Coverage for python/lsst/daf/butler/registry/_registry.py : 12%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "Registry",
26)
28from collections import defaultdict
29import contextlib
30import logging
31from typing import (
32 Any,
33 Dict,
34 Iterable,
35 Iterator,
36 List,
37 Mapping,
38 Optional,
39 Set,
40 Type,
41 TYPE_CHECKING,
42 Union,
43)
45import sqlalchemy
47from ..core import (
48 Config,
49 DataCoordinate,
50 DataCoordinateIterable,
51 DataId,
52 DatasetAssociation,
53 DatasetRef,
54 DatasetType,
55 ddl,
56 Dimension,
57 DimensionConfig,
58 DimensionElement,
59 DimensionGraph,
60 DimensionRecord,
61 DimensionUniverse,
62 NamedKeyMapping,
63 NameLookupMapping,
64 StorageClassFactory,
65 Timespan,
66)
67from . import queries
68from ..core.utils import doImport, iterable, transactional
69from ._config import RegistryConfig
70from ._collectionType import CollectionType
71from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError
72from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch, Ellipsis
73from .interfaces import ChainedCollectionRecord, RunRecord
74from .versions import ButlerVersionsManager, DigestMismatchError
76if TYPE_CHECKING: 76 ↛ 77line 76 didn't jump to line 77, because the condition on line 76 was never true
77 from ..butlerConfig import ButlerConfig
78 from .interfaces import (
79 ButlerAttributeManager,
80 CollectionManager,
81 Database,
82 OpaqueTableStorageManager,
83 DimensionRecordStorageManager,
84 DatasetRecordStorageManager,
85 DatastoreRegistryBridgeManager,
86 )
89_LOG = logging.getLogger(__name__)
91# key for dimensions configuration in attributes table
92_DIMENSIONS_ATTR = "config:dimensions.json"
95class Registry:
96 """Registry interface.
98 Parameters
99 ----------
100 database : `Database`
101 Database instance to store Registry.
102 attributes : `type`
103 Manager class implementing `ButlerAttributeManager`.
104 opaque : `type`
105 Manager class implementing `OpaqueTableStorageManager`.
106 dimensions : `type`
107 Manager class implementing `DimensionRecordStorageManager`.
108 collections : `type`
109 Manager class implementing `CollectionManager`.
110 datasets : `type`
111 Manager class implementing `DatasetRecordStorageManager`.
112 datastoreBridges : `type`
113 Manager class implementing `DatastoreRegistryBridgeManager`.
114 dimensionConfig : `DimensionConfig`, optional
115 Dimension universe configuration, only used when ``create`` is True.
116 writeable : `bool`, optional
117 If True then Registry will support write operations.
118 create : `bool`, optional
119 If True then database schema will be initialized, it must be empty
120 before instantiating Registry.
121 """
123 defaultConfigFile: Optional[str] = None
124 """Path to configuration defaults. Accessed within the ``configs`` resource
125 or relative to a search path. Can be None if no defaults specified.
126 """
128 @classmethod
129 def createFromConfig(cls, config: Optional[Union[RegistryConfig, str]] = None,
130 dimensionConfig: Optional[Union[DimensionConfig, str]] = None,
131 butlerRoot: Optional[str] = None) -> Registry:
132 """Create registry database and return `Registry` instance.
134 This method initializes database contents, database must be empty
135 prior to calling this method.
137 Parameters
138 ----------
139 config : `RegistryConfig` or `str`, optional
140 Registry configuration, if missing then default configuration will
141 be loaded from registry.yaml.
142 dimensionConfig : `DimensionConfig` or `str`, optional
143 Dimensions configuration, if missing then default configuration
144 will be loaded from dimensions.yaml.
145 butlerRoot : `str`, optional
146 Path to the repository root this `Registry` will manage.
148 Returns
149 -------
150 registry : `Registry`
151 A new `Registry` instance.
152 """
153 if isinstance(config, str):
154 config = RegistryConfig(config)
155 elif config is None:
156 config = RegistryConfig()
157 elif not isinstance(config, RegistryConfig):
158 raise TypeError(f"Incompatible Registry configuration type: {type(config)}")
159 config.replaceRoot(butlerRoot)
161 if isinstance(dimensionConfig, str):
162 dimensionConfig = DimensionConfig(config)
163 elif dimensionConfig is None:
164 dimensionConfig = DimensionConfig()
165 elif not isinstance(dimensionConfig, DimensionConfig):
166 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}")
168 DatabaseClass = config.getDatabaseClass()
169 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
170 namespace=config.get("namespace"))
171 attributes = doImport(config["managers", "attributes"])
172 opaque = doImport(config["managers", "opaque"])
173 dimensions = doImport(config["managers", "dimensions"])
174 collections = doImport(config["managers", "collections"])
175 datasets = doImport(config["managers", "datasets"])
176 datastoreBridges = doImport(config["managers", "datastores"])
178 return cls(database, dimensions=dimensions, attributes=attributes, opaque=opaque,
179 collections=collections, datasets=datasets, datastoreBridges=datastoreBridges,
180 dimensionConfig=dimensionConfig, create=True)
182 @classmethod
183 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str],
184 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry:
185 """Create `Registry` subclass instance from `config`.
187 Registry database must be inbitialized prior to calling this method.
189 Parameters
190 ----------
191 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
192 Registry configuration
193 butlerRoot : `str`, optional
194 Path to the repository root this `Registry` will manage.
195 writeable : `bool`, optional
196 If `True` (default) create a read-write connection to the database.
198 Returns
199 -------
200 registry : `Registry` (subclass)
201 A new `Registry` subclass instance.
202 """
203 if not isinstance(config, RegistryConfig):
204 if isinstance(config, str) or isinstance(config, Config):
205 config = RegistryConfig(config)
206 else:
207 raise ValueError("Incompatible Registry configuration: {}".format(config))
208 config.replaceRoot(butlerRoot)
209 DatabaseClass = config.getDatabaseClass()
210 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
211 namespace=config.get("namespace"), writeable=writeable)
212 attributes = doImport(config["managers", "attributes"])
213 opaque = doImport(config["managers", "opaque"])
214 dimensions = doImport(config["managers", "dimensions"])
215 collections = doImport(config["managers", "collections"])
216 datasets = doImport(config["managers", "datasets"])
217 datastoreBridges = doImport(config["managers", "datastores"])
219 return cls(database, dimensions=dimensions, attributes=attributes, opaque=opaque,
220 collections=collections, datasets=datasets, datastoreBridges=datastoreBridges,
221 dimensionConfig=None, writeable=writeable)
223 def __init__(self, database: Database, *,
224 attributes: Type[ButlerAttributeManager],
225 opaque: Type[OpaqueTableStorageManager],
226 dimensions: Type[DimensionRecordStorageManager],
227 collections: Type[CollectionManager],
228 datasets: Type[DatasetRecordStorageManager],
229 datastoreBridges: Type[DatastoreRegistryBridgeManager],
230 dimensionConfig: Optional[DimensionConfig] = None,
231 writeable: bool = True,
232 create: bool = False):
233 self._db = database
234 self.storageClasses = StorageClassFactory()
236 # With existing registry we have to read dimensions config from
237 # database before we initialize all other managers.
238 if dimensionConfig is None:
239 assert not create, "missing DimensionConfig when create=True"
240 with self._db.declareStaticTables(create=False) as context:
241 self._attributes = attributes.initialize(self._db, context)
243 versions = ButlerVersionsManager(
244 self._attributes,
245 dict(attributes=self._attributes)
246 )
247 # verify that configured versions are compatible with schema
248 versions.checkManagersConfig()
249 versions.checkManagersVersions(writeable)
251 # get serialized as a string from database
252 dimensionsString = self._attributes.get(_DIMENSIONS_ATTR)
253 if dimensionsString is not None:
254 dimensionConfig = DimensionConfig(Config.fromString(dimensionsString, format="json"))
255 else:
256 raise LookupError(f"Registry attribute {_DIMENSIONS_ATTR} is missing from database")
258 # make universe
259 universe = DimensionUniverse(dimensionConfig)
261 with self._db.declareStaticTables(create=create) as context:
262 self._attributes = attributes.initialize(self._db, context)
263 self._dimensions = dimensions.initialize(self._db, context, universe=universe)
264 self._collections = collections.initialize(self._db, context, dimensions=self._dimensions)
265 self._datasets = datasets.initialize(self._db, context,
266 collections=self._collections,
267 dimensions=self._dimensions)
268 self._opaque = opaque.initialize(self._db, context)
269 self._datastoreBridges = datastoreBridges.initialize(self._db, context,
270 opaque=self._opaque,
271 datasets=datasets,
272 universe=self._dimensions.universe)
273 versions = ButlerVersionsManager(
274 self._attributes,
275 dict(
276 attributes=self._attributes,
277 opaque=self._opaque,
278 dimensions=self._dimensions,
279 collections=self._collections,
280 datasets=self._datasets,
281 datastores=self._datastoreBridges,
282 )
283 )
284 # store managers and their versions in attributes table
285 context.addInitializer(lambda db: versions.storeManagersConfig())
286 context.addInitializer(lambda db: versions.storeManagersVersions())
287 # dump universe config as json into attributes (faster than YAML)
288 json = dimensionConfig.dump(format="json")
289 if json is not None:
290 # Convert Optional[str] to str for mypy
291 json_str = json
292 context.addInitializer(
293 lambda db: self._attributes.set(_DIMENSIONS_ATTR, json_str)
294 )
295 else:
296 raise RuntimeError("Unexpectedly failed to serialize DimensionConfig to JSON")
298 if not create:
299 # verify that configured versions are compatible with schema
300 versions.checkManagersConfig()
301 versions.checkManagersVersions(writeable)
302 try:
303 versions.checkManagersDigests()
304 except DigestMismatchError as exc:
305 # potentially digest mismatch is a serious error but during
306 # development it could be benign, treat this as warning for
307 # now.
308 _LOG.warning(f"Registry schema digest mismatch: {exc}")
310 self._dimensions.refresh()
311 self._collections.refresh()
312 self._datasets.refresh()
314 def __str__(self) -> str:
315 return str(self._db)
317 def __repr__(self) -> str:
318 return f"Registry({self._db!r}, {self.dimensions!r})"
320 def isWriteable(self) -> bool:
321 """Return `True` if this registry allows write operations, and `False`
322 otherwise.
323 """
324 return self._db.isWriteable()
326 @property
327 def dimensions(self) -> DimensionUniverse:
328 """All dimensions recognized by this `Registry` (`DimensionUniverse`).
329 """
330 return self._dimensions.universe
332 @contextlib.contextmanager
333 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
334 """Return a context manager that represents a transaction.
335 """
336 try:
337 with self._db.transaction(savepoint=savepoint):
338 yield
339 except BaseException:
340 # TODO: this clears the caches sometimes when we wouldn't actually
341 # need to. Can we avoid that?
342 self._dimensions.clearCaches()
343 raise
345 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
346 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
347 other data repository client.
349 Opaque table records can be added via `insertOpaqueData`, retrieved via
350 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
352 Parameters
353 ----------
354 tableName : `str`
355 Logical name of the opaque table. This may differ from the
356 actual name used in the database by a prefix and/or suffix.
357 spec : `ddl.TableSpec`
358 Specification for the table to be added.
359 """
360 self._opaque.register(tableName, spec)
362 @transactional
363 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
364 """Insert records into an opaque table.
366 Parameters
367 ----------
368 tableName : `str`
369 Logical name of the opaque table. Must match the name used in a
370 previous call to `registerOpaqueTable`.
371 data
372 Each additional positional argument is a dictionary that represents
373 a single row to be added.
374 """
375 self._opaque[tableName].insert(*data)
377 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
378 """Retrieve records from an opaque table.
380 Parameters
381 ----------
382 tableName : `str`
383 Logical name of the opaque table. Must match the name used in a
384 previous call to `registerOpaqueTable`.
385 where
386 Additional keyword arguments are interpreted as equality
387 constraints that restrict the returned rows (combined with AND);
388 keyword arguments are column names and values are the values they
389 must have.
391 Yields
392 ------
393 row : `dict`
394 A dictionary representing a single result row.
395 """
396 yield from self._opaque[tableName].fetch(**where)
398 @transactional
399 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
400 """Remove records from an opaque table.
402 Parameters
403 ----------
404 tableName : `str`
405 Logical name of the opaque table. Must match the name used in a
406 previous call to `registerOpaqueTable`.
407 where
408 Additional keyword arguments are interpreted as equality
409 constraints that restrict the deleted rows (combined with AND);
410 keyword arguments are column names and values are the values they
411 must have.
412 """
413 self._opaque[tableName].delete(**where)
415 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED,
416 doc: Optional[str] = None) -> None:
417 """Add a new collection if one with the given name does not exist.
419 Parameters
420 ----------
421 name : `str`
422 The name of the collection to create.
423 type : `CollectionType`
424 Enum value indicating the type of collection to create.
425 doc : `str`, optional
426 Documentation string for the collection.
428 Notes
429 -----
430 This method cannot be called within transactions, as it needs to be
431 able to perform its own transaction to be concurrent.
432 """
433 self._collections.register(name, type, doc=doc)
435 def getCollectionType(self, name: str) -> CollectionType:
436 """Return an enumeration value indicating the type of the given
437 collection.
439 Parameters
440 ----------
441 name : `str`
442 The name of the collection.
444 Returns
445 -------
446 type : `CollectionType`
447 Enum value indicating the type of this collection.
449 Raises
450 ------
451 MissingCollectionError
452 Raised if no collection with the given name exists.
453 """
454 return self._collections.find(name).type
456 def registerRun(self, name: str, doc: Optional[str] = None) -> None:
457 """Add a new run if one with the given name does not exist.
459 Parameters
460 ----------
461 name : `str`
462 The name of the run to create.
463 doc : `str`, optional
464 Documentation string for the collection.
466 Notes
467 -----
468 This method cannot be called within transactions, as it needs to be
469 able to perform its own transaction to be concurrent.
470 """
471 self._collections.register(name, CollectionType.RUN, doc=doc)
473 @transactional
474 def removeCollection(self, name: str) -> None:
475 """Completely remove the given collection.
477 Parameters
478 ----------
479 name : `str`
480 The name of the collection to remove.
482 Raises
483 ------
484 MissingCollectionError
485 Raised if no collection with the given name exists.
487 Notes
488 -----
489 If this is a `~CollectionType.RUN` collection, all datasets and quanta
490 in it are also fully removed. This requires that those datasets be
491 removed (or at least trashed) from any datastores that hold them first.
493 A collection may not be deleted as long as it is referenced by a
494 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must
495 be deleted or redefined first.
496 """
497 self._collections.remove(name)
499 def getCollectionChain(self, parent: str) -> CollectionSearch:
500 """Return the child collections in a `~CollectionType.CHAINED`
501 collection.
503 Parameters
504 ----------
505 parent : `str`
506 Name of the chained collection. Must have already been added via
507 a call to `Registry.registerCollection`.
509 Returns
510 -------
511 children : `CollectionSearch`
512 An object that defines the search path of the collection.
513 See :ref:`daf_butler_collection_expressions` for more information.
515 Raises
516 ------
517 MissingCollectionError
518 Raised if ``parent`` does not exist in the `Registry`.
519 TypeError
520 Raised if ``parent`` does not correspond to a
521 `~CollectionType.CHAINED` collection.
522 """
523 record = self._collections.find(parent)
524 if record.type is not CollectionType.CHAINED:
525 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
526 assert isinstance(record, ChainedCollectionRecord)
527 return record.children
529 @transactional
530 def setCollectionChain(self, parent: str, children: Any) -> None:
531 """Define or redefine a `~CollectionType.CHAINED` collection.
533 Parameters
534 ----------
535 parent : `str`
536 Name of the chained collection. Must have already been added via
537 a call to `Registry.registerCollection`.
538 children : `Any`
539 An expression defining an ordered search of child collections,
540 generally an iterable of `str`; see
541 :ref:`daf_butler_collection_expressions` for more information.
543 Raises
544 ------
545 MissingCollectionError
546 Raised when any of the given collections do not exist in the
547 `Registry`.
548 TypeError
549 Raised if ``parent`` does not correspond to a
550 `~CollectionType.CHAINED` collection.
551 ValueError
552 Raised if the given collections contains a cycle.
553 """
554 record = self._collections.find(parent)
555 if record.type is not CollectionType.CHAINED:
556 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
557 assert isinstance(record, ChainedCollectionRecord)
558 children = CollectionSearch.fromExpression(children)
559 if children != record.children:
560 record.update(self._collections, children)
562 def getCollectionDocumentation(self, collection: str) -> Optional[str]:
563 """Retrieve the documentation string for a collection.
565 Parameters
566 ----------
567 name : `str`
568 Name of the collection.
570 Returns
571 -------
572 docs : `str` or `None`
573 Docstring for the collection with the given name.
574 """
575 return self._collections.getDocumentation(self._collections.find(collection).key)
577 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None:
578 """Set the documentation string for a collection.
580 Parameters
581 ----------
582 name : `str`
583 Name of the collection.
584 docs : `str` or `None`
585 Docstring for the collection with the given name; will replace any
586 existing docstring. Passing `None` will remove any existing
587 docstring.
588 """
589 self._collections.setDocumentation(self._collections.find(collection).key, doc)
591 def registerDatasetType(self, datasetType: DatasetType) -> bool:
592 """
593 Add a new `DatasetType` to the Registry.
595 It is not an error to register the same `DatasetType` twice.
597 Parameters
598 ----------
599 datasetType : `DatasetType`
600 The `DatasetType` to be added.
602 Returns
603 -------
604 inserted : `bool`
605 `True` if ``datasetType`` was inserted, `False` if an identical
606 existing `DatsetType` was found. Note that in either case the
607 DatasetType is guaranteed to be defined in the Registry
608 consistently with the given definition.
610 Raises
611 ------
612 ValueError
613 Raised if the dimensions or storage class are invalid.
614 ConflictingDefinitionError
615 Raised if this DatasetType is already registered with a different
616 definition.
618 Notes
619 -----
620 This method cannot be called within transactions, as it needs to be
621 able to perform its own transaction to be concurrent.
622 """
623 _, inserted = self._datasets.register(datasetType)
624 return inserted
626 def removeDatasetType(self, name: str) -> None:
627 """Remove the named `DatasetType` from the registry.
629 .. warning::
631 Registry caches the dataset type definitions. This means that
632 deleting the dataset type definition may result in unexpected
633 behavior from other butler processes that are active that have
634 not seen the deletion.
636 Parameters
637 ----------
638 name : `str`
639 Name of the type to be removed.
641 Raises
642 ------
643 lsst.daf.butler.registry.OrphanedRecordError
644 Raised if an attempt is made to remove the dataset type definition
645 when there are already datasets associated with it.
647 Notes
648 -----
649 If the dataset type is not registered the method will return without
650 action.
651 """
652 self._datasets.remove(name)
654 def getDatasetType(self, name: str) -> DatasetType:
655 """Get the `DatasetType`.
657 Parameters
658 ----------
659 name : `str`
660 Name of the type.
662 Returns
663 -------
664 type : `DatasetType`
665 The `DatasetType` associated with the given name.
667 Raises
668 ------
669 KeyError
670 Requested named DatasetType could not be found in registry.
671 """
672 return self._datasets[name].datasetType
674 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *,
675 collections: Any, timespan: Optional[Timespan] = None,
676 **kwargs: Any) -> Optional[DatasetRef]:
677 """Find a dataset given its `DatasetType` and data ID.
679 This can be used to obtain a `DatasetRef` that permits the dataset to
680 be read from a `Datastore`. If the dataset is a component and can not
681 be found using the provided dataset type, a dataset ref for the parent
682 will be returned instead but with the correct dataset type.
684 Parameters
685 ----------
686 datasetType : `DatasetType` or `str`
687 A `DatasetType` or the name of one.
688 dataId : `dict` or `DataCoordinate`, optional
689 A `dict`-like object containing the `Dimension` links that identify
690 the dataset within a collection.
691 collections
692 An expression that fully or partially identifies the collections to
693 search for the dataset; see
694 :ref:`daf_butler_collection_expressions` for more information.
695 timespan : `Timespan`, optional
696 A timespan that the validity range of the dataset must overlap.
697 If not provided, any `~CollectionType.CALIBRATION` collections
698 matched by the ``collections`` argument will not be searched.
699 **kwargs
700 Additional keyword arguments passed to
701 `DataCoordinate.standardize` to convert ``dataId`` to a true
702 `DataCoordinate` or augment an existing one.
704 Returns
705 -------
706 ref : `DatasetRef`
707 A reference to the dataset, or `None` if no matching Dataset
708 was found.
710 Raises
711 ------
712 LookupError
713 Raised if one or more data ID keys are missing.
714 KeyError
715 Raised if the dataset type does not exist.
716 MissingCollectionError
717 Raised if any of ``collections`` does not exist in the registry.
719 Notes
720 -----
721 This method simply returns `None` and does not raise an exception even
722 when the set of collections searched is intrinsically incompatible with
723 the dataset type, e.g. if ``datasetType.isCalibration() is False``, but
724 only `~CollectionType.CALIBRATION` collections are being searched.
725 This may make it harder to debug some lookup failures, but the behavior
726 is intentional; we consider it more important that failed searches are
727 reported consistently, regardless of the reason, and that adding
728 additional collections that do not contain a match to the search path
729 never changes the behavior.
730 """
731 if isinstance(datasetType, DatasetType):
732 storage = self._datasets[datasetType.name]
733 else:
734 storage = self._datasets[datasetType]
735 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions,
736 universe=self.dimensions, **kwargs)
737 collections = CollectionSearch.fromExpression(collections)
738 for collectionRecord in collections.iter(self._collections):
739 if (collectionRecord.type is CollectionType.CALIBRATION
740 and (not storage.datasetType.isCalibration() or timespan is None)):
741 continue
742 result = storage.find(collectionRecord, dataId, timespan=timespan)
743 if result is not None:
744 return result
746 return None
748 @transactional
749 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
750 run: str) -> List[DatasetRef]:
751 """Insert one or more datasets into the `Registry`
753 This always adds new datasets; to associate existing datasets with
754 a new collection, use ``associate``.
756 Parameters
757 ----------
758 datasetType : `DatasetType` or `str`
759 A `DatasetType` or the name of one.
760 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate`
761 Dimension-based identifiers for the new datasets.
762 run : `str`
763 The name of the run that produced the datasets.
765 Returns
766 -------
767 refs : `list` of `DatasetRef`
768 Resolved `DatasetRef` instances for all given data IDs (in the same
769 order).
771 Raises
772 ------
773 ConflictingDefinitionError
774 If a dataset with the same dataset type and data ID as one of those
775 given already exists in ``run``.
776 MissingCollectionError
777 Raised if ``run`` does not exist in the registry.
778 """
779 if isinstance(datasetType, DatasetType):
780 storage = self._datasets.find(datasetType.name)
781 if storage is None:
782 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
783 else:
784 storage = self._datasets.find(datasetType)
785 if storage is None:
786 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
787 runRecord = self._collections.find(run)
788 if runRecord.type is not CollectionType.RUN:
789 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.")
790 assert isinstance(runRecord, RunRecord)
791 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions)
792 for dataId in dataIds]
793 try:
794 refs = list(storage.insert(runRecord, expandedDataIds))
795 except sqlalchemy.exc.IntegrityError as err:
796 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
797 f"one or more datasets of type {storage.datasetType} into "
798 f"collection '{run}'. "
799 f"This probably means a dataset with the same data ID "
800 f"and dataset type already exists, but it may also mean a "
801 f"dimension row is missing.") from err
802 return refs
804 def getDataset(self, id: int) -> Optional[DatasetRef]:
805 """Retrieve a Dataset entry.
807 Parameters
808 ----------
809 id : `int`
810 The unique identifier for the dataset.
812 Returns
813 -------
814 ref : `DatasetRef` or `None`
815 A ref to the Dataset, or `None` if no matching Dataset
816 was found.
817 """
818 ref = self._datasets.getDatasetRef(id)
819 if ref is None:
820 return None
821 return ref
823 @transactional
824 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
825 """Remove datasets from the Registry.
827 The datasets will be removed unconditionally from all collections, and
828 any `Quantum` that consumed this dataset will instead be marked with
829 having a NULL input. `Datastore` records will *not* be deleted; the
830 caller is responsible for ensuring that the dataset has already been
831 removed from all Datastores.
833 Parameters
834 ----------
835 refs : `Iterable` of `DatasetRef`
836 References to the datasets to be removed. Must include a valid
837 ``id`` attribute, and should be considered invalidated upon return.
839 Raises
840 ------
841 AmbiguousDatasetError
842 Raised if any ``ref.id`` is `None`.
843 OrphanedRecordError
844 Raised if any dataset is still present in any `Datastore`.
845 """
846 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
847 storage = self._datasets.find(datasetType.name)
848 assert storage is not None
849 try:
850 storage.delete(refsForType)
851 except sqlalchemy.exc.IntegrityError as err:
852 raise OrphanedRecordError("One or more datasets is still "
853 "present in one or more Datastores.") from err
855 @transactional
856 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
857 """Add existing datasets to a `~CollectionType.TAGGED` collection.
859 If a DatasetRef with the same exact integer ID is already in a
860 collection nothing is changed. If a `DatasetRef` with the same
861 `DatasetType` and data ID but with different integer ID
862 exists in the collection, `ConflictingDefinitionError` is raised.
864 Parameters
865 ----------
866 collection : `str`
867 Indicates the collection the datasets should be associated with.
868 refs : `Iterable` [ `DatasetRef` ]
869 An iterable of resolved `DatasetRef` instances that already exist
870 in this `Registry`.
872 Raises
873 ------
874 ConflictingDefinitionError
875 If a Dataset with the given `DatasetRef` already exists in the
876 given collection.
877 AmbiguousDatasetError
878 Raised if ``any(ref.id is None for ref in refs)``.
879 MissingCollectionError
880 Raised if ``collection`` does not exist in the registry.
881 TypeError
882 Raise adding new datasets to the given ``collection`` is not
883 allowed.
884 """
885 collectionRecord = self._collections.find(collection)
886 if collectionRecord.type is not CollectionType.TAGGED:
887 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
888 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
889 storage = self._datasets.find(datasetType.name)
890 assert storage is not None
891 try:
892 storage.associate(collectionRecord, refsForType)
893 except sqlalchemy.exc.IntegrityError as err:
894 raise ConflictingDefinitionError(
895 f"Constraint violation while associating dataset of type {datasetType.name} with "
896 f"collection {collection}. This probably means that one or more datasets with the same "
897 f"dataset type and data ID already exist in the collection, but it may also indicate "
898 f"that the datasets do not exist."
899 ) from err
901 @transactional
902 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
903 """Remove existing datasets from a `~CollectionType.TAGGED` collection.
905 ``collection`` and ``ref`` combinations that are not currently
906 associated are silently ignored.
908 Parameters
909 ----------
910 collection : `str`
911 The collection the datasets should no longer be associated with.
912 refs : `Iterable` [ `DatasetRef` ]
913 An iterable of resolved `DatasetRef` instances that already exist
914 in this `Registry`.
916 Raises
917 ------
918 AmbiguousDatasetError
919 Raised if any of the given dataset references is unresolved.
920 MissingCollectionError
921 Raised if ``collection`` does not exist in the registry.
922 TypeError
923 Raise adding new datasets to the given ``collection`` is not
924 allowed.
925 """
926 collectionRecord = self._collections.find(collection)
927 if collectionRecord.type is not CollectionType.TAGGED:
928 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; "
929 "expected TAGGED.")
930 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
931 storage = self._datasets.find(datasetType.name)
932 assert storage is not None
933 storage.disassociate(collectionRecord, refsForType)
935 @transactional
936 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
937 """Associate one or more datasets with a calibration collection and a
938 validity range within it.
940 Parameters
941 ----------
942 collection : `str`
943 The name of an already-registered `~CollectionType.CALIBRATION`
944 collection.
945 refs : `Iterable` [ `DatasetRef` ]
946 Datasets to be associated.
947 timespan : `Timespan`
948 The validity range for these datasets within the collection.
950 Raises
951 ------
952 AmbiguousDatasetError
953 Raised if any of the given `DatasetRef` instances is unresolved.
954 ConflictingDefinitionError
955 Raised if the collection already contains a different dataset with
956 the same `DatasetType` and data ID and an overlapping validity
957 range.
958 TypeError
959 Raised if ``collection`` is not a `~CollectionType.CALIBRATION`
960 collection or if one or more datasets are of a dataset type for
961 which `DatasetType.isCalibration` returns `False`.
962 """
963 collectionRecord = self._collections.find(collection)
964 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
965 storage = self._datasets[datasetType.name]
966 storage.certify(collectionRecord, refsForType, timespan)
968 @transactional
969 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *,
970 dataIds: Optional[Iterable[DataId]] = None) -> None:
971 """Remove or adjust datasets to clear a validity range within a
972 calibration collection.
974 Parameters
975 ----------
976 collection : `str`
977 The name of an already-registered `~CollectionType.CALIBRATION`
978 collection.
979 datasetType : `str` or `DatasetType`
980 Name or `DatasetType` instance for the datasets to be decertified.
981 timespan : `Timespan`, optional
982 The validity range to remove datasets from within the collection.
983 Datasets that overlap this range but are not contained by it will
984 have their validity ranges adjusted to not overlap it, which may
985 split a single dataset validity range into two.
986 dataIds : `Iterable` [ `DataId` ], optional
987 Data IDs that should be decertified within the given validity range
988 If `None`, all data IDs for ``self.datasetType`` will be
989 decertified.
991 Raises
992 ------
993 TypeError
994 Raised if ``collection`` is not a `~CollectionType.CALIBRATION`
995 collection or if ``datasetType.isCalibration() is False``.
996 """
997 collectionRecord = self._collections.find(collection)
998 if isinstance(datasetType, str):
999 storage = self._datasets[datasetType]
1000 else:
1001 storage = self._datasets[datasetType.name]
1002 standardizedDataIds = None
1003 if dataIds is not None:
1004 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions)
1005 for d in dataIds]
1006 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds)
1008 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
1009 """Return an object that allows a new `Datastore` instance to
1010 communicate with this `Registry`.
1012 Returns
1013 -------
1014 manager : `DatastoreRegistryBridgeManager`
1015 Object that mediates communication between this `Registry` and its
1016 associated datastores.
1017 """
1018 return self._datastoreBridges
1020 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
1021 """Retrieve datastore locations for a given dataset.
1023 Parameters
1024 ----------
1025 ref : `DatasetRef`
1026 A reference to the dataset for which to retrieve storage
1027 information.
1029 Returns
1030 -------
1031 datastores : `Iterable` [ `str` ]
1032 All the matching datastores holding this dataset.
1034 Raises
1035 ------
1036 AmbiguousDatasetError
1037 Raised if ``ref.id`` is `None`.
1038 """
1039 return self._datastoreBridges.findDatastores(ref)
1041 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
1042 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,
1043 **kwargs: Any) -> DataCoordinate:
1044 """Expand a dimension-based data ID to include additional information.
1046 Parameters
1047 ----------
1048 dataId : `DataCoordinate` or `dict`, optional
1049 Data ID to be expanded; augmented and overridden by ``kwds``.
1050 graph : `DimensionGraph`, optional
1051 Set of dimensions for the expanded ID. If `None`, the dimensions
1052 will be inferred from the keys of ``dataId`` and ``kwds``.
1053 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph``
1054 are silently ignored, providing a way to extract and expand a
1055 subset of a data ID.
1056 records : `Mapping` [`str`, `DimensionRecord`], optional
1057 Dimension record data to use before querying the database for that
1058 data, keyed by element name.
1059 **kwargs
1060 Additional keywords are treated like additional key-value pairs for
1061 ``dataId``, extending and overriding
1063 Returns
1064 -------
1065 expanded : `DataCoordinate`
1066 A data ID that includes full metadata for all of the dimensions it
1067 identifieds, i.e. guarantees that ``expanded.hasRecords()`` and
1068 ``expanded.hasFull()`` both return `True`.
1069 """
1070 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwargs)
1071 if standardized.hasRecords():
1072 return standardized
1073 if records is None:
1074 records = {}
1075 elif isinstance(records, NamedKeyMapping):
1076 records = records.byName()
1077 else:
1078 records = dict(records)
1079 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
1080 records.update(dataId.records.byName())
1081 keys = standardized.byName()
1082 for element in standardized.graph.primaryKeyTraversalOrder:
1083 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
1084 if record is ...:
1085 if isinstance(element, Dimension) and keys.get(element.name) is None:
1086 if element in standardized.graph.required:
1087 raise LookupError(
1088 f"No value or null value for required dimension {element.name}."
1089 )
1090 keys[element.name] = None
1091 record = None
1092 else:
1093 storage = self._dimensions[element]
1094 dataIdSet = DataCoordinateIterable.fromScalar(
1095 DataCoordinate.standardize(keys, graph=element.graph)
1096 )
1097 fetched = tuple(storage.fetch(dataIdSet))
1098 try:
1099 (record,) = fetched
1100 except ValueError:
1101 record = None
1102 records[element.name] = record
1103 if record is not None:
1104 for d in element.implied:
1105 value = getattr(record, d.name)
1106 if keys.setdefault(d.name, value) != value:
1107 raise InconsistentDataIdError(
1108 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
1109 f"but {element.name} implies {d.name}={value!r}."
1110 )
1111 else:
1112 if element in standardized.graph.required:
1113 raise LookupError(
1114 f"Could not fetch record for required dimension {element.name} via keys {keys}."
1115 )
1116 if element.alwaysJoin:
1117 raise InconsistentDataIdError(
1118 f"Could not fetch record for element {element.name} via keys {keys}, ",
1119 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
1120 "related."
1121 )
1122 for d in element.implied:
1123 keys.setdefault(d.name, None)
1124 records.setdefault(d.name, None)
1125 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
1127 def insertDimensionData(self, element: Union[DimensionElement, str],
1128 *data: Union[Mapping[str, Any], DimensionRecord],
1129 conform: bool = True) -> None:
1130 """Insert one or more dimension records into the database.
1132 Parameters
1133 ----------
1134 element : `DimensionElement` or `str`
1135 The `DimensionElement` or name thereof that identifies the table
1136 records will be inserted into.
1137 data : `dict` or `DimensionRecord` (variadic)
1138 One or more records to insert.
1139 conform : `bool`, optional
1140 If `False` (`True` is default) perform no checking or conversions,
1141 and assume that ``element`` is a `DimensionElement` instance and
1142 ``data`` is a one or more `DimensionRecord` instances of the
1143 appropriate subclass.
1144 """
1145 if conform:
1146 if isinstance(element, str):
1147 element = self.dimensions[element]
1148 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
1149 for row in data]
1150 else:
1151 # Ignore typing since caller said to trust them with conform=False.
1152 records = data # type: ignore
1153 storage = self._dimensions[element] # type: ignore
1154 storage.insert(*records)
1156 def syncDimensionData(self, element: Union[DimensionElement, str],
1157 row: Union[Mapping[str, Any], DimensionRecord],
1158 conform: bool = True) -> bool:
1159 """Synchronize the given dimension record with the database, inserting
1160 if it does not already exist and comparing values if it does.
1162 Parameters
1163 ----------
1164 element : `DimensionElement` or `str`
1165 The `DimensionElement` or name thereof that identifies the table
1166 records will be inserted into.
1167 row : `dict` or `DimensionRecord`
1168 The record to insert.
1169 conform : `bool`, optional
1170 If `False` (`True` is default) perform no checking or conversions,
1171 and assume that ``element`` is a `DimensionElement` instance and
1172 ``data`` is a one or more `DimensionRecord` instances of the
1173 appropriate subclass.
1175 Returns
1176 -------
1177 inserted : `bool`
1178 `True` if a new row was inserted, `False` otherwise.
1180 Raises
1181 ------
1182 ConflictingDefinitionError
1183 Raised if the record exists in the database (according to primary
1184 key lookup) but is inconsistent with the given one.
1185 """
1186 if conform:
1187 if isinstance(element, str):
1188 element = self.dimensions[element]
1189 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
1190 else:
1191 # Ignore typing since caller said to trust them with conform=False.
1192 record = row # type: ignore
1193 storage = self._dimensions[element] # type: ignore
1194 return storage.sync(record)
1196 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None
1197 ) -> Iterator[DatasetType]:
1198 """Iterate over the dataset types whose names match an expression.
1200 Parameters
1201 ----------
1202 expression : `Any`, optional
1203 An expression that fully or partially identifies the dataset types
1204 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1205 `...` can be used to return all dataset types, and is the default.
1206 See :ref:`daf_butler_dataset_type_expressions` for more
1207 information.
1208 components : `bool`, optional
1209 If `True`, apply all expression patterns to component dataset type
1210 names as well. If `False`, never apply patterns to components.
1211 If `None` (default), apply patterns to components only if their
1212 parent datasets were not matched by the expression.
1213 Fully-specified component datasets (`str` or `DatasetType`
1214 instances) are always included.
1216 Yields
1217 ------
1218 datasetType : `DatasetType`
1219 A `DatasetType` instance whose name matches ``expression``.
1220 """
1221 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
1222 if wildcard is Ellipsis:
1223 for datasetType in self._datasets:
1224 # The dataset type can no longer be a component
1225 yield datasetType
1226 if components and datasetType.isComposite():
1227 # Automatically create the component dataset types
1228 for component in datasetType.makeAllComponentDatasetTypes():
1229 yield component
1230 return
1231 done: Set[str] = set()
1232 for name in wildcard.strings:
1233 storage = self._datasets.find(name)
1234 if storage is not None:
1235 done.add(storage.datasetType.name)
1236 yield storage.datasetType
1237 if wildcard.patterns:
1238 # If components (the argument) is None, we'll save component
1239 # dataset that we might want to match, but only if their parents
1240 # didn't get included.
1241 componentsForLater = []
1242 for registeredDatasetType in self._datasets:
1243 # Components are not stored in registry so expand them here
1244 allDatasetTypes = [registeredDatasetType] \
1245 + registeredDatasetType.makeAllComponentDatasetTypes()
1246 for datasetType in allDatasetTypes:
1247 if datasetType.name in done:
1248 continue
1249 parentName, componentName = datasetType.nameAndComponent()
1250 if componentName is not None and not components:
1251 if components is None and parentName not in done:
1252 componentsForLater.append(datasetType)
1253 continue
1254 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
1255 done.add(datasetType.name)
1256 yield datasetType
1257 # Go back and try to match saved components.
1258 for datasetType in componentsForLater:
1259 parentName, _ = datasetType.nameAndComponent()
1260 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
1261 yield datasetType
1263 def queryCollections(self, expression: Any = ...,
1264 datasetType: Optional[DatasetType] = None,
1265 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1266 flattenChains: bool = False,
1267 includeChains: Optional[bool] = None) -> Iterator[str]:
1268 """Iterate over the collections whose names match an expression.
1270 Parameters
1271 ----------
1272 expression : `Any`, optional
1273 An expression that fully or partially identifies the collections
1274 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1275 `...` can be used to return all collections, and is the default.
1276 See :ref:`daf_butler_collection_expressions` for more
1277 information.
1278 datasetType : `DatasetType`, optional
1279 If provided, only yield collections that may contain datasets of
1280 this type. This is a conservative approximation in general; it may
1281 yield collections that do not have any such datasets.
1282 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
1283 If provided, only yield collections of these types.
1284 flattenChains : `bool`, optional
1285 If `True` (`False` is default), recursively yield the child
1286 collections of matching `~CollectionType.CHAINED` collections.
1287 includeChains : `bool`, optional
1288 If `True`, yield records for matching `~CollectionType.CHAINED`
1289 collections. Default is the opposite of ``flattenChains``: include
1290 either CHAINED collections or their children, but not both.
1292 Yields
1293 ------
1294 collection : `str`
1295 The name of a collection that matches ``expression``.
1296 """
1297 # Right now the datasetTypes argument is completely ignored, but that
1298 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up
1299 # ticket will take care of that.
1300 query = CollectionQuery.fromExpression(expression)
1301 for record in query.iter(self._collections, collectionTypes=frozenset(collectionTypes),
1302 flattenChains=flattenChains, includeChains=includeChains):
1303 yield record.name
1305 def makeQueryBuilder(self, summary: queries.QuerySummary) -> queries.QueryBuilder:
1306 """Return a `QueryBuilder` instance capable of constructing and
1307 managing more complex queries than those obtainable via `Registry`
1308 interfaces.
1310 This is an advanced interface; downstream code should prefer
1311 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
1312 are sufficient.
1314 Parameters
1315 ----------
1316 summary : `queries.QuerySummary`
1317 Object describing and categorizing the full set of dimensions that
1318 will be included in the query.
1320 Returns
1321 -------
1322 builder : `queries.QueryBuilder`
1323 Object that can be used to construct and perform advanced queries.
1324 """
1325 return queries.QueryBuilder(
1326 summary,
1327 queries.RegistryManagers(
1328 collections=self._collections,
1329 dimensions=self._dimensions,
1330 datasets=self._datasets
1331 )
1332 )
1334 def queryDatasets(self, datasetType: Any, *,
1335 collections: Any,
1336 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
1337 dataId: Optional[DataId] = None,
1338 where: Optional[str] = None,
1339 findFirst: bool = False,
1340 components: Optional[bool] = None,
1341 **kwargs: Any) -> queries.DatasetQueryResults:
1342 """Query for and iterate over dataset references matching user-provided
1343 criteria.
1345 Parameters
1346 ----------
1347 datasetType
1348 An expression that fully or partially identifies the dataset types
1349 to be queried. Allowed types include `DatasetType`, `str`,
1350 `re.Pattern`, and iterables thereof. The special value `...` can
1351 be used to query all dataset types. See
1352 :ref:`daf_butler_dataset_type_expressions` for more information.
1353 collections
1354 An expression that fully or partially identifies the collections
1355 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1356 thereof. `...` can be used to find datasets from all
1357 `~CollectionType.RUN` collections (no other collections are
1358 necessary, because all datasets are in a ``RUN`` collection). See
1359 :ref:`daf_butler_collection_expressions` for more information.
1360 dimensions : `~collections.abc.Iterable` of `Dimension` or `str`
1361 Dimensions to include in the query (in addition to those used
1362 to identify the queried dataset type(s)), either to constrain
1363 the resulting datasets to those for which a matching dimension
1364 exists, or to relate the dataset type's dimensions to dimensions
1365 referenced by the ``dataId`` or ``where`` arguments.
1366 dataId : `dict` or `DataCoordinate`, optional
1367 A data ID whose key-value pairs are used as equality constraints
1368 in the query.
1369 where : `str`, optional
1370 A string expression similar to a SQL WHERE clause. May involve
1371 any column of a dimension table or (as a shortcut for the primary
1372 key column of a dimension table) dimension name. See
1373 :ref:`daf_butler_dimension_expressions` for more information.
1374 findFirst : `bool`, optional
1375 If `True` (`False` is default), for each result data ID, only
1376 yield one `DatasetRef` of each `DatasetType`, from the first
1377 collection in which a dataset of that dataset type appears
1378 (according to the order of ``collections`` passed in). If `True`,
1379 ``collections`` must not contain regular expressions and may not
1380 be `...`.
1381 components : `bool`, optional
1382 If `True`, apply all dataset expression patterns to component
1383 dataset type names as well. If `False`, never apply patterns to
1384 components. If `None` (default), apply patterns to components only
1385 if their parent datasets were not matched by the expression.
1386 Fully-specified component datasets (`str` or `DatasetType`
1387 instances) are always included.
1388 **kwargs
1389 Additional keyword arguments are forwarded to
1390 `DataCoordinate.standardize` when processing the ``dataId``
1391 argument (and may be used to provide a constraining data ID even
1392 when the ``dataId`` argument is `None`).
1394 Returns
1395 -------
1396 refs : `queries.DatasetQueryResults`
1397 Dataset references matching the given query criteria.
1399 Raises
1400 ------
1401 TypeError
1402 Raised when the arguments are incompatible, such as when a
1403 collection wildcard is passed when ``findFirst`` is `True`.
1405 Notes
1406 -----
1407 When multiple dataset types are queried in a single call, the
1408 results of this operation are equivalent to querying for each dataset
1409 type separately in turn, and no information about the relationships
1410 between datasets of different types is included. In contexts where
1411 that kind of information is important, the recommended pattern is to
1412 use `queryDataIds` to first obtain data IDs (possibly with the
1413 desired dataset types and collections passed as constraints to the
1414 query), and then use multiple (generally much simpler) calls to
1415 `queryDatasets` with the returned data IDs passed as constraints.
1416 """
1417 # Standardize the collections expression.
1418 if findFirst:
1419 collections = CollectionSearch.fromExpression(collections)
1420 else:
1421 collections = CollectionQuery.fromExpression(collections)
1422 # Standardize and expand the data ID provided as a constraint.
1423 standardizedDataId = self.expandDataId(dataId, **kwargs)
1425 # We can only query directly if given a non-component DatasetType
1426 # instance. If we were given an expression or str or a component
1427 # DatasetType instance, we'll populate this dict, recurse, and return.
1428 # If we already have a non-component DatasetType, it will remain None
1429 # and we'll run the query directly.
1430 composition: Optional[
1431 Dict[
1432 DatasetType, # parent dataset type
1433 List[Optional[str]] # component name, or None for parent
1434 ]
1435 ] = None
1436 if not isinstance(datasetType, DatasetType):
1437 # We were given a dataset type expression (which may be as simple
1438 # as a str). Loop over all matching datasets, delegating handling
1439 # of the `components` argument to queryDatasetTypes, as we populate
1440 # the composition dict.
1441 composition = defaultdict(list)
1442 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
1443 parentName, componentName = trueDatasetType.nameAndComponent()
1444 if componentName is not None:
1445 parentDatasetType = self.getDatasetType(parentName)
1446 composition.setdefault(parentDatasetType, []).append(componentName)
1447 else:
1448 composition.setdefault(trueDatasetType, []).append(None)
1449 elif datasetType.isComponent():
1450 # We were given a true DatasetType instance, but it's a component.
1451 # the composition dict will have exactly one item.
1452 parentName, componentName = datasetType.nameAndComponent()
1453 parentDatasetType = self.getDatasetType(parentName)
1454 composition = {parentDatasetType: [componentName]}
1455 if composition is not None:
1456 # We need to recurse. Do that once for each parent dataset type.
1457 chain = []
1458 for parentDatasetType, componentNames in composition.items():
1459 parentResults = self.queryDatasets(
1460 parentDatasetType,
1461 collections=collections,
1462 dimensions=dimensions,
1463 dataId=standardizedDataId,
1464 where=where,
1465 findFirst=findFirst
1466 )
1467 if isinstance(parentResults, queries.ParentDatasetQueryResults):
1468 chain.append(
1469 parentResults.withComponents(componentNames)
1470 )
1471 else:
1472 # Should only happen if we know there would be no results.
1473 assert isinstance(parentResults, queries.ChainedDatasetQueryResults) \
1474 and not parentResults._chain
1475 return queries.ChainedDatasetQueryResults(chain)
1476 # If we get here, there's no need to recurse (or we are already
1477 # recursing; there can only ever be one level of recursion).
1479 # The full set of dimensions in the query is the combination of those
1480 # needed for the DatasetType and those explicitly requested, if any.
1481 requestedDimensionNames = set(datasetType.dimensions.names)
1482 if dimensions is not None:
1483 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
1484 # Construct the summary structure needed to construct a QueryBuilder.
1485 summary = queries.QuerySummary(
1486 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1487 dataId=standardizedDataId,
1488 expression=where,
1489 )
1490 builder = self.makeQueryBuilder(summary)
1491 # Add the dataset subquery to the query, telling the QueryBuilder to
1492 # include the rank of the selected collection in the results only if we
1493 # need to findFirst. Note that if any of the collections are
1494 # actually wildcard expressions, and we've asked for deduplication,
1495 # this will raise TypeError for us.
1496 if not builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst):
1497 return queries.ChainedDatasetQueryResults(())
1498 query = builder.finish()
1499 return queries.ParentDatasetQueryResults(self._db, query, components=[None])
1501 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
1502 dataId: Optional[DataId] = None,
1503 datasets: Any = None,
1504 collections: Any = None,
1505 where: Optional[str] = None,
1506 components: Optional[bool] = None,
1507 **kwargs: Any) -> queries.DataCoordinateQueryResults:
1508 """Query for data IDs matching user-provided criteria.
1510 Parameters
1511 ----------
1512 dimensions : `Dimension` or `str`, or iterable thereof
1513 The dimensions of the data IDs to yield, as either `Dimension`
1514 instances or `str`. Will be automatically expanded to a complete
1515 `DimensionGraph`.
1516 dataId : `dict` or `DataCoordinate`, optional
1517 A data ID whose key-value pairs are used as equality constraints
1518 in the query.
1519 datasets : `Any`, optional
1520 An expression that fully or partially identifies dataset types
1521 that should constrain the yielded data IDs. For example, including
1522 "raw" here would constrain the yielded ``instrument``,
1523 ``exposure``, ``detector``, and ``physical_filter`` values to only
1524 those for which at least one "raw" dataset exists in
1525 ``collections``. Allowed types include `DatasetType`, `str`,
1526 `re.Pattern`, and iterables thereof. Unlike other dataset type
1527 expressions, ``...`` is not permitted - it doesn't make sense to
1528 constrain data IDs on the existence of *all* datasets.
1529 See :ref:`daf_butler_dataset_type_expressions` for more
1530 information.
1531 collections: `Any`, optional
1532 An expression that fully or partially identifies the collections
1533 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1534 thereof. `...` can be used to return all collections. Must be
1535 provided if ``datasets`` is, and is ignored if it is not. See
1536 :ref:`daf_butler_collection_expressions` for more information.
1537 where : `str`, optional
1538 A string expression similar to a SQL WHERE clause. May involve
1539 any column of a dimension table or (as a shortcut for the primary
1540 key column of a dimension table) dimension name. See
1541 :ref:`daf_butler_dimension_expressions` for more information.
1542 components : `bool`, optional
1543 If `True`, apply all dataset expression patterns to component
1544 dataset type names as well. If `False`, never apply patterns to
1545 components. If `None` (default), apply patterns to components only
1546 if their parent datasets were not matched by the expression.
1547 Fully-specified component datasets (`str` or `DatasetType`
1548 instances) are always included.
1549 **kwargs
1550 Additional keyword arguments are forwarded to
1551 `DataCoordinate.standardize` when processing the ``dataId``
1552 argument (and may be used to provide a constraining data ID even
1553 when the ``dataId`` argument is `None`).
1555 Returns
1556 -------
1557 dataIds : `DataCoordinateQueryResults`
1558 Data IDs matching the given query parameters. These are guaranteed
1559 to identify all dimensions (`DataCoordinate.hasFull` returns
1560 `True`), but will not contain `DimensionRecord` objects
1561 (`DataCoordinate.hasRecords` returns `False`). Call
1562 `DataCoordinateQueryResults.expanded` on the returned object to
1563 fetch those (and consider using
1564 `DataCoordinateQueryResults.materialize` on the returned object
1565 first if the expected number of rows is very large). See
1566 documentation for those methods for additional information.
1567 """
1568 dimensions = iterable(dimensions)
1569 standardizedDataId = self.expandDataId(dataId, **kwargs)
1570 standardizedDatasetTypes = set()
1571 requestedDimensions = self.dimensions.extract(dimensions)
1572 queryDimensionNames = set(requestedDimensions.names)
1573 if datasets is not None:
1574 if collections is None:
1575 raise TypeError("Cannot pass 'datasets' without 'collections'.")
1576 for datasetType in self.queryDatasetTypes(datasets, components=components):
1577 queryDimensionNames.update(datasetType.dimensions.names)
1578 # If any matched dataset type is a component, just operate on
1579 # its parent instead, because Registry doesn't know anything
1580 # about what components exist, and here (unlike queryDatasets)
1581 # we don't care about returning them.
1582 parentDatasetTypeName, componentName = datasetType.nameAndComponent()
1583 if componentName is not None:
1584 datasetType = self.getDatasetType(parentDatasetTypeName)
1585 standardizedDatasetTypes.add(datasetType)
1586 # Preprocess collections expression in case the original included
1587 # single-pass iterators (we'll want to use it multiple times
1588 # below).
1589 collections = CollectionQuery.fromExpression(collections)
1591 summary = queries.QuerySummary(
1592 requested=DimensionGraph(self.dimensions, names=queryDimensionNames),
1593 dataId=standardizedDataId,
1594 expression=where,
1595 )
1596 builder = self.makeQueryBuilder(summary)
1597 for datasetType in standardizedDatasetTypes:
1598 builder.joinDataset(datasetType, collections, isResult=False)
1599 query = builder.finish()
1600 return queries.DataCoordinateQueryResults(self._db, query)
1602 def queryDimensionRecords(self, element: Union[DimensionElement, str], *,
1603 dataId: Optional[DataId] = None,
1604 datasets: Any = None,
1605 collections: Any = None,
1606 where: Optional[str] = None,
1607 components: Optional[bool] = None,
1608 **kwargs: Any) -> Iterator[DimensionRecord]:
1609 """Query for dimension information matching user-provided criteria.
1611 Parameters
1612 ----------
1613 element : `DimensionElement` or `str`
1614 The dimension element to obtain r
1615 dataId : `dict` or `DataCoordinate`, optional
1616 A data ID whose key-value pairs are used as equality constraints
1617 in the query.
1618 datasets : `Any`, optional
1619 An expression that fully or partially identifies dataset types
1620 that should constrain the yielded records. See `queryDataIds` and
1621 :ref:`daf_butler_dataset_type_expressions` for more information.
1622 collections: `Any`, optional
1623 An expression that fully or partially identifies the collections
1624 to search for datasets. See `queryDataIds` and
1625 :ref:`daf_butler_collection_expressions` for more information.
1626 where : `str`, optional
1627 A string expression similar to a SQL WHERE clause. See
1628 `queryDataIds` and :ref:`daf_butler_dimension_expressions` for more
1629 information.
1630 components : `bool`, optional
1631 Whether to apply dataset expressions to components as well.
1632 See `queryDataIds` for more information.
1633 **kwargs
1634 Additional keyword arguments are forwarded to
1635 `DataCoordinate.standardize` when processing the ``dataId``
1636 argument (and may be used to provide a constraining data ID even
1637 when the ``dataId`` argument is `None`).
1639 Returns
1640 -------
1641 dataIds : `DataCoordinateQueryResults`
1642 Data IDs matching the given query parameters.
1643 """
1644 if not isinstance(element, DimensionElement):
1645 element = self.dimensions[element]
1646 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections,
1647 where=where, components=components, **kwargs)
1648 return iter(self._dimensions[element].fetch(dataIds))
1650 def queryDatasetAssociations(
1651 self,
1652 datasetType: Union[str, DatasetType],
1653 collections: Any = ...,
1654 *,
1655 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1656 flattenChains: bool = False,
1657 ) -> Iterator[DatasetAssociation]:
1658 """Iterate over dataset-collection combinations where the dataset is in
1659 the collection.
1661 This method is a temporary placeholder for better support for
1662 assocation results in `queryDatasets`. It will probably be
1663 removed in the future, and should be avoided in production code
1664 whenever possible.
1666 Parameters
1667 ----------
1668 datasetType : `DatasetType` or `str`
1669 A dataset type object or the name of one.
1670 collections: `Any`, optional
1671 An expression that fully or partially identifies the collections
1672 to search for datasets. See `queryCollections` and
1673 :ref:`daf_butler_collection_expressions` for more information.
1674 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
1675 If provided, only yield associations from collections of these
1676 types.
1677 flattenChains : `bool`, optional
1678 If `True` (default) search in the children of
1679 `~CollectionType.CHAINED` collections. If `False`, ``CHAINED``
1680 collections are ignored.
1682 Yields
1683 ------
1684 association : `DatasetAssociation`
1685 Object representing the relationship beween a single dataset and
1686 a single collection.
1687 """
1688 collections = CollectionQuery.fromExpression(collections)
1689 tsRepr = self._db.getTimespanRepresentation()
1690 if isinstance(datasetType, str):
1691 storage = self._datasets[datasetType]
1692 else:
1693 storage = self._datasets[datasetType.name]
1694 for collectionRecord in collections.iter(self._collections,
1695 collectionTypes=frozenset(collectionTypes),
1696 flattenChains=flattenChains):
1697 query = storage.select(collectionRecord)
1698 if query is None:
1699 continue
1700 for row in self._db.query(query.combine()):
1701 dataId = DataCoordinate.fromRequiredValues(
1702 storage.datasetType.dimensions,
1703 tuple(row[name] for name in storage.datasetType.dimensions.required.names)
1704 )
1705 runRecord = self._collections[row[self._collections.getRunForeignKeyName()]]
1706 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name,
1707 conform=False)
1708 if collectionRecord.type is CollectionType.CALIBRATION:
1709 timespan = tsRepr.extract(row)
1710 else:
1711 timespan = None
1712 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan)
1714 storageClasses: StorageClassFactory
1715 """All storage classes known to the registry (`StorageClassFactory`).
1716 """