Coverage for python/lsst/daf/butler/registry/_registry.py : 11%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "Registry",
26)
28from collections import defaultdict
29import contextlib
30import logging
31from typing import (
32 Any,
33 Dict,
34 Iterable,
35 Iterator,
36 List,
37 Mapping,
38 Optional,
39 Set,
40 Type,
41 TYPE_CHECKING,
42 Union,
43)
45import sqlalchemy
47from ..core import (
48 Config,
49 DataCoordinate,
50 DataCoordinateIterable,
51 DataId,
52 DatasetAssociation,
53 DatasetRef,
54 DatasetType,
55 ddl,
56 Dimension,
57 DimensionConfig,
58 DimensionElement,
59 DimensionGraph,
60 DimensionRecord,
61 DimensionUniverse,
62 NamedKeyMapping,
63 NameLookupMapping,
64 StorageClassFactory,
65 Timespan,
66)
67from . import queries
68from ..core.utils import doImport, iterable, transactional
69from ._config import RegistryConfig
70from ._collectionType import CollectionType
71from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError
72from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch, Ellipsis
73from .interfaces import ChainedCollectionRecord, RunRecord
74from .versions import ButlerVersionsManager, DigestMismatchError
76if TYPE_CHECKING: 76 ↛ 77line 76 didn't jump to line 77, because the condition on line 76 was never true
77 from ..butlerConfig import ButlerConfig
78 from .interfaces import (
79 ButlerAttributeManager,
80 CollectionManager,
81 Database,
82 OpaqueTableStorageManager,
83 DimensionRecordStorageManager,
84 DatasetRecordStorageManager,
85 DatastoreRegistryBridgeManager,
86 )
89_LOG = logging.getLogger(__name__)
91# key for dimensions configuration in attributes table
92_DIMENSIONS_ATTR = "config:dimensions.json"
95class Registry:
96 """Registry interface.
98 Parameters
99 ----------
100 database : `Database`
101 Database instance to store Registry.
102 attributes : `type`
103 Manager class implementing `ButlerAttributeManager`.
104 opaque : `type`
105 Manager class implementing `OpaqueTableStorageManager`.
106 dimensions : `type`
107 Manager class implementing `DimensionRecordStorageManager`.
108 collections : `type`
109 Manager class implementing `CollectionManager`.
110 datasets : `type`
111 Manager class implementing `DatasetRecordStorageManager`.
112 datastoreBridges : `type`
113 Manager class implementing `DatastoreRegistryBridgeManager`.
114 dimensionConfig : `DimensionConfig`, optional
115 Dimension universe configuration, only used when ``create`` is True.
116 writeable : `bool`, optional
117 If True then Registry will support write operations.
118 create : `bool`, optional
119 If True then database schema will be initialized, it must be empty
120 before instantiating Registry.
121 """
123 defaultConfigFile: Optional[str] = None
124 """Path to configuration defaults. Accessed within the ``configs`` resource
125 or relative to a search path. Can be None if no defaults specified.
126 """
128 @classmethod
129 def createFromConfig(cls, config: Optional[Union[RegistryConfig, str]] = None,
130 dimensionConfig: Optional[Union[DimensionConfig, str]] = None,
131 butlerRoot: Optional[str] = None) -> Registry:
132 """Create registry database and return `Registry` instance.
134 This method initializes database contents, database must be empty
135 prior to calling this method.
137 Parameters
138 ----------
139 config : `RegistryConfig` or `str`, optional
140 Registry configuration, if missing then default configuration will
141 be loaded from registry.yaml.
142 dimensionConfig : `DimensionConfig` or `str`, optional
143 Dimensions configuration, if missing then default configuration
144 will be loaded from dimensions.yaml.
145 butlerRoot : `str`, optional
146 Path to the repository root this `Registry` will manage.
148 Returns
149 -------
150 registry : `Registry`
151 A new `Registry` instance.
152 """
153 if isinstance(config, str):
154 config = RegistryConfig(config)
155 elif config is None:
156 config = RegistryConfig()
157 elif not isinstance(config, RegistryConfig):
158 raise TypeError(f"Incompatible Registry configuration type: {type(config)}")
159 config.replaceRoot(butlerRoot)
161 if isinstance(dimensionConfig, str):
162 dimensionConfig = DimensionConfig(config)
163 elif dimensionConfig is None:
164 dimensionConfig = DimensionConfig()
165 elif not isinstance(dimensionConfig, DimensionConfig):
166 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}")
168 DatabaseClass = config.getDatabaseClass()
169 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
170 namespace=config.get("namespace"))
171 attributes = doImport(config["managers", "attributes"])
172 opaque = doImport(config["managers", "opaque"])
173 dimensions = doImport(config["managers", "dimensions"])
174 collections = doImport(config["managers", "collections"])
175 datasets = doImport(config["managers", "datasets"])
176 datastoreBridges = doImport(config["managers", "datastores"])
178 return cls(database, dimensions=dimensions, attributes=attributes, opaque=opaque,
179 collections=collections, datasets=datasets, datastoreBridges=datastoreBridges,
180 dimensionConfig=dimensionConfig, create=True)
182 @classmethod
183 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str],
184 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry:
185 """Create `Registry` subclass instance from `config`.
187 Registry database must be inbitialized prior to calling this method.
189 Parameters
190 ----------
191 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
192 Registry configuration
193 butlerRoot : `str`, optional
194 Path to the repository root this `Registry` will manage.
195 writeable : `bool`, optional
196 If `True` (default) create a read-write connection to the database.
198 Returns
199 -------
200 registry : `Registry` (subclass)
201 A new `Registry` subclass instance.
202 """
203 if not isinstance(config, RegistryConfig):
204 if isinstance(config, str) or isinstance(config, Config):
205 config = RegistryConfig(config)
206 else:
207 raise ValueError("Incompatible Registry configuration: {}".format(config))
208 config.replaceRoot(butlerRoot)
209 DatabaseClass = config.getDatabaseClass()
210 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
211 namespace=config.get("namespace"), writeable=writeable)
212 attributes = doImport(config["managers", "attributes"])
213 opaque = doImport(config["managers", "opaque"])
214 dimensions = doImport(config["managers", "dimensions"])
215 collections = doImport(config["managers", "collections"])
216 datasets = doImport(config["managers", "datasets"])
217 datastoreBridges = doImport(config["managers", "datastores"])
219 return cls(database, dimensions=dimensions, attributes=attributes, opaque=opaque,
220 collections=collections, datasets=datasets, datastoreBridges=datastoreBridges,
221 dimensionConfig=None, writeable=writeable)
223 def __init__(self, database: Database, *,
224 attributes: Type[ButlerAttributeManager],
225 opaque: Type[OpaqueTableStorageManager],
226 dimensions: Type[DimensionRecordStorageManager],
227 collections: Type[CollectionManager],
228 datasets: Type[DatasetRecordStorageManager],
229 datastoreBridges: Type[DatastoreRegistryBridgeManager],
230 dimensionConfig: Optional[DimensionConfig] = None,
231 writeable: bool = True,
232 create: bool = False):
233 self._db = database
234 self.storageClasses = StorageClassFactory()
236 # With existing registry we have to read dimensions config from
237 # database before we initialize all other managers.
238 if dimensionConfig is None:
239 assert not create, "missing DimensionConfig when create=True"
240 with self._db.declareStaticTables(create=False) as context:
241 self._attributes = attributes.initialize(self._db, context)
243 versions = ButlerVersionsManager(
244 self._attributes,
245 dict(attributes=self._attributes)
246 )
247 # verify that configured versions are compatible with schema
248 versions.checkManagersConfig()
249 versions.checkManagersVersions(writeable)
251 # get serialized as a string from database
252 dimensionsString = self._attributes.get(_DIMENSIONS_ATTR)
253 if dimensionsString is not None:
254 dimensionConfig = DimensionConfig(Config.fromString(dimensionsString, format="json"))
255 else:
256 raise LookupError(f"Registry attribute {_DIMENSIONS_ATTR} is missing from database")
258 # make universe
259 universe = DimensionUniverse(dimensionConfig)
261 with self._db.declareStaticTables(create=create) as context:
262 self._attributes = attributes.initialize(self._db, context)
263 self._dimensions = dimensions.initialize(self._db, context, universe=universe)
264 self._collections = collections.initialize(self._db, context, dimensions=self._dimensions)
265 self._datasets = datasets.initialize(self._db, context,
266 collections=self._collections,
267 dimensions=self._dimensions)
268 self._opaque = opaque.initialize(self._db, context)
269 self._datastoreBridges = datastoreBridges.initialize(self._db, context,
270 opaque=self._opaque,
271 datasets=datasets,
272 universe=self._dimensions.universe)
273 versions = ButlerVersionsManager(
274 self._attributes,
275 dict(
276 attributes=self._attributes,
277 opaque=self._opaque,
278 dimensions=self._dimensions,
279 collections=self._collections,
280 datasets=self._datasets,
281 datastores=self._datastoreBridges,
282 )
283 )
284 # store managers and their versions in attributes table
285 context.addInitializer(lambda db: versions.storeManagersConfig())
286 context.addInitializer(lambda db: versions.storeManagersVersions())
287 # dump universe config as json into attributes (faster than YAML)
288 json = dimensionConfig.dump(format="json")
289 if json is not None:
290 # Convert Optional[str] to str for mypy
291 json_str = json
292 context.addInitializer(
293 lambda db: self._attributes.set(_DIMENSIONS_ATTR, json_str)
294 )
295 else:
296 raise RuntimeError("Unexpectedly failed to serialize DimensionConfig to JSON")
298 if not create:
299 # verify that configured versions are compatible with schema
300 versions.checkManagersConfig()
301 versions.checkManagersVersions(writeable)
302 try:
303 versions.checkManagersDigests()
304 except DigestMismatchError as exc:
305 # potentially digest mismatch is a serious error but during
306 # development it could be benign, treat this as warning for
307 # now.
308 _LOG.warning(f"Registry schema digest mismatch: {exc}")
310 self._dimensions.refresh()
311 self._collections.refresh()
312 self._datasets.refresh()
314 def __str__(self) -> str:
315 return str(self._db)
317 def __repr__(self) -> str:
318 return f"Registry({self._db!r}, {self.dimensions!r})"
320 def isWriteable(self) -> bool:
321 """Return `True` if this registry allows write operations, and `False`
322 otherwise.
323 """
324 return self._db.isWriteable()
326 @property
327 def dimensions(self) -> DimensionUniverse:
328 """All dimensions recognized by this `Registry` (`DimensionUniverse`).
329 """
330 return self._dimensions.universe
332 @contextlib.contextmanager
333 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
334 """Return a context manager that represents a transaction.
335 """
336 try:
337 with self._db.transaction(savepoint=savepoint):
338 yield
339 except BaseException:
340 # TODO: this clears the caches sometimes when we wouldn't actually
341 # need to. Can we avoid that?
342 self._dimensions.clearCaches()
343 raise
345 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
346 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
347 other data repository client.
349 Opaque table records can be added via `insertOpaqueData`, retrieved via
350 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
352 Parameters
353 ----------
354 tableName : `str`
355 Logical name of the opaque table. This may differ from the
356 actual name used in the database by a prefix and/or suffix.
357 spec : `ddl.TableSpec`
358 Specification for the table to be added.
359 """
360 self._opaque.register(tableName, spec)
362 @transactional
363 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
364 """Insert records into an opaque table.
366 Parameters
367 ----------
368 tableName : `str`
369 Logical name of the opaque table. Must match the name used in a
370 previous call to `registerOpaqueTable`.
371 data
372 Each additional positional argument is a dictionary that represents
373 a single row to be added.
374 """
375 self._opaque[tableName].insert(*data)
377 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
378 """Retrieve records from an opaque table.
380 Parameters
381 ----------
382 tableName : `str`
383 Logical name of the opaque table. Must match the name used in a
384 previous call to `registerOpaqueTable`.
385 where
386 Additional keyword arguments are interpreted as equality
387 constraints that restrict the returned rows (combined with AND);
388 keyword arguments are column names and values are the values they
389 must have.
391 Yields
392 ------
393 row : `dict`
394 A dictionary representing a single result row.
395 """
396 yield from self._opaque[tableName].fetch(**where)
398 @transactional
399 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
400 """Remove records from an opaque table.
402 Parameters
403 ----------
404 tableName : `str`
405 Logical name of the opaque table. Must match the name used in a
406 previous call to `registerOpaqueTable`.
407 where
408 Additional keyword arguments are interpreted as equality
409 constraints that restrict the deleted rows (combined with AND);
410 keyword arguments are column names and values are the values they
411 must have.
412 """
413 self._opaque[tableName].delete(**where)
415 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED) -> None:
416 """Add a new collection if one with the given name does not exist.
418 Parameters
419 ----------
420 name : `str`
421 The name of the collection to create.
422 type : `CollectionType`
423 Enum value indicating the type of collection to create.
425 Notes
426 -----
427 This method cannot be called within transactions, as it needs to be
428 able to perform its own transaction to be concurrent.
429 """
430 self._collections.register(name, type)
432 def getCollectionType(self, name: str) -> CollectionType:
433 """Return an enumeration value indicating the type of the given
434 collection.
436 Parameters
437 ----------
438 name : `str`
439 The name of the collection.
441 Returns
442 -------
443 type : `CollectionType`
444 Enum value indicating the type of this collection.
446 Raises
447 ------
448 MissingCollectionError
449 Raised if no collection with the given name exists.
450 """
451 return self._collections.find(name).type
453 def registerRun(self, name: str) -> None:
454 """Add a new run if one with the given name does not exist.
456 Parameters
457 ----------
458 name : `str`
459 The name of the run to create.
461 Notes
462 -----
463 This method cannot be called within transactions, as it needs to be
464 able to perform its own transaction to be concurrent.
465 """
466 self._collections.register(name, CollectionType.RUN)
468 @transactional
469 def removeCollection(self, name: str) -> None:
470 """Completely remove the given collection.
472 Parameters
473 ----------
474 name : `str`
475 The name of the collection to remove.
477 Raises
478 ------
479 MissingCollectionError
480 Raised if no collection with the given name exists.
482 Notes
483 -----
484 If this is a `~CollectionType.RUN` collection, all datasets and quanta
485 in it are also fully removed. This requires that those datasets be
486 removed (or at least trashed) from any datastores that hold them first.
488 A collection may not be deleted as long as it is referenced by a
489 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must
490 be deleted or redefined first.
491 """
492 self._collections.remove(name)
494 def getCollectionChain(self, parent: str) -> CollectionSearch:
495 """Return the child collections in a `~CollectionType.CHAINED`
496 collection.
498 Parameters
499 ----------
500 parent : `str`
501 Name of the chained collection. Must have already been added via
502 a call to `Registry.registerCollection`.
504 Returns
505 -------
506 children : `CollectionSearch`
507 An object that defines the search path of the collection.
508 See :ref:`daf_butler_collection_expressions` for more information.
510 Raises
511 ------
512 MissingCollectionError
513 Raised if ``parent`` does not exist in the `Registry`.
514 TypeError
515 Raised if ``parent`` does not correspond to a
516 `~CollectionType.CHAINED` collection.
517 """
518 record = self._collections.find(parent)
519 if record.type is not CollectionType.CHAINED:
520 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
521 assert isinstance(record, ChainedCollectionRecord)
522 return record.children
524 @transactional
525 def setCollectionChain(self, parent: str, children: Any) -> None:
526 """Define or redefine a `~CollectionType.CHAINED` collection.
528 Parameters
529 ----------
530 parent : `str`
531 Name of the chained collection. Must have already been added via
532 a call to `Registry.registerCollection`.
533 children : `Any`
534 An expression defining an ordered search of child collections,
535 generally an iterable of `str`; see
536 :ref:`daf_butler_collection_expressions` for more information.
538 Raises
539 ------
540 MissingCollectionError
541 Raised when any of the given collections do not exist in the
542 `Registry`.
543 TypeError
544 Raised if ``parent`` does not correspond to a
545 `~CollectionType.CHAINED` collection.
546 ValueError
547 Raised if the given collections contains a cycle.
548 """
549 record = self._collections.find(parent)
550 if record.type is not CollectionType.CHAINED:
551 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
552 assert isinstance(record, ChainedCollectionRecord)
553 children = CollectionSearch.fromExpression(children)
554 if children != record.children:
555 record.update(self._collections, children)
557 def registerDatasetType(self, datasetType: DatasetType) -> bool:
558 """
559 Add a new `DatasetType` to the Registry.
561 It is not an error to register the same `DatasetType` twice.
563 Parameters
564 ----------
565 datasetType : `DatasetType`
566 The `DatasetType` to be added.
568 Returns
569 -------
570 inserted : `bool`
571 `True` if ``datasetType`` was inserted, `False` if an identical
572 existing `DatsetType` was found. Note that in either case the
573 DatasetType is guaranteed to be defined in the Registry
574 consistently with the given definition.
576 Raises
577 ------
578 ValueError
579 Raised if the dimensions or storage class are invalid.
580 ConflictingDefinitionError
581 Raised if this DatasetType is already registered with a different
582 definition.
584 Notes
585 -----
586 This method cannot be called within transactions, as it needs to be
587 able to perform its own transaction to be concurrent.
588 """
589 _, inserted = self._datasets.register(datasetType)
590 return inserted
592 def removeDatasetType(self, name: str) -> None:
593 """Remove the named `DatasetType` from the registry.
595 .. warning::
597 Registry caches the dataset type definitions. This means that
598 deleting the dataset type definition may result in unexpected
599 behavior from other butler processes that are active that have
600 not seen the deletion.
602 Parameters
603 ----------
604 name : `str`
605 Name of the type to be removed.
607 Raises
608 ------
609 lsst.daf.butler.registry.OrphanedRecordError
610 Raised if an attempt is made to remove the dataset type definition
611 when there are already datasets associated with it.
613 Notes
614 -----
615 If the dataset type is not registered the method will return without
616 action.
617 """
618 self._datasets.remove(name)
620 def getDatasetType(self, name: str) -> DatasetType:
621 """Get the `DatasetType`.
623 Parameters
624 ----------
625 name : `str`
626 Name of the type.
628 Returns
629 -------
630 type : `DatasetType`
631 The `DatasetType` associated with the given name.
633 Raises
634 ------
635 KeyError
636 Requested named DatasetType could not be found in registry.
637 """
638 return self._datasets[name].datasetType
640 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *,
641 collections: Any, timespan: Optional[Timespan] = None,
642 **kwargs: Any) -> Optional[DatasetRef]:
643 """Find a dataset given its `DatasetType` and data ID.
645 This can be used to obtain a `DatasetRef` that permits the dataset to
646 be read from a `Datastore`. If the dataset is a component and can not
647 be found using the provided dataset type, a dataset ref for the parent
648 will be returned instead but with the correct dataset type.
650 Parameters
651 ----------
652 datasetType : `DatasetType` or `str`
653 A `DatasetType` or the name of one.
654 dataId : `dict` or `DataCoordinate`, optional
655 A `dict`-like object containing the `Dimension` links that identify
656 the dataset within a collection.
657 collections
658 An expression that fully or partially identifies the collections to
659 search for the dataset; see
660 :ref:`daf_butler_collection_expressions` for more information.
661 timespan : `Timespan`, optional
662 A timespan that the validity range of the dataset must overlap.
663 If not provided, any `~CollectionType.CALIBRATION` collections
664 matched by the ``collections`` argument will not be searched.
665 **kwargs
666 Additional keyword arguments passed to
667 `DataCoordinate.standardize` to convert ``dataId`` to a true
668 `DataCoordinate` or augment an existing one.
670 Returns
671 -------
672 ref : `DatasetRef`
673 A reference to the dataset, or `None` if no matching Dataset
674 was found.
676 Raises
677 ------
678 LookupError
679 Raised if one or more data ID keys are missing.
680 KeyError
681 Raised if the dataset type does not exist.
682 MissingCollectionError
683 Raised if any of ``collections`` does not exist in the registry.
685 Notes
686 -----
687 This method simply returns `None` and does not raise an exception even
688 when the set of collections searched is intrinsically incompatible with
689 the dataset type, e.g. if ``datasetType.isCalibration() is False``, but
690 only `~CollectionType.CALIBRATION` collections are being searched.
691 This may make it harder to debug some lookup failures, but the behavior
692 is intentional; we consider it more important that failed searches are
693 reported consistently, regardless of the reason, and that adding
694 additional collections that do not contain a match to the search path
695 never changes the behavior.
696 """
697 if isinstance(datasetType, DatasetType):
698 storage = self._datasets[datasetType.name]
699 else:
700 storage = self._datasets[datasetType]
701 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions,
702 universe=self.dimensions, **kwargs)
703 collections = CollectionSearch.fromExpression(collections)
704 for collectionRecord in collections.iter(self._collections):
705 if (collectionRecord.type is CollectionType.CALIBRATION
706 and (not storage.datasetType.isCalibration() or timespan is None)):
707 continue
708 result = storage.find(collectionRecord, dataId, timespan=timespan)
709 if result is not None:
710 return result
712 return None
714 @transactional
715 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
716 run: str) -> List[DatasetRef]:
717 """Insert one or more datasets into the `Registry`
719 This always adds new datasets; to associate existing datasets with
720 a new collection, use ``associate``.
722 Parameters
723 ----------
724 datasetType : `DatasetType` or `str`
725 A `DatasetType` or the name of one.
726 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate`
727 Dimension-based identifiers for the new datasets.
728 run : `str`
729 The name of the run that produced the datasets.
731 Returns
732 -------
733 refs : `list` of `DatasetRef`
734 Resolved `DatasetRef` instances for all given data IDs (in the same
735 order).
737 Raises
738 ------
739 ConflictingDefinitionError
740 If a dataset with the same dataset type and data ID as one of those
741 given already exists in ``run``.
742 MissingCollectionError
743 Raised if ``run`` does not exist in the registry.
744 """
745 if isinstance(datasetType, DatasetType):
746 storage = self._datasets.find(datasetType.name)
747 if storage is None:
748 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
749 else:
750 storage = self._datasets.find(datasetType)
751 if storage is None:
752 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
753 runRecord = self._collections.find(run)
754 if runRecord.type is not CollectionType.RUN:
755 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.")
756 assert isinstance(runRecord, RunRecord)
757 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions)
758 for dataId in dataIds]
759 try:
760 refs = list(storage.insert(runRecord, expandedDataIds))
761 except sqlalchemy.exc.IntegrityError as err:
762 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
763 f"one or more datasets of type {storage.datasetType} into "
764 f"collection '{run}'. "
765 f"This probably means a dataset with the same data ID "
766 f"and dataset type already exists, but it may also mean a "
767 f"dimension row is missing.") from err
768 return refs
770 def getDataset(self, id: int) -> Optional[DatasetRef]:
771 """Retrieve a Dataset entry.
773 Parameters
774 ----------
775 id : `int`
776 The unique identifier for the dataset.
778 Returns
779 -------
780 ref : `DatasetRef` or `None`
781 A ref to the Dataset, or `None` if no matching Dataset
782 was found.
783 """
784 ref = self._datasets.getDatasetRef(id)
785 if ref is None:
786 return None
787 return ref
789 @transactional
790 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
791 """Remove datasets from the Registry.
793 The datasets will be removed unconditionally from all collections, and
794 any `Quantum` that consumed this dataset will instead be marked with
795 having a NULL input. `Datastore` records will *not* be deleted; the
796 caller is responsible for ensuring that the dataset has already been
797 removed from all Datastores.
799 Parameters
800 ----------
801 refs : `Iterable` of `DatasetRef`
802 References to the datasets to be removed. Must include a valid
803 ``id`` attribute, and should be considered invalidated upon return.
805 Raises
806 ------
807 AmbiguousDatasetError
808 Raised if any ``ref.id`` is `None`.
809 OrphanedRecordError
810 Raised if any dataset is still present in any `Datastore`.
811 """
812 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
813 storage = self._datasets.find(datasetType.name)
814 assert storage is not None
815 try:
816 storage.delete(refsForType)
817 except sqlalchemy.exc.IntegrityError as err:
818 raise OrphanedRecordError("One or more datasets is still "
819 "present in one or more Datastores.") from err
821 @transactional
822 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
823 """Add existing datasets to a `~CollectionType.TAGGED` collection.
825 If a DatasetRef with the same exact integer ID is already in a
826 collection nothing is changed. If a `DatasetRef` with the same
827 `DatasetType` and data ID but with different integer ID
828 exists in the collection, `ConflictingDefinitionError` is raised.
830 Parameters
831 ----------
832 collection : `str`
833 Indicates the collection the datasets should be associated with.
834 refs : `Iterable` [ `DatasetRef` ]
835 An iterable of resolved `DatasetRef` instances that already exist
836 in this `Registry`.
838 Raises
839 ------
840 ConflictingDefinitionError
841 If a Dataset with the given `DatasetRef` already exists in the
842 given collection.
843 AmbiguousDatasetError
844 Raised if ``any(ref.id is None for ref in refs)``.
845 MissingCollectionError
846 Raised if ``collection`` does not exist in the registry.
847 TypeError
848 Raise adding new datasets to the given ``collection`` is not
849 allowed.
850 """
851 collectionRecord = self._collections.find(collection)
852 if collectionRecord.type is not CollectionType.TAGGED:
853 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
854 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
855 storage = self._datasets.find(datasetType.name)
856 assert storage is not None
857 try:
858 storage.associate(collectionRecord, refsForType)
859 except sqlalchemy.exc.IntegrityError as err:
860 raise ConflictingDefinitionError(
861 f"Constraint violation while associating dataset of type {datasetType.name} with "
862 f"collection {collection}. This probably means that one or more datasets with the same "
863 f"dataset type and data ID already exist in the collection, but it may also indicate "
864 f"that the datasets do not exist."
865 ) from err
867 @transactional
868 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
869 """Remove existing datasets from a `~CollectionType.TAGGED` collection.
871 ``collection`` and ``ref`` combinations that are not currently
872 associated are silently ignored.
874 Parameters
875 ----------
876 collection : `str`
877 The collection the datasets should no longer be associated with.
878 refs : `Iterable` [ `DatasetRef` ]
879 An iterable of resolved `DatasetRef` instances that already exist
880 in this `Registry`.
882 Raises
883 ------
884 AmbiguousDatasetError
885 Raised if any of the given dataset references is unresolved.
886 MissingCollectionError
887 Raised if ``collection`` does not exist in the registry.
888 TypeError
889 Raise adding new datasets to the given ``collection`` is not
890 allowed.
891 """
892 collectionRecord = self._collections.find(collection)
893 if collectionRecord.type is not CollectionType.TAGGED:
894 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; "
895 "expected TAGGED.")
896 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
897 storage = self._datasets.find(datasetType.name)
898 assert storage is not None
899 storage.disassociate(collectionRecord, refsForType)
901 @transactional
902 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
903 """Associate one or more datasets with a calibration collection and a
904 validity range within it.
906 Parameters
907 ----------
908 collection : `str`
909 The name of an already-registered `~CollectionType.CALIBRATION`
910 collection.
911 refs : `Iterable` [ `DatasetRef` ]
912 Datasets to be associated.
913 timespan : `Timespan`
914 The validity range for these datasets within the collection.
916 Raises
917 ------
918 AmbiguousDatasetError
919 Raised if any of the given `DatasetRef` instances is unresolved.
920 ConflictingDefinitionError
921 Raised if the collection already contains a different dataset with
922 the same `DatasetType` and data ID and an overlapping validity
923 range.
924 TypeError
925 Raised if ``collection`` is not a `~CollectionType.CALIBRATION`
926 collection or if one or more datasets are of a dataset type for
927 which `DatasetType.isCalibration` returns `False`.
928 """
929 collectionRecord = self._collections.find(collection)
930 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
931 storage = self._datasets[datasetType.name]
932 storage.certify(collectionRecord, refsForType, timespan)
934 @transactional
935 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *,
936 dataIds: Optional[Iterable[DataId]] = None) -> None:
937 """Remove or adjust datasets to clear a validity range within a
938 calibration collection.
940 Parameters
941 ----------
942 collection : `str`
943 The name of an already-registered `~CollectionType.CALIBRATION`
944 collection.
945 datasetType : `str` or `DatasetType`
946 Name or `DatasetType` instance for the datasets to be decertified.
947 timespan : `Timespan`, optional
948 The validity range to remove datasets from within the collection.
949 Datasets that overlap this range but are not contained by it will
950 have their validity ranges adjusted to not overlap it, which may
951 split a single dataset validity range into two.
952 dataIds : `Iterable` [ `DataId` ], optional
953 Data IDs that should be decertified within the given validity range
954 If `None`, all data IDs for ``self.datasetType`` will be
955 decertified.
957 Raises
958 ------
959 TypeError
960 Raised if ``collection`` is not a `~CollectionType.CALIBRATION`
961 collection or if ``datasetType.isCalibration() is False``.
962 """
963 collectionRecord = self._collections.find(collection)
964 if isinstance(datasetType, str):
965 storage = self._datasets[datasetType]
966 else:
967 storage = self._datasets[datasetType.name]
968 standardizedDataIds = None
969 if dataIds is not None:
970 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions)
971 for d in dataIds]
972 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds)
974 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
975 """Return an object that allows a new `Datastore` instance to
976 communicate with this `Registry`.
978 Returns
979 -------
980 manager : `DatastoreRegistryBridgeManager`
981 Object that mediates communication between this `Registry` and its
982 associated datastores.
983 """
984 return self._datastoreBridges
986 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
987 """Retrieve datastore locations for a given dataset.
989 Parameters
990 ----------
991 ref : `DatasetRef`
992 A reference to the dataset for which to retrieve storage
993 information.
995 Returns
996 -------
997 datastores : `Iterable` [ `str` ]
998 All the matching datastores holding this dataset.
1000 Raises
1001 ------
1002 AmbiguousDatasetError
1003 Raised if ``ref.id`` is `None`.
1004 """
1005 return self._datastoreBridges.findDatastores(ref)
1007 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
1008 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,
1009 **kwargs: Any) -> DataCoordinate:
1010 """Expand a dimension-based data ID to include additional information.
1012 Parameters
1013 ----------
1014 dataId : `DataCoordinate` or `dict`, optional
1015 Data ID to be expanded; augmented and overridden by ``kwds``.
1016 graph : `DimensionGraph`, optional
1017 Set of dimensions for the expanded ID. If `None`, the dimensions
1018 will be inferred from the keys of ``dataId`` and ``kwds``.
1019 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph``
1020 are silently ignored, providing a way to extract and expand a
1021 subset of a data ID.
1022 records : `Mapping` [`str`, `DimensionRecord`], optional
1023 Dimension record data to use before querying the database for that
1024 data, keyed by element name.
1025 **kwargs
1026 Additional keywords are treated like additional key-value pairs for
1027 ``dataId``, extending and overriding
1029 Returns
1030 -------
1031 expanded : `DataCoordinate`
1032 A data ID that includes full metadata for all of the dimensions it
1033 identifieds, i.e. guarantees that ``expanded.hasRecords()`` and
1034 ``expanded.hasFull()`` both return `True`.
1035 """
1036 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwargs)
1037 if standardized.hasRecords():
1038 return standardized
1039 if records is None:
1040 records = {}
1041 elif isinstance(records, NamedKeyMapping):
1042 records = records.byName()
1043 else:
1044 records = dict(records)
1045 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
1046 records.update(dataId.records.byName())
1047 keys = standardized.byName()
1048 for element in standardized.graph.primaryKeyTraversalOrder:
1049 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
1050 if record is ...:
1051 if isinstance(element, Dimension) and keys.get(element.name) is None:
1052 if element in standardized.graph.required:
1053 raise LookupError(
1054 f"No value or null value for required dimension {element.name}."
1055 )
1056 keys[element.name] = None
1057 record = None
1058 else:
1059 storage = self._dimensions[element]
1060 dataIdSet = DataCoordinateIterable.fromScalar(
1061 DataCoordinate.standardize(keys, graph=element.graph)
1062 )
1063 fetched = tuple(storage.fetch(dataIdSet))
1064 try:
1065 (record,) = fetched
1066 except ValueError:
1067 record = None
1068 records[element.name] = record
1069 if record is not None:
1070 for d in element.implied:
1071 value = getattr(record, d.name)
1072 if keys.setdefault(d.name, value) != value:
1073 raise InconsistentDataIdError(
1074 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
1075 f"but {element.name} implies {d.name}={value!r}."
1076 )
1077 else:
1078 if element in standardized.graph.required:
1079 raise LookupError(
1080 f"Could not fetch record for required dimension {element.name} via keys {keys}."
1081 )
1082 if element.alwaysJoin:
1083 raise InconsistentDataIdError(
1084 f"Could not fetch record for element {element.name} via keys {keys}, ",
1085 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
1086 "related."
1087 )
1088 for d in element.implied:
1089 keys.setdefault(d.name, None)
1090 records.setdefault(d.name, None)
1091 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
1093 def insertDimensionData(self, element: Union[DimensionElement, str],
1094 *data: Union[Mapping[str, Any], DimensionRecord],
1095 conform: bool = True) -> None:
1096 """Insert one or more dimension records into the database.
1098 Parameters
1099 ----------
1100 element : `DimensionElement` or `str`
1101 The `DimensionElement` or name thereof that identifies the table
1102 records will be inserted into.
1103 data : `dict` or `DimensionRecord` (variadic)
1104 One or more records to insert.
1105 conform : `bool`, optional
1106 If `False` (`True` is default) perform no checking or conversions,
1107 and assume that ``element`` is a `DimensionElement` instance and
1108 ``data`` is a one or more `DimensionRecord` instances of the
1109 appropriate subclass.
1110 """
1111 if conform:
1112 if isinstance(element, str):
1113 element = self.dimensions[element]
1114 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
1115 for row in data]
1116 else:
1117 # Ignore typing since caller said to trust them with conform=False.
1118 records = data # type: ignore
1119 storage = self._dimensions[element] # type: ignore
1120 storage.insert(*records)
1122 def syncDimensionData(self, element: Union[DimensionElement, str],
1123 row: Union[Mapping[str, Any], DimensionRecord],
1124 conform: bool = True) -> bool:
1125 """Synchronize the given dimension record with the database, inserting
1126 if it does not already exist and comparing values if it does.
1128 Parameters
1129 ----------
1130 element : `DimensionElement` or `str`
1131 The `DimensionElement` or name thereof that identifies the table
1132 records will be inserted into.
1133 row : `dict` or `DimensionRecord`
1134 The record to insert.
1135 conform : `bool`, optional
1136 If `False` (`True` is default) perform no checking or conversions,
1137 and assume that ``element`` is a `DimensionElement` instance and
1138 ``data`` is a one or more `DimensionRecord` instances of the
1139 appropriate subclass.
1141 Returns
1142 -------
1143 inserted : `bool`
1144 `True` if a new row was inserted, `False` otherwise.
1146 Raises
1147 ------
1148 ConflictingDefinitionError
1149 Raised if the record exists in the database (according to primary
1150 key lookup) but is inconsistent with the given one.
1151 """
1152 if conform:
1153 if isinstance(element, str):
1154 element = self.dimensions[element]
1155 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
1156 else:
1157 # Ignore typing since caller said to trust them with conform=False.
1158 record = row # type: ignore
1159 storage = self._dimensions[element] # type: ignore
1160 return storage.sync(record)
1162 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None
1163 ) -> Iterator[DatasetType]:
1164 """Iterate over the dataset types whose names match an expression.
1166 Parameters
1167 ----------
1168 expression : `Any`, optional
1169 An expression that fully or partially identifies the dataset types
1170 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1171 `...` can be used to return all dataset types, and is the default.
1172 See :ref:`daf_butler_dataset_type_expressions` for more
1173 information.
1174 components : `bool`, optional
1175 If `True`, apply all expression patterns to component dataset type
1176 names as well. If `False`, never apply patterns to components.
1177 If `None` (default), apply patterns to components only if their
1178 parent datasets were not matched by the expression.
1179 Fully-specified component datasets (`str` or `DatasetType`
1180 instances) are always included.
1182 Yields
1183 ------
1184 datasetType : `DatasetType`
1185 A `DatasetType` instance whose name matches ``expression``.
1186 """
1187 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
1188 if wildcard is Ellipsis:
1189 for datasetType in self._datasets:
1190 # The dataset type can no longer be a component
1191 yield datasetType
1192 if components and datasetType.isComposite():
1193 # Automatically create the component dataset types
1194 for component in datasetType.makeAllComponentDatasetTypes():
1195 yield component
1196 return
1197 done: Set[str] = set()
1198 for name in wildcard.strings:
1199 storage = self._datasets.find(name)
1200 if storage is not None:
1201 done.add(storage.datasetType.name)
1202 yield storage.datasetType
1203 if wildcard.patterns:
1204 # If components (the argument) is None, we'll save component
1205 # dataset that we might want to match, but only if their parents
1206 # didn't get included.
1207 componentsForLater = []
1208 for registeredDatasetType in self._datasets:
1209 # Components are not stored in registry so expand them here
1210 allDatasetTypes = [registeredDatasetType] \
1211 + registeredDatasetType.makeAllComponentDatasetTypes()
1212 for datasetType in allDatasetTypes:
1213 if datasetType.name in done:
1214 continue
1215 parentName, componentName = datasetType.nameAndComponent()
1216 if componentName is not None and not components:
1217 if components is None and parentName not in done:
1218 componentsForLater.append(datasetType)
1219 continue
1220 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
1221 done.add(datasetType.name)
1222 yield datasetType
1223 # Go back and try to match saved components.
1224 for datasetType in componentsForLater:
1225 parentName, _ = datasetType.nameAndComponent()
1226 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
1227 yield datasetType
1229 def queryCollections(self, expression: Any = ...,
1230 datasetType: Optional[DatasetType] = None,
1231 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1232 flattenChains: bool = False,
1233 includeChains: Optional[bool] = None) -> Iterator[str]:
1234 """Iterate over the collections whose names match an expression.
1236 Parameters
1237 ----------
1238 expression : `Any`, optional
1239 An expression that fully or partially identifies the collections
1240 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1241 `...` can be used to return all collections, and is the default.
1242 See :ref:`daf_butler_collection_expressions` for more
1243 information.
1244 datasetType : `DatasetType`, optional
1245 If provided, only yield collections that may contain datasets of
1246 this type. This is a conservative approximation in general; it may
1247 yield collections that do not have any such datasets.
1248 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
1249 If provided, only yield collections of these types.
1250 flattenChains : `bool`, optional
1251 If `True` (`False` is default), recursively yield the child
1252 collections of matching `~CollectionType.CHAINED` collections.
1253 includeChains : `bool`, optional
1254 If `True`, yield records for matching `~CollectionType.CHAINED`
1255 collections. Default is the opposite of ``flattenChains``: include
1256 either CHAINED collections or their children, but not both.
1258 Yields
1259 ------
1260 collection : `str`
1261 The name of a collection that matches ``expression``.
1262 """
1263 # Right now the datasetTypes argument is completely ignored, but that
1264 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up
1265 # ticket will take care of that.
1266 query = CollectionQuery.fromExpression(expression)
1267 for record in query.iter(self._collections, collectionTypes=frozenset(collectionTypes),
1268 flattenChains=flattenChains, includeChains=includeChains):
1269 yield record.name
1271 def makeQueryBuilder(self, summary: queries.QuerySummary) -> queries.QueryBuilder:
1272 """Return a `QueryBuilder` instance capable of constructing and
1273 managing more complex queries than those obtainable via `Registry`
1274 interfaces.
1276 This is an advanced interface; downstream code should prefer
1277 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
1278 are sufficient.
1280 Parameters
1281 ----------
1282 summary : `queries.QuerySummary`
1283 Object describing and categorizing the full set of dimensions that
1284 will be included in the query.
1286 Returns
1287 -------
1288 builder : `queries.QueryBuilder`
1289 Object that can be used to construct and perform advanced queries.
1290 """
1291 return queries.QueryBuilder(
1292 summary,
1293 queries.RegistryManagers(
1294 collections=self._collections,
1295 dimensions=self._dimensions,
1296 datasets=self._datasets
1297 )
1298 )
1300 def queryDatasets(self, datasetType: Any, *,
1301 collections: Any,
1302 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
1303 dataId: Optional[DataId] = None,
1304 where: Optional[str] = None,
1305 findFirst: bool = False,
1306 components: Optional[bool] = None,
1307 **kwargs: Any) -> queries.DatasetQueryResults:
1308 """Query for and iterate over dataset references matching user-provided
1309 criteria.
1311 Parameters
1312 ----------
1313 datasetType
1314 An expression that fully or partially identifies the dataset types
1315 to be queried. Allowed types include `DatasetType`, `str`,
1316 `re.Pattern`, and iterables thereof. The special value `...` can
1317 be used to query all dataset types. See
1318 :ref:`daf_butler_dataset_type_expressions` for more information.
1319 collections
1320 An expression that fully or partially identifies the collections
1321 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1322 thereof. `...` can be used to find datasets from all
1323 `~CollectionType.RUN` collections (no other collections are
1324 necessary, because all datasets are in a ``RUN`` collection). See
1325 :ref:`daf_butler_collection_expressions` for more information.
1326 dimensions : `~collections.abc.Iterable` of `Dimension` or `str`
1327 Dimensions to include in the query (in addition to those used
1328 to identify the queried dataset type(s)), either to constrain
1329 the resulting datasets to those for which a matching dimension
1330 exists, or to relate the dataset type's dimensions to dimensions
1331 referenced by the ``dataId`` or ``where`` arguments.
1332 dataId : `dict` or `DataCoordinate`, optional
1333 A data ID whose key-value pairs are used as equality constraints
1334 in the query.
1335 where : `str`, optional
1336 A string expression similar to a SQL WHERE clause. May involve
1337 any column of a dimension table or (as a shortcut for the primary
1338 key column of a dimension table) dimension name. See
1339 :ref:`daf_butler_dimension_expressions` for more information.
1340 findFirst : `bool`, optional
1341 If `True` (`False` is default), for each result data ID, only
1342 yield one `DatasetRef` of each `DatasetType`, from the first
1343 collection in which a dataset of that dataset type appears
1344 (according to the order of ``collections`` passed in). If `True`,
1345 ``collections`` must not contain regular expressions and may not
1346 be `...`.
1347 components : `bool`, optional
1348 If `True`, apply all dataset expression patterns to component
1349 dataset type names as well. If `False`, never apply patterns to
1350 components. If `None` (default), apply patterns to components only
1351 if their parent datasets were not matched by the expression.
1352 Fully-specified component datasets (`str` or `DatasetType`
1353 instances) are always included.
1354 **kwargs
1355 Additional keyword arguments are forwarded to
1356 `DataCoordinate.standardize` when processing the ``dataId``
1357 argument (and may be used to provide a constraining data ID even
1358 when the ``dataId`` argument is `None`).
1360 Returns
1361 -------
1362 refs : `queries.DatasetQueryResults`
1363 Dataset references matching the given query criteria.
1365 Raises
1366 ------
1367 TypeError
1368 Raised when the arguments are incompatible, such as when a
1369 collection wildcard is passed when ``findFirst`` is `True`.
1371 Notes
1372 -----
1373 When multiple dataset types are queried in a single call, the
1374 results of this operation are equivalent to querying for each dataset
1375 type separately in turn, and no information about the relationships
1376 between datasets of different types is included. In contexts where
1377 that kind of information is important, the recommended pattern is to
1378 use `queryDataIds` to first obtain data IDs (possibly with the
1379 desired dataset types and collections passed as constraints to the
1380 query), and then use multiple (generally much simpler) calls to
1381 `queryDatasets` with the returned data IDs passed as constraints.
1382 """
1383 # Standardize the collections expression.
1384 if findFirst:
1385 collections = CollectionSearch.fromExpression(collections)
1386 else:
1387 collections = CollectionQuery.fromExpression(collections)
1388 # Standardize and expand the data ID provided as a constraint.
1389 standardizedDataId = self.expandDataId(dataId, **kwargs)
1391 # We can only query directly if given a non-component DatasetType
1392 # instance. If we were given an expression or str or a component
1393 # DatasetType instance, we'll populate this dict, recurse, and return.
1394 # If we already have a non-component DatasetType, it will remain None
1395 # and we'll run the query directly.
1396 composition: Optional[
1397 Dict[
1398 DatasetType, # parent dataset type
1399 List[Optional[str]] # component name, or None for parent
1400 ]
1401 ] = None
1402 if not isinstance(datasetType, DatasetType):
1403 # We were given a dataset type expression (which may be as simple
1404 # as a str). Loop over all matching datasets, delegating handling
1405 # of the `components` argument to queryDatasetTypes, as we populate
1406 # the composition dict.
1407 composition = defaultdict(list)
1408 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
1409 parentName, componentName = trueDatasetType.nameAndComponent()
1410 if componentName is not None:
1411 parentDatasetType = self.getDatasetType(parentName)
1412 composition.setdefault(parentDatasetType, []).append(componentName)
1413 else:
1414 composition.setdefault(trueDatasetType, []).append(None)
1415 elif datasetType.isComponent():
1416 # We were given a true DatasetType instance, but it's a component.
1417 # the composition dict will have exactly one item.
1418 parentName, componentName = datasetType.nameAndComponent()
1419 parentDatasetType = self.getDatasetType(parentName)
1420 composition = {parentDatasetType: [componentName]}
1421 if composition is not None:
1422 # We need to recurse. Do that once for each parent dataset type.
1423 chain = []
1424 for parentDatasetType, componentNames in composition.items():
1425 parentResults = self.queryDatasets(
1426 parentDatasetType,
1427 collections=collections,
1428 dimensions=dimensions,
1429 dataId=standardizedDataId,
1430 where=where,
1431 findFirst=findFirst
1432 )
1433 if isinstance(parentResults, queries.ParentDatasetQueryResults):
1434 chain.append(
1435 parentResults.withComponents(componentNames)
1436 )
1437 else:
1438 # Should only happen if we know there would be no results.
1439 assert isinstance(parentResults, queries.ChainedDatasetQueryResults) \
1440 and not parentResults._chain
1441 return queries.ChainedDatasetQueryResults(chain)
1442 # If we get here, there's no need to recurse (or we are already
1443 # recursing; there can only ever be one level of recursion).
1445 # The full set of dimensions in the query is the combination of those
1446 # needed for the DatasetType and those explicitly requested, if any.
1447 requestedDimensionNames = set(datasetType.dimensions.names)
1448 if dimensions is not None:
1449 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
1450 # Construct the summary structure needed to construct a QueryBuilder.
1451 summary = queries.QuerySummary(
1452 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1453 dataId=standardizedDataId,
1454 expression=where,
1455 )
1456 builder = self.makeQueryBuilder(summary)
1457 # Add the dataset subquery to the query, telling the QueryBuilder to
1458 # include the rank of the selected collection in the results only if we
1459 # need to findFirst. Note that if any of the collections are
1460 # actually wildcard expressions, and we've asked for deduplication,
1461 # this will raise TypeError for us.
1462 if not builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst):
1463 return queries.ChainedDatasetQueryResults(())
1464 query = builder.finish()
1465 return queries.ParentDatasetQueryResults(self._db, query, components=[None])
1467 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
1468 dataId: Optional[DataId] = None,
1469 datasets: Any = None,
1470 collections: Any = None,
1471 where: Optional[str] = None,
1472 components: Optional[bool] = None,
1473 **kwargs: Any) -> queries.DataCoordinateQueryResults:
1474 """Query for data IDs matching user-provided criteria.
1476 Parameters
1477 ----------
1478 dimensions : `Dimension` or `str`, or iterable thereof
1479 The dimensions of the data IDs to yield, as either `Dimension`
1480 instances or `str`. Will be automatically expanded to a complete
1481 `DimensionGraph`.
1482 dataId : `dict` or `DataCoordinate`, optional
1483 A data ID whose key-value pairs are used as equality constraints
1484 in the query.
1485 datasets : `Any`, optional
1486 An expression that fully or partially identifies dataset types
1487 that should constrain the yielded data IDs. For example, including
1488 "raw" here would constrain the yielded ``instrument``,
1489 ``exposure``, ``detector``, and ``physical_filter`` values to only
1490 those for which at least one "raw" dataset exists in
1491 ``collections``. Allowed types include `DatasetType`, `str`,
1492 `re.Pattern`, and iterables thereof. Unlike other dataset type
1493 expressions, ``...`` is not permitted - it doesn't make sense to
1494 constrain data IDs on the existence of *all* datasets.
1495 See :ref:`daf_butler_dataset_type_expressions` for more
1496 information.
1497 collections: `Any`, optional
1498 An expression that fully or partially identifies the collections
1499 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1500 thereof. `...` can be used to return all collections. Must be
1501 provided if ``datasets`` is, and is ignored if it is not. See
1502 :ref:`daf_butler_collection_expressions` for more information.
1503 where : `str`, optional
1504 A string expression similar to a SQL WHERE clause. May involve
1505 any column of a dimension table or (as a shortcut for the primary
1506 key column of a dimension table) dimension name. See
1507 :ref:`daf_butler_dimension_expressions` for more information.
1508 components : `bool`, optional
1509 If `True`, apply all dataset expression patterns to component
1510 dataset type names as well. If `False`, never apply patterns to
1511 components. If `None` (default), apply patterns to components only
1512 if their parent datasets were not matched by the expression.
1513 Fully-specified component datasets (`str` or `DatasetType`
1514 instances) are always included.
1515 **kwargs
1516 Additional keyword arguments are forwarded to
1517 `DataCoordinate.standardize` when processing the ``dataId``
1518 argument (and may be used to provide a constraining data ID even
1519 when the ``dataId`` argument is `None`).
1521 Returns
1522 -------
1523 dataIds : `DataCoordinateQueryResults`
1524 Data IDs matching the given query parameters. These are guaranteed
1525 to identify all dimensions (`DataCoordinate.hasFull` returns
1526 `True`), but will not contain `DimensionRecord` objects
1527 (`DataCoordinate.hasRecords` returns `False`). Call
1528 `DataCoordinateQueryResults.expanded` on the returned object to
1529 fetch those (and consider using
1530 `DataCoordinateQueryResults.materialize` on the returned object
1531 first if the expected number of rows is very large). See
1532 documentation for those methods for additional information.
1533 """
1534 dimensions = iterable(dimensions)
1535 standardizedDataId = self.expandDataId(dataId, **kwargs)
1536 standardizedDatasetTypes = set()
1537 requestedDimensions = self.dimensions.extract(dimensions)
1538 queryDimensionNames = set(requestedDimensions.names)
1539 if datasets is not None:
1540 if collections is None:
1541 raise TypeError("Cannot pass 'datasets' without 'collections'.")
1542 for datasetType in self.queryDatasetTypes(datasets, components=components):
1543 queryDimensionNames.update(datasetType.dimensions.names)
1544 # If any matched dataset type is a component, just operate on
1545 # its parent instead, because Registry doesn't know anything
1546 # about what components exist, and here (unlike queryDatasets)
1547 # we don't care about returning them.
1548 parentDatasetTypeName, componentName = datasetType.nameAndComponent()
1549 if componentName is not None:
1550 datasetType = self.getDatasetType(parentDatasetTypeName)
1551 standardizedDatasetTypes.add(datasetType)
1552 # Preprocess collections expression in case the original included
1553 # single-pass iterators (we'll want to use it multiple times
1554 # below).
1555 collections = CollectionQuery.fromExpression(collections)
1557 summary = queries.QuerySummary(
1558 requested=DimensionGraph(self.dimensions, names=queryDimensionNames),
1559 dataId=standardizedDataId,
1560 expression=where,
1561 )
1562 builder = self.makeQueryBuilder(summary)
1563 for datasetType in standardizedDatasetTypes:
1564 builder.joinDataset(datasetType, collections, isResult=False)
1565 query = builder.finish()
1566 return queries.DataCoordinateQueryResults(self._db, query)
1568 def queryDimensionRecords(self, element: Union[DimensionElement, str], *,
1569 dataId: Optional[DataId] = None,
1570 datasets: Any = None,
1571 collections: Any = None,
1572 where: Optional[str] = None,
1573 components: Optional[bool] = None,
1574 **kwargs: Any) -> Iterator[DimensionRecord]:
1575 """Query for dimension information matching user-provided criteria.
1577 Parameters
1578 ----------
1579 element : `DimensionElement` or `str`
1580 The dimension element to obtain r
1581 dataId : `dict` or `DataCoordinate`, optional
1582 A data ID whose key-value pairs are used as equality constraints
1583 in the query.
1584 datasets : `Any`, optional
1585 An expression that fully or partially identifies dataset types
1586 that should constrain the yielded records. See `queryDataIds` and
1587 :ref:`daf_butler_dataset_type_expressions` for more information.
1588 collections: `Any`, optional
1589 An expression that fully or partially identifies the collections
1590 to search for datasets. See `queryDataIds` and
1591 :ref:`daf_butler_collection_expressions` for more information.
1592 where : `str`, optional
1593 A string expression similar to a SQL WHERE clause. See
1594 `queryDataIds` and :ref:`daf_butler_dimension_expressions` for more
1595 information.
1596 components : `bool`, optional
1597 Whether to apply dataset expressions to components as well.
1598 See `queryDataIds` for more information.
1599 **kwargs
1600 Additional keyword arguments are forwarded to
1601 `DataCoordinate.standardize` when processing the ``dataId``
1602 argument (and may be used to provide a constraining data ID even
1603 when the ``dataId`` argument is `None`).
1605 Returns
1606 -------
1607 dataIds : `DataCoordinateQueryResults`
1608 Data IDs matching the given query parameters.
1609 """
1610 if not isinstance(element, DimensionElement):
1611 element = self.dimensions[element]
1612 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections,
1613 where=where, components=components, **kwargs)
1614 return iter(self._dimensions[element].fetch(dataIds))
1616 def queryDatasetAssociations(
1617 self,
1618 datasetType: Union[str, DatasetType],
1619 collections: Any = ...,
1620 *,
1621 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1622 flattenChains: bool = False,
1623 ) -> Iterator[DatasetAssociation]:
1624 """Iterate over dataset-collection combinations where the dataset is in
1625 the collection.
1627 This method is a temporary placeholder for better support for
1628 assocation results in `queryDatasets`. It will probably be
1629 removed in the future, and should be avoided in production code
1630 whenever possible.
1632 Parameters
1633 ----------
1634 datasetType : `DatasetType` or `str`
1635 A dataset type object or the name of one.
1636 collections: `Any`, optional
1637 An expression that fully or partially identifies the collections
1638 to search for datasets. See `queryCollections` and
1639 :ref:`daf_butler_collection_expressions` for more information.
1640 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
1641 If provided, only yield associations from collections of these
1642 types.
1643 flattenChains : `bool`, optional
1644 If `True` (default) search in the children of
1645 `~CollectionType.CHAINED` collections. If `False`, ``CHAINED``
1646 collections are ignored.
1648 Yields
1649 ------
1650 association : `DatasetAssociation`
1651 Object representing the relationship beween a single dataset and
1652 a single collection.
1653 """
1654 collections = CollectionQuery.fromExpression(collections)
1655 tsRepr = self._db.getTimespanRepresentation()
1656 if isinstance(datasetType, str):
1657 storage = self._datasets[datasetType]
1658 else:
1659 storage = self._datasets[datasetType.name]
1660 for collectionRecord in collections.iter(self._collections,
1661 collectionTypes=frozenset(collectionTypes),
1662 flattenChains=flattenChains):
1663 query = storage.select(collectionRecord)
1664 if query is None:
1665 continue
1666 for row in self._db.query(query.combine()):
1667 dataId = DataCoordinate.fromRequiredValues(
1668 storage.datasetType.dimensions,
1669 tuple(row[name] for name in storage.datasetType.dimensions.required.names)
1670 )
1671 runRecord = self._collections[row[self._collections.getRunForeignKeyName()]]
1672 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name,
1673 conform=False)
1674 if collectionRecord.type is CollectionType.CALIBRATION:
1675 timespan = tsRepr.extract(row)
1676 else:
1677 timespan = None
1678 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan)
1680 storageClasses: StorageClassFactory
1681 """All storage classes known to the registry (`StorageClassFactory`).
1682 """