Coverage for python/lsst/daf/butler/registry/_registry.py : 13%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("Registry", "AmbiguousDatasetError", "ConflictingDefinitionError", "OrphanedRecordError")
26import contextlib
27import sys
28from typing import (
29 Any,
30 Iterable,
31 Iterator,
32 List,
33 Mapping,
34 Optional,
35 Set,
36 Type,
37 TYPE_CHECKING,
38 Union,
39)
41import sqlalchemy
43from ..core import (
44 Config,
45 DataCoordinate,
46 DataId,
47 DatasetRef,
48 DatasetType,
49 Dimension,
50 DimensionElement,
51 DimensionGraph,
52 DimensionRecord,
53 DimensionUniverse,
54 ExpandedDataCoordinate,
55 StorageClassFactory,
56)
57from ..core import ddl
58from ..core.utils import doImport, iterable, transactional
59from ._config import RegistryConfig
60from .queries import (
61 DatasetRegistryStorage,
62 QueryBuilder,
63 QuerySummary,
64)
65from .tables import makeRegistryTableSpecs
66from ._collectionType import CollectionType
67from .wildcards import CollectionQuery, CollectionSearch
69if TYPE_CHECKING: 69 ↛ 70line 69 didn't jump to line 70, because the condition on line 69 was never true
70 from ..butlerConfig import ButlerConfig
71 from ..core import (
72 Quantum
73 )
74 from .interfaces import (
75 CollectionManager,
76 Database,
77 OpaqueTableStorageManager,
78 DimensionRecordStorageManager,
79 )
82class AmbiguousDatasetError(Exception):
83 """Exception raised when a `DatasetRef` has no ID and a `Registry`
84 operation requires one.
85 """
88class ConflictingDefinitionError(Exception):
89 """Exception raised when trying to insert a database record when a
90 conflicting record already exists.
91 """
94class OrphanedRecordError(Exception):
95 """Exception raised when trying to remove or modify a database record
96 that is still being used in some other table.
97 """
100def _checkAndGetId(ref: DatasetRef) -> int:
101 """Return the ID of the given `DatasetRef`, or raise if it is `None`.
103 This trivial function exists to allow operations that would otherwise be
104 natural list comprehensions to check that the ID is not `None` as well.
106 Parameters
107 ----------
108 ref : `DatasetRef`
109 Dataset reference.
111 Returns
112 -------
113 id : `int`
114 ``ref.id``
116 Raises
117 ------
118 AmbiguousDatasetError
119 Raised if ``ref.id`` is `None`.
120 """
121 if ref.id is None:
122 raise AmbiguousDatasetError("Dataset ID must not be `None`.")
123 return ref.id
126class Registry:
127 """Registry interface.
129 Parameters
130 ----------
131 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
132 Registry configuration
133 """
135 defaultConfigFile = None
136 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
137 absolute path. Can be None if no defaults specified.
138 """
140 @classmethod
141 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False,
142 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry:
143 """Create `Registry` subclass instance from `config`.
145 Uses ``registry.cls`` from `config` to determine which subclass to
146 instantiate.
148 Parameters
149 ----------
150 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
151 Registry configuration
152 create : `bool`, optional
153 Assume empty Registry and create a new one.
154 butlerRoot : `str`, optional
155 Path to the repository root this `Registry` will manage.
156 writeable : `bool`, optional
157 If `True` (default) create a read-write connection to the database.
159 Returns
160 -------
161 registry : `Registry` (subclass)
162 A new `Registry` subclass instance.
163 """
164 if not isinstance(config, RegistryConfig):
165 if isinstance(config, str) or isinstance(config, Config):
166 config = RegistryConfig(config)
167 else:
168 raise ValueError("Incompatible Registry configuration: {}".format(config))
169 config.replaceRoot(butlerRoot)
170 DatabaseClass = config.getDatabaseClass()
171 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
172 namespace=config.get("namespace"), writeable=writeable)
173 universe = DimensionUniverse(config)
174 opaque = doImport(config["managers", "opaque"])
175 dimensions = doImport(config["managers", "dimensions"])
176 collections = doImport(config["managers", "collections"])
177 return cls(database, universe, dimensions=dimensions, opaque=opaque, collections=collections,
178 create=create)
180 def __init__(self, database: Database, universe: DimensionUniverse, *,
181 opaque: Type[OpaqueTableStorageManager],
182 dimensions: Type[DimensionRecordStorageManager],
183 collections: Type[CollectionManager],
184 create: bool = False):
185 self._db = database
186 self.storageClasses = StorageClassFactory()
187 with self._db.declareStaticTables(create=create) as context:
188 self._dimensions = dimensions.initialize(self._db, context, universe=universe)
189 self._collections = collections.initialize(self._db, context)
190 self._tables = context.addTableTuple(makeRegistryTableSpecs(self.dimensions, self._collections))
191 self._opaque = opaque.initialize(self._db, context)
192 self._collections.refresh()
193 # TODO: we shouldn't be grabbing the private connection from the
194 # Database instance like this, but it's a reasonable way to proceed
195 # while we transition to using the Database API more.
196 self._connection = self._db._connection
197 self._datasetStorage = DatasetRegistryStorage(connection=self._connection,
198 universe=self.dimensions,
199 tables=self._tables._asdict(),
200 collections=self._collections)
201 self._datasetTypes = {}
203 def __str__(self) -> str:
204 return str(self._db)
206 def __repr__(self) -> str:
207 return f"Registry({self._db!r}, {self.dimensions!r})"
209 def isWriteable(self) -> bool:
210 """Return `True` if this registry allows write operations, and `False`
211 otherwise.
212 """
213 return self._db.isWriteable()
215 @property
216 def dimensions(self) -> DimensionUniverse:
217 """All dimensions recognized by this `Registry` (`DimensionUniverse`).
218 """
219 return self._dimensions.universe
221 @contextlib.contextmanager
222 def transaction(self):
223 """Return a context manager that represents a transaction.
224 """
225 # TODO make savepoint=False the default.
226 try:
227 with self._db.transaction():
228 yield
229 except BaseException:
230 # TODO: this clears the caches sometimes when we wouldn't actually
231 # need to. Can we avoid that?
232 self._dimensions.clearCaches()
233 self._datasetTypes.clear()
234 raise
236 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec):
237 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
238 other data repository client.
240 Opaque table records can be added via `insertOpaqueData`, retrieved via
241 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
243 Parameters
244 ----------
245 tableName : `str`
246 Logical name of the opaque table. This may differ from the
247 actual name used in the database by a prefix and/or suffix.
248 spec : `ddl.TableSpec`
249 Specification for the table to be added.
250 """
251 self._opaque.register(tableName, spec)
253 @transactional
254 def insertOpaqueData(self, tableName: str, *data: dict):
255 """Insert records into an opaque table.
257 Parameters
258 ----------
259 tableName : `str`
260 Logical name of the opaque table. Must match the name used in a
261 previous call to `registerOpaqueTable`.
262 data
263 Each additional positional argument is a dictionary that represents
264 a single row to be added.
265 """
266 self._opaque[tableName].insert(*data)
268 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
269 """Retrieve records from an opaque table.
271 Parameters
272 ----------
273 tableName : `str`
274 Logical name of the opaque table. Must match the name used in a
275 previous call to `registerOpaqueTable`.
276 where
277 Additional keyword arguments are interpreted as equality
278 constraints that restrict the returned rows (combined with AND);
279 keyword arguments are column names and values are the values they
280 must have.
282 Yields
283 ------
284 row : `dict`
285 A dictionary representing a single result row.
286 """
287 yield from self._opaque[tableName].fetch(**where)
289 @transactional
290 def deleteOpaqueData(self, tableName: str, **where: Any):
291 """Remove records from an opaque table.
293 Parameters
294 ----------
295 tableName : `str`
296 Logical name of the opaque table. Must match the name used in a
297 previous call to `registerOpaqueTable`.
298 where
299 Additional keyword arguments are interpreted as equality
300 constraints that restrict the deleted rows (combined with AND);
301 keyword arguments are column names and values are the values they
302 must have.
303 """
304 self._opaque[tableName].delete(**where)
306 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED):
307 """Add a new collection if one with the given name does not exist.
309 Parameters
310 ----------
311 name : `str`
312 The name of the collection to create.
313 type : `CollectionType`
314 Enum value indicating the type of collection to create.
316 Notes
317 -----
318 This method cannot be called within transactions, as it needs to be
319 able to perform its own transaction to be concurrent.
320 """
321 self._collections.register(name, type)
323 def getCollectionType(self, name: str) -> CollectionType:
324 """Return an enumeration value indicating the type of the given
325 collection.
327 Parameters
328 ----------
329 name : `str`
330 The name of the collection.
332 Returns
333 -------
334 type : `CollectionType`
335 Enum value indicating the type of this collection.
337 Raises
338 ------
339 MissingCollectionError
340 Raised if no collection with the given name exists.
341 """
342 return self._collections.find(name).type
344 def registerRun(self, name: str):
345 """Add a new run if one with the given name does not exist.
347 Parameters
348 ----------
349 name : `str`
350 The name of the run to create.
352 Notes
353 -----
354 This method cannot be called within transactions, as it needs to be
355 able to perform its own transaction to be concurrent.
356 """
357 self._collections.register(name, CollectionType.RUN)
359 def getCollectionChain(self, parent: str) -> CollectionSearch:
360 """Return the child collections in a `~CollectionType.CHAINED`
361 collection.
363 Parameters
364 ----------
365 parent : `str`
366 Name of the chained collection. Must have already been added via
367 a call to `Registry.registerCollection`.
369 Returns
370 -------
371 children : `CollectionSearch`
372 An object that defines the search path of the collection.
373 See :ref:`daf_butler_collection_expressions` for more information.
375 Raises
376 ------
377 MissingCollectionError
378 Raised if ``parent`` does not exist in the `Registry`.
379 TypeError
380 Raised if ``parent`` does not correspond to a
381 `~CollectionType.CHAINED` collection.
382 """
383 record = self._collections.find(parent)
384 if record.type is not CollectionType.CHAINED:
385 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
386 return record.children
388 def setCollectionChain(self, parent: str, children: Any):
389 """Define or redefine a `~CollectionType.CHAINED` collection.
391 Parameters
392 ----------
393 parent : `str`
394 Name of the chained collection. Must have already been added via
395 a call to `Registry.registerCollection`.
396 children : `Any`
397 An expression defining an ordered search of child collections,
398 generally an iterable of `str`. Restrictions on the dataset types
399 to be searched can also be included, by passing mapping or an
400 iterable containing tuples; see
401 :ref:`daf_butler_collection_expressions` for more information.
403 Raises
404 ------
405 MissingCollectionError
406 Raised when any of the given collections do not exist in the
407 `Registry`.
408 TypeError
409 Raised if ``parent`` does not correspond to a
410 `~CollectionType.CHAINED` collection.
411 ValueError
412 Raised if the given collections contains a cycle.
413 """
414 record = self._collections.find(parent)
415 if record.type is not CollectionType.CHAINED:
416 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
417 children = CollectionSearch.fromExpression(children)
418 if children != record.children:
419 record.update(self._collections, children)
421 @transactional
422 def registerDatasetType(self, datasetType: DatasetType) -> bool:
423 """
424 Add a new `DatasetType` to the Registry.
426 It is not an error to register the same `DatasetType` twice.
428 Parameters
429 ----------
430 datasetType : `DatasetType`
431 The `DatasetType` to be added.
433 Returns
434 -------
435 inserted : `bool`
436 `True` if ``datasetType`` was inserted, `False` if an identical
437 existing `DatsetType` was found. Note that in either case the
438 DatasetType is guaranteed to be defined in the Registry
439 consistently with the given definition.
441 Raises
442 ------
443 ValueError
444 Raised if the dimensions or storage class are invalid.
445 ConflictingDefinitionError
446 Raised if this DatasetType is already registered with a different
447 definition.
448 """
449 # TODO: this implementation isn't concurrent, except *maybe* in SQLite
450 # with aggressive locking (where starting a transaction is essentially
451 # the same as grabbing a full-database lock). Should be reimplemented
452 # with Database.sync to fix this, but that may require schema changes
453 # as well so we only have to synchronize one row to know if we have
454 # inconsistent definitions.
456 # If the DatasetType is already in the cache, we assume it's already in
457 # the DB (note that we don't actually provide a way to remove them from
458 # the DB).
459 existingDatasetType = self._datasetTypes.get(datasetType.name)
460 # If it's not in the cache, try to insert it.
461 if existingDatasetType is None:
462 try:
463 with self._db.transaction():
464 self._db.insert(
465 self._tables.dataset_type,
466 {
467 "dataset_type_name": datasetType.name,
468 "storage_class": datasetType.storageClass.name,
469 }
470 )
471 except sqlalchemy.exc.IntegrityError:
472 # Insert failed on the only unique constraint on this table:
473 # dataset_type_name. So now the question is whether the one in
474 # there is the same as the one we tried to insert.
475 existingDatasetType = self.getDatasetType(datasetType.name)
476 else:
477 # If adding the DatasetType record itself succeeded, add its
478 # dimensions (if any). We don't guard this in a try block
479 # because a problem with this insert means the database
480 # content must be corrupted.
481 if datasetType.dimensions:
482 self._db.insert(
483 self._tables.dataset_type_dimensions,
484 *[{"dataset_type_name": datasetType.name,
485 "dimension_name": dimensionName}
486 for dimensionName in datasetType.dimensions.names]
487 )
488 # Update the cache.
489 self._datasetTypes[datasetType.name] = datasetType
490 # Also register component DatasetTypes (if any).
491 for compName, compStorageClass in datasetType.storageClass.components.items():
492 compType = DatasetType(datasetType.componentTypeName(compName),
493 dimensions=datasetType.dimensions,
494 storageClass=compStorageClass)
495 self.registerDatasetType(compType)
496 # Inserts succeeded, nothing left to do here.
497 return True
498 # A DatasetType with this name exists, check if is equal
499 if datasetType == existingDatasetType:
500 return False
501 else:
502 raise ConflictingDefinitionError(f"DatasetType: {datasetType} != existing {existingDatasetType}")
504 def getDatasetType(self, name: str) -> DatasetType:
505 """Get the `DatasetType`.
507 Parameters
508 ----------
509 name : `str`
510 Name of the type.
512 Returns
513 -------
514 type : `DatasetType`
515 The `DatasetType` associated with the given name.
517 Raises
518 ------
519 KeyError
520 Requested named DatasetType could not be found in registry.
521 """
522 datasetType = self._datasetTypes.get(name)
523 if datasetType is None:
524 # Get StorageClass from DatasetType table
525 result = self._db.query(
526 sqlalchemy.sql.select(
527 [self._tables.dataset_type.c.storage_class]
528 ).where(
529 self._tables.dataset_type.columns.dataset_type_name == name
530 )
531 ).fetchone()
533 if result is None:
534 raise KeyError("Could not find entry for datasetType {}".format(name))
536 storageClass = self.storageClasses.getStorageClass(result["storage_class"])
537 # Get Dimensions (if any) from DatasetTypeDimensions table
538 result = self._db.query(
539 sqlalchemy.sql.select(
540 [self._tables.dataset_type_dimensions.columns.dimension_name]
541 ).where(
542 self._tables.dataset_type_dimensions.columns.dataset_type_name == name
543 )
544 ).fetchall()
545 dimensions = DimensionGraph(self.dimensions, names=(r[0] for r in result) if result else ())
546 datasetType = DatasetType(name=name,
547 storageClass=storageClass,
548 dimensions=dimensions)
549 self._datasetTypes[name] = datasetType
550 return datasetType
552 def _makeDatasetRefFromRow(self, row: sqlalchemy.engine.RowProxy,
553 datasetType: Optional[DatasetType] = None,
554 dataId: Optional[DataCoordinate] = None):
555 """Construct a DatasetRef from the result of a query on the Dataset
556 table.
558 Parameters
559 ----------
560 row : `sqlalchemy.engine.RowProxy`.
561 Row of a query that contains all columns from the `Dataset` table.
562 May include additional fields (which will be ignored).
563 datasetType : `DatasetType`, optional
564 `DatasetType` associated with this dataset. Will be retrieved
565 if not provided. If provided, the caller guarantees that it is
566 already consistent with what would have been retrieved from the
567 database.
568 dataId : `DataCoordinate`, optional
569 Dimensions associated with this dataset. Will be retrieved if not
570 provided. If provided, the caller guarantees that it is already
571 consistent with what would have been retrieved from the database.
573 Returns
574 -------
575 ref : `DatasetRef`.
576 A new `DatasetRef` instance.
577 """
578 if datasetType is None:
579 datasetType = self.getDatasetType(row["dataset_type_name"])
580 runRecord = self._collections[row[self._collections.getRunForeignKeyName()]]
581 assert runRecord is not None, "Should be guaranteed by foreign key constraints."
582 run = runRecord.name
583 datasetRefHash = row["dataset_ref_hash"]
584 if dataId is None:
585 # TODO: should we expand here?
586 dataId = DataCoordinate.standardize(
587 row,
588 graph=datasetType.dimensions,
589 universe=self.dimensions
590 )
591 # Get components (if present)
592 components = {}
593 if datasetType.storageClass.isComposite():
594 t = self._tables
595 columns = list(t.dataset.columns)
596 columns.append(t.dataset_composition.columns.component_name)
597 results = self._db.query(
598 sqlalchemy.sql.select(
599 columns
600 ).select_from(
601 t.dataset.join(
602 t.dataset_composition,
603 (t.dataset.columns.dataset_id == t.dataset_composition.columns.component_dataset_id)
604 )
605 ).where(
606 t.dataset_composition.columns.parent_dataset_id == row["dataset_id"]
607 )
608 ).fetchall()
609 for result in results:
610 componentName = result["component_name"]
611 componentDatasetType = DatasetType(
612 DatasetType.nameWithComponent(datasetType.name, componentName),
613 dimensions=datasetType.dimensions,
614 storageClass=datasetType.storageClass.components[componentName]
615 )
616 components[componentName] = self._makeDatasetRefFromRow(result, dataId=dataId,
617 datasetType=componentDatasetType)
618 if not components.keys() <= datasetType.storageClass.components.keys():
619 raise RuntimeError(
620 f"Inconsistency detected between dataset and storage class definitions: "
621 f"{datasetType.storageClass.name} has components "
622 f"{set(datasetType.storageClass.components.keys())}, "
623 f"but dataset has components {set(components.keys())}"
624 )
625 return DatasetRef(datasetType=datasetType, dataId=dataId, id=row["dataset_id"], run=run,
626 hash=datasetRefHash, components=components)
628 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *,
629 collections: Any, **kwds: Any) -> Optional[DatasetRef]:
630 """Find a dataset given its `DatasetType` and data ID.
632 This can be used to obtain a `DatasetRef` that permits the dataset to
633 be read from a `Datastore`.
635 Parameters
636 ----------
637 datasetType : `DatasetType` or `str`
638 A `DatasetType` or the name of one.
639 dataId : `dict` or `DataCoordinate`, optional
640 A `dict`-like object containing the `Dimension` links that identify
641 the dataset within a collection.
642 collections
643 An expression that fully or partially identifies the collections
644 to search for the dataset, such as a `str`, `re.Pattern`, or
645 iterable thereof. `...` can be used to return all collections.
646 See :ref:`daf_butler_collection_expressions` for more information.
647 **kwds
648 Additional keyword arguments passed to
649 `DataCoordinate.standardize` to convert ``dataId`` to a true
650 `DataCoordinate` or augment an existing one.
652 Returns
653 -------
654 ref : `DatasetRef`
655 A reference to the dataset, or `None` if no matching Dataset
656 was found.
658 Raises
659 ------
660 LookupError
661 Raised if one or more data ID keys are missing.
662 MissingCollectionError
663 Raised if any of ``collections`` does not exist in the registry.
664 """
665 if not isinstance(datasetType, DatasetType):
666 datasetType = self.getDatasetType(datasetType)
667 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions,
668 universe=self.dimensions, **kwds)
669 collections = CollectionSearch.fromExpression(collections)
670 for collectionRecord in collections.iter(self._collections, datasetType=datasetType):
671 if collectionRecord.type is CollectionType.TAGGED:
672 collectionColumn = \
673 self._tables.dataset_collection.columns[self._collections.getCollectionForeignKeyName()]
674 fromClause = self._tables.dataset.join(self._tables.dataset_collection)
675 elif collectionRecord.type is CollectionType.RUN:
676 collectionColumn = self._tables.dataset.columns[self._collections.getRunForeignKeyName()]
677 fromClause = self._tables.dataset
678 else:
679 raise NotImplementedError(f"Unrecognized CollectionType: '{collectionRecord.type}'.")
680 whereTerms = [
681 self._tables.dataset.columns.dataset_type_name == datasetType.name,
682 collectionColumn == collectionRecord.key,
683 ]
684 whereTerms.extend(self._tables.dataset.columns[name] == dataId[name] for name in dataId.keys())
685 query = self._tables.dataset.select().select_from(
686 fromClause
687 ).where(
688 sqlalchemy.sql.and_(*whereTerms)
689 )
690 result = self._db.query(query).fetchone()
691 if result is not None:
692 return self._makeDatasetRefFromRow(result, datasetType=datasetType, dataId=dataId)
693 return None
695 @transactional
696 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
697 run: str, *, producer: Optional[Quantum] = None, recursive: bool = False
698 ) -> List[DatasetRef]:
699 """Insert one or more datasets into the `Registry`
701 This always adds new datasets; to associate existing datasets with
702 a new collection, use ``associate``.
704 Parameters
705 ----------
706 datasetType : `DatasetType` or `str`
707 A `DatasetType` or the name of one.
708 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate`
709 Dimension-based identifiers for the new datasets.
710 run : `str`
711 The name of the run that produced the datasets.
712 producer : `Quantum`
713 Unit of work that produced the datasets. May be `None` to store
714 no provenance information, but if present the `Quantum` must
715 already have been added to the Registry.
716 recursive : `bool`
717 If True, recursively add datasets and attach entries for component
718 datasets as well.
720 Returns
721 -------
722 refs : `list` of `DatasetRef`
723 Resolved `DatasetRef` instances for all given data IDs (in the same
724 order).
726 Raises
727 ------
728 ConflictingDefinitionError
729 If a dataset with the same dataset type and data ID as one of those
730 given already exists in the given collection.
731 MissingCollectionError
732 Raised if ``run`` does not exist in the registry.
733 """
734 if not isinstance(datasetType, DatasetType):
735 datasetType = self.getDatasetType(datasetType)
736 rows = []
737 refs = []
738 runRecord = self._collections.find(run)
739 base = {
740 "dataset_type_name": datasetType.name,
741 self._collections.getRunForeignKeyName(): runRecord.key,
742 "quantum_id": producer.id if producer is not None else None,
743 }
744 # Expand data IDs and build both a list of unresolved DatasetRefs
745 # and a list of dictionary rows for the dataset table.
746 for dataId in dataIds:
747 ref = DatasetRef(datasetType, self.expandDataId(dataId, graph=datasetType.dimensions))
748 refs.append(ref)
749 row = dict(base, dataset_ref_hash=ref.hash)
750 for dimension, value in ref.dataId.full.items():
751 row[dimension.name] = value
752 rows.append(row)
753 # Actually insert into the dataset table.
754 try:
755 datasetIds = self._db.insert(self._tables.dataset, *rows, returnIds=True)
756 except sqlalchemy.exc.IntegrityError as err:
757 raise ConflictingDefinitionError(
758 f"Constraint violation while inserting datasets into run {run}. "
759 f"This usually means that one or more datasets with the same dataset type and data ID "
760 f"already exist in the collection, but it may be a foreign key violation."
761 ) from err
762 # Resolve the DatasetRefs with the autoincrement IDs we generated.
763 refs = [ref.resolved(id=datasetId, run=run) for datasetId, ref in zip(datasetIds, refs)]
764 if recursive and datasetType.isComposite():
765 # Insert component rows by recursing, and gather a single big list
766 # of rows to insert into the dataset_composition table.
767 compositionRows = []
768 for componentName in datasetType.storageClass.components:
769 componentDatasetType = datasetType.makeComponentDatasetType(componentName)
770 componentRefs = self.insertDatasets(componentDatasetType,
771 dataIds=(ref.dataId for ref in refs),
772 run=run,
773 producer=producer,
774 recursive=True)
775 for parentRef, componentRef in zip(refs, componentRefs):
776 parentRef._components[componentName] = componentRef
777 compositionRows.append({
778 "parent_dataset_id": parentRef.id,
779 "component_dataset_id": componentRef.id,
780 "component_name": componentName,
781 })
782 if compositionRows:
783 self._db.insert(self._tables.dataset_composition, *compositionRows)
784 return refs
786 def getDataset(self, id: int, datasetType: Optional[DatasetType] = None,
787 dataId: Optional[DataCoordinate] = None) -> Optional[DatasetRef]:
788 """Retrieve a Dataset entry.
790 Parameters
791 ----------
792 id : `int`
793 The unique identifier for the Dataset.
794 datasetType : `DatasetType`, optional
795 The `DatasetType` of the dataset to retrieve. This is used to
796 short-circuit retrieving the `DatasetType`, so if provided, the
797 caller is guaranteeing that it is what would have been retrieved.
798 dataId : `DataCoordinate`, optional
799 A `Dimension`-based identifier for the dataset within a
800 collection, possibly containing additional metadata. This is used
801 to short-circuit retrieving the dataId, so if provided, the
802 caller is guaranteeing that it is what would have been retrieved.
804 Returns
805 -------
806 ref : `DatasetRef`
807 A ref to the Dataset, or `None` if no matching Dataset
808 was found.
809 """
810 result = self._db.query(
811 self._tables.dataset.select().where(
812 self._tables.dataset.columns.dataset_id == id
813 )
814 ).fetchone()
815 if result is None:
816 return None
817 return self._makeDatasetRefFromRow(result, datasetType=datasetType, dataId=dataId)
819 @transactional
820 def removeDataset(self, ref: DatasetRef):
821 """Remove a dataset from the Registry.
823 The dataset and all components will be removed unconditionally from
824 all collections, and any associated `Quantum` records will also be
825 removed. `Datastore` records will *not* be deleted; the caller is
826 responsible for ensuring that the dataset has already been removed
827 from all Datastores.
829 Parameters
830 ----------
831 ref : `DatasetRef`
832 Reference to the dataset to be removed. Must include a valid
833 ``id`` attribute, and should be considered invalidated upon return.
835 Raises
836 ------
837 AmbiguousDatasetError
838 Raised if ``ref.id`` is `None`.
839 OrphanedRecordError
840 Raised if the dataset is still present in any `Datastore`.
841 """
842 if not ref.id:
843 raise AmbiguousDatasetError(f"Cannot remove dataset {ref} without ID.")
844 # Remove component datasets. We assume ``ref.components`` is already
845 # correctly populated, and rely on ON DELETE CASCADE to remove entries
846 # from DatasetComposition.
847 for componentRef in ref.components.values():
848 self.removeDataset(componentRef)
850 # Remove related quanta. We rely on ON DELETE CASCADE to remove any
851 # related records in dataset_consumers. Note that we permit a Quantum
852 # to be deleted without removing the datasets it refers to, but do not
853 # allow a dataset to be deleted without removing the Quanta that refer
854 # to them. A dataset is still quite usable without provenance, but
855 # provenance is worthless if it's inaccurate.
856 t = self._tables
857 selectProducer = sqlalchemy.sql.select(
858 [t.dataset.columns.quantum_id]
859 ).where(
860 t.dataset.columns.dataset_id == ref.id
861 )
862 selectConsumers = sqlalchemy.sql.select(
863 [t.dataset_consumers.columns.quantum_id]
864 ).where(
865 t.dataset_consumers.columns.dataset_id == ref.id
866 )
867 # TODO: we'd like to use Database.delete here, but it doesn't general
868 # queries yet.
869 self._connection.execute(
870 t.quantum.delete().where(
871 t.quantum.columns.id.in_(sqlalchemy.sql.union(selectProducer, selectConsumers))
872 )
873 )
874 # Remove the Dataset record itself. We rely on ON DELETE CASCADE to
875 # remove from DatasetCollection, and assume foreign key violations
876 # come from DatasetLocation (everything else should have an ON DELETE).
877 try:
878 self._connection.execute(
879 t.dataset.delete().where(t.dataset.c.dataset_id == ref.id)
880 )
881 except sqlalchemy.exc.IntegrityError as err:
882 raise OrphanedRecordError(f"Dataset {ref} is still present in one or more Datastores.") from err
884 @transactional
885 def attachComponent(self, name: str, parent: DatasetRef, component: DatasetRef):
886 """Attach a component to a dataset.
888 Parameters
889 ----------
890 name : `str`
891 Name of the component.
892 parent : `DatasetRef`
893 A reference to the parent dataset. Will be updated to reference
894 the component.
895 component : `DatasetRef`
896 A reference to the component dataset.
898 Raises
899 ------
900 AmbiguousDatasetError
901 Raised if ``parent.id`` or ``component.id`` is `None`.
902 """
903 # TODO Insert check for component name and type against
904 # parent.storageClass specified components
905 if parent.id is None:
906 raise AmbiguousDatasetError(f"Cannot attach component to dataset {parent} without ID.")
907 if component.id is None:
908 raise AmbiguousDatasetError(f"Cannot attach component {component} without ID.")
909 values = dict(component_name=name,
910 parent_dataset_id=parent.id,
911 component_dataset_id=component.id)
912 self._db.insert(self._tables.dataset_composition, values)
913 parent._components[name] = component
915 @transactional
916 def associate(self, collection: str, refs: Iterable[DatasetRef], *, recursive: bool = True):
917 """Add existing Datasets to a collection, implicitly creating the
918 collection if it does not already exist.
920 If a DatasetRef with the same exact ``dataset_id`` is already in a
921 collection nothing is changed. If a `DatasetRef` with the same
922 `DatasetType1` and dimension values but with different ``dataset_id``
923 exists in the collection, `ValueError` is raised.
925 Parameters
926 ----------
927 collection : `str`
928 Indicates the collection the Datasets should be associated with.
929 refs : iterable of `DatasetRef`
930 An iterable of resolved `DatasetRef` instances that already exist
931 in this `Registry`.
932 recursive : `bool`, optional
933 If `True`, associate all component datasets as well. Note that
934 this only associates components that are actually included in the
935 given `DatasetRef` instances, which may not be the same as those in
936 the database (especially if they were obtained from
937 `queryDatasets`, which does not populate `DatasetRef.components`).
939 Raises
940 ------
941 ConflictingDefinitionError
942 If a Dataset with the given `DatasetRef` already exists in the
943 given collection.
944 AmbiguousDatasetError
945 Raised if ``any(ref.id is None for ref in refs)``.
946 MissingCollectionError
947 Raised if ``collection`` does not exist in the registry.
948 TypeError
949 Raise adding new datasets to the given ``collection`` is not
950 allowed.
951 """
952 collectionRecord = self._collections.find(collection)
953 if collectionRecord.type is not CollectionType.TAGGED:
954 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
955 if recursive:
956 refs = DatasetRef.flatten(refs)
957 rows = [{"dataset_id": _checkAndGetId(ref),
958 "dataset_ref_hash": ref.hash,
959 self._collections.getCollectionForeignKeyName(): collectionRecord.key}
960 for ref in refs]
961 try:
962 self._db.replace(self._tables.dataset_collection, *rows)
963 except sqlalchemy.exc.IntegrityError as err:
964 raise ConflictingDefinitionError(
965 f"Constraint violation while associating datasets with collection {collection}. "
966 f"This probably means that one or more datasets with the same dataset type and data ID "
967 f"already exist in the collection, but it may also indicate that the datasets do not exist."
968 ) from err
970 @transactional
971 def disassociate(self, collection: str, refs: Iterable[DatasetRef], *, recursive: bool = True):
972 """Remove existing Datasets from a collection.
974 ``collection`` and ``ref`` combinations that are not currently
975 associated are silently ignored.
977 Parameters
978 ----------
979 collection : `str`
980 The collection the Datasets should no longer be associated with.
981 refs : iterable of `DatasetRef`
982 An iterable of resolved `DatasetRef` instances that already exist
983 in this `Registry`.
984 recursive : `bool`, optional
985 If `True`, disassociate all component datasets as well. Note that
986 this only disassociates components that are actually included in
987 the given `DatasetRef` instances, which may not be the same as
988 those in the database (especially if they were obtained from
989 `queryDatasets`, which does not populate `DatasetRef.components`).
991 Raises
992 ------
993 AmbiguousDatasetError
994 Raised if ``any(ref.id is None for ref in refs)``.
995 MissingCollectionError
996 Raised if ``collection`` does not exist in the registry.
997 TypeError
998 Raise adding new datasets to the given ``collection`` is not
999 allowed.
1000 """
1001 collectionFieldName = self._collections.getCollectionForeignKeyName()
1002 collectionRecord = self._collections.find(collection)
1003 if collectionRecord.type is not CollectionType.TAGGED:
1004 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; "
1005 "expected TAGGED.")
1006 if recursive:
1007 refs = DatasetRef.flatten(refs)
1008 rows = [{"dataset_id": _checkAndGetId(ref), collectionFieldName: collectionRecord.key}
1009 for ref in refs]
1010 self._db.delete(self._tables.dataset_collection, ["dataset_id", collectionFieldName], *rows)
1012 @transactional
1013 def insertDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]):
1014 """Record that a datastore holds the given datasets.
1016 Typically used by `Datastore`.
1018 Parameters
1019 ----------
1020 datastoreName : `str`
1021 Name of the datastore holding these datasets.
1022 refs : `~collections.abc.Iterable` of `DatasetRef`
1023 References to the datasets.
1025 Raises
1026 ------
1027 AmbiguousDatasetError
1028 Raised if ``any(ref.id is None for ref in refs)``.
1029 """
1030 self._db.insert(
1031 self._tables.dataset_storage,
1032 *[{"datastore_name": datastoreName, "dataset_id": _checkAndGetId(ref)} for ref in refs]
1033 )
1035 def getDatasetLocations(self, ref: DatasetRef) -> Set[str]:
1036 """Retrieve datastore locations for a given dataset.
1038 Typically used by `Datastore`.
1040 Parameters
1041 ----------
1042 ref : `DatasetRef`
1043 A reference to the dataset for which to retrieve storage
1044 information.
1046 Returns
1047 -------
1048 datastores : `set` of `str`
1049 All the matching datastores holding this dataset. Empty set
1050 if the dataset does not exist anywhere.
1052 Raises
1053 ------
1054 AmbiguousDatasetError
1055 Raised if ``ref.id`` is `None`.
1056 """
1057 table = self._tables.dataset_storage
1058 result = self._db.query(
1059 sqlalchemy.sql.select(
1060 [table.columns.datastore_name]
1061 ).where(
1062 table.columns.dataset_id == ref.id
1063 )
1064 ).fetchall()
1065 return {r["datastore_name"] for r in result}
1067 @transactional
1068 def removeDatasetLocation(self, datastoreName, ref):
1069 """Remove datastore location associated with this dataset.
1071 Typically used by `Datastore` when a dataset is removed.
1073 Parameters
1074 ----------
1075 datastoreName : `str`
1076 Name of this `Datastore`.
1077 ref : `DatasetRef`
1078 A reference to the dataset for which information is to be removed.
1080 Raises
1081 ------
1082 AmbiguousDatasetError
1083 Raised if ``ref.id`` is `None`.
1084 """
1085 self._db.delete(
1086 self._tables.dataset_storage,
1087 ["dataset_id", "datastore_name"],
1088 {"dataset_id": _checkAndGetId(ref), "datastore_name": datastoreName}
1089 )
1091 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
1092 records: Optional[Mapping[DimensionElement, DimensionRecord]] = None, **kwds):
1093 """Expand a dimension-based data ID to include additional information.
1095 Parameters
1096 ----------
1097 dataId : `DataCoordinate` or `dict`, optional
1098 Data ID to be expanded; augmented and overridden by ``kwds``.
1099 graph : `DimensionGraph`, optional
1100 Set of dimensions for the expanded ID. If `None`, the dimensions
1101 will be inferred from the keys of ``dataId`` and ``kwds``.
1102 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph``
1103 are silently ignored, providing a way to extract and expand a
1104 subset of a data ID.
1105 records : mapping [`DimensionElement`, `DimensionRecord`], optional
1106 Dimension record data to use before querying the database for that
1107 data.
1108 **kwds
1109 Additional keywords are treated like additional key-value pairs for
1110 ``dataId``, extending and overriding
1112 Returns
1113 -------
1114 expanded : `ExpandedDataCoordinate`
1115 A data ID that includes full metadata for all of the dimensions it
1116 identifieds.
1117 """
1118 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwds)
1119 if isinstance(standardized, ExpandedDataCoordinate):
1120 return standardized
1121 elif isinstance(dataId, ExpandedDataCoordinate):
1122 records = dict(records) if records is not None else {}
1123 records.update(dataId.records)
1124 else:
1125 records = dict(records) if records is not None else {}
1126 keys = dict(standardized)
1127 for element in standardized.graph.primaryKeyTraversalOrder:
1128 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
1129 if record is ...:
1130 storage = self._dimensions[element]
1131 record = storage.fetch(keys)
1132 records[element] = record
1133 if record is not None:
1134 keys.update((d, getattr(record, d.name)) for d in element.implied)
1135 else:
1136 if element in standardized.graph.required:
1137 raise LookupError(
1138 f"Could not fetch record for required dimension {element.name} via keys {keys}."
1139 )
1140 records.update((d, None) for d in element.implied)
1141 return ExpandedDataCoordinate(standardized.graph, standardized.values(), records=records)
1143 def insertDimensionData(self, element: Union[DimensionElement, str],
1144 *data: Union[dict, DimensionRecord],
1145 conform: bool = True):
1146 """Insert one or more dimension records into the database.
1148 Parameters
1149 ----------
1150 element : `DimensionElement` or `str`
1151 The `DimensionElement` or name thereof that identifies the table
1152 records will be inserted into.
1153 data : `dict` or `DimensionRecord` (variadic)
1154 One or more records to insert.
1155 conform : `bool`, optional
1156 If `False` (`True` is default) perform no checking or conversions,
1157 and assume that ``element`` is a `DimensionElement` instance and
1158 ``data`` is a one or more `DimensionRecord` instances of the
1159 appropriate subclass.
1160 """
1161 if conform:
1162 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement.
1163 records = [element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row
1164 for row in data]
1165 else:
1166 records = data
1167 storage = self._dimensions[element]
1168 storage.insert(*records)
1170 def queryDatasetTypes(self, expression: Any = ...) -> Iterator[DatasetType]:
1171 """Iterate over the dataset types whose names match an expression.
1173 Parameters
1174 ----------
1175 expression : `Any`, optional
1176 An expression that fully or partially identifies the dataset types
1177 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1178 `...` can be used to return all dataset types, and is the default.
1179 See :ref:`daf_butler_dataset_type_expressions` for more
1180 information.
1182 Yields
1183 ------
1184 datasetType : `DatasetType`
1185 A `DatasetType` instance whose name matches ``expression``.
1186 """
1187 yield from self._datasetStorage.fetchDatasetTypes(expression)
1189 def queryCollections(self, expression: Any = ...,
1190 datasetType: Optional[DatasetType] = None,
1191 collectionType: Optional[CollectionType] = None,
1192 flattenChains: bool = False,
1193 includeChains: Optional[bool] = None) -> Iterator[str]:
1194 """Iterate over the collections whose names match an expression.
1196 Parameters
1197 ----------
1198 expression : `Any`, optional
1199 An expression that fully or partially identifies the collections
1200 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1201 `...` can be used to return all collections, and is the default.
1202 See :ref:`daf_butler_collection_expressions` for more
1203 information.
1204 datasetType : `DatasetType`, optional
1205 If provided, only yield collections that should be searched for
1206 this dataset type according to ``expression``. If this is
1207 not provided, any dataset type restrictions in ``expression`` are
1208 ignored.
1209 collectionType : `CollectionType`, optional
1210 If provided, only yield collections of this type.
1211 flattenChains : `bool`, optional
1212 If `True` (`False` is default), recursively yield the child
1213 collections of matching `~CollectionType.CHAINED` collections.
1214 includeChains : `bool`, optional
1215 If `True`, yield records for matching `~CollectionType.CHAINED`
1216 collections. Default is the opposite of ``flattenChains``: include
1217 either CHAINED collections or their children, but not both.
1219 Yields
1220 ------
1221 collection : `str`
1222 The name of a collection that matches ``expression``.
1223 """
1224 query = CollectionQuery.fromExpression(expression)
1225 for record in query.iter(self._collections, datasetType=datasetType, collectionType=collectionType,
1226 flattenChains=flattenChains, includeChains=includeChains):
1227 yield record.name
1229 def makeQueryBuilder(self, summary: QuerySummary) -> QueryBuilder:
1230 """Return a `QueryBuilder` instance capable of constructing and
1231 managing more complex queries than those obtainable via `Registry`
1232 interfaces.
1234 This is an advanced interface; downstream code should prefer
1235 `Registry.queryDimensions` and `Registry.queryDatasets` whenever those
1236 are sufficient.
1238 Parameters
1239 ----------
1240 summary : `QuerySummary`
1241 Object describing and categorizing the full set of dimensions that
1242 will be included in the query.
1244 Returns
1245 -------
1246 builder : `QueryBuilder`
1247 Object that can be used to construct and perform advanced queries.
1248 """
1249 return QueryBuilder(connection=self._connection, summary=summary,
1250 dimensionStorage=self._dimensions,
1251 datasetStorage=self._datasetStorage)
1253 def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
1254 dataId: Optional[DataId] = None,
1255 datasets: Any = None,
1256 collections: Any = None,
1257 where: Optional[str] = None,
1258 expand: bool = True,
1259 **kwds) -> Iterator[DataCoordinate]:
1260 """Query for and iterate over data IDs matching user-provided criteria.
1262 Parameters
1263 ----------
1264 dimensions : `Dimension` or `str`, or iterable thereof
1265 The dimensions of the data IDs to yield, as either `Dimension`
1266 instances or `str`. Will be automatically expanded to a complete
1267 `DimensionGraph`.
1268 dataId : `dict` or `DataCoordinate`, optional
1269 A data ID whose key-value pairs are used as equality constraints
1270 in the query.
1271 datasets : `Any`, optional
1272 An expression that fully or partially identifies dataset types
1273 that should constrain the yielded data IDs. For example, including
1274 "raw" here would constrain the yielded ``instrument``,
1275 ``exposure``, ``detector``, and ``physical_filter`` values to only
1276 those for which at least one "raw" dataset exists in
1277 ``collections``. Allowed types include `DatasetType`, `str`,
1278 `re.Pattern`, and iterables thereof. Unlike other dataset type
1279 expressions, `...` is not permitted - it doesn't make sense to
1280 constrain data IDs on the existence of *all* datasets.
1281 See :ref:`daf_butler_dataset_type_expressions` for more
1282 information.
1283 collections: `Any`, optional
1284 An expression that fully or partially identifies the collections
1285 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1286 thereof. `...` can be used to return all collections. Must be
1287 provided if ``datasets`` is, and is ignored if it is not. See
1288 :ref:`daf_butler_collection_expressions` for more information.
1289 where : `str`, optional
1290 A string expression similar to a SQL WHERE clause. May involve
1291 any column of a dimension table or (as a shortcut for the primary
1292 key column of a dimension table) dimension name. See
1293 :ref:`daf_butler_dimension_expressions` for more information.
1294 expand : `bool`, optional
1295 If `True` (default) yield `ExpandedDataCoordinate` instead of
1296 minimal `DataCoordinate` base-class instances.
1297 kwds
1298 Additional keyword arguments are forwarded to
1299 `DataCoordinate.standardize` when processing the ``dataId``
1300 argument (and may be used to provide a constraining data ID even
1301 when the ``dataId`` argument is `None`).
1303 Yields
1304 ------
1305 dataId : `DataCoordinate`
1306 Data IDs matching the given query parameters. Order is
1307 unspecified.
1308 """
1309 dimensions = iterable(dimensions)
1310 standardizedDataId = self.expandDataId(dataId, **kwds)
1311 standardizedDatasetTypes = []
1312 requestedDimensionNames = set(self.dimensions.extract(dimensions).names)
1313 if datasets is not None:
1314 if collections is None:
1315 raise TypeError("Cannot pass 'datasets' without 'collections'.")
1316 for datasetType in self._datasetStorage.fetchDatasetTypes(datasets):
1317 requestedDimensionNames.update(datasetType.dimensions.names)
1318 standardizedDatasetTypes.append(datasetType)
1319 # Preprocess collections expression in case the original included
1320 # single-pass iterators (we'll want to use it multiple times
1321 # below).
1322 collections = CollectionQuery.fromExpression(collections)
1324 summary = QuerySummary(
1325 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1326 dataId=standardizedDataId,
1327 expression=where,
1328 )
1329 builder = self.makeQueryBuilder(summary)
1330 for datasetType in standardizedDatasetTypes:
1331 builder.joinDataset(datasetType, collections, isResult=False)
1332 query = builder.finish()
1333 predicate = query.predicate()
1334 for row in query.execute():
1335 if predicate(row):
1336 result = query.extractDataId(row)
1337 if expand:
1338 yield self.expandDataId(result, records=standardizedDataId.records)
1339 else:
1340 yield result
1342 def queryDatasets(self, datasetType: Any, *,
1343 collections: Any,
1344 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
1345 dataId: Optional[DataId] = None,
1346 where: Optional[str] = None,
1347 deduplicate: bool = False,
1348 expand: bool = True,
1349 **kwds) -> Iterator[DatasetRef]:
1350 """Query for and iterate over dataset references matching user-provided
1351 criteria.
1353 Parameters
1354 ----------
1355 datasetType
1356 An expression that fully or partially identifies the dataset types
1357 to be queried. Allowed types include `DatasetType`, `str`,
1358 `re.Pattern`, and iterables thereof. The special value `...` can
1359 be used to query all dataset types. See
1360 :ref:`daf_butler_dataset_type_expressions` for more information.
1361 collections
1362 An expression that fully or partially identifies the collections
1363 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1364 thereof. `...` can be used to return all collections. See
1365 :ref:`daf_butler_collection_expressions` for more information.
1366 dimensions : `~collections.abc.Iterable` of `Dimension` or `str`
1367 Dimensions to include in the query (in addition to those used
1368 to identify the queried dataset type(s)), either to constrain
1369 the resulting datasets to those for which a matching dimension
1370 exists, or to relate the dataset type's dimensions to dimensions
1371 referenced by the ``dataId`` or ``where`` arguments.
1372 dataId : `dict` or `DataCoordinate`, optional
1373 A data ID whose key-value pairs are used as equality constraints
1374 in the query.
1375 where : `str`, optional
1376 A string expression similar to a SQL WHERE clause. May involve
1377 any column of a dimension table or (as a shortcut for the primary
1378 key column of a dimension table) dimension name. See
1379 :ref:`daf_butler_dimension_expressions` for more information.
1380 deduplicate : `bool`, optional
1381 If `True` (`False` is default), for each result data ID, only
1382 yield one `DatasetRef` of each `DatasetType`, from the first
1383 collection in which a dataset of that dataset type appears
1384 (according to the order of ``collections`` passed in). If `True`,
1385 ``collections`` must not contain regular expressions and may not
1386 be `...`.
1387 expand : `bool`, optional
1388 If `True` (default) attach `ExpandedDataCoordinate` instead of
1389 minimal `DataCoordinate` base-class instances.
1390 kwds
1391 Additional keyword arguments are forwarded to
1392 `DataCoordinate.standardize` when processing the ``dataId``
1393 argument (and may be used to provide a constraining data ID even
1394 when the ``dataId`` argument is `None`).
1396 Yields
1397 ------
1398 ref : `DatasetRef`
1399 Dataset references matching the given query criteria. These
1400 are grouped by `DatasetType` if the query evaluates to multiple
1401 dataset types, but order is otherwise unspecified.
1403 Raises
1404 ------
1405 TypeError
1406 Raised when the arguments are incompatible, such as when a
1407 collection wildcard is passed when ``deduplicate`` is `True`.
1409 Notes
1410 -----
1411 When multiple dataset types are queried in a single call, the
1412 results of this operation are equivalent to querying for each dataset
1413 type separately in turn, and no information about the relationships
1414 between datasets of different types is included. In contexts where
1415 that kind of information is important, the recommended pattern is to
1416 use `queryDimensions` to first obtain data IDs (possibly with the
1417 desired dataset types and collections passed as constraints to the
1418 query), and then use multiple (generally much simpler) calls to
1419 `queryDatasets` with the returned data IDs passed as constraints.
1420 """
1421 # Standardize and expand the data ID provided as a constraint.
1422 standardizedDataId = self.expandDataId(dataId, **kwds)
1423 # If the datasetType passed isn't actually a DatasetType, expand it
1424 # (it could be an expression that yields multiple DatasetTypes) and
1425 # recurse.
1426 if not isinstance(datasetType, DatasetType):
1427 for trueDatasetType in self._datasetStorage.fetchDatasetTypes(datasetType):
1428 yield from self.queryDatasets(trueDatasetType, collections=collections,
1429 dimensions=dimensions, dataId=standardizedDataId,
1430 where=where, deduplicate=deduplicate)
1431 return
1432 # The full set of dimensions in the query is the combination of those
1433 # needed for the DatasetType and those explicitly requested, if any.
1434 requestedDimensionNames = set(datasetType.dimensions.names)
1435 if dimensions is not None:
1436 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
1437 # Construct the summary structure needed to construct a QueryBuilder.
1438 summary = QuerySummary(
1439 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1440 dataId=standardizedDataId,
1441 expression=where,
1442 )
1443 builder = self.makeQueryBuilder(summary)
1444 # Add the dataset subquery to the query, telling the QueryBuilder to
1445 # include the rank of the selected collection in the results only if we
1446 # need to deduplicate. Note that if any of the collections are
1447 # actually wildcard expressions, and we've asked for deduplication,
1448 # this will raise TypeError for us.
1449 builder.joinDataset(datasetType, collections, isResult=True, addRank=deduplicate)
1450 query = builder.finish()
1451 predicate = query.predicate()
1452 if not deduplicate or len(collections) == 1:
1453 # No need to de-duplicate across collections.
1454 for row in query.execute():
1455 if predicate(row):
1456 dataId = query.extractDataId(row, graph=datasetType.dimensions)
1457 if expand:
1458 dataId = self.expandDataId(dataId, records=standardizedDataId.records)
1459 yield query.extractDatasetRef(row, datasetType, dataId)[0]
1460 else:
1461 # For each data ID, yield only the DatasetRef with the lowest
1462 # collection rank.
1463 bestRefs = {}
1464 bestRanks = {}
1465 for row in query.execute():
1466 if predicate(row):
1467 ref, rank = query.extractDatasetRef(row, datasetType)
1468 bestRank = bestRanks.get(ref.dataId, sys.maxsize)
1469 if rank < bestRank:
1470 bestRefs[ref.dataId] = ref
1471 bestRanks[ref.dataId] = rank
1472 # If caller requested expanded data IDs, we defer that until here
1473 # so we do as little expansion as possible.
1474 if expand:
1475 for ref in bestRefs.values():
1476 dataId = self.expandDataId(ref.dataId, records=standardizedDataId.records)
1477 yield ref.expanded(dataId)
1478 else:
1479 yield from bestRefs.values()
1481 dimensions: DimensionUniverse
1482 """The universe of all dimensions known to the registry
1483 (`DimensionUniverse`).
1484 """
1486 storageClasses: StorageClassFactory
1487 """All storage classes known to the registry (`StorageClassFactory`).
1488 """