Coverage for python/lsst/daf/butler/registry/_registry.py : 13%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "AmbiguousDatasetError",
26 "ConflictingDefinitionError",
27 "ConsistentDataIds",
28 "InconsistentDataIdError",
29 "OrphanedRecordError",
30 "Registry",
31)
33import contextlib
34from dataclasses import dataclass
35import sys
36from typing import (
37 Any,
38 Iterable,
39 Iterator,
40 List,
41 Mapping,
42 Optional,
43 Set,
44 Type,
45 TYPE_CHECKING,
46 Union,
47)
49import sqlalchemy
51import lsst.sphgeom
52from ..core import (
53 Config,
54 DataCoordinate,
55 DataId,
56 DatasetRef,
57 DatasetType,
58 Dimension,
59 DimensionElement,
60 DimensionGraph,
61 DimensionRecord,
62 DimensionUniverse,
63 ExpandedDataCoordinate,
64 FakeDatasetRef,
65 StorageClassFactory,
66)
67from ..core import ddl
68from ..core.utils import doImport, iterable, transactional
69from ._config import RegistryConfig
70from .queries import (
71 DatasetRegistryStorage,
72 QueryBuilder,
73 QuerySummary,
74)
75from .tables import makeRegistryTableSpecs
76from ._collectionType import CollectionType
77from .wildcards import CollectionQuery, CollectionSearch
78from .interfaces import DatabaseConflictError
80if TYPE_CHECKING: 80 ↛ 81line 80 didn't jump to line 81, because the condition on line 80 was never true
81 from ..butlerConfig import ButlerConfig
82 from ..core import (
83 Quantum
84 )
85 from .interfaces import (
86 CollectionManager,
87 Database,
88 OpaqueTableStorageManager,
89 DimensionRecordStorageManager,
90 )
93@dataclass
94class ConsistentDataIds:
95 """A struct used to report relationships between data IDs by
96 `Registry.relateDataIds`.
98 If an instance of this class is returned (instead of `None`), the data IDs
99 are "not inconsistent" - any keys they have in common have the same value,
100 and any spatial or temporal relationships they have at least might involve
101 an overlap. To capture this, any instance of `ConsistentDataIds` coerces
102 to `True` in boolean contexts.
103 """
105 overlaps: bool
106 """If `True`, the data IDs have at least one key in common, associated with
107 the same value.
109 Note that data IDs are not inconsistent even if overlaps is `False` - they
110 may simply have no keys in common, which means they cannot have
111 inconsistent values for any keys. They may even be equal, in the case that
112 both data IDs are empty.
114 This field does _not_ indicate whether a spatial or temporal overlap
115 relationship exists.
116 """
118 contains: bool
119 """If `True`, all keys in the first data ID are in the second, and are
120 associated with the same values.
122 This includes case where the first data ID is empty.
123 """
125 within: bool
126 """If `True`, all keys in the second data ID are in the first, and are
127 associated with the same values.
129 This includes case where the second data ID is empty.
130 """
132 @property
133 def equal(self) -> bool:
134 """If `True`, the two data IDs are the same.
136 Data IDs are equal if they have both a `contains` and a `within`
137 relationship.
138 """
139 return self.contains and self.within
141 @property
142 def disjoint(self) -> bool:
143 """If `True`, the two data IDs have no keys in common.
145 This is simply the oppose of `overlaps`. Disjoint datasets are by
146 definition not inconsistent.
147 """
148 return not self.overlaps
150 def __bool__(self) -> bool:
151 return True
154class InconsistentDataIdError(ValueError):
155 """Exception raised when a data ID contains contradictory key-value pairs,
156 according to dimension relationships.
158 This can include the case where the data ID identifies mulitple spatial
159 regions or timspans that are disjoint.
160 """
163class AmbiguousDatasetError(Exception):
164 """Exception raised when a `DatasetRef` has no ID and a `Registry`
165 operation requires one.
166 """
169class ConflictingDefinitionError(Exception):
170 """Exception raised when trying to insert a database record when a
171 conflicting record already exists.
172 """
175class OrphanedRecordError(Exception):
176 """Exception raised when trying to remove or modify a database record
177 that is still being used in some other table.
178 """
181def _checkAndGetId(ref: DatasetRef) -> int:
182 """Return the ID of the given `DatasetRef`, or raise if it is `None`.
184 This trivial function exists to allow operations that would otherwise be
185 natural list comprehensions to check that the ID is not `None` as well.
187 Parameters
188 ----------
189 ref : `DatasetRef`
190 Dataset reference.
192 Returns
193 -------
194 id : `int`
195 ``ref.id``
197 Raises
198 ------
199 AmbiguousDatasetError
200 Raised if ``ref.id`` is `None`.
201 """
202 if ref.id is None:
203 raise AmbiguousDatasetError("Dataset ID must not be `None`.")
204 return ref.id
207class Registry:
208 """Registry interface.
210 Parameters
211 ----------
212 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
213 Registry configuration
214 """
216 defaultConfigFile = None
217 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
218 absolute path. Can be None if no defaults specified.
219 """
221 @classmethod
222 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False,
223 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry:
224 """Create `Registry` subclass instance from `config`.
226 Uses ``registry.cls`` from `config` to determine which subclass to
227 instantiate.
229 Parameters
230 ----------
231 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
232 Registry configuration
233 create : `bool`, optional
234 Assume empty Registry and create a new one.
235 butlerRoot : `str`, optional
236 Path to the repository root this `Registry` will manage.
237 writeable : `bool`, optional
238 If `True` (default) create a read-write connection to the database.
240 Returns
241 -------
242 registry : `Registry` (subclass)
243 A new `Registry` subclass instance.
244 """
245 if not isinstance(config, RegistryConfig):
246 if isinstance(config, str) or isinstance(config, Config):
247 config = RegistryConfig(config)
248 else:
249 raise ValueError("Incompatible Registry configuration: {}".format(config))
250 config.replaceRoot(butlerRoot)
251 DatabaseClass = config.getDatabaseClass()
252 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
253 namespace=config.get("namespace"), writeable=writeable)
254 universe = DimensionUniverse(config)
255 opaque = doImport(config["managers", "opaque"])
256 dimensions = doImport(config["managers", "dimensions"])
257 collections = doImport(config["managers", "collections"])
258 return cls(database, universe, dimensions=dimensions, opaque=opaque, collections=collections,
259 create=create)
261 def __init__(self, database: Database, universe: DimensionUniverse, *,
262 opaque: Type[OpaqueTableStorageManager],
263 dimensions: Type[DimensionRecordStorageManager],
264 collections: Type[CollectionManager],
265 create: bool = False):
266 self._db = database
267 self.storageClasses = StorageClassFactory()
268 with self._db.declareStaticTables(create=create) as context:
269 self._dimensions = dimensions.initialize(self._db, context, universe=universe)
270 self._collections = collections.initialize(self._db, context)
271 self._tables = context.addTableTuple(makeRegistryTableSpecs(self.dimensions, self._collections))
272 self._opaque = opaque.initialize(self._db, context)
273 self._collections.refresh()
274 # TODO: we shouldn't be grabbing the private connection from the
275 # Database instance like this, but it's a reasonable way to proceed
276 # while we transition to using the Database API more.
277 self._connection = self._db._connection
278 self._datasetStorage = DatasetRegistryStorage(connection=self._connection,
279 universe=self.dimensions,
280 tables=self._tables._asdict(),
281 collections=self._collections)
282 self._datasetTypes = {}
284 def __str__(self) -> str:
285 return str(self._db)
287 def __repr__(self) -> str:
288 return f"Registry({self._db!r}, {self.dimensions!r})"
290 def isWriteable(self) -> bool:
291 """Return `True` if this registry allows write operations, and `False`
292 otherwise.
293 """
294 return self._db.isWriteable()
296 @property
297 def dimensions(self) -> DimensionUniverse:
298 """All dimensions recognized by this `Registry` (`DimensionUniverse`).
299 """
300 return self._dimensions.universe
302 @contextlib.contextmanager
303 def transaction(self):
304 """Return a context manager that represents a transaction.
305 """
306 # TODO make savepoint=False the default.
307 try:
308 with self._db.transaction():
309 yield
310 except BaseException:
311 # TODO: this clears the caches sometimes when we wouldn't actually
312 # need to. Can we avoid that?
313 self._dimensions.clearCaches()
314 self._datasetTypes.clear()
315 raise
317 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec):
318 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
319 other data repository client.
321 Opaque table records can be added via `insertOpaqueData`, retrieved via
322 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
324 Parameters
325 ----------
326 tableName : `str`
327 Logical name of the opaque table. This may differ from the
328 actual name used in the database by a prefix and/or suffix.
329 spec : `ddl.TableSpec`
330 Specification for the table to be added.
331 """
332 self._opaque.register(tableName, spec)
334 @transactional
335 def insertOpaqueData(self, tableName: str, *data: dict):
336 """Insert records into an opaque table.
338 Parameters
339 ----------
340 tableName : `str`
341 Logical name of the opaque table. Must match the name used in a
342 previous call to `registerOpaqueTable`.
343 data
344 Each additional positional argument is a dictionary that represents
345 a single row to be added.
346 """
347 self._opaque[tableName].insert(*data)
349 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
350 """Retrieve records from an opaque table.
352 Parameters
353 ----------
354 tableName : `str`
355 Logical name of the opaque table. Must match the name used in a
356 previous call to `registerOpaqueTable`.
357 where
358 Additional keyword arguments are interpreted as equality
359 constraints that restrict the returned rows (combined with AND);
360 keyword arguments are column names and values are the values they
361 must have.
363 Yields
364 ------
365 row : `dict`
366 A dictionary representing a single result row.
367 """
368 yield from self._opaque[tableName].fetch(**where)
370 @transactional
371 def deleteOpaqueData(self, tableName: str, **where: Any):
372 """Remove records from an opaque table.
374 Parameters
375 ----------
376 tableName : `str`
377 Logical name of the opaque table. Must match the name used in a
378 previous call to `registerOpaqueTable`.
379 where
380 Additional keyword arguments are interpreted as equality
381 constraints that restrict the deleted rows (combined with AND);
382 keyword arguments are column names and values are the values they
383 must have.
384 """
385 self._opaque[tableName].delete(**where)
387 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED):
388 """Add a new collection if one with the given name does not exist.
390 Parameters
391 ----------
392 name : `str`
393 The name of the collection to create.
394 type : `CollectionType`
395 Enum value indicating the type of collection to create.
397 Notes
398 -----
399 This method cannot be called within transactions, as it needs to be
400 able to perform its own transaction to be concurrent.
401 """
402 self._collections.register(name, type)
404 def getCollectionType(self, name: str) -> CollectionType:
405 """Return an enumeration value indicating the type of the given
406 collection.
408 Parameters
409 ----------
410 name : `str`
411 The name of the collection.
413 Returns
414 -------
415 type : `CollectionType`
416 Enum value indicating the type of this collection.
418 Raises
419 ------
420 MissingCollectionError
421 Raised if no collection with the given name exists.
422 """
423 return self._collections.find(name).type
425 def registerRun(self, name: str):
426 """Add a new run if one with the given name does not exist.
428 Parameters
429 ----------
430 name : `str`
431 The name of the run to create.
433 Notes
434 -----
435 This method cannot be called within transactions, as it needs to be
436 able to perform its own transaction to be concurrent.
437 """
438 self._collections.register(name, CollectionType.RUN)
440 def getCollectionChain(self, parent: str) -> CollectionSearch:
441 """Return the child collections in a `~CollectionType.CHAINED`
442 collection.
444 Parameters
445 ----------
446 parent : `str`
447 Name of the chained collection. Must have already been added via
448 a call to `Registry.registerCollection`.
450 Returns
451 -------
452 children : `CollectionSearch`
453 An object that defines the search path of the collection.
454 See :ref:`daf_butler_collection_expressions` for more information.
456 Raises
457 ------
458 MissingCollectionError
459 Raised if ``parent`` does not exist in the `Registry`.
460 TypeError
461 Raised if ``parent`` does not correspond to a
462 `~CollectionType.CHAINED` collection.
463 """
464 record = self._collections.find(parent)
465 if record.type is not CollectionType.CHAINED:
466 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
467 return record.children
469 def setCollectionChain(self, parent: str, children: Any):
470 """Define or redefine a `~CollectionType.CHAINED` collection.
472 Parameters
473 ----------
474 parent : `str`
475 Name of the chained collection. Must have already been added via
476 a call to `Registry.registerCollection`.
477 children : `Any`
478 An expression defining an ordered search of child collections,
479 generally an iterable of `str`. Restrictions on the dataset types
480 to be searched can also be included, by passing mapping or an
481 iterable containing tuples; see
482 :ref:`daf_butler_collection_expressions` for more information.
484 Raises
485 ------
486 MissingCollectionError
487 Raised when any of the given collections do not exist in the
488 `Registry`.
489 TypeError
490 Raised if ``parent`` does not correspond to a
491 `~CollectionType.CHAINED` collection.
492 ValueError
493 Raised if the given collections contains a cycle.
494 """
495 record = self._collections.find(parent)
496 if record.type is not CollectionType.CHAINED:
497 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
498 children = CollectionSearch.fromExpression(children)
499 if children != record.children:
500 record.update(self._collections, children)
502 @transactional
503 def registerDatasetType(self, datasetType: DatasetType) -> bool:
504 """
505 Add a new `DatasetType` to the Registry.
507 It is not an error to register the same `DatasetType` twice.
509 Parameters
510 ----------
511 datasetType : `DatasetType`
512 The `DatasetType` to be added.
514 Returns
515 -------
516 inserted : `bool`
517 `True` if ``datasetType`` was inserted, `False` if an identical
518 existing `DatsetType` was found. Note that in either case the
519 DatasetType is guaranteed to be defined in the Registry
520 consistently with the given definition.
522 Raises
523 ------
524 ValueError
525 Raised if the dimensions or storage class are invalid.
526 ConflictingDefinitionError
527 Raised if this DatasetType is already registered with a different
528 definition.
529 """
530 # TODO: this implementation isn't concurrent, except *maybe* in SQLite
531 # with aggressive locking (where starting a transaction is essentially
532 # the same as grabbing a full-database lock). Should be reimplemented
533 # with Database.sync to fix this, but that may require schema changes
534 # as well so we only have to synchronize one row to know if we have
535 # inconsistent definitions.
537 # If the DatasetType is already in the cache, we assume it's already in
538 # the DB (note that we don't actually provide a way to remove them from
539 # the DB).
540 existingDatasetType = self._datasetTypes.get(datasetType.name)
541 # If it's not in the cache, try to insert it.
542 if existingDatasetType is None:
543 try:
544 with self._db.transaction():
545 self._db.insert(
546 self._tables.dataset_type,
547 {
548 "dataset_type_name": datasetType.name,
549 "storage_class": datasetType.storageClass.name,
550 }
551 )
552 except sqlalchemy.exc.IntegrityError:
553 # Insert failed on the only unique constraint on this table:
554 # dataset_type_name. So now the question is whether the one in
555 # there is the same as the one we tried to insert.
556 existingDatasetType = self.getDatasetType(datasetType.name)
557 else:
558 # If adding the DatasetType record itself succeeded, add its
559 # dimensions (if any). We don't guard this in a try block
560 # because a problem with this insert means the database
561 # content must be corrupted.
562 if datasetType.dimensions:
563 self._db.insert(
564 self._tables.dataset_type_dimensions,
565 *[{"dataset_type_name": datasetType.name,
566 "dimension_name": dimensionName}
567 for dimensionName in datasetType.dimensions.names]
568 )
569 # Update the cache.
570 self._datasetTypes[datasetType.name] = datasetType
571 # Also register component DatasetTypes (if any).
572 for compName, compStorageClass in datasetType.storageClass.components.items():
573 compType = DatasetType(datasetType.componentTypeName(compName),
574 dimensions=datasetType.dimensions,
575 storageClass=compStorageClass)
576 self.registerDatasetType(compType)
577 # Inserts succeeded, nothing left to do here.
578 return True
579 # A DatasetType with this name exists, check if is equal
580 if datasetType == existingDatasetType:
581 return False
582 else:
583 raise ConflictingDefinitionError(f"DatasetType: {datasetType} != existing {existingDatasetType}")
585 def getDatasetType(self, name: str) -> DatasetType:
586 """Get the `DatasetType`.
588 Parameters
589 ----------
590 name : `str`
591 Name of the type.
593 Returns
594 -------
595 type : `DatasetType`
596 The `DatasetType` associated with the given name.
598 Raises
599 ------
600 KeyError
601 Requested named DatasetType could not be found in registry.
602 """
603 datasetType = self._datasetTypes.get(name)
604 if datasetType is None:
605 # Get StorageClass from DatasetType table
606 result = self._db.query(
607 sqlalchemy.sql.select(
608 [self._tables.dataset_type.c.storage_class]
609 ).where(
610 self._tables.dataset_type.columns.dataset_type_name == name
611 )
612 ).fetchone()
614 if result is None:
615 raise KeyError("Could not find entry for datasetType {}".format(name))
617 storageClass = self.storageClasses.getStorageClass(result["storage_class"])
618 # Get Dimensions (if any) from DatasetTypeDimensions table
619 result = self._db.query(
620 sqlalchemy.sql.select(
621 [self._tables.dataset_type_dimensions.columns.dimension_name]
622 ).where(
623 self._tables.dataset_type_dimensions.columns.dataset_type_name == name
624 )
625 ).fetchall()
626 dimensions = DimensionGraph(self.dimensions, names=(r[0] for r in result) if result else ())
627 datasetType = DatasetType(name=name,
628 storageClass=storageClass,
629 dimensions=dimensions)
630 self._datasetTypes[name] = datasetType
631 return datasetType
633 def _makeDatasetRefFromRow(self, row: sqlalchemy.engine.RowProxy,
634 datasetType: Optional[DatasetType] = None,
635 dataId: Optional[DataCoordinate] = None):
636 """Construct a DatasetRef from the result of a query on the Dataset
637 table.
639 Parameters
640 ----------
641 row : `sqlalchemy.engine.RowProxy`.
642 Row of a query that contains all columns from the `Dataset` table.
643 May include additional fields (which will be ignored).
644 datasetType : `DatasetType`, optional
645 `DatasetType` associated with this dataset. Will be retrieved
646 if not provided. If provided, the caller guarantees that it is
647 already consistent with what would have been retrieved from the
648 database.
649 dataId : `DataCoordinate`, optional
650 Dimensions associated with this dataset. Will be retrieved if not
651 provided. If provided, the caller guarantees that it is already
652 consistent with what would have been retrieved from the database.
654 Returns
655 -------
656 ref : `DatasetRef`.
657 A new `DatasetRef` instance.
658 """
659 if datasetType is None:
660 datasetType = self.getDatasetType(row["dataset_type_name"])
661 runRecord = self._collections[row[self._collections.getRunForeignKeyName()]]
662 assert runRecord is not None, "Should be guaranteed by foreign key constraints."
663 run = runRecord.name
664 datasetRefHash = row["dataset_ref_hash"]
665 if dataId is None:
666 # TODO: should we expand here?
667 dataId = DataCoordinate.standardize(
668 row,
669 graph=datasetType.dimensions,
670 universe=self.dimensions
671 )
672 # Get components (if present)
673 components = {}
674 if datasetType.storageClass.isComposite():
675 t = self._tables
676 columns = list(t.dataset.columns)
677 columns.append(t.dataset_composition.columns.component_name)
678 results = self._db.query(
679 sqlalchemy.sql.select(
680 columns
681 ).select_from(
682 t.dataset.join(
683 t.dataset_composition,
684 (t.dataset.columns.dataset_id == t.dataset_composition.columns.component_dataset_id)
685 )
686 ).where(
687 t.dataset_composition.columns.parent_dataset_id == row["dataset_id"]
688 )
689 ).fetchall()
690 for result in results:
691 componentName = result["component_name"]
692 componentDatasetType = DatasetType(
693 DatasetType.nameWithComponent(datasetType.name, componentName),
694 dimensions=datasetType.dimensions,
695 storageClass=datasetType.storageClass.components[componentName]
696 )
697 components[componentName] = self._makeDatasetRefFromRow(result, dataId=dataId,
698 datasetType=componentDatasetType)
699 if not components.keys() <= datasetType.storageClass.components.keys():
700 raise RuntimeError(
701 f"Inconsistency detected between dataset and storage class definitions: "
702 f"{datasetType.storageClass.name} has components "
703 f"{set(datasetType.storageClass.components.keys())}, "
704 f"but dataset has components {set(components.keys())}"
705 )
706 return DatasetRef(datasetType=datasetType, dataId=dataId, id=row["dataset_id"], run=run,
707 hash=datasetRefHash, components=components)
709 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *,
710 collections: Any, **kwds: Any) -> Optional[DatasetRef]:
711 """Find a dataset given its `DatasetType` and data ID.
713 This can be used to obtain a `DatasetRef` that permits the dataset to
714 be read from a `Datastore`.
716 Parameters
717 ----------
718 datasetType : `DatasetType` or `str`
719 A `DatasetType` or the name of one.
720 dataId : `dict` or `DataCoordinate`, optional
721 A `dict`-like object containing the `Dimension` links that identify
722 the dataset within a collection.
723 collections
724 An expression that fully or partially identifies the collections
725 to search for the dataset, such as a `str`, `re.Pattern`, or
726 iterable thereof. `...` can be used to return all collections.
727 See :ref:`daf_butler_collection_expressions` for more information.
728 **kwds
729 Additional keyword arguments passed to
730 `DataCoordinate.standardize` to convert ``dataId`` to a true
731 `DataCoordinate` or augment an existing one.
733 Returns
734 -------
735 ref : `DatasetRef`
736 A reference to the dataset, or `None` if no matching Dataset
737 was found.
739 Raises
740 ------
741 LookupError
742 Raised if one or more data ID keys are missing.
743 MissingCollectionError
744 Raised if any of ``collections`` does not exist in the registry.
745 """
746 if not isinstance(datasetType, DatasetType):
747 datasetType = self.getDatasetType(datasetType)
748 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions,
749 universe=self.dimensions, **kwds)
750 collections = CollectionSearch.fromExpression(collections)
751 for collectionRecord in collections.iter(self._collections, datasetType=datasetType):
752 if collectionRecord.type is CollectionType.TAGGED:
753 collectionColumn = \
754 self._tables.dataset_collection.columns[self._collections.getCollectionForeignKeyName()]
755 fromClause = self._tables.dataset.join(self._tables.dataset_collection)
756 elif collectionRecord.type is CollectionType.RUN:
757 collectionColumn = self._tables.dataset.columns[self._collections.getRunForeignKeyName()]
758 fromClause = self._tables.dataset
759 else:
760 raise NotImplementedError(f"Unrecognized CollectionType: '{collectionRecord.type}'.")
761 whereTerms = [
762 self._tables.dataset.columns.dataset_type_name == datasetType.name,
763 collectionColumn == collectionRecord.key,
764 ]
765 whereTerms.extend(self._tables.dataset.columns[name] == dataId[name] for name in dataId.keys())
766 query = self._tables.dataset.select().select_from(
767 fromClause
768 ).where(
769 sqlalchemy.sql.and_(*whereTerms)
770 )
771 result = self._db.query(query).fetchone()
772 if result is not None:
773 return self._makeDatasetRefFromRow(result, datasetType=datasetType, dataId=dataId)
774 return None
776 @transactional
777 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
778 run: str, *, producer: Optional[Quantum] = None, recursive: bool = False
779 ) -> List[DatasetRef]:
780 """Insert one or more datasets into the `Registry`
782 This always adds new datasets; to associate existing datasets with
783 a new collection, use ``associate``.
785 Parameters
786 ----------
787 datasetType : `DatasetType` or `str`
788 A `DatasetType` or the name of one.
789 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate`
790 Dimension-based identifiers for the new datasets.
791 run : `str`
792 The name of the run that produced the datasets.
793 producer : `Quantum`
794 Unit of work that produced the datasets. May be `None` to store
795 no provenance information, but if present the `Quantum` must
796 already have been added to the Registry.
797 recursive : `bool`
798 If True, recursively add datasets and attach entries for component
799 datasets as well.
801 Returns
802 -------
803 refs : `list` of `DatasetRef`
804 Resolved `DatasetRef` instances for all given data IDs (in the same
805 order).
807 Raises
808 ------
809 ConflictingDefinitionError
810 If a dataset with the same dataset type and data ID as one of those
811 given already exists in the given collection.
812 MissingCollectionError
813 Raised if ``run`` does not exist in the registry.
814 """
815 if not isinstance(datasetType, DatasetType):
816 datasetType = self.getDatasetType(datasetType)
817 rows = []
818 refs = []
819 runRecord = self._collections.find(run)
820 base = {
821 "dataset_type_name": datasetType.name,
822 self._collections.getRunForeignKeyName(): runRecord.key,
823 "quantum_id": producer.id if producer is not None else None,
824 }
825 # Expand data IDs and build both a list of unresolved DatasetRefs
826 # and a list of dictionary rows for the dataset table.
827 for dataId in dataIds:
828 ref = DatasetRef(datasetType, self.expandDataId(dataId, graph=datasetType.dimensions))
829 refs.append(ref)
830 row = dict(base, dataset_ref_hash=ref.hash)
831 for dimension, value in ref.dataId.full.items():
832 row[dimension.name] = value
833 rows.append(row)
834 # Actually insert into the dataset table.
835 try:
836 datasetIds = self._db.insert(self._tables.dataset, *rows, returnIds=True)
837 except sqlalchemy.exc.IntegrityError as err:
838 raise ConflictingDefinitionError(
839 f"Constraint violation while inserting datasets into run {run}. "
840 f"This usually means that one or more datasets with the same dataset type and data ID "
841 f"already exist in the collection, but it may be a foreign key violation."
842 ) from err
843 # Resolve the DatasetRefs with the autoincrement IDs we generated.
844 refs = [ref.resolved(id=datasetId, run=run) for datasetId, ref in zip(datasetIds, refs)]
845 if recursive and datasetType.isComposite():
846 # Insert component rows by recursing, and gather a single big list
847 # of rows to insert into the dataset_composition table.
848 compositionRows = []
849 for componentName in datasetType.storageClass.components:
850 componentDatasetType = datasetType.makeComponentDatasetType(componentName)
851 componentRefs = self.insertDatasets(componentDatasetType,
852 dataIds=(ref.dataId for ref in refs),
853 run=run,
854 producer=producer,
855 recursive=True)
856 for parentRef, componentRef in zip(refs, componentRefs):
857 parentRef._components[componentName] = componentRef
858 compositionRows.append({
859 "parent_dataset_id": parentRef.id,
860 "component_dataset_id": componentRef.id,
861 "component_name": componentName,
862 })
863 if compositionRows:
864 self._db.insert(self._tables.dataset_composition, *compositionRows)
865 return refs
867 def getDataset(self, id: int, datasetType: Optional[DatasetType] = None,
868 dataId: Optional[DataCoordinate] = None) -> Optional[DatasetRef]:
869 """Retrieve a Dataset entry.
871 Parameters
872 ----------
873 id : `int`
874 The unique identifier for the Dataset.
875 datasetType : `DatasetType`, optional
876 The `DatasetType` of the dataset to retrieve. This is used to
877 short-circuit retrieving the `DatasetType`, so if provided, the
878 caller is guaranteeing that it is what would have been retrieved.
879 dataId : `DataCoordinate`, optional
880 A `Dimension`-based identifier for the dataset within a
881 collection, possibly containing additional metadata. This is used
882 to short-circuit retrieving the dataId, so if provided, the
883 caller is guaranteeing that it is what would have been retrieved.
885 Returns
886 -------
887 ref : `DatasetRef`
888 A ref to the Dataset, or `None` if no matching Dataset
889 was found.
890 """
891 result = self._db.query(
892 self._tables.dataset.select().where(
893 self._tables.dataset.columns.dataset_id == id
894 )
895 ).fetchone()
896 if result is None:
897 return None
898 return self._makeDatasetRefFromRow(result, datasetType=datasetType, dataId=dataId)
900 @transactional
901 def removeDataset(self, ref: DatasetRef):
902 """Remove a dataset from the Registry.
904 The dataset and all components will be removed unconditionally from
905 all collections, and any associated `Quantum` records will also be
906 removed. `Datastore` records will *not* be deleted; the caller is
907 responsible for ensuring that the dataset has already been removed
908 from all Datastores.
910 Parameters
911 ----------
912 ref : `DatasetRef`
913 Reference to the dataset to be removed. Must include a valid
914 ``id`` attribute, and should be considered invalidated upon return.
916 Raises
917 ------
918 AmbiguousDatasetError
919 Raised if ``ref.id`` is `None`.
920 OrphanedRecordError
921 Raised if the dataset is still present in any `Datastore`.
922 """
923 if not ref.id:
924 raise AmbiguousDatasetError(f"Cannot remove dataset {ref} without ID.")
925 # Remove component datasets. We assume ``ref.components`` is already
926 # correctly populated, and rely on ON DELETE CASCADE to remove entries
927 # from DatasetComposition.
928 for componentRef in ref.components.values():
929 self.removeDataset(componentRef)
931 # Remove related quanta. We rely on ON DELETE CASCADE to remove any
932 # related records in dataset_consumers. Note that we permit a Quantum
933 # to be deleted without removing the datasets it refers to, but do not
934 # allow a dataset to be deleted without removing the Quanta that refer
935 # to them. A dataset is still quite usable without provenance, but
936 # provenance is worthless if it's inaccurate.
937 t = self._tables
938 selectProducer = sqlalchemy.sql.select(
939 [t.dataset.columns.quantum_id]
940 ).where(
941 t.dataset.columns.dataset_id == ref.id
942 )
943 selectConsumers = sqlalchemy.sql.select(
944 [t.dataset_consumers.columns.quantum_id]
945 ).where(
946 t.dataset_consumers.columns.dataset_id == ref.id
947 )
948 # TODO: we'd like to use Database.delete here, but it doesn't general
949 # queries yet.
950 self._connection.execute(
951 t.quantum.delete().where(
952 t.quantum.columns.id.in_(sqlalchemy.sql.union(selectProducer, selectConsumers))
953 )
954 )
955 # Remove the Dataset record itself. We rely on ON DELETE CASCADE to
956 # remove from DatasetCollection, and assume foreign key violations
957 # come from DatasetLocation (everything else should have an ON DELETE).
958 try:
959 self._connection.execute(
960 t.dataset.delete().where(t.dataset.c.dataset_id == ref.id)
961 )
962 except sqlalchemy.exc.IntegrityError as err:
963 raise OrphanedRecordError(f"Dataset {ref} is still present in one or more Datastores.") from err
965 @transactional
966 def attachComponent(self, name: str, parent: DatasetRef, component: DatasetRef):
967 """Attach a component to a dataset.
969 Parameters
970 ----------
971 name : `str`
972 Name of the component.
973 parent : `DatasetRef`
974 A reference to the parent dataset. Will be updated to reference
975 the component.
976 component : `DatasetRef`
977 A reference to the component dataset.
979 Raises
980 ------
981 AmbiguousDatasetError
982 Raised if ``parent.id`` or ``component.id`` is `None`.
983 """
984 # TODO Insert check for component name and type against
985 # parent.storageClass specified components
986 if parent.id is None:
987 raise AmbiguousDatasetError(f"Cannot attach component to dataset {parent} without ID.")
988 if component.id is None:
989 raise AmbiguousDatasetError(f"Cannot attach component {component} without ID.")
990 values = dict(component_name=name,
991 parent_dataset_id=parent.id,
992 component_dataset_id=component.id)
993 self._db.insert(self._tables.dataset_composition, values)
994 parent._components[name] = component
996 @transactional
997 def associate(self, collection: str, refs: Iterable[DatasetRef], *, recursive: bool = True):
998 """Add existing Datasets to a collection, implicitly creating the
999 collection if it does not already exist.
1001 If a DatasetRef with the same exact ``dataset_id`` is already in a
1002 collection nothing is changed. If a `DatasetRef` with the same
1003 `DatasetType1` and dimension values but with different ``dataset_id``
1004 exists in the collection, `ValueError` is raised.
1006 Parameters
1007 ----------
1008 collection : `str`
1009 Indicates the collection the Datasets should be associated with.
1010 refs : iterable of `DatasetRef`
1011 An iterable of resolved `DatasetRef` instances that already exist
1012 in this `Registry`.
1013 recursive : `bool`, optional
1014 If `True`, associate all component datasets as well. Note that
1015 this only associates components that are actually included in the
1016 given `DatasetRef` instances, which may not be the same as those in
1017 the database (especially if they were obtained from
1018 `queryDatasets`, which does not populate `DatasetRef.components`).
1020 Raises
1021 ------
1022 ConflictingDefinitionError
1023 If a Dataset with the given `DatasetRef` already exists in the
1024 given collection.
1025 AmbiguousDatasetError
1026 Raised if ``any(ref.id is None for ref in refs)``.
1027 MissingCollectionError
1028 Raised if ``collection`` does not exist in the registry.
1029 TypeError
1030 Raise adding new datasets to the given ``collection`` is not
1031 allowed.
1032 """
1033 collectionRecord = self._collections.find(collection)
1034 if collectionRecord.type is not CollectionType.TAGGED:
1035 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
1036 if recursive:
1037 refs = DatasetRef.flatten(refs)
1038 rows = [{"dataset_id": _checkAndGetId(ref),
1039 "dataset_ref_hash": ref.hash,
1040 self._collections.getCollectionForeignKeyName(): collectionRecord.key}
1041 for ref in refs]
1042 try:
1043 self._db.replace(self._tables.dataset_collection, *rows)
1044 except sqlalchemy.exc.IntegrityError as err:
1045 raise ConflictingDefinitionError(
1046 f"Constraint violation while associating datasets with collection {collection}. "
1047 f"This probably means that one or more datasets with the same dataset type and data ID "
1048 f"already exist in the collection, but it may also indicate that the datasets do not exist."
1049 ) from err
1051 @transactional
1052 def disassociate(self, collection: str, refs: Iterable[DatasetRef], *, recursive: bool = True):
1053 """Remove existing Datasets from a collection.
1055 ``collection`` and ``ref`` combinations that are not currently
1056 associated are silently ignored.
1058 Parameters
1059 ----------
1060 collection : `str`
1061 The collection the Datasets should no longer be associated with.
1062 refs : iterable of `DatasetRef`
1063 An iterable of resolved `DatasetRef` instances that already exist
1064 in this `Registry`.
1065 recursive : `bool`, optional
1066 If `True`, disassociate all component datasets as well. Note that
1067 this only disassociates components that are actually included in
1068 the given `DatasetRef` instances, which may not be the same as
1069 those in the database (especially if they were obtained from
1070 `queryDatasets`, which does not populate `DatasetRef.components`).
1072 Raises
1073 ------
1074 AmbiguousDatasetError
1075 Raised if ``any(ref.id is None for ref in refs)``.
1076 MissingCollectionError
1077 Raised if ``collection`` does not exist in the registry.
1078 TypeError
1079 Raise adding new datasets to the given ``collection`` is not
1080 allowed.
1081 """
1082 collectionFieldName = self._collections.getCollectionForeignKeyName()
1083 collectionRecord = self._collections.find(collection)
1084 if collectionRecord.type is not CollectionType.TAGGED:
1085 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; "
1086 "expected TAGGED.")
1087 if recursive:
1088 refs = DatasetRef.flatten(refs)
1089 rows = [{"dataset_id": _checkAndGetId(ref), collectionFieldName: collectionRecord.key}
1090 for ref in refs]
1091 self._db.delete(self._tables.dataset_collection, ["dataset_id", collectionFieldName], *rows)
1093 @transactional
1094 def insertDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]):
1095 """Record that a datastore holds the given datasets.
1097 Typically used by `Datastore`.
1099 Parameters
1100 ----------
1101 datastoreName : `str`
1102 Name of the datastore holding these datasets.
1103 refs : `~collections.abc.Iterable` of `DatasetRef`
1104 References to the datasets.
1106 Raises
1107 ------
1108 AmbiguousDatasetError
1109 Raised if ``any(ref.id is None for ref in refs)``.
1110 """
1111 self._db.insert(
1112 self._tables.dataset_location,
1113 *[{"datastore_name": datastoreName, "dataset_id": _checkAndGetId(ref)} for ref in refs]
1114 )
1116 @transactional
1117 def moveDatasetLocationToTrash(self, datastoreName: str, refs: Iterable[DatasetRef]):
1118 """Move the dataset location information to trash.
1120 Parameters
1121 ----------
1122 datastoreName : `str`
1123 Name of the datastore holding these datasets.
1124 refs : `~collections.abc.Iterable` of `DatasetRef`
1125 References to the datasets.
1126 """
1127 # We only want to move rows that already exist in the main table
1128 filtered = self.checkDatasetLocations(datastoreName, refs)
1129 self.canDeleteDatasetLocations(datastoreName, filtered)
1130 self.removeDatasetLocation(datastoreName, filtered)
1132 @transactional
1133 def canDeleteDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]):
1134 """Record that a datastore can delete this dataset
1136 Parameters
1137 ----------
1138 datastoreName : `str`
1139 Name of the datastore holding these datasets.
1140 refs : `~collections.abc.Iterable` of `DatasetRef`
1141 References to the datasets.
1143 Raises
1144 ------
1145 AmbiguousDatasetError
1146 Raised if ``any(ref.id is None for ref in refs)``.
1147 """
1148 self._db.insert(
1149 self._tables.dataset_location_trash,
1150 *[{"datastore_name": datastoreName, "dataset_id": _checkAndGetId(ref)} for ref in refs]
1151 )
1153 def checkDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]) -> List[DatasetRef]:
1154 """Check which refs are listed for this datastore.
1156 Parameters
1157 ----------
1158 datastoreName : `str`
1159 Name of the datastore holding these datasets.
1160 refs : `~collections.abc.Iterable` of `DatasetRef`
1161 References to the datasets.
1163 Returns
1164 -------
1165 present : `list` of `DatasetRef`
1166 All the `DatasetRef` that are listed.
1167 """
1169 table = self._tables.dataset_location
1170 result = self._db.query(
1171 sqlalchemy.sql.select(
1172 [table.columns.datastore_name, table.columns.dataset_id]
1173 ).where(
1174 sqlalchemy.sql.and_(table.columns.dataset_id.in_([ref.id for ref in refs]),
1175 table.columns.datastore_name == datastoreName)
1176 )
1177 ).fetchall()
1179 matched_ids = {r["dataset_id"] for r in result}
1180 return [ref for ref in refs if ref.id in matched_ids]
1182 def getDatasetLocations(self, ref: DatasetRef) -> Set[str]:
1183 """Retrieve datastore locations for a given dataset.
1185 Typically used by `Datastore`.
1187 Parameters
1188 ----------
1189 ref : `DatasetRef`
1190 A reference to the dataset for which to retrieve storage
1191 information.
1193 Returns
1194 -------
1195 datastores : `set` of `str`
1196 All the matching datastores holding this dataset. Empty set
1197 if the dataset does not exist anywhere.
1199 Raises
1200 ------
1201 AmbiguousDatasetError
1202 Raised if ``ref.id`` is `None`.
1203 """
1204 table = self._tables.dataset_location
1205 result = self._db.query(
1206 sqlalchemy.sql.select(
1207 [table.columns.datastore_name]
1208 ).where(
1209 table.columns.dataset_id == ref.id
1210 )
1211 ).fetchall()
1212 return {r["datastore_name"] for r in result}
1214 @transactional
1215 def getTrashedDatasets(self, datastoreName: str) -> Set[FakeDatasetRef]:
1216 """Retrieve all the dataset ref IDs that are in the trash
1217 associated with the specified datastore.
1219 Parameters
1220 ----------
1221 datastoreName : `str`
1222 The relevant datastore name to use.
1224 Returns
1225 -------
1226 ids : `set` of `FakeDatasetRef`
1227 The IDs of datasets that can be safely removed from this datastore.
1228 Can be empty.
1229 """
1230 table = self._tables.dataset_location_trash
1231 result = self._db.query(
1232 sqlalchemy.sql.select(
1233 [table.columns.dataset_id]
1234 ).where(
1235 table.columns.datastore_name == datastoreName
1236 )
1237 ).fetchall()
1238 return {FakeDatasetRef(r["dataset_id"]) for r in result}
1240 @transactional
1241 def emptyDatasetLocationsTrash(self, datastoreName: str, refs: Iterable[FakeDatasetRef]) -> None:
1242 """Remove datastore location associated with these datasets from trash.
1244 Typically used by `Datastore` when a dataset is removed.
1246 Parameters
1247 ----------
1248 datastoreName : `str`
1249 Name of this `Datastore`.
1250 refs : iterable of `FakeDatasetRef`
1251 The dataset IDs to be removed.
1253 Raises
1254 ------
1255 AmbiguousDatasetError
1256 Raised if ``ref.id`` is `None`.
1257 """
1258 if not refs:
1259 return
1260 self._db.delete(
1261 self._tables.dataset_location_trash,
1262 ["dataset_id", "datastore_name"],
1263 *[{"dataset_id": ref.id, "datastore_name": datastoreName} for ref in refs]
1264 )
1266 @transactional
1267 def removeDatasetLocation(self, datastoreName: str, refs: Iterable[DatasetRef]) -> None:
1268 """Remove datastore location associated with this dataset.
1270 Typically used by `Datastore` when a dataset is removed.
1272 Parameters
1273 ----------
1274 datastoreName : `str`
1275 Name of this `Datastore`.
1276 refs : iterable of `DatasetRef`
1277 A reference to the dataset for which information is to be removed.
1279 Raises
1280 ------
1281 AmbiguousDatasetError
1282 Raised if ``ref.id`` is `None`.
1283 """
1284 if not refs:
1285 return
1286 self._db.delete(
1287 self._tables.dataset_location,
1288 ["dataset_id", "datastore_name"],
1289 *[{"dataset_id": _checkAndGetId(ref), "datastore_name": datastoreName} for ref in refs]
1290 )
1292 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
1293 records: Optional[Mapping[DimensionElement, DimensionRecord]] = None, **kwds):
1294 """Expand a dimension-based data ID to include additional information.
1296 Parameters
1297 ----------
1298 dataId : `DataCoordinate` or `dict`, optional
1299 Data ID to be expanded; augmented and overridden by ``kwds``.
1300 graph : `DimensionGraph`, optional
1301 Set of dimensions for the expanded ID. If `None`, the dimensions
1302 will be inferred from the keys of ``dataId`` and ``kwds``.
1303 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph``
1304 are silently ignored, providing a way to extract and expand a
1305 subset of a data ID.
1306 records : mapping [`DimensionElement`, `DimensionRecord`], optional
1307 Dimension record data to use before querying the database for that
1308 data.
1309 **kwds
1310 Additional keywords are treated like additional key-value pairs for
1311 ``dataId``, extending and overriding
1313 Returns
1314 -------
1315 expanded : `ExpandedDataCoordinate`
1316 A data ID that includes full metadata for all of the dimensions it
1317 identifieds.
1318 """
1319 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwds)
1320 if isinstance(standardized, ExpandedDataCoordinate):
1321 return standardized
1322 elif isinstance(dataId, ExpandedDataCoordinate):
1323 records = dict(records) if records is not None else {}
1324 records.update(dataId.records)
1325 else:
1326 records = dict(records) if records is not None else {}
1327 keys = dict(standardized)
1328 regions = []
1329 timespans = []
1330 for element in standardized.graph.primaryKeyTraversalOrder:
1331 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
1332 if record is ...:
1333 storage = self._dimensions[element]
1334 record = storage.fetch(keys)
1335 records[element] = record
1336 if record is not None:
1337 for d in element.implied:
1338 value = getattr(record, d.name)
1339 if keys.setdefault(d, value) != value:
1340 raise InconsistentDataIdError(f"Data ID {standardized} has {d.name}={keys[d]!r}, "
1341 f"but {element.name} implies {d.name}={value!r}.")
1342 if element in standardized.graph.spatial and record.region is not None:
1343 if any(record.region.relate(r) & lsst.sphgeom.DISJOINT for r in regions):
1344 raise InconsistentDataIdError(f"Data ID {standardized}'s region for {element.name} "
1345 f"is disjoint with those for other elements.")
1346 regions.append(record.region)
1347 if element in standardized.graph.temporal:
1348 if any(not record.timespan.overlaps(t) for t in timespans):
1349 raise InconsistentDataIdError(f"Data ID {standardized}'s timespan for {element.name}"
1350 f" is disjoint with those for other elements.")
1351 timespans.append(record.timespan)
1352 else:
1353 if element in standardized.graph.required:
1354 raise LookupError(
1355 f"Could not fetch record for required dimension {element.name} via keys {keys}."
1356 )
1357 if element.alwaysJoin:
1358 raise InconsistentDataIdError(
1359 f"Could not fetch record for element {element.name} via keys {keys}, ",
1360 f"but it is marked alwaysJoin=True; this means one or more dimensions are not "
1361 f"related."
1362 )
1363 records.update((d, None) for d in element.implied)
1364 return ExpandedDataCoordinate(standardized.graph, standardized.values(), records=records)
1366 def relateDataIds(self, a: DataId, b: DataId) -> Optional[ConsistentDataIds]:
1367 """Compare the keys and values of a pair of data IDs for consistency.
1369 See `ConsistentDataIds` for more information.
1371 Parameters
1372 ----------
1373 a : `dict` or `DataCoordinate`
1374 First data ID to be compared.
1375 b : `dict` or `DataCoordinate`
1376 Second data ID to be compared.
1378 Returns
1379 -------
1380 relationship : `ConsistentDataIds` or `None`
1381 Relationship information. This is not `None` and coerces to
1382 `True` in boolean contexts if and only if the data IDs are
1383 consistent in terms of all common key-value pairs, all many-to-many
1384 join tables, and all spatial andtemporal relationships.
1385 """
1386 a = DataCoordinate.standardize(a, universe=self.dimensions)
1387 b = DataCoordinate.standardize(b, universe=self.dimensions)
1388 aFull = getattr(a, "full", None)
1389 bFull = getattr(b, "full", None)
1390 aBest = aFull if aFull is not None else a
1391 bBest = bFull if bFull is not None else b
1392 jointKeys = aBest.keys() & bBest.keys()
1393 # If any common values are not equal, we know they are inconsistent.
1394 if any(aBest[k] != bBest[k] for k in jointKeys):
1395 return None
1396 # If the graphs are equal, we know the data IDs are.
1397 if a.graph == b.graph:
1398 return ConsistentDataIds(contains=True, within=True, overlaps=bool(jointKeys))
1399 # Result is still inconclusive. Try to expand a data ID containing
1400 # keys from both; that will fail if they are inconsistent.
1401 # First, if either input was already an ExpandedDataCoordinate, extract
1402 # its records so we don't have to query for them.
1403 records = {}
1404 if hasattr(a, "records"):
1405 records.update(a.records)
1406 if hasattr(b, "records"):
1407 records.update(b.records)
1408 try:
1409 self.expandDataId({**a, **b}, graph=(a.graph | b.graph), records=records)
1410 except InconsistentDataIdError:
1411 return None
1412 # We know the answer is not `None`; time to figure out what it is.
1413 return ConsistentDataIds(
1414 contains=(a.graph >= b.graph),
1415 within=(a.graph <= b.graph),
1416 overlaps=bool(a.graph & b.graph),
1417 )
1419 def insertDimensionData(self, element: Union[DimensionElement, str],
1420 *data: Union[dict, DimensionRecord],
1421 conform: bool = True):
1422 """Insert one or more dimension records into the database.
1424 Parameters
1425 ----------
1426 element : `DimensionElement` or `str`
1427 The `DimensionElement` or name thereof that identifies the table
1428 records will be inserted into.
1429 data : `dict` or `DimensionRecord` (variadic)
1430 One or more records to insert.
1431 conform : `bool`, optional
1432 If `False` (`True` is default) perform no checking or conversions,
1433 and assume that ``element`` is a `DimensionElement` instance and
1434 ``data`` is a one or more `DimensionRecord` instances of the
1435 appropriate subclass.
1436 """
1437 if conform:
1438 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement.
1439 records = [element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row
1440 for row in data]
1441 else:
1442 records = data
1443 storage = self._dimensions[element]
1444 storage.insert(*records)
1446 def syncDimensionData(self, element: Union[DimensionElement, str],
1447 row: Union[dict, DimensionRecord],
1448 conform: bool = True) -> bool:
1449 """Synchronize the given dimension record with the database, inserting
1450 if it does not already exist and comparing values if it does.
1452 Parameters
1453 ----------
1454 element : `DimensionElement` or `str`
1455 The `DimensionElement` or name thereof that identifies the table
1456 records will be inserted into.
1457 row : `dict` or `DimensionRecord`
1458 The record to insert.
1459 conform : `bool`, optional
1460 If `False` (`True` is default) perform no checking or conversions,
1461 and assume that ``element`` is a `DimensionElement` instance and
1462 ``data`` is a one or more `DimensionRecord` instances of the
1463 appropriate subclass.
1465 Returns
1466 -------
1467 inserted : `bool`
1468 `True` if a new row was inserted, `False` otherwise.
1470 Raises
1471 ------
1472 ConflictingDefinitionError
1473 Raised if the record exists in the database (according to primary
1474 key lookup) but is inconsistent with the given one.
1476 Notes
1477 -----
1478 This method cannot be called within transactions, as it needs to be
1479 able to perform its own transaction to be concurrent.
1480 """
1481 if conform:
1482 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement.
1483 record = element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row
1484 else:
1485 record = row
1486 storage = self._dimensions[element]
1487 try:
1488 return storage.sync(record)
1489 except DatabaseConflictError as err:
1490 raise ConflictingDefinitionError(str(err)) from err
1492 def queryDatasetTypes(self, expression: Any = ...) -> Iterator[DatasetType]:
1493 """Iterate over the dataset types whose names match an expression.
1495 Parameters
1496 ----------
1497 expression : `Any`, optional
1498 An expression that fully or partially identifies the dataset types
1499 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1500 `...` can be used to return all dataset types, and is the default.
1501 See :ref:`daf_butler_dataset_type_expressions` for more
1502 information.
1504 Yields
1505 ------
1506 datasetType : `DatasetType`
1507 A `DatasetType` instance whose name matches ``expression``.
1508 """
1509 yield from self._datasetStorage.fetchDatasetTypes(expression)
1511 def queryCollections(self, expression: Any = ...,
1512 datasetType: Optional[DatasetType] = None,
1513 collectionType: Optional[CollectionType] = None,
1514 flattenChains: bool = False,
1515 includeChains: Optional[bool] = None) -> Iterator[str]:
1516 """Iterate over the collections whose names match an expression.
1518 Parameters
1519 ----------
1520 expression : `Any`, optional
1521 An expression that fully or partially identifies the collections
1522 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1523 `...` can be used to return all collections, and is the default.
1524 See :ref:`daf_butler_collection_expressions` for more
1525 information.
1526 datasetType : `DatasetType`, optional
1527 If provided, only yield collections that should be searched for
1528 this dataset type according to ``expression``. If this is
1529 not provided, any dataset type restrictions in ``expression`` are
1530 ignored.
1531 collectionType : `CollectionType`, optional
1532 If provided, only yield collections of this type.
1533 flattenChains : `bool`, optional
1534 If `True` (`False` is default), recursively yield the child
1535 collections of matching `~CollectionType.CHAINED` collections.
1536 includeChains : `bool`, optional
1537 If `True`, yield records for matching `~CollectionType.CHAINED`
1538 collections. Default is the opposite of ``flattenChains``: include
1539 either CHAINED collections or their children, but not both.
1541 Yields
1542 ------
1543 collection : `str`
1544 The name of a collection that matches ``expression``.
1545 """
1546 query = CollectionQuery.fromExpression(expression)
1547 for record in query.iter(self._collections, datasetType=datasetType, collectionType=collectionType,
1548 flattenChains=flattenChains, includeChains=includeChains):
1549 yield record.name
1551 def makeQueryBuilder(self, summary: QuerySummary) -> QueryBuilder:
1552 """Return a `QueryBuilder` instance capable of constructing and
1553 managing more complex queries than those obtainable via `Registry`
1554 interfaces.
1556 This is an advanced interface; downstream code should prefer
1557 `Registry.queryDimensions` and `Registry.queryDatasets` whenever those
1558 are sufficient.
1560 Parameters
1561 ----------
1562 summary : `QuerySummary`
1563 Object describing and categorizing the full set of dimensions that
1564 will be included in the query.
1566 Returns
1567 -------
1568 builder : `QueryBuilder`
1569 Object that can be used to construct and perform advanced queries.
1570 """
1571 return QueryBuilder(connection=self._connection, summary=summary,
1572 dimensionStorage=self._dimensions,
1573 datasetStorage=self._datasetStorage)
1575 def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
1576 dataId: Optional[DataId] = None,
1577 datasets: Any = None,
1578 collections: Any = None,
1579 where: Optional[str] = None,
1580 expand: bool = True,
1581 **kwds) -> Iterator[DataCoordinate]:
1582 """Query for and iterate over data IDs matching user-provided criteria.
1584 Parameters
1585 ----------
1586 dimensions : `Dimension` or `str`, or iterable thereof
1587 The dimensions of the data IDs to yield, as either `Dimension`
1588 instances or `str`. Will be automatically expanded to a complete
1589 `DimensionGraph`.
1590 dataId : `dict` or `DataCoordinate`, optional
1591 A data ID whose key-value pairs are used as equality constraints
1592 in the query.
1593 datasets : `Any`, optional
1594 An expression that fully or partially identifies dataset types
1595 that should constrain the yielded data IDs. For example, including
1596 "raw" here would constrain the yielded ``instrument``,
1597 ``exposure``, ``detector``, and ``physical_filter`` values to only
1598 those for which at least one "raw" dataset exists in
1599 ``collections``. Allowed types include `DatasetType`, `str`,
1600 `re.Pattern`, and iterables thereof. Unlike other dataset type
1601 expressions, `...` is not permitted - it doesn't make sense to
1602 constrain data IDs on the existence of *all* datasets.
1603 See :ref:`daf_butler_dataset_type_expressions` for more
1604 information.
1605 collections: `Any`, optional
1606 An expression that fully or partially identifies the collections
1607 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1608 thereof. `...` can be used to return all collections. Must be
1609 provided if ``datasets`` is, and is ignored if it is not. See
1610 :ref:`daf_butler_collection_expressions` for more information.
1611 where : `str`, optional
1612 A string expression similar to a SQL WHERE clause. May involve
1613 any column of a dimension table or (as a shortcut for the primary
1614 key column of a dimension table) dimension name. See
1615 :ref:`daf_butler_dimension_expressions` for more information.
1616 expand : `bool`, optional
1617 If `True` (default) yield `ExpandedDataCoordinate` instead of
1618 minimal `DataCoordinate` base-class instances.
1619 kwds
1620 Additional keyword arguments are forwarded to
1621 `DataCoordinate.standardize` when processing the ``dataId``
1622 argument (and may be used to provide a constraining data ID even
1623 when the ``dataId`` argument is `None`).
1625 Yields
1626 ------
1627 dataId : `DataCoordinate`
1628 Data IDs matching the given query parameters. Order is
1629 unspecified.
1630 """
1631 dimensions = iterable(dimensions)
1632 standardizedDataId = self.expandDataId(dataId, **kwds)
1633 standardizedDatasetTypes = []
1634 requestedDimensionNames = set(self.dimensions.extract(dimensions).names)
1635 if datasets is not None:
1636 if collections is None:
1637 raise TypeError("Cannot pass 'datasets' without 'collections'.")
1638 for datasetType in self._datasetStorage.fetchDatasetTypes(datasets):
1639 requestedDimensionNames.update(datasetType.dimensions.names)
1640 standardizedDatasetTypes.append(datasetType)
1641 # Preprocess collections expression in case the original included
1642 # single-pass iterators (we'll want to use it multiple times
1643 # below).
1644 collections = CollectionQuery.fromExpression(collections)
1646 summary = QuerySummary(
1647 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1648 dataId=standardizedDataId,
1649 expression=where,
1650 )
1651 builder = self.makeQueryBuilder(summary)
1652 for datasetType in standardizedDatasetTypes:
1653 builder.joinDataset(datasetType, collections, isResult=False)
1654 query = builder.finish()
1655 predicate = query.predicate()
1656 for row in query.execute():
1657 if predicate(row):
1658 result = query.extractDataId(row)
1659 if expand:
1660 yield self.expandDataId(result, records=standardizedDataId.records)
1661 else:
1662 yield result
1664 def queryDatasets(self, datasetType: Any, *,
1665 collections: Any,
1666 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
1667 dataId: Optional[DataId] = None,
1668 where: Optional[str] = None,
1669 deduplicate: bool = False,
1670 expand: bool = True,
1671 **kwds) -> Iterator[DatasetRef]:
1672 """Query for and iterate over dataset references matching user-provided
1673 criteria.
1675 Parameters
1676 ----------
1677 datasetType
1678 An expression that fully or partially identifies the dataset types
1679 to be queried. Allowed types include `DatasetType`, `str`,
1680 `re.Pattern`, and iterables thereof. The special value `...` can
1681 be used to query all dataset types. See
1682 :ref:`daf_butler_dataset_type_expressions` for more information.
1683 collections
1684 An expression that fully or partially identifies the collections
1685 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1686 thereof. `...` can be used to return all collections. See
1687 :ref:`daf_butler_collection_expressions` for more information.
1688 dimensions : `~collections.abc.Iterable` of `Dimension` or `str`
1689 Dimensions to include in the query (in addition to those used
1690 to identify the queried dataset type(s)), either to constrain
1691 the resulting datasets to those for which a matching dimension
1692 exists, or to relate the dataset type's dimensions to dimensions
1693 referenced by the ``dataId`` or ``where`` arguments.
1694 dataId : `dict` or `DataCoordinate`, optional
1695 A data ID whose key-value pairs are used as equality constraints
1696 in the query.
1697 where : `str`, optional
1698 A string expression similar to a SQL WHERE clause. May involve
1699 any column of a dimension table or (as a shortcut for the primary
1700 key column of a dimension table) dimension name. See
1701 :ref:`daf_butler_dimension_expressions` for more information.
1702 deduplicate : `bool`, optional
1703 If `True` (`False` is default), for each result data ID, only
1704 yield one `DatasetRef` of each `DatasetType`, from the first
1705 collection in which a dataset of that dataset type appears
1706 (according to the order of ``collections`` passed in). If `True`,
1707 ``collections`` must not contain regular expressions and may not
1708 be `...`.
1709 expand : `bool`, optional
1710 If `True` (default) attach `ExpandedDataCoordinate` instead of
1711 minimal `DataCoordinate` base-class instances.
1712 kwds
1713 Additional keyword arguments are forwarded to
1714 `DataCoordinate.standardize` when processing the ``dataId``
1715 argument (and may be used to provide a constraining data ID even
1716 when the ``dataId`` argument is `None`).
1718 Yields
1719 ------
1720 ref : `DatasetRef`
1721 Dataset references matching the given query criteria. These
1722 are grouped by `DatasetType` if the query evaluates to multiple
1723 dataset types, but order is otherwise unspecified.
1725 Raises
1726 ------
1727 TypeError
1728 Raised when the arguments are incompatible, such as when a
1729 collection wildcard is passed when ``deduplicate`` is `True`.
1731 Notes
1732 -----
1733 When multiple dataset types are queried in a single call, the
1734 results of this operation are equivalent to querying for each dataset
1735 type separately in turn, and no information about the relationships
1736 between datasets of different types is included. In contexts where
1737 that kind of information is important, the recommended pattern is to
1738 use `queryDimensions` to first obtain data IDs (possibly with the
1739 desired dataset types and collections passed as constraints to the
1740 query), and then use multiple (generally much simpler) calls to
1741 `queryDatasets` with the returned data IDs passed as constraints.
1742 """
1743 # Standardize and expand the data ID provided as a constraint.
1744 standardizedDataId = self.expandDataId(dataId, **kwds)
1745 # If the datasetType passed isn't actually a DatasetType, expand it
1746 # (it could be an expression that yields multiple DatasetTypes) and
1747 # recurse.
1748 if not isinstance(datasetType, DatasetType):
1749 for trueDatasetType in self._datasetStorage.fetchDatasetTypes(datasetType):
1750 yield from self.queryDatasets(trueDatasetType, collections=collections,
1751 dimensions=dimensions, dataId=standardizedDataId,
1752 where=where, deduplicate=deduplicate)
1753 return
1754 # The full set of dimensions in the query is the combination of those
1755 # needed for the DatasetType and those explicitly requested, if any.
1756 requestedDimensionNames = set(datasetType.dimensions.names)
1757 if dimensions is not None:
1758 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
1759 # Construct the summary structure needed to construct a QueryBuilder.
1760 summary = QuerySummary(
1761 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1762 dataId=standardizedDataId,
1763 expression=where,
1764 )
1765 builder = self.makeQueryBuilder(summary)
1766 # Add the dataset subquery to the query, telling the QueryBuilder to
1767 # include the rank of the selected collection in the results only if we
1768 # need to deduplicate. Note that if any of the collections are
1769 # actually wildcard expressions, and we've asked for deduplication,
1770 # this will raise TypeError for us.
1771 builder.joinDataset(datasetType, collections, isResult=True, addRank=deduplicate)
1772 query = builder.finish()
1773 predicate = query.predicate()
1774 if not deduplicate or len(collections) == 1:
1775 # No need to de-duplicate across collections.
1776 for row in query.execute():
1777 if predicate(row):
1778 dataId = query.extractDataId(row, graph=datasetType.dimensions)
1779 if expand:
1780 dataId = self.expandDataId(dataId, records=standardizedDataId.records)
1781 yield query.extractDatasetRef(row, datasetType, dataId)[0]
1782 else:
1783 # For each data ID, yield only the DatasetRef with the lowest
1784 # collection rank.
1785 bestRefs = {}
1786 bestRanks = {}
1787 for row in query.execute():
1788 if predicate(row):
1789 ref, rank = query.extractDatasetRef(row, datasetType)
1790 bestRank = bestRanks.get(ref.dataId, sys.maxsize)
1791 if rank < bestRank:
1792 bestRefs[ref.dataId] = ref
1793 bestRanks[ref.dataId] = rank
1794 # If caller requested expanded data IDs, we defer that until here
1795 # so we do as little expansion as possible.
1796 if expand:
1797 for ref in bestRefs.values():
1798 dataId = self.expandDataId(ref.dataId, records=standardizedDataId.records)
1799 yield ref.expanded(dataId)
1800 else:
1801 yield from bestRefs.values()
1803 dimensions: DimensionUniverse
1804 """The universe of all dimensions known to the registry
1805 (`DimensionUniverse`).
1806 """
1808 storageClasses: StorageClassFactory
1809 """All storage classes known to the registry (`StorageClassFactory`).
1810 """