Coverage for python/lsst/daf/butler/registry/_registry.py : 14%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "AmbiguousDatasetError",
26 "ConflictingDefinitionError",
27 "ConsistentDataIds",
28 "InconsistentDataIdError",
29 "OrphanedRecordError",
30 "Registry",
31)
33import contextlib
34from dataclasses import dataclass
35import sys
36from typing import (
37 Any,
38 Iterable,
39 Iterator,
40 List,
41 Mapping,
42 Optional,
43 Set,
44 Type,
45 TYPE_CHECKING,
46 Union,
47)
49import sqlalchemy
51import lsst.sphgeom
52from ..core import (
53 Config,
54 DataCoordinate,
55 DataId,
56 DatasetRef,
57 DatasetType,
58 Dimension,
59 DimensionElement,
60 DimensionGraph,
61 DimensionRecord,
62 DimensionUniverse,
63 ExpandedDataCoordinate,
64 FakeDatasetRef,
65 StorageClassFactory,
66)
67from ..core import ddl
68from ..core.utils import doImport, iterable, transactional
69from ._config import RegistryConfig
70from .queries import (
71 DatasetRegistryStorage,
72 QueryBuilder,
73 QuerySummary,
74)
75from .tables import makeRegistryTableSpecs
76from ._collectionType import CollectionType
77from .wildcards import CollectionQuery, CollectionSearch
78from .interfaces import DatabaseConflictError
80if TYPE_CHECKING: 80 ↛ 81line 80 didn't jump to line 81, because the condition on line 80 was never true
81 from ..butlerConfig import ButlerConfig
82 from ..core import (
83 Quantum
84 )
85 from .interfaces import (
86 CollectionManager,
87 Database,
88 OpaqueTableStorageManager,
89 DimensionRecordStorageManager,
90 )
93@dataclass
94class ConsistentDataIds:
95 """A struct used to report relationships between data IDs by
96 `Registry.relateDataIds`.
98 If an instance of this class is returned (instead of `None`), the data IDs
99 are "not inconsistent" - any keys they have in common have the same value,
100 and any spatial or temporal relationships they have at least might involve
101 an overlap. To capture this, any instance of `ConsistentDataIds` coerces
102 to `True` in boolean contexts.
103 """
105 overlaps: bool
106 """If `True`, the data IDs have at least one key in common, associated with
107 the same value.
109 Note that data IDs are not inconsistent even if overlaps is `False` - they
110 may simply have no keys in common, which means they cannot have
111 inconsistent values for any keys. They may even be equal, in the case that
112 both data IDs are empty.
114 This field does _not_ indicate whether a spatial or temporal overlap
115 relationship exists.
116 """
118 contains: bool
119 """If `True`, all keys in the first data ID are in the second, and are
120 associated with the same values.
122 This includes case where the first data ID is empty.
123 """
125 within: bool
126 """If `True`, all keys in the second data ID are in the first, and are
127 associated with the same values.
129 This includes case where the second data ID is empty.
130 """
132 @property
133 def equal(self) -> bool:
134 """If `True`, the two data IDs are the same.
136 Data IDs are equal if they have both a `contains` and a `within`
137 relationship.
138 """
139 return self.contains and self.within
141 @property
142 def disjoint(self) -> bool:
143 """If `True`, the two data IDs have no keys in common.
145 This is simply the oppose of `overlaps`. Disjoint datasets are by
146 definition not inconsistent.
147 """
148 return not self.overlaps
150 def __bool__(self) -> bool:
151 return True
154class InconsistentDataIdError(ValueError):
155 """Exception raised when a data ID contains contradictory key-value pairs,
156 according to dimension relationships.
158 This can include the case where the data ID identifies mulitple spatial
159 regions or timspans that are disjoint.
160 """
163class AmbiguousDatasetError(Exception):
164 """Exception raised when a `DatasetRef` has no ID and a `Registry`
165 operation requires one.
166 """
169class ConflictingDefinitionError(Exception):
170 """Exception raised when trying to insert a database record when a
171 conflicting record already exists.
172 """
175class OrphanedRecordError(Exception):
176 """Exception raised when trying to remove or modify a database record
177 that is still being used in some other table.
178 """
181def _checkAndGetId(ref: DatasetRef) -> int:
182 """Return the ID of the given `DatasetRef`, or raise if it is `None`.
184 This trivial function exists to allow operations that would otherwise be
185 natural list comprehensions to check that the ID is not `None` as well.
187 Parameters
188 ----------
189 ref : `DatasetRef`
190 Dataset reference.
192 Returns
193 -------
194 id : `int`
195 ``ref.id``
197 Raises
198 ------
199 AmbiguousDatasetError
200 Raised if ``ref.id`` is `None`.
201 """
202 if ref.id is None:
203 raise AmbiguousDatasetError("Dataset ID must not be `None`.")
204 return ref.id
207class Registry:
208 """Registry interface.
210 Parameters
211 ----------
212 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
213 Registry configuration
214 """
216 defaultConfigFile = None
217 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
218 absolute path. Can be None if no defaults specified.
219 """
221 @classmethod
222 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False,
223 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry:
224 """Create `Registry` subclass instance from `config`.
226 Uses ``registry.cls`` from `config` to determine which subclass to
227 instantiate.
229 Parameters
230 ----------
231 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
232 Registry configuration
233 create : `bool`, optional
234 Assume empty Registry and create a new one.
235 butlerRoot : `str`, optional
236 Path to the repository root this `Registry` will manage.
237 writeable : `bool`, optional
238 If `True` (default) create a read-write connection to the database.
240 Returns
241 -------
242 registry : `Registry` (subclass)
243 A new `Registry` subclass instance.
244 """
245 if not isinstance(config, RegistryConfig):
246 if isinstance(config, str) or isinstance(config, Config):
247 config = RegistryConfig(config)
248 else:
249 raise ValueError("Incompatible Registry configuration: {}".format(config))
250 config.replaceRoot(butlerRoot)
251 DatabaseClass = config.getDatabaseClass()
252 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
253 namespace=config.get("namespace"), writeable=writeable)
254 universe = DimensionUniverse(config)
255 opaque = doImport(config["managers", "opaque"])
256 dimensions = doImport(config["managers", "dimensions"])
257 collections = doImport(config["managers", "collections"])
258 return cls(database, universe, dimensions=dimensions, opaque=opaque, collections=collections,
259 create=create)
261 def __init__(self, database: Database, universe: DimensionUniverse, *,
262 opaque: Type[OpaqueTableStorageManager],
263 dimensions: Type[DimensionRecordStorageManager],
264 collections: Type[CollectionManager],
265 create: bool = False):
266 self._db = database
267 self.storageClasses = StorageClassFactory()
268 with self._db.declareStaticTables(create=create) as context:
269 self._dimensions = dimensions.initialize(self._db, context, universe=universe)
270 self._collections = collections.initialize(self._db, context)
271 self._tables = context.addTableTuple(makeRegistryTableSpecs(self.dimensions, self._collections))
272 self._opaque = opaque.initialize(self._db, context)
273 self._collections.refresh()
274 # TODO: we shouldn't be grabbing the private connection from the
275 # Database instance like this, but it's a reasonable way to proceed
276 # while we transition to using the Database API more.
277 self._connection = self._db._connection
278 self._datasetStorage = DatasetRegistryStorage(connection=self._connection,
279 universe=self.dimensions,
280 tables=self._tables._asdict(),
281 collections=self._collections)
282 self._datasetTypes = {}
284 def __str__(self) -> str:
285 return str(self._db)
287 def __repr__(self) -> str:
288 return f"Registry({self._db!r}, {self.dimensions!r})"
290 def isWriteable(self) -> bool:
291 """Return `True` if this registry allows write operations, and `False`
292 otherwise.
293 """
294 return self._db.isWriteable()
296 @property
297 def dimensions(self) -> DimensionUniverse:
298 """All dimensions recognized by this `Registry` (`DimensionUniverse`).
299 """
300 return self._dimensions.universe
302 @contextlib.contextmanager
303 def transaction(self):
304 """Return a context manager that represents a transaction.
305 """
306 # TODO make savepoint=False the default.
307 try:
308 with self._db.transaction():
309 yield
310 except BaseException:
311 # TODO: this clears the caches sometimes when we wouldn't actually
312 # need to. Can we avoid that?
313 self._dimensions.clearCaches()
314 self._datasetTypes.clear()
315 raise
317 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec):
318 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
319 other data repository client.
321 Opaque table records can be added via `insertOpaqueData`, retrieved via
322 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
324 Parameters
325 ----------
326 tableName : `str`
327 Logical name of the opaque table. This may differ from the
328 actual name used in the database by a prefix and/or suffix.
329 spec : `ddl.TableSpec`
330 Specification for the table to be added.
331 """
332 self._opaque.register(tableName, spec)
334 @transactional
335 def insertOpaqueData(self, tableName: str, *data: dict):
336 """Insert records into an opaque table.
338 Parameters
339 ----------
340 tableName : `str`
341 Logical name of the opaque table. Must match the name used in a
342 previous call to `registerOpaqueTable`.
343 data
344 Each additional positional argument is a dictionary that represents
345 a single row to be added.
346 """
347 self._opaque[tableName].insert(*data)
349 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
350 """Retrieve records from an opaque table.
352 Parameters
353 ----------
354 tableName : `str`
355 Logical name of the opaque table. Must match the name used in a
356 previous call to `registerOpaqueTable`.
357 where
358 Additional keyword arguments are interpreted as equality
359 constraints that restrict the returned rows (combined with AND);
360 keyword arguments are column names and values are the values they
361 must have.
363 Yields
364 ------
365 row : `dict`
366 A dictionary representing a single result row.
367 """
368 yield from self._opaque[tableName].fetch(**where)
370 @transactional
371 def deleteOpaqueData(self, tableName: str, **where: Any):
372 """Remove records from an opaque table.
374 Parameters
375 ----------
376 tableName : `str`
377 Logical name of the opaque table. Must match the name used in a
378 previous call to `registerOpaqueTable`.
379 where
380 Additional keyword arguments are interpreted as equality
381 constraints that restrict the deleted rows (combined with AND);
382 keyword arguments are column names and values are the values they
383 must have.
384 """
385 self._opaque[tableName].delete(**where)
387 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED):
388 """Add a new collection if one with the given name does not exist.
390 Parameters
391 ----------
392 name : `str`
393 The name of the collection to create.
394 type : `CollectionType`
395 Enum value indicating the type of collection to create.
397 Notes
398 -----
399 This method cannot be called within transactions, as it needs to be
400 able to perform its own transaction to be concurrent.
401 """
402 self._collections.register(name, type)
404 def getCollectionType(self, name: str) -> CollectionType:
405 """Return an enumeration value indicating the type of the given
406 collection.
408 Parameters
409 ----------
410 name : `str`
411 The name of the collection.
413 Returns
414 -------
415 type : `CollectionType`
416 Enum value indicating the type of this collection.
418 Raises
419 ------
420 MissingCollectionError
421 Raised if no collection with the given name exists.
422 """
423 return self._collections.find(name).type
425 def registerRun(self, name: str):
426 """Add a new run if one with the given name does not exist.
428 Parameters
429 ----------
430 name : `str`
431 The name of the run to create.
433 Notes
434 -----
435 This method cannot be called within transactions, as it needs to be
436 able to perform its own transaction to be concurrent.
437 """
438 self._collections.register(name, CollectionType.RUN)
440 @transactional
441 def removeCollection(self, name: str):
442 """Completely remove the given collection.
444 Parameters
445 ----------
446 name : `str`
447 The name of the collection to remove.
449 Raises
450 ------
451 MissingCollectionError
452 Raised if no collection with the given name exists.
454 Notes
455 -----
456 If this is a `~CollectionType.RUN` collection, all datasets and quanta
457 in it are also fully removed. This requires that those datasets be
458 removed (or at least trashed) from any datastores that hold them first.
460 A collection may not be deleted as long as it is referenced by a
461 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must
462 be deleted or redefined first.
463 """
464 self._collections.remove(name)
466 def getCollectionChain(self, parent: str) -> CollectionSearch:
467 """Return the child collections in a `~CollectionType.CHAINED`
468 collection.
470 Parameters
471 ----------
472 parent : `str`
473 Name of the chained collection. Must have already been added via
474 a call to `Registry.registerCollection`.
476 Returns
477 -------
478 children : `CollectionSearch`
479 An object that defines the search path of the collection.
480 See :ref:`daf_butler_collection_expressions` for more information.
482 Raises
483 ------
484 MissingCollectionError
485 Raised if ``parent`` does not exist in the `Registry`.
486 TypeError
487 Raised if ``parent`` does not correspond to a
488 `~CollectionType.CHAINED` collection.
489 """
490 record = self._collections.find(parent)
491 if record.type is not CollectionType.CHAINED:
492 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
493 return record.children
495 @transactional
496 def setCollectionChain(self, parent: str, children: Any):
497 """Define or redefine a `~CollectionType.CHAINED` collection.
499 Parameters
500 ----------
501 parent : `str`
502 Name of the chained collection. Must have already been added via
503 a call to `Registry.registerCollection`.
504 children : `Any`
505 An expression defining an ordered search of child collections,
506 generally an iterable of `str`. Restrictions on the dataset types
507 to be searched can also be included, by passing mapping or an
508 iterable containing tuples; see
509 :ref:`daf_butler_collection_expressions` for more information.
511 Raises
512 ------
513 MissingCollectionError
514 Raised when any of the given collections do not exist in the
515 `Registry`.
516 TypeError
517 Raised if ``parent`` does not correspond to a
518 `~CollectionType.CHAINED` collection.
519 ValueError
520 Raised if the given collections contains a cycle.
521 """
522 record = self._collections.find(parent)
523 if record.type is not CollectionType.CHAINED:
524 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
525 children = CollectionSearch.fromExpression(children)
526 if children != record.children:
527 record.update(self._collections, children)
529 @transactional
530 def registerDatasetType(self, datasetType: DatasetType) -> bool:
531 """
532 Add a new `DatasetType` to the Registry.
534 It is not an error to register the same `DatasetType` twice.
536 Parameters
537 ----------
538 datasetType : `DatasetType`
539 The `DatasetType` to be added.
541 Returns
542 -------
543 inserted : `bool`
544 `True` if ``datasetType`` was inserted, `False` if an identical
545 existing `DatsetType` was found. Note that in either case the
546 DatasetType is guaranteed to be defined in the Registry
547 consistently with the given definition.
549 Raises
550 ------
551 ValueError
552 Raised if the dimensions or storage class are invalid.
553 ConflictingDefinitionError
554 Raised if this DatasetType is already registered with a different
555 definition.
556 """
557 # TODO: this implementation isn't concurrent, except *maybe* in SQLite
558 # with aggressive locking (where starting a transaction is essentially
559 # the same as grabbing a full-database lock). Should be reimplemented
560 # with Database.sync to fix this, but that may require schema changes
561 # as well so we only have to synchronize one row to know if we have
562 # inconsistent definitions.
564 # If the DatasetType is already in the cache, we assume it's already in
565 # the DB (note that we don't actually provide a way to remove them from
566 # the DB).
567 existingDatasetType = self._datasetTypes.get(datasetType.name)
568 # If it's not in the cache, try to insert it.
569 if existingDatasetType is None:
570 try:
571 with self._db.transaction():
572 self._db.insert(
573 self._tables.dataset_type,
574 {
575 "dataset_type_name": datasetType.name,
576 "storage_class": datasetType.storageClass.name,
577 }
578 )
579 except sqlalchemy.exc.IntegrityError:
580 # Insert failed on the only unique constraint on this table:
581 # dataset_type_name. So now the question is whether the one in
582 # there is the same as the one we tried to insert.
583 existingDatasetType = self.getDatasetType(datasetType.name)
584 else:
585 # If adding the DatasetType record itself succeeded, add its
586 # dimensions (if any). We don't guard this in a try block
587 # because a problem with this insert means the database
588 # content must be corrupted.
589 if datasetType.dimensions:
590 self._db.insert(
591 self._tables.dataset_type_dimensions,
592 *[{"dataset_type_name": datasetType.name,
593 "dimension_name": dimensionName}
594 for dimensionName in datasetType.dimensions.names]
595 )
596 # Update the cache.
597 self._datasetTypes[datasetType.name] = datasetType
598 # Also register component DatasetTypes (if any).
599 for compName, compStorageClass in datasetType.storageClass.components.items():
600 compType = DatasetType(datasetType.componentTypeName(compName),
601 dimensions=datasetType.dimensions,
602 storageClass=compStorageClass)
603 self.registerDatasetType(compType)
604 # Inserts succeeded, nothing left to do here.
605 return True
606 # A DatasetType with this name exists, check if is equal
607 if datasetType == existingDatasetType:
608 return False
609 else:
610 raise ConflictingDefinitionError(f"DatasetType: {datasetType} != existing {existingDatasetType}")
612 def getDatasetType(self, name: str) -> DatasetType:
613 """Get the `DatasetType`.
615 Parameters
616 ----------
617 name : `str`
618 Name of the type.
620 Returns
621 -------
622 type : `DatasetType`
623 The `DatasetType` associated with the given name.
625 Raises
626 ------
627 KeyError
628 Requested named DatasetType could not be found in registry.
629 """
630 datasetType = self._datasetTypes.get(name)
631 if datasetType is None:
632 # Get StorageClass from DatasetType table
633 result = self._db.query(
634 sqlalchemy.sql.select(
635 [self._tables.dataset_type.c.storage_class]
636 ).where(
637 self._tables.dataset_type.columns.dataset_type_name == name
638 )
639 ).fetchone()
641 if result is None:
642 raise KeyError("Could not find entry for datasetType {}".format(name))
644 storageClass = self.storageClasses.getStorageClass(result["storage_class"])
645 # Get Dimensions (if any) from DatasetTypeDimensions table
646 result = self._db.query(
647 sqlalchemy.sql.select(
648 [self._tables.dataset_type_dimensions.columns.dimension_name]
649 ).where(
650 self._tables.dataset_type_dimensions.columns.dataset_type_name == name
651 )
652 ).fetchall()
653 dimensions = DimensionGraph(self.dimensions, names=(r[0] for r in result) if result else ())
654 datasetType = DatasetType(name=name,
655 storageClass=storageClass,
656 dimensions=dimensions)
657 self._datasetTypes[name] = datasetType
658 return datasetType
660 def _makeDatasetRefFromRow(self, row: sqlalchemy.engine.RowProxy,
661 datasetType: Optional[DatasetType] = None,
662 dataId: Optional[DataCoordinate] = None):
663 """Construct a DatasetRef from the result of a query on the Dataset
664 table.
666 Parameters
667 ----------
668 row : `sqlalchemy.engine.RowProxy`.
669 Row of a query that contains all columns from the `Dataset` table.
670 May include additional fields (which will be ignored).
671 datasetType : `DatasetType`, optional
672 `DatasetType` associated with this dataset. Will be retrieved
673 if not provided. If provided, the caller guarantees that it is
674 already consistent with what would have been retrieved from the
675 database.
676 dataId : `DataCoordinate`, optional
677 Dimensions associated with this dataset. Will be retrieved if not
678 provided. If provided, the caller guarantees that it is already
679 consistent with what would have been retrieved from the database.
681 Returns
682 -------
683 ref : `DatasetRef`.
684 A new `DatasetRef` instance.
685 """
686 if datasetType is None:
687 datasetType = self.getDatasetType(row["dataset_type_name"])
688 runRecord = self._collections[row[self._collections.getRunForeignKeyName()]]
689 assert runRecord is not None, "Should be guaranteed by foreign key constraints."
690 run = runRecord.name
691 datasetRefHash = row["dataset_ref_hash"]
692 if dataId is None:
693 # TODO: should we expand here?
694 dataId = DataCoordinate.standardize(
695 row,
696 graph=datasetType.dimensions,
697 universe=self.dimensions
698 )
699 # Get components (if present)
700 components = {}
701 if datasetType.storageClass.isComposite():
702 t = self._tables
703 columns = list(t.dataset.columns)
704 columns.append(t.dataset_composition.columns.component_name)
705 results = self._db.query(
706 sqlalchemy.sql.select(
707 columns
708 ).select_from(
709 t.dataset.join(
710 t.dataset_composition,
711 (t.dataset.columns.dataset_id == t.dataset_composition.columns.component_dataset_id)
712 )
713 ).where(
714 t.dataset_composition.columns.parent_dataset_id == row["dataset_id"]
715 )
716 ).fetchall()
717 for result in results:
718 componentName = result["component_name"]
719 componentDatasetType = DatasetType(
720 DatasetType.nameWithComponent(datasetType.name, componentName),
721 dimensions=datasetType.dimensions,
722 storageClass=datasetType.storageClass.components[componentName]
723 )
724 components[componentName] = self._makeDatasetRefFromRow(result, dataId=dataId,
725 datasetType=componentDatasetType)
726 if not components.keys() <= datasetType.storageClass.components.keys():
727 raise RuntimeError(
728 f"Inconsistency detected between dataset and storage class definitions: "
729 f"{datasetType.storageClass.name} has components "
730 f"{set(datasetType.storageClass.components.keys())}, "
731 f"but dataset has components {set(components.keys())}"
732 )
733 return DatasetRef(datasetType=datasetType, dataId=dataId, id=row["dataset_id"], run=run,
734 hash=datasetRefHash, components=components)
736 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *,
737 collections: Any, **kwds: Any) -> Optional[DatasetRef]:
738 """Find a dataset given its `DatasetType` and data ID.
740 This can be used to obtain a `DatasetRef` that permits the dataset to
741 be read from a `Datastore`.
743 Parameters
744 ----------
745 datasetType : `DatasetType` or `str`
746 A `DatasetType` or the name of one.
747 dataId : `dict` or `DataCoordinate`, optional
748 A `dict`-like object containing the `Dimension` links that identify
749 the dataset within a collection.
750 collections
751 An expression that fully or partially identifies the collections
752 to search for the dataset, such as a `str`, `re.Pattern`, or
753 iterable thereof. `...` can be used to return all collections.
754 See :ref:`daf_butler_collection_expressions` for more information.
755 **kwds
756 Additional keyword arguments passed to
757 `DataCoordinate.standardize` to convert ``dataId`` to a true
758 `DataCoordinate` or augment an existing one.
760 Returns
761 -------
762 ref : `DatasetRef`
763 A reference to the dataset, or `None` if no matching Dataset
764 was found.
766 Raises
767 ------
768 LookupError
769 Raised if one or more data ID keys are missing.
770 MissingCollectionError
771 Raised if any of ``collections`` does not exist in the registry.
772 """
773 if not isinstance(datasetType, DatasetType):
774 datasetType = self.getDatasetType(datasetType)
775 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions,
776 universe=self.dimensions, **kwds)
777 collections = CollectionSearch.fromExpression(collections)
778 for collectionRecord in collections.iter(self._collections, datasetType=datasetType):
779 if collectionRecord.type is CollectionType.TAGGED:
780 collectionColumn = \
781 self._tables.dataset_collection.columns[self._collections.getCollectionForeignKeyName()]
782 fromClause = self._tables.dataset.join(self._tables.dataset_collection)
783 elif collectionRecord.type is CollectionType.RUN:
784 collectionColumn = self._tables.dataset.columns[self._collections.getRunForeignKeyName()]
785 fromClause = self._tables.dataset
786 else:
787 raise NotImplementedError(f"Unrecognized CollectionType: '{collectionRecord.type}'.")
788 whereTerms = [
789 self._tables.dataset.columns.dataset_type_name == datasetType.name,
790 collectionColumn == collectionRecord.key,
791 ]
792 whereTerms.extend(self._tables.dataset.columns[name] == dataId[name] for name in dataId.keys())
793 query = self._tables.dataset.select().select_from(
794 fromClause
795 ).where(
796 sqlalchemy.sql.and_(*whereTerms)
797 )
798 result = self._db.query(query).fetchone()
799 if result is not None:
800 return self._makeDatasetRefFromRow(result, datasetType=datasetType, dataId=dataId)
801 return None
803 @transactional
804 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
805 run: str, *, producer: Optional[Quantum] = None, recursive: bool = False
806 ) -> List[DatasetRef]:
807 """Insert one or more datasets into the `Registry`
809 This always adds new datasets; to associate existing datasets with
810 a new collection, use ``associate``.
812 Parameters
813 ----------
814 datasetType : `DatasetType` or `str`
815 A `DatasetType` or the name of one.
816 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate`
817 Dimension-based identifiers for the new datasets.
818 run : `str`
819 The name of the run that produced the datasets.
820 producer : `Quantum`
821 Unit of work that produced the datasets. May be `None` to store
822 no provenance information, but if present the `Quantum` must
823 already have been added to the Registry.
824 recursive : `bool`
825 If True, recursively add datasets and attach entries for component
826 datasets as well.
828 Returns
829 -------
830 refs : `list` of `DatasetRef`
831 Resolved `DatasetRef` instances for all given data IDs (in the same
832 order).
834 Raises
835 ------
836 ConflictingDefinitionError
837 If a dataset with the same dataset type and data ID as one of those
838 given already exists in the given collection.
839 MissingCollectionError
840 Raised if ``run`` does not exist in the registry.
841 """
842 if not isinstance(datasetType, DatasetType):
843 datasetType = self.getDatasetType(datasetType)
844 rows = []
845 refs = []
846 runRecord = self._collections.find(run)
847 base = {
848 "dataset_type_name": datasetType.name,
849 self._collections.getRunForeignKeyName(): runRecord.key,
850 "quantum_id": producer.id if producer is not None else None,
851 }
852 # Expand data IDs and build both a list of unresolved DatasetRefs
853 # and a list of dictionary rows for the dataset table.
854 for dataId in dataIds:
855 ref = DatasetRef(datasetType, self.expandDataId(dataId, graph=datasetType.dimensions))
856 refs.append(ref)
857 row = dict(base, dataset_ref_hash=ref.hash)
858 for dimension, value in ref.dataId.full.items():
859 row[dimension.name] = value
860 rows.append(row)
861 # Actually insert into the dataset table.
862 try:
863 datasetIds = self._db.insert(self._tables.dataset, *rows, returnIds=True)
864 except sqlalchemy.exc.IntegrityError as err:
865 raise ConflictingDefinitionError(
866 f"Constraint violation while inserting datasets into run {run}. "
867 f"This usually means that one or more datasets with the same dataset type and data ID "
868 f"already exist in the collection, but it may be a foreign key violation."
869 ) from err
870 # Resolve the DatasetRefs with the autoincrement IDs we generated.
871 refs = [ref.resolved(id=datasetId, run=run) for datasetId, ref in zip(datasetIds, refs)]
872 if recursive and datasetType.isComposite():
873 # Insert component rows by recursing, and gather a single big list
874 # of rows to insert into the dataset_composition table.
875 compositionRows = []
876 for componentName in datasetType.storageClass.components:
877 componentDatasetType = datasetType.makeComponentDatasetType(componentName)
878 componentRefs = self.insertDatasets(componentDatasetType,
879 dataIds=(ref.dataId for ref in refs),
880 run=run,
881 producer=producer,
882 recursive=True)
883 for parentRef, componentRef in zip(refs, componentRefs):
884 parentRef._components[componentName] = componentRef
885 compositionRows.append({
886 "parent_dataset_id": parentRef.id,
887 "component_dataset_id": componentRef.id,
888 "component_name": componentName,
889 })
890 if compositionRows:
891 self._db.insert(self._tables.dataset_composition, *compositionRows)
892 return refs
894 def getDataset(self, id: int, datasetType: Optional[DatasetType] = None,
895 dataId: Optional[DataCoordinate] = None) -> Optional[DatasetRef]:
896 """Retrieve a Dataset entry.
898 Parameters
899 ----------
900 id : `int`
901 The unique identifier for the Dataset.
902 datasetType : `DatasetType`, optional
903 The `DatasetType` of the dataset to retrieve. This is used to
904 short-circuit retrieving the `DatasetType`, so if provided, the
905 caller is guaranteeing that it is what would have been retrieved.
906 dataId : `DataCoordinate`, optional
907 A `Dimension`-based identifier for the dataset within a
908 collection, possibly containing additional metadata. This is used
909 to short-circuit retrieving the dataId, so if provided, the
910 caller is guaranteeing that it is what would have been retrieved.
912 Returns
913 -------
914 ref : `DatasetRef`
915 A ref to the Dataset, or `None` if no matching Dataset
916 was found.
917 """
918 result = self._db.query(
919 self._tables.dataset.select().where(
920 self._tables.dataset.columns.dataset_id == id
921 )
922 ).fetchone()
923 if result is None:
924 return None
925 return self._makeDatasetRefFromRow(result, datasetType=datasetType, dataId=dataId)
927 @transactional
928 def removeDatasets(self, refs: Iterable[DatasetRef], *, recursive: bool = True):
929 """Remove datasets from the Registry.
931 The datasets will be removed unconditionally from all collections, and
932 any `Quantum` that consumed this dataset will instead be marked with
933 having a NULL input. `Datastore` records will *not* be deleted; the
934 caller is responsible for ensuring that the dataset has already been
935 removed from all Datastores.
937 Parameters
938 ----------
939 refs : `Iterable` of `DatasetRef`
940 References to the datasets to be removed. Must include a valid
941 ``id`` attribute, and should be considered invalidated upon return.
942 recursive : `bool`, optional
943 If `True`, remove all component datasets as well. Note that
944 this only removes components that are actually included in the
945 given `DatasetRef` instances, which may not be the same as those in
946 the database (especially if they were obtained from
947 `queryDatasets`, which does not populate `DatasetRef.components`).
949 Raises
950 ------
951 AmbiguousDatasetError
952 Raised if any ``ref.id`` is `None`.
953 OrphanedRecordError
954 Raised if any dataset is still present in any `Datastore`.
955 """
956 if recursive:
957 refs = DatasetRef.flatten(refs)
958 rows = [{"dataset_id": _checkAndGetId(ref)} for ref in refs]
959 # Remove the dataset records. We rely on ON DELETE clauses to
960 # take care of other dependencies:
961 # - ON DELETE CASCADE will remove dataset_composition rows.
962 # - ON DELETE CASCADE will remove dataset_collection rows.
963 # - ON DELETE SET NULL will apply to dataset_consumer rows, making it
964 # clear that the provenance of any quanta that used this dataset as
965 # an input is now incomplete.
966 try:
967 self._db.delete(self._tables.dataset, ["dataset_id"], *rows)
968 except sqlalchemy.exc.IntegrityError as err:
969 raise OrphanedRecordError("One or more datasets is still "
970 "present in one or more Datastores.") from err
972 @transactional
973 def attachComponent(self, name: str, parent: DatasetRef, component: DatasetRef):
974 """Attach a component to a dataset.
976 Parameters
977 ----------
978 name : `str`
979 Name of the component.
980 parent : `DatasetRef`
981 A reference to the parent dataset. Will be updated to reference
982 the component.
983 component : `DatasetRef`
984 A reference to the component dataset.
986 Raises
987 ------
988 AmbiguousDatasetError
989 Raised if ``parent.id`` or ``component.id`` is `None`.
990 """
991 # TODO Insert check for component name and type against
992 # parent.storageClass specified components
993 if parent.id is None:
994 raise AmbiguousDatasetError(f"Cannot attach component to dataset {parent} without ID.")
995 if component.id is None:
996 raise AmbiguousDatasetError(f"Cannot attach component {component} without ID.")
997 values = dict(component_name=name,
998 parent_dataset_id=parent.id,
999 component_dataset_id=component.id)
1000 self._db.insert(self._tables.dataset_composition, values)
1001 parent._components[name] = component
1003 @transactional
1004 def associate(self, collection: str, refs: Iterable[DatasetRef], *, recursive: bool = True):
1005 """Add existing Datasets to a collection, implicitly creating the
1006 collection if it does not already exist.
1008 If a DatasetRef with the same exact ``dataset_id`` is already in a
1009 collection nothing is changed. If a `DatasetRef` with the same
1010 `DatasetType1` and dimension values but with different ``dataset_id``
1011 exists in the collection, `ValueError` is raised.
1013 Parameters
1014 ----------
1015 collection : `str`
1016 Indicates the collection the Datasets should be associated with.
1017 refs : iterable of `DatasetRef`
1018 An iterable of resolved `DatasetRef` instances that already exist
1019 in this `Registry`.
1020 recursive : `bool`, optional
1021 If `True`, associate all component datasets as well. Note that
1022 this only associates components that are actually included in the
1023 given `DatasetRef` instances, which may not be the same as those in
1024 the database (especially if they were obtained from
1025 `queryDatasets`, which does not populate `DatasetRef.components`).
1027 Raises
1028 ------
1029 ConflictingDefinitionError
1030 If a Dataset with the given `DatasetRef` already exists in the
1031 given collection.
1032 AmbiguousDatasetError
1033 Raised if ``any(ref.id is None for ref in refs)``.
1034 MissingCollectionError
1035 Raised if ``collection`` does not exist in the registry.
1036 TypeError
1037 Raise adding new datasets to the given ``collection`` is not
1038 allowed.
1039 """
1040 collectionRecord = self._collections.find(collection)
1041 if collectionRecord.type is not CollectionType.TAGGED:
1042 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
1043 if recursive:
1044 refs = DatasetRef.flatten(refs)
1045 rows = [{"dataset_id": _checkAndGetId(ref),
1046 "dataset_ref_hash": ref.hash,
1047 self._collections.getCollectionForeignKeyName(): collectionRecord.key}
1048 for ref in refs]
1049 try:
1050 self._db.replace(self._tables.dataset_collection, *rows)
1051 except sqlalchemy.exc.IntegrityError as err:
1052 raise ConflictingDefinitionError(
1053 f"Constraint violation while associating datasets with collection {collection}. "
1054 f"This probably means that one or more datasets with the same dataset type and data ID "
1055 f"already exist in the collection, but it may also indicate that the datasets do not exist."
1056 ) from err
1058 @transactional
1059 def disassociate(self, collection: str, refs: Iterable[DatasetRef], *, recursive: bool = True):
1060 """Remove existing Datasets from a collection.
1062 ``collection`` and ``ref`` combinations that are not currently
1063 associated are silently ignored.
1065 Parameters
1066 ----------
1067 collection : `str`
1068 The collection the Datasets should no longer be associated with.
1069 refs : iterable of `DatasetRef`
1070 An iterable of resolved `DatasetRef` instances that already exist
1071 in this `Registry`.
1072 recursive : `bool`, optional
1073 If `True`, disassociate all component datasets as well. Note that
1074 this only disassociates components that are actually included in
1075 the given `DatasetRef` instances, which may not be the same as
1076 those in the database (especially if they were obtained from
1077 `queryDatasets`, which does not populate `DatasetRef.components`).
1079 Raises
1080 ------
1081 AmbiguousDatasetError
1082 Raised if ``any(ref.id is None for ref in refs)``.
1083 MissingCollectionError
1084 Raised if ``collection`` does not exist in the registry.
1085 TypeError
1086 Raise adding new datasets to the given ``collection`` is not
1087 allowed.
1088 """
1089 collectionFieldName = self._collections.getCollectionForeignKeyName()
1090 collectionRecord = self._collections.find(collection)
1091 if collectionRecord.type is not CollectionType.TAGGED:
1092 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; "
1093 "expected TAGGED.")
1094 if recursive:
1095 refs = DatasetRef.flatten(refs)
1096 rows = [{"dataset_id": _checkAndGetId(ref), collectionFieldName: collectionRecord.key}
1097 for ref in refs]
1098 self._db.delete(self._tables.dataset_collection, ["dataset_id", collectionFieldName], *rows)
1100 @transactional
1101 def insertDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]):
1102 """Record that a datastore holds the given datasets.
1104 Typically used by `Datastore`.
1106 Parameters
1107 ----------
1108 datastoreName : `str`
1109 Name of the datastore holding these datasets.
1110 refs : `~collections.abc.Iterable` of `DatasetRef`
1111 References to the datasets.
1113 Raises
1114 ------
1115 AmbiguousDatasetError
1116 Raised if ``any(ref.id is None for ref in refs)``.
1117 """
1118 self._db.insert(
1119 self._tables.dataset_location,
1120 *[{"datastore_name": datastoreName, "dataset_id": _checkAndGetId(ref)} for ref in refs]
1121 )
1123 @transactional
1124 def moveDatasetLocationToTrash(self, datastoreName: str, refs: Iterable[DatasetRef]):
1125 """Move the dataset location information to trash.
1127 Parameters
1128 ----------
1129 datastoreName : `str`
1130 Name of the datastore holding these datasets.
1131 refs : `~collections.abc.Iterable` of `DatasetRef`
1132 References to the datasets.
1133 """
1134 # We only want to move rows that already exist in the main table
1135 filtered = self.checkDatasetLocations(datastoreName, refs)
1136 self.canDeleteDatasetLocations(datastoreName, filtered)
1137 self.removeDatasetLocation(datastoreName, filtered)
1139 @transactional
1140 def canDeleteDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]):
1141 """Record that a datastore can delete this dataset
1143 Parameters
1144 ----------
1145 datastoreName : `str`
1146 Name of the datastore holding these datasets.
1147 refs : `~collections.abc.Iterable` of `DatasetRef`
1148 References to the datasets.
1150 Raises
1151 ------
1152 AmbiguousDatasetError
1153 Raised if ``any(ref.id is None for ref in refs)``.
1154 """
1155 self._db.insert(
1156 self._tables.dataset_location_trash,
1157 *[{"datastore_name": datastoreName, "dataset_id": _checkAndGetId(ref)} for ref in refs]
1158 )
1160 def checkDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]) -> List[DatasetRef]:
1161 """Check which refs are listed for this datastore.
1163 Parameters
1164 ----------
1165 datastoreName : `str`
1166 Name of the datastore holding these datasets.
1167 refs : `~collections.abc.Iterable` of `DatasetRef`
1168 References to the datasets.
1170 Returns
1171 -------
1172 present : `list` of `DatasetRef`
1173 All the `DatasetRef` that are listed.
1174 """
1176 table = self._tables.dataset_location
1177 result = self._db.query(
1178 sqlalchemy.sql.select(
1179 [table.columns.datastore_name, table.columns.dataset_id]
1180 ).where(
1181 sqlalchemy.sql.and_(table.columns.dataset_id.in_([ref.id for ref in refs]),
1182 table.columns.datastore_name == datastoreName)
1183 )
1184 ).fetchall()
1186 matched_ids = {r["dataset_id"] for r in result}
1187 return [ref for ref in refs if ref.id in matched_ids]
1189 def getDatasetLocations(self, ref: DatasetRef) -> Set[str]:
1190 """Retrieve datastore locations for a given dataset.
1192 Typically used by `Datastore`.
1194 Parameters
1195 ----------
1196 ref : `DatasetRef`
1197 A reference to the dataset for which to retrieve storage
1198 information.
1200 Returns
1201 -------
1202 datastores : `set` of `str`
1203 All the matching datastores holding this dataset. Empty set
1204 if the dataset does not exist anywhere.
1206 Raises
1207 ------
1208 AmbiguousDatasetError
1209 Raised if ``ref.id`` is `None`.
1210 """
1211 table = self._tables.dataset_location
1212 result = self._db.query(
1213 sqlalchemy.sql.select(
1214 [table.columns.datastore_name]
1215 ).where(
1216 table.columns.dataset_id == ref.id
1217 )
1218 ).fetchall()
1219 return {r["datastore_name"] for r in result}
1221 @transactional
1222 def getTrashedDatasets(self, datastoreName: str) -> Set[FakeDatasetRef]:
1223 """Retrieve all the dataset ref IDs that are in the trash
1224 associated with the specified datastore.
1226 Parameters
1227 ----------
1228 datastoreName : `str`
1229 The relevant datastore name to use.
1231 Returns
1232 -------
1233 ids : `set` of `FakeDatasetRef`
1234 The IDs of datasets that can be safely removed from this datastore.
1235 Can be empty.
1236 """
1237 table = self._tables.dataset_location_trash
1238 result = self._db.query(
1239 sqlalchemy.sql.select(
1240 [table.columns.dataset_id]
1241 ).where(
1242 table.columns.datastore_name == datastoreName
1243 )
1244 ).fetchall()
1245 return {FakeDatasetRef(r["dataset_id"]) for r in result}
1247 @transactional
1248 def emptyDatasetLocationsTrash(self, datastoreName: str, refs: Iterable[FakeDatasetRef]) -> None:
1249 """Remove datastore location associated with these datasets from trash.
1251 Typically used by `Datastore` when a dataset is removed.
1253 Parameters
1254 ----------
1255 datastoreName : `str`
1256 Name of this `Datastore`.
1257 refs : iterable of `FakeDatasetRef`
1258 The dataset IDs to be removed.
1260 Raises
1261 ------
1262 AmbiguousDatasetError
1263 Raised if ``ref.id`` is `None`.
1264 """
1265 if not refs:
1266 return
1267 self._db.delete(
1268 self._tables.dataset_location_trash,
1269 ["dataset_id", "datastore_name"],
1270 *[{"dataset_id": ref.id, "datastore_name": datastoreName} for ref in refs]
1271 )
1273 @transactional
1274 def removeDatasetLocation(self, datastoreName: str, refs: Iterable[DatasetRef]) -> None:
1275 """Remove datastore location associated with this dataset.
1277 Typically used by `Datastore` when a dataset is removed.
1279 Parameters
1280 ----------
1281 datastoreName : `str`
1282 Name of this `Datastore`.
1283 refs : iterable of `DatasetRef`
1284 A reference to the dataset for which information is to be removed.
1286 Raises
1287 ------
1288 AmbiguousDatasetError
1289 Raised if ``ref.id`` is `None`.
1290 """
1291 if not refs:
1292 return
1293 self._db.delete(
1294 self._tables.dataset_location,
1295 ["dataset_id", "datastore_name"],
1296 *[{"dataset_id": _checkAndGetId(ref), "datastore_name": datastoreName} for ref in refs]
1297 )
1299 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
1300 records: Optional[Mapping[DimensionElement, DimensionRecord]] = None, **kwds):
1301 """Expand a dimension-based data ID to include additional information.
1303 Parameters
1304 ----------
1305 dataId : `DataCoordinate` or `dict`, optional
1306 Data ID to be expanded; augmented and overridden by ``kwds``.
1307 graph : `DimensionGraph`, optional
1308 Set of dimensions for the expanded ID. If `None`, the dimensions
1309 will be inferred from the keys of ``dataId`` and ``kwds``.
1310 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph``
1311 are silently ignored, providing a way to extract and expand a
1312 subset of a data ID.
1313 records : mapping [`DimensionElement`, `DimensionRecord`], optional
1314 Dimension record data to use before querying the database for that
1315 data.
1316 **kwds
1317 Additional keywords are treated like additional key-value pairs for
1318 ``dataId``, extending and overriding
1320 Returns
1321 -------
1322 expanded : `ExpandedDataCoordinate`
1323 A data ID that includes full metadata for all of the dimensions it
1324 identifieds.
1325 """
1326 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwds)
1327 if isinstance(standardized, ExpandedDataCoordinate):
1328 return standardized
1329 elif isinstance(dataId, ExpandedDataCoordinate):
1330 records = dict(records) if records is not None else {}
1331 records.update(dataId.records)
1332 else:
1333 records = dict(records) if records is not None else {}
1334 keys = dict(standardized)
1335 regions = []
1336 timespans = []
1337 for element in standardized.graph.primaryKeyTraversalOrder:
1338 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
1339 if record is ...:
1340 storage = self._dimensions[element]
1341 record = storage.fetch(keys)
1342 records[element] = record
1343 if record is not None:
1344 for d in element.implied:
1345 value = getattr(record, d.name)
1346 if keys.setdefault(d, value) != value:
1347 raise InconsistentDataIdError(f"Data ID {standardized} has {d.name}={keys[d]!r}, "
1348 f"but {element.name} implies {d.name}={value!r}.")
1349 if element in standardized.graph.spatial and record.region is not None:
1350 if any(record.region.relate(r) & lsst.sphgeom.DISJOINT for r in regions):
1351 raise InconsistentDataIdError(f"Data ID {standardized}'s region for {element.name} "
1352 f"is disjoint with those for other elements.")
1353 regions.append(record.region)
1354 if element in standardized.graph.temporal:
1355 if any(not record.timespan.overlaps(t) for t in timespans):
1356 raise InconsistentDataIdError(f"Data ID {standardized}'s timespan for {element.name}"
1357 f" is disjoint with those for other elements.")
1358 timespans.append(record.timespan)
1359 else:
1360 if element in standardized.graph.required:
1361 raise LookupError(
1362 f"Could not fetch record for required dimension {element.name} via keys {keys}."
1363 )
1364 if element.alwaysJoin:
1365 raise InconsistentDataIdError(
1366 f"Could not fetch record for element {element.name} via keys {keys}, ",
1367 f"but it is marked alwaysJoin=True; this means one or more dimensions are not "
1368 f"related."
1369 )
1370 records.update((d, None) for d in element.implied)
1371 return ExpandedDataCoordinate(standardized.graph, standardized.values(), records=records)
1373 def relateDataIds(self, a: DataId, b: DataId) -> Optional[ConsistentDataIds]:
1374 """Compare the keys and values of a pair of data IDs for consistency.
1376 See `ConsistentDataIds` for more information.
1378 Parameters
1379 ----------
1380 a : `dict` or `DataCoordinate`
1381 First data ID to be compared.
1382 b : `dict` or `DataCoordinate`
1383 Second data ID to be compared.
1385 Returns
1386 -------
1387 relationship : `ConsistentDataIds` or `None`
1388 Relationship information. This is not `None` and coerces to
1389 `True` in boolean contexts if and only if the data IDs are
1390 consistent in terms of all common key-value pairs, all many-to-many
1391 join tables, and all spatial andtemporal relationships.
1392 """
1393 a = DataCoordinate.standardize(a, universe=self.dimensions)
1394 b = DataCoordinate.standardize(b, universe=self.dimensions)
1395 aFull = getattr(a, "full", None)
1396 bFull = getattr(b, "full", None)
1397 aBest = aFull if aFull is not None else a
1398 bBest = bFull if bFull is not None else b
1399 jointKeys = aBest.keys() & bBest.keys()
1400 # If any common values are not equal, we know they are inconsistent.
1401 if any(aBest[k] != bBest[k] for k in jointKeys):
1402 return None
1403 # If the graphs are equal, we know the data IDs are.
1404 if a.graph == b.graph:
1405 return ConsistentDataIds(contains=True, within=True, overlaps=bool(jointKeys))
1406 # Result is still inconclusive. Try to expand a data ID containing
1407 # keys from both; that will fail if they are inconsistent.
1408 # First, if either input was already an ExpandedDataCoordinate, extract
1409 # its records so we don't have to query for them.
1410 records = {}
1411 if hasattr(a, "records"):
1412 records.update(a.records)
1413 if hasattr(b, "records"):
1414 records.update(b.records)
1415 try:
1416 self.expandDataId({**a, **b}, graph=(a.graph | b.graph), records=records)
1417 except InconsistentDataIdError:
1418 return None
1419 # We know the answer is not `None`; time to figure out what it is.
1420 return ConsistentDataIds(
1421 contains=(a.graph >= b.graph),
1422 within=(a.graph <= b.graph),
1423 overlaps=bool(a.graph & b.graph),
1424 )
1426 def insertDimensionData(self, element: Union[DimensionElement, str],
1427 *data: Union[dict, DimensionRecord],
1428 conform: bool = True):
1429 """Insert one or more dimension records into the database.
1431 Parameters
1432 ----------
1433 element : `DimensionElement` or `str`
1434 The `DimensionElement` or name thereof that identifies the table
1435 records will be inserted into.
1436 data : `dict` or `DimensionRecord` (variadic)
1437 One or more records to insert.
1438 conform : `bool`, optional
1439 If `False` (`True` is default) perform no checking or conversions,
1440 and assume that ``element`` is a `DimensionElement` instance and
1441 ``data`` is a one or more `DimensionRecord` instances of the
1442 appropriate subclass.
1443 """
1444 if conform:
1445 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement.
1446 records = [element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row
1447 for row in data]
1448 else:
1449 records = data
1450 storage = self._dimensions[element]
1451 storage.insert(*records)
1453 def syncDimensionData(self, element: Union[DimensionElement, str],
1454 row: Union[dict, DimensionRecord],
1455 conform: bool = True) -> bool:
1456 """Synchronize the given dimension record with the database, inserting
1457 if it does not already exist and comparing values if it does.
1459 Parameters
1460 ----------
1461 element : `DimensionElement` or `str`
1462 The `DimensionElement` or name thereof that identifies the table
1463 records will be inserted into.
1464 row : `dict` or `DimensionRecord`
1465 The record to insert.
1466 conform : `bool`, optional
1467 If `False` (`True` is default) perform no checking or conversions,
1468 and assume that ``element`` is a `DimensionElement` instance and
1469 ``data`` is a one or more `DimensionRecord` instances of the
1470 appropriate subclass.
1472 Returns
1473 -------
1474 inserted : `bool`
1475 `True` if a new row was inserted, `False` otherwise.
1477 Raises
1478 ------
1479 ConflictingDefinitionError
1480 Raised if the record exists in the database (according to primary
1481 key lookup) but is inconsistent with the given one.
1483 Notes
1484 -----
1485 This method cannot be called within transactions, as it needs to be
1486 able to perform its own transaction to be concurrent.
1487 """
1488 if conform:
1489 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement.
1490 record = element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row
1491 else:
1492 record = row
1493 storage = self._dimensions[element]
1494 try:
1495 return storage.sync(record)
1496 except DatabaseConflictError as err:
1497 raise ConflictingDefinitionError(str(err)) from err
1499 def queryDatasetTypes(self, expression: Any = ...) -> Iterator[DatasetType]:
1500 """Iterate over the dataset types whose names match an expression.
1502 Parameters
1503 ----------
1504 expression : `Any`, optional
1505 An expression that fully or partially identifies the dataset types
1506 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1507 `...` can be used to return all dataset types, and is the default.
1508 See :ref:`daf_butler_dataset_type_expressions` for more
1509 information.
1511 Yields
1512 ------
1513 datasetType : `DatasetType`
1514 A `DatasetType` instance whose name matches ``expression``.
1515 """
1516 yield from self._datasetStorage.fetchDatasetTypes(expression)
1518 def queryCollections(self, expression: Any = ...,
1519 datasetType: Optional[DatasetType] = None,
1520 collectionType: Optional[CollectionType] = None,
1521 flattenChains: bool = False,
1522 includeChains: Optional[bool] = None) -> Iterator[str]:
1523 """Iterate over the collections whose names match an expression.
1525 Parameters
1526 ----------
1527 expression : `Any`, optional
1528 An expression that fully or partially identifies the collections
1529 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1530 `...` can be used to return all collections, and is the default.
1531 See :ref:`daf_butler_collection_expressions` for more
1532 information.
1533 datasetType : `DatasetType`, optional
1534 If provided, only yield collections that should be searched for
1535 this dataset type according to ``expression``. If this is
1536 not provided, any dataset type restrictions in ``expression`` are
1537 ignored.
1538 collectionType : `CollectionType`, optional
1539 If provided, only yield collections of this type.
1540 flattenChains : `bool`, optional
1541 If `True` (`False` is default), recursively yield the child
1542 collections of matching `~CollectionType.CHAINED` collections.
1543 includeChains : `bool`, optional
1544 If `True`, yield records for matching `~CollectionType.CHAINED`
1545 collections. Default is the opposite of ``flattenChains``: include
1546 either CHAINED collections or their children, but not both.
1548 Yields
1549 ------
1550 collection : `str`
1551 The name of a collection that matches ``expression``.
1552 """
1553 query = CollectionQuery.fromExpression(expression)
1554 for record in query.iter(self._collections, datasetType=datasetType, collectionType=collectionType,
1555 flattenChains=flattenChains, includeChains=includeChains):
1556 yield record.name
1558 def makeQueryBuilder(self, summary: QuerySummary) -> QueryBuilder:
1559 """Return a `QueryBuilder` instance capable of constructing and
1560 managing more complex queries than those obtainable via `Registry`
1561 interfaces.
1563 This is an advanced interface; downstream code should prefer
1564 `Registry.queryDimensions` and `Registry.queryDatasets` whenever those
1565 are sufficient.
1567 Parameters
1568 ----------
1569 summary : `QuerySummary`
1570 Object describing and categorizing the full set of dimensions that
1571 will be included in the query.
1573 Returns
1574 -------
1575 builder : `QueryBuilder`
1576 Object that can be used to construct and perform advanced queries.
1577 """
1578 return QueryBuilder(connection=self._connection, summary=summary,
1579 dimensionStorage=self._dimensions,
1580 datasetStorage=self._datasetStorage)
1582 def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
1583 dataId: Optional[DataId] = None,
1584 datasets: Any = None,
1585 collections: Any = None,
1586 where: Optional[str] = None,
1587 expand: bool = True,
1588 **kwds) -> Iterator[DataCoordinate]:
1589 """Query for and iterate over data IDs matching user-provided criteria.
1591 Parameters
1592 ----------
1593 dimensions : `Dimension` or `str`, or iterable thereof
1594 The dimensions of the data IDs to yield, as either `Dimension`
1595 instances or `str`. Will be automatically expanded to a complete
1596 `DimensionGraph`.
1597 dataId : `dict` or `DataCoordinate`, optional
1598 A data ID whose key-value pairs are used as equality constraints
1599 in the query.
1600 datasets : `Any`, optional
1601 An expression that fully or partially identifies dataset types
1602 that should constrain the yielded data IDs. For example, including
1603 "raw" here would constrain the yielded ``instrument``,
1604 ``exposure``, ``detector``, and ``physical_filter`` values to only
1605 those for which at least one "raw" dataset exists in
1606 ``collections``. Allowed types include `DatasetType`, `str`,
1607 `re.Pattern`, and iterables thereof. Unlike other dataset type
1608 expressions, `...` is not permitted - it doesn't make sense to
1609 constrain data IDs on the existence of *all* datasets.
1610 See :ref:`daf_butler_dataset_type_expressions` for more
1611 information.
1612 collections: `Any`, optional
1613 An expression that fully or partially identifies the collections
1614 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1615 thereof. `...` can be used to return all collections. Must be
1616 provided if ``datasets`` is, and is ignored if it is not. See
1617 :ref:`daf_butler_collection_expressions` for more information.
1618 where : `str`, optional
1619 A string expression similar to a SQL WHERE clause. May involve
1620 any column of a dimension table or (as a shortcut for the primary
1621 key column of a dimension table) dimension name. See
1622 :ref:`daf_butler_dimension_expressions` for more information.
1623 expand : `bool`, optional
1624 If `True` (default) yield `ExpandedDataCoordinate` instead of
1625 minimal `DataCoordinate` base-class instances.
1626 kwds
1627 Additional keyword arguments are forwarded to
1628 `DataCoordinate.standardize` when processing the ``dataId``
1629 argument (and may be used to provide a constraining data ID even
1630 when the ``dataId`` argument is `None`).
1632 Yields
1633 ------
1634 dataId : `DataCoordinate`
1635 Data IDs matching the given query parameters. Order is
1636 unspecified.
1637 """
1638 dimensions = iterable(dimensions)
1639 standardizedDataId = self.expandDataId(dataId, **kwds)
1640 standardizedDatasetTypes = []
1641 requestedDimensionNames = set(self.dimensions.extract(dimensions).names)
1642 if datasets is not None:
1643 if collections is None:
1644 raise TypeError("Cannot pass 'datasets' without 'collections'.")
1645 for datasetType in self._datasetStorage.fetchDatasetTypes(datasets):
1646 requestedDimensionNames.update(datasetType.dimensions.names)
1647 standardizedDatasetTypes.append(datasetType)
1648 # Preprocess collections expression in case the original included
1649 # single-pass iterators (we'll want to use it multiple times
1650 # below).
1651 collections = CollectionQuery.fromExpression(collections)
1653 summary = QuerySummary(
1654 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1655 dataId=standardizedDataId,
1656 expression=where,
1657 )
1658 builder = self.makeQueryBuilder(summary)
1659 for datasetType in standardizedDatasetTypes:
1660 builder.joinDataset(datasetType, collections, isResult=False)
1661 query = builder.finish()
1662 predicate = query.predicate()
1663 for row in query.execute():
1664 if predicate(row):
1665 result = query.extractDataId(row)
1666 if expand:
1667 yield self.expandDataId(result, records=standardizedDataId.records)
1668 else:
1669 yield result
1671 def queryDatasets(self, datasetType: Any, *,
1672 collections: Any,
1673 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
1674 dataId: Optional[DataId] = None,
1675 where: Optional[str] = None,
1676 deduplicate: bool = False,
1677 expand: bool = True,
1678 **kwds) -> Iterator[DatasetRef]:
1679 """Query for and iterate over dataset references matching user-provided
1680 criteria.
1682 Parameters
1683 ----------
1684 datasetType
1685 An expression that fully or partially identifies the dataset types
1686 to be queried. Allowed types include `DatasetType`, `str`,
1687 `re.Pattern`, and iterables thereof. The special value `...` can
1688 be used to query all dataset types. See
1689 :ref:`daf_butler_dataset_type_expressions` for more information.
1690 collections
1691 An expression that fully or partially identifies the collections
1692 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1693 thereof. `...` can be used to return all collections. See
1694 :ref:`daf_butler_collection_expressions` for more information.
1695 dimensions : `~collections.abc.Iterable` of `Dimension` or `str`
1696 Dimensions to include in the query (in addition to those used
1697 to identify the queried dataset type(s)), either to constrain
1698 the resulting datasets to those for which a matching dimension
1699 exists, or to relate the dataset type's dimensions to dimensions
1700 referenced by the ``dataId`` or ``where`` arguments.
1701 dataId : `dict` or `DataCoordinate`, optional
1702 A data ID whose key-value pairs are used as equality constraints
1703 in the query.
1704 where : `str`, optional
1705 A string expression similar to a SQL WHERE clause. May involve
1706 any column of a dimension table or (as a shortcut for the primary
1707 key column of a dimension table) dimension name. See
1708 :ref:`daf_butler_dimension_expressions` for more information.
1709 deduplicate : `bool`, optional
1710 If `True` (`False` is default), for each result data ID, only
1711 yield one `DatasetRef` of each `DatasetType`, from the first
1712 collection in which a dataset of that dataset type appears
1713 (according to the order of ``collections`` passed in). If `True`,
1714 ``collections`` must not contain regular expressions and may not
1715 be `...`.
1716 expand : `bool`, optional
1717 If `True` (default) attach `ExpandedDataCoordinate` instead of
1718 minimal `DataCoordinate` base-class instances.
1719 kwds
1720 Additional keyword arguments are forwarded to
1721 `DataCoordinate.standardize` when processing the ``dataId``
1722 argument (and may be used to provide a constraining data ID even
1723 when the ``dataId`` argument is `None`).
1725 Yields
1726 ------
1727 ref : `DatasetRef`
1728 Dataset references matching the given query criteria. These
1729 are grouped by `DatasetType` if the query evaluates to multiple
1730 dataset types, but order is otherwise unspecified.
1732 Raises
1733 ------
1734 TypeError
1735 Raised when the arguments are incompatible, such as when a
1736 collection wildcard is passed when ``deduplicate`` is `True`.
1738 Notes
1739 -----
1740 When multiple dataset types are queried in a single call, the
1741 results of this operation are equivalent to querying for each dataset
1742 type separately in turn, and no information about the relationships
1743 between datasets of different types is included. In contexts where
1744 that kind of information is important, the recommended pattern is to
1745 use `queryDimensions` to first obtain data IDs (possibly with the
1746 desired dataset types and collections passed as constraints to the
1747 query), and then use multiple (generally much simpler) calls to
1748 `queryDatasets` with the returned data IDs passed as constraints.
1749 """
1750 # Standardize the collections expression.
1751 if deduplicate:
1752 collections = CollectionSearch.fromExpression(collections)
1753 else:
1754 collections = CollectionQuery.fromExpression(collections)
1755 # Standardize and expand the data ID provided as a constraint.
1756 standardizedDataId = self.expandDataId(dataId, **kwds)
1757 # If the datasetType passed isn't actually a DatasetType, expand it
1758 # (it could be an expression that yields multiple DatasetTypes) and
1759 # recurse.
1760 if not isinstance(datasetType, DatasetType):
1761 for trueDatasetType in self._datasetStorage.fetchDatasetTypes(datasetType):
1762 yield from self.queryDatasets(trueDatasetType, collections=collections,
1763 dimensions=dimensions, dataId=standardizedDataId,
1764 where=where, deduplicate=deduplicate)
1765 return
1766 # The full set of dimensions in the query is the combination of those
1767 # needed for the DatasetType and those explicitly requested, if any.
1768 requestedDimensionNames = set(datasetType.dimensions.names)
1769 if dimensions is not None:
1770 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
1771 # Construct the summary structure needed to construct a QueryBuilder.
1772 summary = QuerySummary(
1773 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1774 dataId=standardizedDataId,
1775 expression=where,
1776 )
1777 builder = self.makeQueryBuilder(summary)
1778 # Add the dataset subquery to the query, telling the QueryBuilder to
1779 # include the rank of the selected collection in the results only if we
1780 # need to deduplicate. Note that if any of the collections are
1781 # actually wildcard expressions, and we've asked for deduplication,
1782 # this will raise TypeError for us.
1783 if not builder.joinDataset(datasetType, collections, isResult=True, addRank=deduplicate):
1784 return
1785 query = builder.finish()
1786 predicate = query.predicate()
1787 if not deduplicate:
1788 # No need to de-duplicate across collections.
1789 for row in query.execute():
1790 if predicate(row):
1791 dataId = query.extractDataId(row, graph=datasetType.dimensions)
1792 if expand:
1793 dataId = self.expandDataId(dataId, records=standardizedDataId.records)
1794 yield query.extractDatasetRef(row, datasetType, dataId)[0]
1795 else:
1796 # For each data ID, yield only the DatasetRef with the lowest
1797 # collection rank.
1798 bestRefs = {}
1799 bestRanks = {}
1800 for row in query.execute():
1801 if predicate(row):
1802 ref, rank = query.extractDatasetRef(row, datasetType)
1803 bestRank = bestRanks.get(ref.dataId, sys.maxsize)
1804 if rank < bestRank:
1805 bestRefs[ref.dataId] = ref
1806 bestRanks[ref.dataId] = rank
1807 # If caller requested expanded data IDs, we defer that until here
1808 # so we do as little expansion as possible.
1809 if expand:
1810 for ref in bestRefs.values():
1811 dataId = self.expandDataId(ref.dataId, records=standardizedDataId.records)
1812 yield ref.expanded(dataId)
1813 else:
1814 yield from bestRefs.values()
1816 dimensions: DimensionUniverse
1817 """The universe of all dimensions known to the registry
1818 (`DimensionUniverse`).
1819 """
1821 storageClasses: StorageClassFactory
1822 """All storage classes known to the registry (`StorageClassFactory`).
1823 """