Coverage for python/lsst/daf/butler/registry/_registry.py : 12%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("Registry", "AmbiguousDatasetError", "ConflictingDefinitionError", "OrphanedRecordError")
26import contextlib
27import sys
28from typing import (
29 Any,
30 FrozenSet,
31 Iterable,
32 Iterator,
33 List,
34 Mapping,
35 Optional,
36 Set,
37 Type,
38 TYPE_CHECKING,
39 Union,
40)
42import sqlalchemy
44from ..core import (
45 Config,
46 DataCoordinate,
47 DataId,
48 DatasetRef,
49 DatasetType,
50 Dimension,
51 DimensionElement,
52 DimensionGraph,
53 DimensionRecord,
54 DimensionUniverse,
55 ExpandedDataCoordinate,
56 StorageClassFactory,
57)
58from ..core import ddl
59from ..core.utils import doImport, iterable, transactional, NamedKeyDict
60from ._config import RegistryConfig
61from .queries import (
62 CollectionsExpression,
63 DatasetRegistryStorage,
64 DatasetTypeExpression,
65 QueryBuilder,
66 QuerySummary,
67)
68from .tables import makeRegistryTableSpecs
70if TYPE_CHECKING: 70 ↛ 71line 70 didn't jump to line 71, because the condition on line 70 was never true
71 from ..butlerConfig import ButlerConfig
72 from ..core import (
73 Quantum
74 )
75 from .interfaces import (
76 Database,
77 OpaqueTableStorageManager,
78 DimensionRecordStorageManager,
79 )
82class AmbiguousDatasetError(Exception):
83 """Exception raised when a `DatasetRef` has no ID and a `Registry`
84 operation requires one.
85 """
88class ConflictingDefinitionError(Exception):
89 """Exception raised when trying to insert a database record when a
90 conflicting record already exists.
91 """
94class OrphanedRecordError(Exception):
95 """Exception raised when trying to remove or modify a database record
96 that is still being used in some other table.
97 """
100def _expandComponents(refs: Iterable[DatasetRef]) -> Iterator[DatasetRef]:
101 """Expand an iterable of datasets to include its components.
103 Parameters
104 ----------
105 refs : iterable of `DatasetRef`
106 An iterable of `DatasetRef` instances.
108 Yields
109 ------
110 refs : `DatasetRef`
111 Recursively expanded datasets.
112 """
113 for ref in refs:
114 yield ref
115 yield from _expandComponents(ref.components.values())
118def _checkAndGetId(ref: DatasetRef) -> int:
119 """Return the ID of the given `DatasetRef`, or raise if it is `None`.
121 This trivial function exists to allow operations that would otherwise be
122 natural list comprehensions to check that the ID is not `None` as well.
124 Parameters
125 ----------
126 ref : `DatasetRef`
127 Dataset reference.
129 Returns
130 -------
131 id : `int`
132 ``ref.id``
134 Raises
135 ------
136 AmbiguousDatasetError
137 Raised if ``ref.id`` is `None`.
138 """
139 if ref.id is None:
140 raise AmbiguousDatasetError("Dataset ID must not be `None`.")
141 return ref.id
144class Registry:
145 """Registry interface.
147 Parameters
148 ----------
149 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
150 Registry configuration
151 """
153 defaultConfigFile = None
154 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
155 absolute path. Can be None if no defaults specified.
156 """
158 @classmethod
159 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False,
160 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry:
161 """Create `Registry` subclass instance from `config`.
163 Uses ``registry.cls`` from `config` to determine which subclass to
164 instantiate.
166 Parameters
167 ----------
168 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
169 Registry configuration
170 create : `bool`, optional
171 Assume empty Registry and create a new one.
172 butlerRoot : `str`, optional
173 Path to the repository root this `Registry` will manage.
174 writeable : `bool`, optional
175 If `True` (default) create a read-write connection to the database.
177 Returns
178 -------
179 registry : `Registry` (subclass)
180 A new `Registry` subclass instance.
181 """
182 if not isinstance(config, RegistryConfig):
183 if isinstance(config, str) or isinstance(config, Config):
184 config = RegistryConfig(config)
185 else:
186 raise ValueError("Incompatible Registry configuration: {}".format(config))
187 config.replaceRoot(butlerRoot)
188 DatabaseClass = config.getDatabaseClass()
189 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
190 namespace=config.get("namespace"), writeable=writeable)
191 universe = DimensionUniverse(config)
192 opaque = doImport(config["managers", "opaque"])
193 dimensions = doImport(config["managers", "dimensions"])
194 return cls(database, universe, dimensions=dimensions, opaque=opaque, create=create)
196 def __init__(self, database: Database, universe: DimensionUniverse, *,
197 opaque: Type[OpaqueTableStorageManager],
198 dimensions: Type[DimensionRecordStorageManager],
199 create: bool = False):
200 self._db = database
201 self.storageClasses = StorageClassFactory()
202 with self._db.declareStaticTables(create=create) as context:
203 self._dimensions = dimensions.initialize(self._db, context, universe=universe)
204 self._tables = context.addTableTuple(makeRegistryTableSpecs(self.dimensions))
205 self._opaque = opaque.initialize(self._db, context)
206 # TODO: we shouldn't be grabbing the private connection from the
207 # Database instance like this, but it's a reasonable way to proceed
208 # while we transition to using the Database API more.
209 self._connection = self._db._connection
210 self._datasetStorage = DatasetRegistryStorage(connection=self._connection,
211 universe=self.dimensions,
212 tables=self._tables._asdict())
213 self._datasetTypes = {}
214 self._runIdsByName = {} # key = name, value = id
215 self._runNamesById = {} # key = id, value = name
217 def __str__(self) -> str:
218 return str(self._db)
220 def __repr__(self) -> str:
221 return f"Registry({self._db!r}, {self.dimensions!r})"
223 def isWriteable(self) -> bool:
224 """Return `True` if this registry allows write operations, and `False`
225 otherwise.
226 """
227 return self._db.isWriteable()
229 @property
230 def dimensions(self) -> DimensionUniverse:
231 """All dimensions recognized by this `Registry` (`DimensionUniverse`).
232 """
233 return self._dimensions.universe
235 @contextlib.contextmanager
236 def transaction(self):
237 """Return a context manager that represents a transaction.
238 """
239 # TODO make savepoint=False the default.
240 try:
241 with self._db.transaction():
242 yield
243 except BaseException:
244 # TODO: this clears the caches sometimes when we wouldn't actually
245 # need to. Can we avoid that?
246 self._dimensions.clearCaches()
247 self._datasetTypes.clear()
248 raise
250 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec):
251 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
252 other data repository client.
254 Opaque table records can be added via `insertOpaqueData`, retrieved via
255 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
257 Parameters
258 ----------
259 tableName : `str`
260 Logical name of the opaque table. This may differ from the
261 actual name used in the database by a prefix and/or suffix.
262 spec : `ddl.TableSpec`
263 Specification for the table to be added.
264 """
265 self._opaque.register(tableName, spec)
267 @transactional
268 def insertOpaqueData(self, tableName: str, *data: dict):
269 """Insert records into an opaque table.
271 Parameters
272 ----------
273 tableName : `str`
274 Logical name of the opaque table. Must match the name used in a
275 previous call to `registerOpaqueTable`.
276 data
277 Each additional positional argument is a dictionary that represents
278 a single row to be added.
279 """
280 self._opaque[tableName].insert(*data)
282 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
283 """Retrieve records from an opaque table.
285 Parameters
286 ----------
287 tableName : `str`
288 Logical name of the opaque table. Must match the name used in a
289 previous call to `registerOpaqueTable`.
290 where
291 Additional keyword arguments are interpreted as equality
292 constraints that restrict the returned rows (combined with AND);
293 keyword arguments are column names and values are the values they
294 must have.
296 Yields
297 ------
298 row : `dict`
299 A dictionary representing a single result row.
300 """
301 yield from self._opaque[tableName].fetch(**where)
303 @transactional
304 def deleteOpaqueData(self, tableName: str, **where: Any):
305 """Remove records from an opaque table.
307 Parameters
308 ----------
309 tableName : `str`
310 Logical name of the opaque table. Must match the name used in a
311 previous call to `registerOpaqueTable`.
312 where
313 Additional keyword arguments are interpreted as equality
314 constraints that restrict the deleted rows (combined with AND);
315 keyword arguments are column names and values are the values they
316 must have.
317 """
318 self._opaque[tableName].delete(**where)
320 def getAllCollections(self):
321 """Get names of all the collections found in this repository.
323 Returns
324 -------
325 collections : `set` of `str`
326 The collections.
327 """
328 table = self._tables.dataset_collection
329 result = self._db.query(sqlalchemy.sql.select([table.c.collection]).distinct()).fetchall()
330 if result is None:
331 return set()
332 return {r[0] for r in result}
334 def registerRun(self, name: str):
335 """Add a new run if one with the given name does not exist.
337 Parameters
338 ----------
339 name : `str`
340 The name of the run to create.
342 Notes
343 -----
344 This method cannot be called within transactions, as it needs to be
345 able to perform its own transaction to be concurrent.
346 """
347 id = self._runIdsByName.get(name)
348 if id is None:
349 (id,), _ = self._db.sync(self._tables.run, keys={"name": name}, returning=["id"])
350 self._runIdsByName[name] = id
351 self._runNamesById[id] = name
352 # Assume that if the run is in the cache, it's in the database, because
353 # right now there's no way to delete them.
355 def _getRunNameFromId(self, id: int) -> str:
356 """Return the name of the run associated with the given integer ID.
357 """
358 assert isinstance(id, int)
359 name = self._runNamesById.get(id)
360 if name is None:
361 table = self._tables.run
362 name = self._db.query(
363 sqlalchemy.sql.select(
364 [table.columns.name]
365 ).select_from(
366 table
367 ).where(
368 table.columns.id == id
369 )
370 ).scalar()
371 self._runNamesById[id] = name
372 self._runIdsByName[name] = id
373 return name
375 def _getRunIdFromName(self, name: str) -> id:
376 """Return the integer ID of the run associated with the given name.
377 """
378 assert isinstance(name, str)
379 id = self._runIdsByName.get(name)
380 if id is None:
381 table = self._tables.run
382 id = self._db.query(
383 sqlalchemy.sql.select(
384 [table.columns.id]
385 ).select_from(
386 table
387 ).where(
388 table.columns.name == name
389 )
390 ).scalar()
391 self._runNamesById[id] = name
392 self._runIdsByName[name] = id
393 return id
395 @transactional
396 def registerDatasetType(self, datasetType: DatasetType) -> bool:
397 """
398 Add a new `DatasetType` to the Registry.
400 It is not an error to register the same `DatasetType` twice.
402 Parameters
403 ----------
404 datasetType : `DatasetType`
405 The `DatasetType` to be added.
407 Returns
408 -------
409 inserted : `bool`
410 `True` if ``datasetType`` was inserted, `False` if an identical
411 existing `DatsetType` was found. Note that in either case the
412 DatasetType is guaranteed to be defined in the Registry
413 consistently with the given definition.
415 Raises
416 ------
417 ValueError
418 Raised if the dimensions or storage class are invalid.
419 ConflictingDefinitionError
420 Raised if this DatasetType is already registered with a different
421 definition.
422 """
423 # TODO: this implementation isn't concurrent, except *maybe* in SQLite
424 # with aggressive locking (where starting a transaction is essentially
425 # the same as grabbing a full-database lock). Should be reimplemented
426 # with Database.sync to fix this, but that may require schema changes
427 # as well so we only have to synchronize one row to know if we have
428 # inconsistent definitions.
430 # If the DatasetType is already in the cache, we assume it's already in
431 # the DB (note that we don't actually provide a way to remove them from
432 # the DB).
433 existingDatasetType = self._datasetTypes.get(datasetType.name)
434 # If it's not in the cache, try to insert it.
435 if existingDatasetType is None:
436 try:
437 with self._db.transaction():
438 self._db.insert(
439 self._tables.dataset_type,
440 {
441 "dataset_type_name": datasetType.name,
442 "storage_class": datasetType.storageClass.name,
443 }
444 )
445 except sqlalchemy.exc.IntegrityError:
446 # Insert failed on the only unique constraint on this table:
447 # dataset_type_name. So now the question is whether the one in
448 # there is the same as the one we tried to insert.
449 existingDatasetType = self.getDatasetType(datasetType.name)
450 else:
451 # If adding the DatasetType record itself succeeded, add its
452 # dimensions (if any). We don't guard this in a try block
453 # because a problem with this insert means the database
454 # content must be corrupted.
455 if datasetType.dimensions:
456 self._db.insert(
457 self._tables.dataset_type_dimensions,
458 *[{"dataset_type_name": datasetType.name,
459 "dimension_name": dimensionName}
460 for dimensionName in datasetType.dimensions.names]
461 )
462 # Update the cache.
463 self._datasetTypes[datasetType.name] = datasetType
464 # Also register component DatasetTypes (if any).
465 for compName, compStorageClass in datasetType.storageClass.components.items():
466 compType = DatasetType(datasetType.componentTypeName(compName),
467 dimensions=datasetType.dimensions,
468 storageClass=compStorageClass)
469 self.registerDatasetType(compType)
470 # Inserts succeeded, nothing left to do here.
471 return True
472 # A DatasetType with this name exists, check if is equal
473 if datasetType == existingDatasetType:
474 return False
475 else:
476 raise ConflictingDefinitionError(f"DatasetType: {datasetType} != existing {existingDatasetType}")
478 def getDatasetType(self, name: str) -> DatasetType:
479 """Get the `DatasetType`.
481 Parameters
482 ----------
483 name : `str`
484 Name of the type.
486 Returns
487 -------
488 type : `DatasetType`
489 The `DatasetType` associated with the given name.
491 Raises
492 ------
493 KeyError
494 Requested named DatasetType could not be found in registry.
495 """
496 datasetType = self._datasetTypes.get(name)
497 if datasetType is None:
498 # Get StorageClass from DatasetType table
499 result = self._db.query(
500 sqlalchemy.sql.select(
501 [self._tables.dataset_type.c.storage_class]
502 ).where(
503 self._tables.dataset_type.columns.dataset_type_name == name
504 )
505 ).fetchone()
507 if result is None:
508 raise KeyError("Could not find entry for datasetType {}".format(name))
510 storageClass = self.storageClasses.getStorageClass(result["storage_class"])
511 # Get Dimensions (if any) from DatasetTypeDimensions table
512 result = self._db.query(
513 sqlalchemy.sql.select(
514 [self._tables.dataset_type_dimensions.columns.dimension_name]
515 ).where(
516 self._tables.dataset_type_dimensions.columns.dataset_type_name == name
517 )
518 ).fetchall()
519 dimensions = DimensionGraph(self.dimensions, names=(r[0] for r in result) if result else ())
520 datasetType = DatasetType(name=name,
521 storageClass=storageClass,
522 dimensions=dimensions)
523 self._datasetTypes[name] = datasetType
524 return datasetType
526 def getAllDatasetTypes(self) -> FrozenSet[DatasetType]:
527 """Get every registered `DatasetType`.
529 Returns
530 -------
531 types : `frozenset` of `DatasetType`
532 Every `DatasetType` in the registry.
533 """
534 # Get all the registered names
535 result = self._db.query(
536 sqlalchemy.sql.select(
537 [self._tables.dataset_type.columns.dataset_type_name]
538 )
539 ).fetchall()
540 if result is None:
541 return frozenset()
542 datasetTypeNames = [r[0] for r in result]
543 return frozenset(self.getDatasetType(name) for name in datasetTypeNames)
545 def _makeDatasetRefFromRow(self, row: sqlalchemy.engine.RowProxy,
546 datasetType: Optional[DatasetType] = None,
547 dataId: Optional[DataCoordinate] = None):
548 """Construct a DatasetRef from the result of a query on the Dataset
549 table.
551 Parameters
552 ----------
553 row : `sqlalchemy.engine.RowProxy`.
554 Row of a query that contains all columns from the `Dataset` table.
555 May include additional fields (which will be ignored).
556 datasetType : `DatasetType`, optional
557 `DatasetType` associated with this dataset. Will be retrieved
558 if not provided. If provided, the caller guarantees that it is
559 already consistent with what would have been retrieved from the
560 database.
561 dataId : `DataCoordinate`, optional
562 Dimensions associated with this dataset. Will be retrieved if not
563 provided. If provided, the caller guarantees that it is already
564 consistent with what would have been retrieved from the database.
566 Returns
567 -------
568 ref : `DatasetRef`.
569 A new `DatasetRef` instance.
570 """
571 if datasetType is None:
572 datasetType = self.getDatasetType(row["dataset_type_name"])
573 run = self._getRunNameFromId(row["run_id"])
574 datasetRefHash = row["dataset_ref_hash"]
575 if dataId is None:
576 # TODO: should we expand here?
577 dataId = DataCoordinate.standardize(
578 row,
579 graph=datasetType.dimensions,
580 universe=self.dimensions
581 )
582 # Get components (if present)
583 components = {}
584 if datasetType.storageClass.isComposite():
585 t = self._tables
586 columns = list(t.dataset.columns)
587 columns.append(t.dataset_composition.columns.component_name)
588 results = self._db.query(
589 sqlalchemy.sql.select(
590 columns
591 ).select_from(
592 t.dataset.join(
593 t.dataset_composition,
594 (t.dataset.columns.dataset_id == t.dataset_composition.columns.component_dataset_id)
595 )
596 ).where(
597 t.dataset_composition.columns.parent_dataset_id == row["dataset_id"]
598 )
599 ).fetchall()
600 for result in results:
601 componentName = result["component_name"]
602 componentDatasetType = DatasetType(
603 DatasetType.nameWithComponent(datasetType.name, componentName),
604 dimensions=datasetType.dimensions,
605 storageClass=datasetType.storageClass.components[componentName]
606 )
607 components[componentName] = self._makeDatasetRefFromRow(result, dataId=dataId,
608 datasetType=componentDatasetType)
609 if not components.keys() <= datasetType.storageClass.components.keys():
610 raise RuntimeError(
611 f"Inconsistency detected between dataset and storage class definitions: "
612 f"{datasetType.storageClass.name} has components "
613 f"{set(datasetType.storageClass.components.keys())}, "
614 f"but dataset has components {set(components.keys())}"
615 )
616 return DatasetRef(datasetType=datasetType, dataId=dataId, id=row["dataset_id"], run=run,
617 hash=datasetRefHash, components=components)
619 def find(self, collection: str, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None,
620 **kwds: Any) -> Optional[DatasetRef]:
621 """Lookup a dataset.
623 This can be used to obtain a `DatasetRef` that permits the dataset to
624 be read from a `Datastore`.
626 Parameters
627 ----------
628 collection : `str`
629 Identifies the collection to search.
630 datasetType : `DatasetType` or `str`
631 A `DatasetType` or the name of one.
632 dataId : `dict` or `DataCoordinate`, optional
633 A `dict`-like object containing the `Dimension` links that identify
634 the dataset within a collection.
635 **kwds
636 Additional keyword arguments passed to
637 `DataCoordinate.standardize` to convert ``dataId`` to a true
638 `DataCoordinate` or augment an existing one.
640 Returns
641 -------
642 ref : `DatasetRef`
643 A ref to the Dataset, or `None` if no matching Dataset
644 was found.
646 Raises
647 ------
648 LookupError
649 If one or more data ID keys are missing.
650 """
651 if not isinstance(datasetType, DatasetType):
652 datasetType = self.getDatasetType(datasetType)
653 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions,
654 universe=self.dimensions, **kwds)
655 whereTerms = [
656 self._tables.dataset.columns.dataset_type_name == datasetType.name,
657 self._tables.dataset_collection.columns.collection == collection,
658 ]
659 whereTerms.extend(self._tables.dataset.columns[name] == dataId[name] for name in dataId.keys())
660 result = self._db.query(
661 self._tables.dataset.select().select_from(
662 self._tables.dataset.join(self._tables.dataset_collection)
663 ).where(
664 sqlalchemy.sql.and_(*whereTerms)
665 )
666 ).fetchone()
667 if result is None:
668 return None
669 return self._makeDatasetRefFromRow(result, datasetType=datasetType, dataId=dataId)
671 @transactional
672 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
673 run: str, *, producer: Optional[Quantum] = None, recursive: bool = False
674 ) -> List[DatasetRef]:
675 """Insert one or more datasets into the `Registry`
677 This always adds new datasets; to associate existing datasets with
678 a new collection, use ``associate``.
680 Parameters
681 ----------
682 datasetType : `DatasetType` or `str`
683 A `DatasetType` or the name of one.
684 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate`
685 Dimension-based identifiers for the new datasets.
686 run : `str`
687 The name of the run that produced the datasets.
688 producer : `Quantum`
689 Unit of work that produced the datasets. May be `None` to store
690 no provenance information, but if present the `Quantum` must
691 already have been added to the Registry.
692 recursive : `bool`
693 If True, recursively add datasets and attach entries for component
694 datasets as well.
696 Returns
697 -------
698 refs : `list` of `DatasetRef`
699 Resolved `DatasetRef` instances for all given data IDs (in the same
700 order).
701 ConflictingDefinitionError
702 If a dataset with the same dataset type and data ID as one of those
703 given already exists in the given collection.
704 """
705 if not isinstance(datasetType, DatasetType):
706 datasetType = self.getDatasetType(datasetType)
707 rows = []
708 refs = []
709 base = {
710 "dataset_type_name": datasetType.name,
711 "run_id": self._getRunIdFromName(run),
712 "quantum_id": producer.id if producer is not None else None,
713 }
714 # Expand data IDs and build both a list of unresolved DatasetRefs
715 # and a list of dictionary rows for the dataset table.
716 for dataId in dataIds:
717 ref = DatasetRef(datasetType, self.expandDataId(dataId, graph=datasetType.dimensions))
718 refs.append(ref)
719 row = dict(base, dataset_ref_hash=ref.hash)
720 for dimension, value in ref.dataId.full.items():
721 row[dimension.name] = value
722 rows.append(row)
723 # Actually insert into the dataset table.
724 datasetIds = self._db.insert(self._tables.dataset, *rows, returnIds=True)
725 # Resolve the DatasetRefs with the autoincrement IDs we generated.
726 refs = [ref.resolved(id=datasetId, run=run) for datasetId, ref in zip(datasetIds, refs)]
727 # Associate the datasets with the run as a collection. Note that we
728 # do this before inserting component datasets so recursing doesn't try
729 # to associate those twice.
730 self.associate(run, refs)
731 if recursive and datasetType.isComposite():
732 # Insert component rows by recursing, and gather a single big list
733 # of rows to insert into the dataset_composition table.
734 compositionRows = []
735 for componentName in datasetType.storageClass.components:
736 componentDatasetType = datasetType.makeComponentDatasetType(componentName)
737 componentRefs = self.insertDatasets(componentDatasetType,
738 dataIds=(ref.dataId for ref in refs),
739 run=run,
740 producer=producer,
741 recursive=True)
742 for parentRef, componentRef in zip(refs, componentRefs):
743 parentRef._components[componentName] = componentRef
744 compositionRows.append({
745 "parent_dataset_id": parentRef.id,
746 "component_dataset_id": componentRef.id,
747 "component_name": componentName,
748 })
749 if compositionRows:
750 self._db.insert(self._tables.dataset_composition, *compositionRows)
751 return refs
753 def getDataset(self, id: int, datasetType: Optional[DatasetType] = None,
754 dataId: Optional[DataCoordinate] = None) -> Optional[DatasetRef]:
755 """Retrieve a Dataset entry.
757 Parameters
758 ----------
759 id : `int`
760 The unique identifier for the Dataset.
761 datasetType : `DatasetType`, optional
762 The `DatasetType` of the dataset to retrieve. This is used to
763 short-circuit retrieving the `DatasetType`, so if provided, the
764 caller is guaranteeing that it is what would have been retrieved.
765 dataId : `DataCoordinate`, optional
766 A `Dimension`-based identifier for the dataset within a
767 collection, possibly containing additional metadata. This is used
768 to short-circuit retrieving the dataId, so if provided, the
769 caller is guaranteeing that it is what would have been retrieved.
771 Returns
772 -------
773 ref : `DatasetRef`
774 A ref to the Dataset, or `None` if no matching Dataset
775 was found.
776 """
777 result = self._db.query(
778 self._tables.dataset.select().where(
779 self._tables.dataset.columns.dataset_id == id
780 )
781 ).fetchone()
782 if result is None:
783 return None
784 return self._makeDatasetRefFromRow(result, datasetType=datasetType, dataId=dataId)
786 @transactional
787 def removeDataset(self, ref: DatasetRef):
788 """Remove a dataset from the Registry.
790 The dataset and all components will be removed unconditionally from
791 all collections, and any associated `Quantum` records will also be
792 removed. `Datastore` records will *not* be deleted; the caller is
793 responsible for ensuring that the dataset has already been removed
794 from all Datastores.
796 Parameters
797 ----------
798 ref : `DatasetRef`
799 Reference to the dataset to be removed. Must include a valid
800 ``id`` attribute, and should be considered invalidated upon return.
802 Raises
803 ------
804 AmbiguousDatasetError
805 Raised if ``ref.id`` is `None`.
806 OrphanedRecordError
807 Raised if the dataset is still present in any `Datastore`.
808 """
809 if not ref.id:
810 raise AmbiguousDatasetError(f"Cannot remove dataset {ref} without ID.")
811 # Remove component datasets. We assume ``ref.components`` is already
812 # correctly populated, and rely on ON DELETE CASCADE to remove entries
813 # from DatasetComposition.
814 for componentRef in ref.components.values():
815 self.removeDataset(componentRef)
817 # Remove related quanta. We rely on ON DELETE CASCADE to remove any
818 # related records in dataset_consumers. Note that we permit a Quantum
819 # to be deleted without removing the datasets it refers to, but do not
820 # allow a dataset to be deleted without removing the Quanta that refer
821 # to them. A dataset is still quite usable without provenance, but
822 # provenance is worthless if it's inaccurate.
823 t = self._tables
824 selectProducer = sqlalchemy.sql.select(
825 [t.dataset.columns.quantum_id]
826 ).where(
827 t.dataset.columns.dataset_id == ref.id
828 )
829 selectConsumers = sqlalchemy.sql.select(
830 [t.dataset_consumers.columns.quantum_id]
831 ).where(
832 t.dataset_consumers.columns.dataset_id == ref.id
833 )
834 # TODO: we'd like to use Database.delete here, but it doesn't general
835 # queries yet.
836 self._connection.execute(
837 t.quantum.delete().where(
838 t.quantum.columns.id.in_(sqlalchemy.sql.union(selectProducer, selectConsumers))
839 )
840 )
841 # Remove the Dataset record itself. We rely on ON DELETE CASCADE to
842 # remove from DatasetCollection, and assume foreign key violations
843 # come from DatasetLocation (everything else should have an ON DELETE).
844 try:
845 self._connection.execute(
846 t.dataset.delete().where(t.dataset.c.dataset_id == ref.id)
847 )
848 except sqlalchemy.exc.IntegrityError as err:
849 raise OrphanedRecordError(f"Dataset {ref} is still present in one or more Datastores.") from err
851 @transactional
852 def attachComponent(self, name: str, parent: DatasetRef, component: DatasetRef):
853 """Attach a component to a dataset.
855 Parameters
856 ----------
857 name : `str`
858 Name of the component.
859 parent : `DatasetRef`
860 A reference to the parent dataset. Will be updated to reference
861 the component.
862 component : `DatasetRef`
863 A reference to the component dataset.
865 Raises
866 ------
867 AmbiguousDatasetError
868 Raised if ``parent.id`` or ``component.id`` is `None`.
869 """
870 # TODO Insert check for component name and type against
871 # parent.storageClass specified components
872 if parent.id is None:
873 raise AmbiguousDatasetError(f"Cannot attach component to dataset {parent} without ID.")
874 if component.id is None:
875 raise AmbiguousDatasetError(f"Cannot attach component {component} without ID.")
876 values = dict(component_name=name,
877 parent_dataset_id=parent.id,
878 component_dataset_id=component.id)
879 self._db.insert(self._tables.dataset_composition, values)
880 parent._components[name] = component
882 @transactional
883 def associate(self, collection: str, refs: List[DatasetRef]):
884 """Add existing Datasets to a collection, implicitly creating the
885 collection if it does not already exist.
887 If a DatasetRef with the same exact ``dataset_id`` is already in a
888 collection nothing is changed. If a `DatasetRef` with the same
889 `DatasetType1` and dimension values but with different ``dataset_id``
890 exists in the collection, `ValueError` is raised.
892 Parameters
893 ----------
894 collection : `str`
895 Indicates the collection the Datasets should be associated with.
896 refs : iterable of `DatasetRef`
897 An iterable of `DatasetRef` instances that already exist in this
898 `Registry`. All component datasets will be associated with the
899 collection as well.
901 Raises
902 ------
903 ConflictingDefinitionError
904 If a Dataset with the given `DatasetRef` already exists in the
905 given collection.
906 AmbiguousDatasetError
907 Raised if ``any(ref.id is None for ref in refs)``.
908 """
909 rows = [{"dataset_id": _checkAndGetId(ref),
910 "dataset_ref_hash": ref.hash,
911 "collection": collection}
912 for ref in _expandComponents(refs)]
913 try:
914 self._db.replace(self._tables.dataset_collection, *rows)
915 except sqlalchemy.exc.IntegrityError as err:
916 raise ConflictingDefinitionError(
917 f"Constraint violation while associating datasets with collection {collection}. "
918 f"This probably means that one or more datasets with the same dataset type and data ID "
919 f"already exist in the collection, but it may also indicate that the datasets do not exist."
920 ) from err
922 @transactional
923 def disassociate(self, collection: str, refs: List[DatasetRef]):
924 """Remove existing Datasets from a collection.
926 ``collection`` and ``ref`` combinations that are not currently
927 associated are silently ignored.
929 Parameters
930 ----------
931 collection : `str`
932 The collection the Datasets should no longer be associated with.
933 refs : `list` of `DatasetRef`
934 A `list` of `DatasetRef` instances that already exist in this
935 `Registry`. All component datasets will also be removed.
937 Raises
938 ------
939 AmbiguousDatasetError
940 Raised if ``any(ref.id is None for ref in refs)``.
941 """
942 rows = [{"dataset_id": _checkAndGetId(ref), "collection": collection}
943 for ref in _expandComponents(refs)]
944 self._db.delete(self._tables.dataset_collection, ["dataset_id", "collection"], *rows)
946 @transactional
947 def insertDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]):
948 """Record that a datastore holds the given datasets.
950 Typically used by `Datastore`.
952 Parameters
953 ----------
954 datastoreName : `str`
955 Name of the datastore holding these datasets.
956 refs : `~collections.abc.Iterable` of `DatasetRef`
957 References to the datasets.
959 Raises
960 ------
961 AmbiguousDatasetError
962 Raised if ``any(ref.id is None for ref in refs)``.
963 """
964 self._db.insert(
965 self._tables.dataset_storage,
966 *[{"datastore_name": datastoreName, "dataset_id": _checkAndGetId(ref)} for ref in refs]
967 )
969 def getDatasetLocations(self, ref: DatasetRef) -> Set[str]:
970 """Retrieve datastore locations for a given dataset.
972 Typically used by `Datastore`.
974 Parameters
975 ----------
976 ref : `DatasetRef`
977 A reference to the dataset for which to retrieve storage
978 information.
980 Returns
981 -------
982 datastores : `set` of `str`
983 All the matching datastores holding this dataset. Empty set
984 if the dataset does not exist anywhere.
986 Raises
987 ------
988 AmbiguousDatasetError
989 Raised if ``ref.id`` is `None`.
990 """
991 table = self._tables.dataset_storage
992 result = self._db.query(
993 sqlalchemy.sql.select(
994 [table.columns.datastore_name]
995 ).where(
996 table.columns.dataset_id == ref.id
997 )
998 ).fetchall()
999 return {r["datastore_name"] for r in result}
1001 @transactional
1002 def removeDatasetLocation(self, datastoreName, ref):
1003 """Remove datastore location associated with this dataset.
1005 Typically used by `Datastore` when a dataset is removed.
1007 Parameters
1008 ----------
1009 datastoreName : `str`
1010 Name of this `Datastore`.
1011 ref : `DatasetRef`
1012 A reference to the dataset for which information is to be removed.
1014 Raises
1015 ------
1016 AmbiguousDatasetError
1017 Raised if ``ref.id`` is `None`.
1018 """
1019 self._db.delete(
1020 self._tables.dataset_storage,
1021 ["dataset_id", "datastore_name"],
1022 {"dataset_id": _checkAndGetId(ref), "datastore_name": datastoreName}
1023 )
1025 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
1026 records: Optional[Mapping[DimensionElement, DimensionRecord]] = None, **kwds):
1027 """Expand a dimension-based data ID to include additional information.
1029 Parameters
1030 ----------
1031 dataId : `DataCoordinate` or `dict`, optional
1032 Data ID to be expanded; augmented and overridden by ``kwds``.
1033 graph : `DimensionGraph`, optional
1034 Set of dimensions for the expanded ID. If `None`, the dimensions
1035 will be inferred from the keys of ``dataId`` and ``kwds``.
1036 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph``
1037 are silently ignored, providing a way to extract and expand a
1038 subset of a data ID.
1039 records : mapping [`DimensionElement`, `DimensionRecord`], optional
1040 Dimension record data to use before querying the database for that
1041 data.
1042 **kwds
1043 Additional keywords are treated like additional key-value pairs for
1044 ``dataId``, extending and overriding
1046 Returns
1047 -------
1048 expanded : `ExpandedDataCoordinate`
1049 A data ID that includes full metadata for all of the dimensions it
1050 identifieds.
1051 """
1052 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwds)
1053 if isinstance(standardized, ExpandedDataCoordinate):
1054 return standardized
1055 elif isinstance(dataId, ExpandedDataCoordinate):
1056 records = dict(records) if records is not None else {}
1057 records.update(dataId.records)
1058 else:
1059 records = dict(records) if records is not None else {}
1060 keys = dict(standardized)
1061 for element in standardized.graph._primaryKeyTraversalOrder:
1062 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
1063 if record is ...:
1064 storage = self._dimensions[element]
1065 record = storage.fetch(keys)
1066 records[element] = record
1067 if record is not None:
1068 keys.update((d, getattr(record, d.name)) for d in element.implied)
1069 else:
1070 if element in standardized.graph.required:
1071 raise LookupError(
1072 f"Could not fetch record for required dimension {element.name} via keys {keys}."
1073 )
1074 records.update((d, None) for d in element.implied)
1075 return ExpandedDataCoordinate(standardized.graph, standardized.values(), records=records)
1077 def insertDimensionData(self, element: Union[DimensionElement, str],
1078 *data: Union[dict, DimensionRecord],
1079 conform: bool = True):
1080 """Insert one or more dimension records into the database.
1082 Parameters
1083 ----------
1084 element : `DimensionElement` or `str`
1085 The `DimensionElement` or name thereof that identifies the table
1086 records will be inserted into.
1087 data : `dict` or `DimensionRecord` (variadic)
1088 One or more records to insert.
1089 conform : `bool`, optional
1090 If `False` (`True` is default) perform no checking or conversions,
1091 and assume that ``element`` is a `DimensionElement` instance and
1092 ``data`` is a one or more `DimensionRecord` instances of the
1093 appropriate subclass.
1094 """
1095 if conform:
1096 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement.
1097 records = [element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row
1098 for row in data]
1099 else:
1100 records = data
1101 storage = self._dimensions[element]
1102 storage.insert(*records)
1104 def makeQueryBuilder(self, summary: QuerySummary) -> QueryBuilder:
1105 """Return a `QueryBuilder` instance capable of constructing and
1106 managing more complex queries than those obtainable via `Registry`
1107 interfaces.
1109 This is an advanced `SqlRegistry`-only interface; downstream code
1110 should prefer `Registry.queryDimensions` and `Registry.queryDatasets`
1111 whenever those are sufficient.
1113 Parameters
1114 ----------
1115 summary: `QuerySummary`
1116 Object describing and categorizing the full set of dimensions that
1117 will be included in the query.
1119 Returns
1120 -------
1121 builder : `QueryBuilder`
1122 Object that can be used to construct and perform advanced queries.
1123 """
1124 return QueryBuilder(connection=self._connection, summary=summary,
1125 dimensionStorage=self._dimensions,
1126 datasetStorage=self._datasetStorage)
1128 def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
1129 dataId: Optional[DataId] = None,
1130 datasets: Optional[Mapping[DatasetTypeExpression, CollectionsExpression]] = None,
1131 where: Optional[str] = None,
1132 expand: bool = True,
1133 **kwds) -> Iterator[DataCoordinate]:
1134 """Query for and iterate over data IDs matching user-provided criteria.
1136 Parameters
1137 ----------
1138 dimensions : `Dimension` or `str`, or iterable thereof
1139 The dimensions of the data IDs to yield, as either `Dimension`
1140 instances or `str`. Will be automatically expanded to a complete
1141 `DimensionGraph`.
1142 dataId : `dict` or `DataCoordinate`, optional
1143 A data ID whose key-value pairs are used as equality constraints
1144 in the query.
1145 datasets : `~collections.abc.Mapping`, optional
1146 Datasets whose existence in the registry constrain the set of data
1147 IDs returned. This is a mapping from a dataset type expression
1148 (a `str` name, a true `DatasetType` instance, a `Like` pattern
1149 for the name, or ``...`` for all DatasetTypes) to a collections
1150 expression (a sequence of `str` or `Like` patterns, or `...` for
1151 all collections).
1152 where : `str`, optional
1153 A string expression similar to a SQL WHERE clause. May involve
1154 any column of a dimension table or (as a shortcut for the primary
1155 key column of a dimension table) dimension name.
1156 expand : `bool`, optional
1157 If `True` (default) yield `ExpandedDataCoordinate` instead of
1158 minimal `DataCoordinate` base-class instances.
1159 kwds
1160 Additional keyword arguments are forwarded to
1161 `DataCoordinate.standardize` when processing the ``dataId``
1162 argument (and may be used to provide a constraining data ID even
1163 when the ``dataId`` argument is `None`).
1165 Yields
1166 ------
1167 dataId : `DataCoordinate`
1168 Data IDs matching the given query parameters. Order is
1169 unspecified.
1170 """
1171 dimensions = iterable(dimensions)
1172 standardizedDataId = self.expandDataId(dataId, **kwds)
1173 standardizedDatasets = NamedKeyDict()
1174 requestedDimensionNames = set(self.dimensions.extract(dimensions).names)
1175 if datasets is not None:
1176 for datasetTypeExpr, collectionsExpr in datasets.items():
1177 for trueDatasetType in self._datasetStorage.fetchDatasetTypes(datasetTypeExpr,
1178 collections=collectionsExpr,
1179 dataId=standardizedDataId):
1180 requestedDimensionNames.update(trueDatasetType.dimensions.names)
1181 standardizedDatasets[trueDatasetType] = collectionsExpr
1182 summary = QuerySummary(
1183 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1184 dataId=standardizedDataId,
1185 expression=where,
1186 )
1187 builder = self.makeQueryBuilder(summary)
1188 for datasetType, collections in standardizedDatasets.items():
1189 builder.joinDataset(datasetType, collections, isResult=False)
1190 query = builder.finish()
1191 predicate = query.predicate()
1192 for row in query.execute():
1193 if predicate(row):
1194 result = query.extractDataId(row)
1195 if expand:
1196 yield self.expandDataId(result, records=standardizedDataId.records)
1197 else:
1198 yield result
1200 def queryDatasets(self, datasetType: DatasetTypeExpression, *,
1201 collections: CollectionsExpression,
1202 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
1203 dataId: Optional[DataId] = None,
1204 where: Optional[str] = None,
1205 deduplicate: bool = False,
1206 expand: bool = True,
1207 **kwds) -> Iterator[DatasetRef]:
1208 """Query for and iterate over dataset references matching user-provided
1209 criteria.
1211 Parameters
1212 ----------
1213 datasetType : `DatasetType`, `str`, `Like`, or ``...``
1214 An expression indicating type(s) of datasets to query for.
1215 ``...`` may be used to query for all known DatasetTypes.
1216 Multiple explicitly-provided dataset types cannot be queried in a
1217 single call to `queryDatasets` even though wildcard expressions
1218 can, because the results would be identical to chaining the
1219 iterators produced by multiple calls to `queryDatasets`.
1220 collections: `~collections.abc.Sequence` of `str` or `Like`, or ``...``
1221 An expression indicating the collections to be searched for
1222 datasets. ``...`` may be passed to search all collections.
1223 dimensions : `~collections.abc.Iterable` of `Dimension` or `str`
1224 Dimensions to include in the query (in addition to those used
1225 to identify the queried dataset type(s)), either to constrain
1226 the resulting datasets to those for which a matching dimension
1227 exists, or to relate the dataset type's dimensions to dimensions
1228 referenced by the ``dataId`` or ``where`` arguments.
1229 dataId : `dict` or `DataCoordinate`, optional
1230 A data ID whose key-value pairs are used as equality constraints
1231 in the query.
1232 where : `str`, optional
1233 A string expression similar to a SQL WHERE clause. May involve
1234 any column of a dimension table or (as a shortcut for the primary
1235 key column of a dimension table) dimension name.
1236 deduplicate : `bool`, optional
1237 If `True` (`False` is default), for each result data ID, only
1238 yield one `DatasetRef` of each `DatasetType`, from the first
1239 collection in which a dataset of that dataset type appears
1240 (according to the order of ``collections`` passed in). Cannot be
1241 used if any element in ``collections`` is an expression.
1242 expand : `bool`, optional
1243 If `True` (default) attach `ExpandedDataCoordinate` instead of
1244 minimal `DataCoordinate` base-class instances.
1245 kwds
1246 Additional keyword arguments are forwarded to
1247 `DataCoordinate.standardize` when processing the ``dataId``
1248 argument (and may be used to provide a constraining data ID even
1249 when the ``dataId`` argument is `None`).
1251 Yields
1252 ------
1253 ref : `DatasetRef`
1254 Dataset references matching the given query criteria. These
1255 are grouped by `DatasetType` if the query evaluates to multiple
1256 dataset types, but order is otherwise unspecified.
1258 Raises
1259 ------
1260 TypeError
1261 Raised when the arguments are incompatible, such as when a
1262 collection wildcard is pass when ``deduplicate`` is `True`.
1264 Notes
1265 -----
1266 When multiple dataset types are queried via a wildcard expression, the
1267 results of this operation are equivalent to querying for each dataset
1268 type separately in turn, and no information about the relationships
1269 between datasets of different types is included. In contexts where
1270 that kind of information is important, the recommended pattern is to
1271 use `queryDimensions` to first obtain data IDs (possibly with the
1272 desired dataset types and collections passed as constraints to the
1273 query), and then use multiple (generally much simpler) calls to
1274 `queryDatasets` with the returned data IDs passed as constraints.
1275 """
1276 # Standardize and expand the data ID provided as a constraint.
1277 standardizedDataId = self.expandDataId(dataId, **kwds)
1278 # If the datasetType passed isn't actually a DatasetType, expand it
1279 # (it could be an expression that yields multiple DatasetTypes) and
1280 # recurse.
1281 if not isinstance(datasetType, DatasetType):
1282 for trueDatasetType in self._datasetStorage.fetchDatasetTypes(datasetType,
1283 collections=collections,
1284 dataId=standardizedDataId):
1285 yield from self.queryDatasets(trueDatasetType, collections=collections,
1286 dimensions=dimensions, dataId=standardizedDataId,
1287 where=where, deduplicate=deduplicate)
1288 return
1289 # The full set of dimensions in the query is the combination of those
1290 # needed for the DatasetType and those explicitly requested, if any.
1291 requestedDimensionNames = set(datasetType.dimensions.names)
1292 if dimensions is not None:
1293 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
1294 # Construct the summary structure needed to construct a QueryBuilder.
1295 summary = QuerySummary(
1296 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1297 dataId=standardizedDataId,
1298 expression=where,
1299 )
1300 builder = self.makeQueryBuilder(summary)
1301 # Add the dataset subquery to the query, telling the QueryBuilder to
1302 # include the rank of the selected collection in the results only if we
1303 # need to deduplicate. Note that if any of the collections are
1304 # actually wildcard expressions, and we've asked for deduplication,
1305 # this will raise TypeError for us.
1306 builder.joinDataset(datasetType, collections, isResult=True, addRank=deduplicate)
1307 query = builder.finish()
1308 predicate = query.predicate()
1309 if not deduplicate or len(collections) == 1:
1310 # No need to de-duplicate across collections.
1311 for row in query.execute():
1312 if predicate(row):
1313 dataId = query.extractDataId(row, graph=datasetType.dimensions)
1314 if expand:
1315 dataId = self.expandDataId(dataId, records=standardizedDataId.records)
1316 yield query.extractDatasetRef(row, datasetType, dataId)[0]
1317 else:
1318 # For each data ID, yield only the DatasetRef with the lowest
1319 # collection rank.
1320 bestRefs = {}
1321 bestRanks = {}
1322 for row in query.execute():
1323 if predicate(row):
1324 ref, rank = query.extractDatasetRef(row, datasetType)
1325 bestRank = bestRanks.get(ref.dataId, sys.maxsize)
1326 if rank < bestRank:
1327 bestRefs[ref.dataId] = ref
1328 bestRanks[ref.dataId] = rank
1329 # If caller requested expanded data IDs, we defer that until here
1330 # so we do as little expansion as possible.
1331 if expand:
1332 for ref in bestRefs.values():
1333 dataId = self.expandDataId(ref.dataId, records=standardizedDataId.records)
1334 yield ref.expanded(dataId)
1335 else:
1336 yield from bestRefs.values()
1338 dimensions: DimensionUniverse
1339 """The universe of all dimensions known to the registry
1340 (`DimensionUniverse`).
1341 """
1343 storageClasses: StorageClassFactory
1344 """All storage classes known to the registry (`StorageClassFactory`).
1345 """