Coverage for python/lsst/daf/butler/registry/interfaces/_datasets.py: 67%
97 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-04-14 09:22 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-04-14 09:22 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage", "DatasetIdFactory", "DatasetIdGenEnum")
26import enum
27import uuid
28from abc import ABC, abstractmethod
29from collections.abc import Iterable, Iterator, Set
30from typing import TYPE_CHECKING, Any
32from lsst.daf.relation import Relation
34from ...core import DataCoordinate, DatasetId, DatasetRef, DatasetType, Timespan, ddl
35from .._exceptions import MissingDatasetTypeError
36from ._versioning import VersionedExtension, VersionTuple
38if TYPE_CHECKING:
39 from .._collection_summary import CollectionSummary
40 from ..queries import SqlQueryContext
41 from ._collections import CollectionManager, CollectionRecord, RunRecord
42 from ._database import Database, StaticTablesContext
43 from ._dimensions import DimensionRecordStorageManager
46class DatasetIdGenEnum(enum.Enum):
47 """This enum is used to specify dataset ID generation options for
48 ``insert()`` method.
49 """
51 UNIQUE = 0
52 """Unique mode generates unique ID for each inserted dataset, e.g.
53 auto-generated by database or random UUID.
54 """
56 DATAID_TYPE = 1
57 """In this mode ID is computed deterministically from a combination of
58 dataset type and dataId.
59 """
61 DATAID_TYPE_RUN = 2
62 """In this mode ID is computed deterministically from a combination of
63 dataset type, dataId, and run collection name.
64 """
67class DatasetIdFactory:
68 """Factory for dataset IDs (UUIDs).
70 For now the logic is hard-coded and is controlled by the user-provided
71 value of `DatasetIdGenEnum`. In the future we may implement a configurable
72 logic that can guess `DatasetIdGenEnum` value from other parameters.
73 """
75 NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f")
76 """Namespace UUID used for UUID5 generation. Do not change. This was
77 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`.
78 """
80 def makeDatasetId(
81 self,
82 run: str,
83 datasetType: DatasetType,
84 dataId: DataCoordinate,
85 idGenerationMode: DatasetIdGenEnum,
86 ) -> uuid.UUID:
87 """Generate dataset ID for a dataset.
89 Parameters
90 ----------
91 run : `str`
92 Name of the RUN collection for the dataset.
93 datasetType : `DatasetType`
94 Dataset type.
95 dataId : `DataCoordinate`
96 Expanded data ID for the dataset.
97 idGenerationMode : `DatasetIdGenEnum`
98 ID generation option. `~DatasetIdGenEnum.UNIQUE` makes a random
99 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a
100 deterministic UUID5-type ID based on a dataset type name and
101 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a
102 deterministic UUID5-type ID based on a dataset type name, run
103 collection name, and ``dataId``.
105 Returns
106 -------
107 datasetId : `uuid.UUID`
108 Dataset identifier.
109 """
110 if idGenerationMode is DatasetIdGenEnum.UNIQUE:
111 return uuid.uuid4()
112 else:
113 # WARNING: If you modify this code make sure that the order of
114 # items in the `items` list below never changes.
115 items: list[tuple[str, str]] = []
116 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE:
117 items = [
118 ("dataset_type", datasetType.name),
119 ]
120 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN:
121 items = [
122 ("dataset_type", datasetType.name),
123 ("run", run),
124 ]
125 else:
126 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}")
128 for name, value in sorted(dataId.byName().items()):
129 items.append((name, str(value)))
130 data = ",".join(f"{key}={value}" for key, value in items)
131 return uuid.uuid5(self.NS_UUID, data)
133 def resolveRef(
134 self,
135 ref: DatasetRef,
136 run: str,
137 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
138 ) -> DatasetRef:
139 """Generate resolved dataset reference for predicted datasets.
141 Parameters
142 ----------
143 ref : `DatasetRef`
144 Dataset ref, can be already resolved.
145 run : `str`
146 Name of the RUN collection for the dataset.
147 idGenerationMode : `DatasetIdGenEnum`
148 ID generation option. `~DatasetIdGenEnum.UNIQUE` makes a random
149 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a
150 deterministic UUID5-type ID based on a dataset type name and
151 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a
152 deterministic UUID5-type ID based on a dataset type name, run
153 collection name, and ``dataId``.
155 Returns
156 -------
157 resolved : `DatasetRef`
158 Resolved dataset ref, if input reference is already resolved it
159 is returned without modification.
161 Notes
162 -----
163 This method can only be used for predicted dataset references that do
164 not exist yet in the database. It does not resolve existing dataset
165 references already stored in registry.
166 """
167 if ref.id is not None:
168 return ref
169 datasetId = self.makeDatasetId(run, ref.datasetType, ref.dataId, idGenerationMode)
170 resolved = ref.resolved(datasetId, run)
171 return resolved
174class DatasetRecordStorage(ABC):
175 """An interface that manages the records associated with a particular
176 `DatasetType`.
178 Parameters
179 ----------
180 datasetType : `DatasetType`
181 Dataset type whose records this object manages.
182 """
184 def __init__(self, datasetType: DatasetType):
185 self.datasetType = datasetType
187 @abstractmethod
188 def insert(
189 self,
190 run: RunRecord,
191 dataIds: Iterable[DataCoordinate],
192 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
193 ) -> Iterator[DatasetRef]:
194 """Insert one or more dataset entries into the database.
196 Parameters
197 ----------
198 run : `RunRecord`
199 The record object describing the `~CollectionType.RUN` collection
200 this dataset will be associated with.
201 dataIds : `Iterable` [ `DataCoordinate` ]
202 Expanded data IDs (`DataCoordinate` instances) for the
203 datasets to be added. The dimensions of all data IDs must be the
204 same as ``self.datasetType.dimensions``.
205 idMode : `DatasetIdGenEnum`
206 With `UNIQUE` each new dataset is inserted with its new unique ID.
207 With non-`UNIQUE` mode ID is computed from some combination of
208 dataset type, dataId, and run collection name; if the same ID is
209 already in the database then new record is not inserted.
211 Returns
212 -------
213 datasets : `Iterable` [ `DatasetRef` ]
214 References to the inserted datasets.
215 """
216 raise NotImplementedError()
218 @abstractmethod
219 def import_(
220 self,
221 run: RunRecord,
222 datasets: Iterable[DatasetRef],
223 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
224 reuseIds: bool = False,
225 ) -> Iterator[DatasetRef]:
226 """Insert one or more dataset entries into the database.
228 Parameters
229 ----------
230 run : `RunRecord`
231 The record object describing the `~CollectionType.RUN` collection
232 this dataset will be associated with.
233 datasets : `~collections.abc.Iterable` of `DatasetRef`
234 Datasets to be inserted. Datasets can specify ``id`` attribute
235 which will be used for inserted datasets. All dataset IDs must
236 have the same type (`int` or `uuid.UUID`), if type of dataset IDs
237 does not match type supported by this class then IDs will be
238 ignored and new IDs will be generated by backend.
239 idGenerationMode : `DatasetIdGenEnum`
240 With `UNIQUE` each new dataset is inserted with its new unique ID.
241 With non-`UNIQUE` mode ID is computed from some combination of
242 dataset type, dataId, and run collection name; if the same ID is
243 already in the database then new record is not inserted.
244 reuseIds : `bool`, optional
245 If `True` then forces re-use of imported dataset IDs for integer
246 IDs which are normally generated as auto-incremented; exception
247 will be raised if imported IDs clash with existing ones. This
248 option has no effect on the use of globally-unique IDs which are
249 always re-used (or generated if integer IDs are being imported).
251 Returns
252 -------
253 datasets : `Iterable` [ `DatasetRef` ]
254 References to the inserted or existing datasets.
256 Notes
257 -----
258 The ``datasetType`` and ``run`` attributes of datasets are supposed to
259 be identical across all datasets but this is not checked and it should
260 be enforced by higher level registry code. This method does not need
261 to use those attributes from datasets, only ``dataId`` and ``id`` are
262 relevant.
263 """
264 raise NotImplementedError()
266 @abstractmethod
267 def delete(self, datasets: Iterable[DatasetRef]) -> None:
268 """Fully delete the given datasets from the registry.
270 Parameters
271 ----------
272 datasets : `Iterable` [ `DatasetRef` ]
273 Datasets to be deleted. All datasets must be resolved and have
274 the same `DatasetType` as ``self``.
276 Raises
277 ------
278 AmbiguousDatasetError
279 Raised if any of the given `DatasetRef` instances is unresolved.
280 """
281 raise NotImplementedError()
283 @abstractmethod
284 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
285 """Associate one or more datasets with a collection.
287 Parameters
288 ----------
289 collection : `CollectionRecord`
290 The record object describing the collection. ``collection.type``
291 must be `~CollectionType.TAGGED`.
292 datasets : `Iterable` [ `DatasetRef` ]
293 Datasets to be associated. All datasets must be resolved and have
294 the same `DatasetType` as ``self``.
296 Raises
297 ------
298 AmbiguousDatasetError
299 Raised if any of the given `DatasetRef` instances is unresolved.
301 Notes
302 -----
303 Associating a dataset with into collection that already contains a
304 different dataset with the same `DatasetType` and data ID will remove
305 the existing dataset from that collection.
307 Associating the same dataset into a collection multiple times is a
308 no-op, but is still not permitted on read-only databases.
309 """
310 raise NotImplementedError()
312 @abstractmethod
313 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
314 """Remove one or more datasets from a collection.
316 Parameters
317 ----------
318 collection : `CollectionRecord`
319 The record object describing the collection. ``collection.type``
320 must be `~CollectionType.TAGGED`.
321 datasets : `Iterable` [ `DatasetRef` ]
322 Datasets to be disassociated. All datasets must be resolved and
323 have the same `DatasetType` as ``self``.
325 Raises
326 ------
327 AmbiguousDatasetError
328 Raised if any of the given `DatasetRef` instances is unresolved.
329 """
330 raise NotImplementedError()
332 @abstractmethod
333 def certify(
334 self,
335 collection: CollectionRecord,
336 datasets: Iterable[DatasetRef],
337 timespan: Timespan,
338 context: SqlQueryContext,
339 ) -> None:
340 """Associate one or more datasets with a calibration collection and a
341 validity range within it.
343 Parameters
344 ----------
345 collection : `CollectionRecord`
346 The record object describing the collection. ``collection.type``
347 must be `~CollectionType.CALIBRATION`.
348 datasets : `Iterable` [ `DatasetRef` ]
349 Datasets to be associated. All datasets must be resolved and have
350 the same `DatasetType` as ``self``.
351 timespan : `Timespan`
352 The validity range for these datasets within the collection.
354 Raises
355 ------
356 AmbiguousDatasetError
357 Raised if any of the given `DatasetRef` instances is unresolved.
358 ConflictingDefinitionError
359 Raised if the collection already contains a different dataset with
360 the same `DatasetType` and data ID and an overlapping validity
361 range.
362 CollectionTypeError
363 Raised if
364 ``collection.type is not CollectionType.CALIBRATION`` or if
365 ``self.datasetType.isCalibration() is False``.
366 """
367 raise NotImplementedError()
369 @abstractmethod
370 def decertify(
371 self,
372 collection: CollectionRecord,
373 timespan: Timespan,
374 *,
375 dataIds: Iterable[DataCoordinate] | None = None,
376 context: SqlQueryContext,
377 ) -> None:
378 """Remove or adjust datasets to clear a validity range within a
379 calibration collection.
381 Parameters
382 ----------
383 collection : `CollectionRecord`
384 The record object describing the collection. ``collection.type``
385 must be `~CollectionType.CALIBRATION`.
386 timespan : `Timespan`
387 The validity range to remove datasets from within the collection.
388 Datasets that overlap this range but are not contained by it will
389 have their validity ranges adjusted to not overlap it, which may
390 split a single dataset validity range into two.
391 dataIds : `Iterable` [ `DataCoordinate` ], optional
392 Data IDs that should be decertified within the given validity range
393 If `None`, all data IDs for ``self.datasetType`` will be
394 decertified.
396 Raises
397 ------
398 CollectionTypeError
399 Raised if ``collection.type is not CollectionType.CALIBRATION``.
400 """
401 raise NotImplementedError()
403 @abstractmethod
404 def make_relation(
405 self,
406 *collections: CollectionRecord,
407 columns: Set[str],
408 context: SqlQueryContext,
409 ) -> Relation:
410 """Return a `sql.Relation` that represents a query for for this
411 `DatasetType` in one or more collections.
413 Parameters
414 ----------
415 *collections : `CollectionRecord`
416 The record object(s) describing the collection(s) to query. May
417 not be of type `CollectionType.CHAINED`. If multiple collections
418 are passed, the query will search all of them in an unspecified
419 order, and all collections must have the same type. Must include
420 at least one collection.
421 columns : `~collections.abc.Set` [ `str` ]
422 Columns to include in the relation. See `Query.find_datasets` for
423 most options, but this method supports one more:
425 - ``rank``: a calculated integer column holding the index of the
426 collection the dataset was found in, within the ``collections``
427 sequence given.
428 context : `SqlQueryContext`
429 The object that manages database connections, temporary tables and
430 relation engines for this query.
432 Returns
433 -------
434 relation : `~lsst.daf.relation.Relation`
435 Representation of the query.
436 """
437 raise NotImplementedError()
439 datasetType: DatasetType
440 """Dataset type whose records this object manages (`DatasetType`).
441 """
444class DatasetRecordStorageManager(VersionedExtension):
445 """An interface that manages the tables that describe datasets.
447 `DatasetRecordStorageManager` primarily serves as a container and factory
448 for `DatasetRecordStorage` instances, which each provide access to the
449 records for a different `DatasetType`.
450 """
452 def __init__(self, *, registry_schema_version: VersionTuple | None = None) -> None:
453 super().__init__(registry_schema_version=registry_schema_version)
455 @classmethod
456 @abstractmethod
457 def initialize(
458 cls,
459 db: Database,
460 context: StaticTablesContext,
461 *,
462 collections: CollectionManager,
463 dimensions: DimensionRecordStorageManager,
464 registry_schema_version: VersionTuple | None = None,
465 ) -> DatasetRecordStorageManager:
466 """Construct an instance of the manager.
468 Parameters
469 ----------
470 db : `Database`
471 Interface to the underlying database engine and namespace.
472 context : `StaticTablesContext`
473 Context object obtained from `Database.declareStaticTables`; used
474 to declare any tables that should always be present.
475 collections: `CollectionManager`
476 Manager object for the collections in this `Registry`.
477 dimensions : `DimensionRecordStorageManager`
478 Manager object for the dimensions in this `Registry`.
479 registry_schema_version : `VersionTuple` or `None`
480 Schema version of this extension as defined in registry.
482 Returns
483 -------
484 manager : `DatasetRecordStorageManager`
485 An instance of a concrete `DatasetRecordStorageManager` subclass.
486 """
487 raise NotImplementedError()
489 @classmethod
490 @abstractmethod
491 def getIdColumnType(cls) -> type:
492 """Return type used for columns storing dataset IDs.
494 This type is used for columns storing `DatasetRef.id` values, usually
495 a `type` subclass provided by SQLAlchemy.
497 Returns
498 -------
499 dtype : `type`
500 Type used for dataset identification in database.
501 """
502 raise NotImplementedError()
504 @classmethod
505 @abstractmethod
506 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
507 """Test whether the given dataset ID generation mode is supported by
508 `insert`.
510 Parameters
511 ----------
512 mode : `DatasetIdGenEnum`
513 Enum value for the mode to test.
515 Returns
516 -------
517 supported : `bool`
518 Whether the given mode is supported.
519 """
520 raise NotImplementedError()
522 @classmethod
523 @abstractmethod
524 def addDatasetForeignKey(
525 cls,
526 tableSpec: ddl.TableSpec,
527 *,
528 name: str = "dataset",
529 constraint: bool = True,
530 onDelete: str | None = None,
531 **kwargs: Any,
532 ) -> ddl.FieldSpec:
533 """Add a foreign key (field and constraint) referencing the dataset
534 table.
536 Parameters
537 ----------
538 tableSpec : `ddl.TableSpec`
539 Specification for the table that should reference the dataset
540 table. Will be modified in place.
541 name: `str`, optional
542 A name to use for the prefix of the new field; the full name is
543 ``{name}_id``.
544 onDelete: `str`, optional
545 One of "CASCADE" or "SET NULL", indicating what should happen to
546 the referencing row if the collection row is deleted. `None`
547 indicates that this should be an integrity error.
548 constraint: `bool`, optional
549 If `False` (`True` is default), add a field that can be joined to
550 the dataset primary key, but do not add a foreign key constraint.
551 **kwargs
552 Additional keyword arguments are forwarded to the `ddl.FieldSpec`
553 constructor (only the ``name`` and ``dtype`` arguments are
554 otherwise provided).
556 Returns
557 -------
558 idSpec : `ddl.FieldSpec`
559 Specification for the ID field.
560 """
561 raise NotImplementedError()
563 @abstractmethod
564 def refresh(self) -> None:
565 """Ensure all other operations on this manager are aware of any
566 dataset types that may have been registered by other clients since
567 it was initialized or last refreshed.
568 """
569 raise NotImplementedError()
571 def __getitem__(self, name: str) -> DatasetRecordStorage:
572 """Return the object that provides access to the records associated
573 with the given `DatasetType` name.
575 This is simply a convenience wrapper for `find` that raises `KeyError`
576 when the dataset type is not found.
578 Returns
579 -------
580 records : `DatasetRecordStorage`
581 The object representing the records for the given dataset type.
583 Raises
584 ------
585 KeyError
586 Raised if there is no dataset type with the given name.
588 Notes
589 -----
590 Dataset types registered by another client of the same repository since
591 the last call to `initialize` or `refresh` may not be found.
592 """
593 result = self.find(name)
594 if result is None:
595 raise MissingDatasetTypeError(f"Dataset type with name '{name}' not found.")
596 return result
598 @abstractmethod
599 def find(self, name: str) -> DatasetRecordStorage | None:
600 """Return an object that provides access to the records associated with
601 the given `DatasetType` name, if one exists.
603 Parameters
604 ----------
605 name : `str`
606 Name of the dataset type.
608 Returns
609 -------
610 records : `DatasetRecordStorage` or `None`
611 The object representing the records for the given dataset type, or
612 `None` if there are no records for that dataset type.
614 Notes
615 -----
616 Dataset types registered by another client of the same repository since
617 the last call to `initialize` or `refresh` may not be found.
618 """
619 raise NotImplementedError()
621 @abstractmethod
622 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]:
623 """Ensure that this `Registry` can hold records for the given
624 `DatasetType`, creating new tables as necessary.
626 Parameters
627 ----------
628 datasetType : `DatasetType`
629 Dataset type for which a table should created (as necessary) and
630 an associated `DatasetRecordStorage` returned.
632 Returns
633 -------
634 records : `DatasetRecordStorage`
635 The object representing the records for the given dataset type.
636 inserted : `bool`
637 `True` if the dataset type did not exist in the registry before.
639 Notes
640 -----
641 This operation may not be invoked within a `Database.transaction`
642 context.
643 """
644 raise NotImplementedError()
646 @abstractmethod
647 def remove(self, name: str) -> None:
648 """Remove the dataset type.
650 Parameters
651 ----------
652 name : `str`
653 Name of the dataset type.
654 """
655 raise NotImplementedError()
657 @abstractmethod
658 def resolve_wildcard(
659 self,
660 expression: Any,
661 components: bool | None = None,
662 missing: list[str] | None = None,
663 explicit_only: bool = False,
664 components_deprecated: bool = True,
665 ) -> dict[DatasetType, list[str | None]]:
666 """Resolve a dataset type wildcard expression.
668 Parameters
669 ----------
670 expression
671 Expression to resolve. Will be passed to
672 `DatasetTypeWildcard.from_expression`.
673 components : `bool`, optional
674 If `True`, apply all expression patterns to component dataset type
675 names as well. If `False`, never apply patterns to components. If
676 `None` (default), apply patterns to components only if their parent
677 datasets were not matched by the expression. Fully-specified
678 component datasets (`str` or `DatasetType` instances) are always
679 included.
680 missing : `list` of `str`, optional
681 String dataset type names that were explicitly given (i.e. not
682 regular expression patterns) but not found will be appended to this
683 list, if it is provided.
684 explicit_only : `bool`, optional
685 If `True`, require explicit `DatasetType` instances or `str` names,
686 with `re.Pattern` instances deprecated and ``...`` prohibited.
687 components_deprecated : `bool`, optional
688 If `True`, this is a context in which component dataset support is
689 deprecated. This will result in a deprecation warning when
690 ``components=True`` or ``components=None`` and a component dataset
691 is matched. In the future this will become an error.
693 Returns
694 -------
695 dataset_types : `dict` [ `DatasetType`, `list` [ `None`, `str` ] ]
696 A mapping with resolved dataset types as keys and lists of
697 matched component names as values, where `None` indicates the
698 parent composite dataset type was matched.
699 """
700 raise NotImplementedError()
702 @abstractmethod
703 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None:
704 """Return a `DatasetRef` for the given dataset primary key
705 value.
707 Parameters
708 ----------
709 id : `DatasetId`
710 Primary key value for the dataset.
712 Returns
713 -------
714 ref : `DatasetRef` or `None`
715 Object representing the dataset, or `None` if no dataset with the
716 given primary key values exists in this layer.
717 """
718 raise NotImplementedError()
720 @abstractmethod
721 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
722 """Return a summary for the given collection.
724 Parameters
725 ----------
726 collection : `CollectionRecord`
727 Record describing the collection for which a summary is to be
728 retrieved.
730 Returns
731 -------
732 summary : `CollectionSummary`
733 Summary of the dataset types and governor dimension values in
734 this collection.
735 """
736 raise NotImplementedError()
738 @abstractmethod
739 def ingest_date_dtype(self) -> type:
740 """Return type of the ``ingest_date`` column."""
741 raise NotImplementedError()