Coverage for python/lsst/daf/butler/registry/interfaces/_datasets.py: 57%
112 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-12 02:05 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-12 02:05 -0800
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage", "DatasetIdFactory", "DatasetIdGenEnum")
26import enum
27import uuid
28from abc import ABC, abstractmethod
29from collections.abc import Iterable, Iterator, Set
30from typing import TYPE_CHECKING, Any
32from lsst.daf.relation import Relation
34from ...core import DataCoordinate, DatasetId, DatasetRef, DatasetType, Timespan, ddl
35from .._exceptions import MissingDatasetTypeError
36from ._versioning import VersionedExtension
38if TYPE_CHECKING: 38 ↛ 39line 38 didn't jump to line 39, because the condition on line 38 was never true
39 from .._collection_summary import CollectionSummary
40 from ..queries import SqlQueryContext
41 from ._collections import CollectionManager, CollectionRecord, RunRecord
42 from ._database import Database, StaticTablesContext
43 from ._dimensions import DimensionRecordStorageManager
46class DatasetIdGenEnum(enum.Enum):
47 """This enum is used to specify dataset ID generation options for
48 ``insert()`` method.
49 """
51 UNIQUE = 0
52 """Unique mode generates unique ID for each inserted dataset, e.g.
53 auto-generated by database or random UUID.
54 """
56 DATAID_TYPE = 1
57 """In this mode ID is computed deterministically from a combination of
58 dataset type and dataId.
59 """
61 DATAID_TYPE_RUN = 2
62 """In this mode ID is computed deterministically from a combination of
63 dataset type, dataId, and run collection name.
64 """
67class DatasetIdFactory:
68 """Factory for dataset IDs (UUIDs).
70 For now the logic is hard-coded and is controlled by the user-provided
71 value of `DatasetIdGenEnum`. In the future we may implement a configurable
72 logic that can guess `DatasetIdGenEnum` value from other parameters.
73 """
75 NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f")
76 """Namespace UUID used for UUID5 generation. Do not change. This was
77 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`.
78 """
80 def makeDatasetId(
81 self,
82 run: str,
83 datasetType: DatasetType,
84 dataId: DataCoordinate,
85 idGenerationMode: DatasetIdGenEnum,
86 ) -> uuid.UUID:
87 """Generate dataset ID for a dataset.
89 Parameters
90 ----------
91 run : `str`
92 Name of the RUN collection for the dataset.
93 datasetType : `DatasetType`
94 Dataset type.
95 dataId : `DataCoordinate`
96 Expanded data ID for the dataset.
97 idGenerationMode : `DatasetIdGenEnum`
98 ID generation option. `~DatasetIdGenEnum.UNIQUE` makes a random
99 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a
100 deterministic UUID5-type ID based on a dataset type name and
101 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a
102 deterministic UUID5-type ID based on a dataset type name, run
103 collection name, and ``dataId``.
105 Returns
106 -------
107 datasetId : `uuid.UUID`
108 Dataset identifier.
109 """
110 if idGenerationMode is DatasetIdGenEnum.UNIQUE:
111 return uuid.uuid4()
112 else:
113 # WARNING: If you modify this code make sure that the order of
114 # items in the `items` list below never changes.
115 items: list[tuple[str, str]] = []
116 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE:
117 items = [
118 ("dataset_type", datasetType.name),
119 ]
120 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN:
121 items = [
122 ("dataset_type", datasetType.name),
123 ("run", run),
124 ]
125 else:
126 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}")
128 for name, value in sorted(dataId.byName().items()):
129 items.append((name, str(value)))
130 data = ",".join(f"{key}={value}" for key, value in items)
131 return uuid.uuid5(self.NS_UUID, data)
134class DatasetRecordStorage(ABC):
135 """An interface that manages the records associated with a particular
136 `DatasetType`.
138 Parameters
139 ----------
140 datasetType : `DatasetType`
141 Dataset type whose records this object manages.
142 """
144 def __init__(self, datasetType: DatasetType):
145 self.datasetType = datasetType
147 @abstractmethod
148 def insert(
149 self,
150 run: RunRecord,
151 dataIds: Iterable[DataCoordinate],
152 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
153 ) -> Iterator[DatasetRef]:
154 """Insert one or more dataset entries into the database.
156 Parameters
157 ----------
158 run : `RunRecord`
159 The record object describing the `~CollectionType.RUN` collection
160 this dataset will be associated with.
161 dataIds : `Iterable` [ `DataCoordinate` ]
162 Expanded data IDs (`DataCoordinate` instances) for the
163 datasets to be added. The dimensions of all data IDs must be the
164 same as ``self.datasetType.dimensions``.
165 idMode : `DatasetIdGenEnum`
166 With `UNIQUE` each new dataset is inserted with its new unique ID.
167 With non-`UNIQUE` mode ID is computed from some combination of
168 dataset type, dataId, and run collection name; if the same ID is
169 already in the database then new record is not inserted.
171 Returns
172 -------
173 datasets : `Iterable` [ `DatasetRef` ]
174 References to the inserted datasets.
175 """
176 raise NotImplementedError()
178 @abstractmethod
179 def import_(
180 self,
181 run: RunRecord,
182 datasets: Iterable[DatasetRef],
183 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
184 reuseIds: bool = False,
185 ) -> Iterator[DatasetRef]:
186 """Insert one or more dataset entries into the database.
188 Parameters
189 ----------
190 run : `RunRecord`
191 The record object describing the `~CollectionType.RUN` collection
192 this dataset will be associated with.
193 datasets : `~collections.abc.Iterable` of `DatasetRef`
194 Datasets to be inserted. Datasets can specify ``id`` attribute
195 which will be used for inserted datasets. All dataset IDs must
196 have the same type (`int` or `uuid.UUID`), if type of dataset IDs
197 does not match type supported by this class then IDs will be
198 ignored and new IDs will be generated by backend.
199 idGenerationMode : `DatasetIdGenEnum`
200 With `UNIQUE` each new dataset is inserted with its new unique ID.
201 With non-`UNIQUE` mode ID is computed from some combination of
202 dataset type, dataId, and run collection name; if the same ID is
203 already in the database then new record is not inserted.
204 reuseIds : `bool`, optional
205 If `True` then forces re-use of imported dataset IDs for integer
206 IDs which are normally generated as auto-incremented; exception
207 will be raised if imported IDs clash with existing ones. This
208 option has no effect on the use of globally-unique IDs which are
209 always re-used (or generated if integer IDs are being imported).
211 Returns
212 -------
213 datasets : `Iterable` [ `DatasetRef` ]
214 References to the inserted or existing datasets.
216 Notes
217 -----
218 The ``datasetType`` and ``run`` attributes of datasets are supposed to
219 be identical across all datasets but this is not checked and it should
220 be enforced by higher level registry code. This method does not need
221 to use those attributes from datasets, only ``dataId`` and ``id`` are
222 relevant.
223 """
224 raise NotImplementedError()
226 @abstractmethod
227 def delete(self, datasets: Iterable[DatasetRef]) -> None:
228 """Fully delete the given datasets from the registry.
230 Parameters
231 ----------
232 datasets : `Iterable` [ `DatasetRef` ]
233 Datasets to be deleted. All datasets must be resolved and have
234 the same `DatasetType` as ``self``.
236 Raises
237 ------
238 AmbiguousDatasetError
239 Raised if any of the given `DatasetRef` instances is unresolved.
240 """
241 raise NotImplementedError()
243 @abstractmethod
244 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
245 """Associate one or more datasets with a collection.
247 Parameters
248 ----------
249 collection : `CollectionRecord`
250 The record object describing the collection. ``collection.type``
251 must be `~CollectionType.TAGGED`.
252 datasets : `Iterable` [ `DatasetRef` ]
253 Datasets to be associated. All datasets must be resolved and have
254 the same `DatasetType` as ``self``.
256 Raises
257 ------
258 AmbiguousDatasetError
259 Raised if any of the given `DatasetRef` instances is unresolved.
261 Notes
262 -----
263 Associating a dataset with into collection that already contains a
264 different dataset with the same `DatasetType` and data ID will remove
265 the existing dataset from that collection.
267 Associating the same dataset into a collection multiple times is a
268 no-op, but is still not permitted on read-only databases.
269 """
270 raise NotImplementedError()
272 @abstractmethod
273 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
274 """Remove one or more datasets from a collection.
276 Parameters
277 ----------
278 collection : `CollectionRecord`
279 The record object describing the collection. ``collection.type``
280 must be `~CollectionType.TAGGED`.
281 datasets : `Iterable` [ `DatasetRef` ]
282 Datasets to be disassociated. All datasets must be resolved and
283 have the same `DatasetType` as ``self``.
285 Raises
286 ------
287 AmbiguousDatasetError
288 Raised if any of the given `DatasetRef` instances is unresolved.
289 """
290 raise NotImplementedError()
292 @abstractmethod
293 def certify(
294 self,
295 collection: CollectionRecord,
296 datasets: Iterable[DatasetRef],
297 timespan: Timespan,
298 context: SqlQueryContext,
299 ) -> None:
300 """Associate one or more datasets with a calibration collection and a
301 validity range within it.
303 Parameters
304 ----------
305 collection : `CollectionRecord`
306 The record object describing the collection. ``collection.type``
307 must be `~CollectionType.CALIBRATION`.
308 datasets : `Iterable` [ `DatasetRef` ]
309 Datasets to be associated. All datasets must be resolved and have
310 the same `DatasetType` as ``self``.
311 timespan : `Timespan`
312 The validity range for these datasets within the collection.
314 Raises
315 ------
316 AmbiguousDatasetError
317 Raised if any of the given `DatasetRef` instances is unresolved.
318 ConflictingDefinitionError
319 Raised if the collection already contains a different dataset with
320 the same `DatasetType` and data ID and an overlapping validity
321 range.
322 CollectionTypeError
323 Raised if
324 ``collection.type is not CollectionType.CALIBRATION`` or if
325 ``self.datasetType.isCalibration() is False``.
326 """
327 raise NotImplementedError()
329 @abstractmethod
330 def decertify(
331 self,
332 collection: CollectionRecord,
333 timespan: Timespan,
334 *,
335 dataIds: Iterable[DataCoordinate] | None = None,
336 context: SqlQueryContext,
337 ) -> None:
338 """Remove or adjust datasets to clear a validity range within a
339 calibration collection.
341 Parameters
342 ----------
343 collection : `CollectionRecord`
344 The record object describing the collection. ``collection.type``
345 must be `~CollectionType.CALIBRATION`.
346 timespan : `Timespan`
347 The validity range to remove datasets from within the collection.
348 Datasets that overlap this range but are not contained by it will
349 have their validity ranges adjusted to not overlap it, which may
350 split a single dataset validity range into two.
351 dataIds : `Iterable` [ `DataCoordinate` ], optional
352 Data IDs that should be decertified within the given validity range
353 If `None`, all data IDs for ``self.datasetType`` will be
354 decertified.
356 Raises
357 ------
358 CollectionTypeError
359 Raised if ``collection.type is not CollectionType.CALIBRATION``.
360 """
361 raise NotImplementedError()
363 @abstractmethod
364 def make_relation(
365 self,
366 *collections: CollectionRecord,
367 columns: Set[str],
368 context: SqlQueryContext,
369 ) -> Relation:
370 """Return a `sql.Relation` that represents a query for for this
371 `DatasetType` in one or more collections.
373 Parameters
374 ----------
375 *collections : `CollectionRecord`
376 The record object(s) describing the collection(s) to query. May
377 not be of type `CollectionType.CHAINED`. If multiple collections
378 are passed, the query will search all of them in an unspecified
379 order, and all collections must have the same type. Must include
380 at least one collection.
381 columns : `~collections.abc.Set` [ `str` ]
382 Columns to include in the relation. See `Query.find_datasets` for
383 most options, but this method supports one more:
385 - ``rank``: a calculated integer column holding the index of the
386 collection the dataset was found in, within the ``collections``
387 sequence given.
388 context : `SqlQueryContext`
389 The object that manages database connections, temporary tables and
390 relation engines for this query.
392 Returns
393 ------
394 relation : `~lsst.daf.relation.Relation`
395 Representation of the query.
396 """
397 raise NotImplementedError()
399 datasetType: DatasetType
400 """Dataset type whose records this object manages (`DatasetType`).
401 """
404class DatasetRecordStorageManager(VersionedExtension):
405 """An interface that manages the tables that describe datasets.
407 `DatasetRecordStorageManager` primarily serves as a container and factory
408 for `DatasetRecordStorage` instances, which each provide access to the
409 records for a different `DatasetType`.
410 """
412 @classmethod
413 @abstractmethod
414 def initialize(
415 cls,
416 db: Database,
417 context: StaticTablesContext,
418 *,
419 collections: CollectionManager,
420 dimensions: DimensionRecordStorageManager,
421 ) -> DatasetRecordStorageManager:
422 """Construct an instance of the manager.
424 Parameters
425 ----------
426 db : `Database`
427 Interface to the underlying database engine and namespace.
428 context : `StaticTablesContext`
429 Context object obtained from `Database.declareStaticTables`; used
430 to declare any tables that should always be present.
431 collections: `CollectionManager`
432 Manager object for the collections in this `Registry`.
433 dimensions : `DimensionRecordStorageManager`
434 Manager object for the dimensions in this `Registry`.
436 Returns
437 -------
438 manager : `DatasetRecordStorageManager`
439 An instance of a concrete `DatasetRecordStorageManager` subclass.
440 """
441 raise NotImplementedError()
443 @classmethod
444 @abstractmethod
445 def getIdColumnType(cls) -> type:
446 """Return type used for columns storing dataset IDs.
448 This type is used for columns storing `DatasetRef.id` values, usually
449 a `type` subclass provided by SQLAlchemy.
451 Returns
452 -------
453 dtype : `type`
454 Type used for dataset identification in database.
455 """
456 raise NotImplementedError()
458 @classmethod
459 @abstractmethod
460 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
461 """Test whether the given dataset ID generation mode is supported by
462 `insert`.
464 Parameters
465 ----------
466 mode : `DatasetIdGenEnum`
467 Enum value for the mode to test.
469 Returns
470 -------
471 supported : `bool`
472 Whether the given mode is supported.
473 """
474 raise NotImplementedError()
476 @classmethod
477 @abstractmethod
478 def addDatasetForeignKey(
479 cls,
480 tableSpec: ddl.TableSpec,
481 *,
482 name: str = "dataset",
483 constraint: bool = True,
484 onDelete: str | None = None,
485 **kwargs: Any,
486 ) -> ddl.FieldSpec:
487 """Add a foreign key (field and constraint) referencing the dataset
488 table.
490 Parameters
491 ----------
492 tableSpec : `ddl.TableSpec`
493 Specification for the table that should reference the dataset
494 table. Will be modified in place.
495 name: `str`, optional
496 A name to use for the prefix of the new field; the full name is
497 ``{name}_id``.
498 onDelete: `str`, optional
499 One of "CASCADE" or "SET NULL", indicating what should happen to
500 the referencing row if the collection row is deleted. `None`
501 indicates that this should be an integrity error.
502 constraint: `bool`, optional
503 If `False` (`True` is default), add a field that can be joined to
504 the dataset primary key, but do not add a foreign key constraint.
505 **kwargs
506 Additional keyword arguments are forwarded to the `ddl.FieldSpec`
507 constructor (only the ``name`` and ``dtype`` arguments are
508 otherwise provided).
510 Returns
511 -------
512 idSpec : `ddl.FieldSpec`
513 Specification for the ID field.
514 """
515 raise NotImplementedError()
517 @abstractmethod
518 def refresh(self) -> None:
519 """Ensure all other operations on this manager are aware of any
520 dataset types that may have been registered by other clients since
521 it was initialized or last refreshed.
522 """
523 raise NotImplementedError()
525 def __getitem__(self, name: str) -> DatasetRecordStorage:
526 """Return the object that provides access to the records associated
527 with the given `DatasetType` name.
529 This is simply a convenience wrapper for `find` that raises `KeyError`
530 when the dataset type is not found.
532 Returns
533 -------
534 records : `DatasetRecordStorage`
535 The object representing the records for the given dataset type.
537 Raises
538 ------
539 KeyError
540 Raised if there is no dataset type with the given name.
542 Notes
543 -----
544 Dataset types registered by another client of the same repository since
545 the last call to `initialize` or `refresh` may not be found.
546 """
547 result = self.find(name)
548 if result is None:
549 raise MissingDatasetTypeError(f"Dataset type with name '{name}' not found.")
550 return result
552 @abstractmethod
553 def find(self, name: str) -> DatasetRecordStorage | None:
554 """Return an object that provides access to the records associated with
555 the given `DatasetType` name, if one exists.
557 Parameters
558 ----------
559 name : `str`
560 Name of the dataset type.
562 Returns
563 -------
564 records : `DatasetRecordStorage` or `None`
565 The object representing the records for the given dataset type, or
566 `None` if there are no records for that dataset type.
568 Notes
569 -----
570 Dataset types registered by another client of the same repository since
571 the last call to `initialize` or `refresh` may not be found.
572 """
573 raise NotImplementedError()
575 @abstractmethod
576 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]:
577 """Ensure that this `Registry` can hold records for the given
578 `DatasetType`, creating new tables as necessary.
580 Parameters
581 ----------
582 datasetType : `DatasetType`
583 Dataset type for which a table should created (as necessary) and
584 an associated `DatasetRecordStorage` returned.
586 Returns
587 -------
588 records : `DatasetRecordStorage`
589 The object representing the records for the given dataset type.
590 inserted : `bool`
591 `True` if the dataset type did not exist in the registry before.
593 Notes
594 -----
595 This operation may not be invoked within a `Database.transaction`
596 context.
597 """
598 raise NotImplementedError()
600 @abstractmethod
601 def remove(self, name: str) -> None:
602 """Remove the dataset type.
604 Parameters
605 ----------
606 name : `str`
607 Name of the dataset type.
608 """
609 raise NotImplementedError()
611 @abstractmethod
612 def resolve_wildcard(
613 self,
614 expression: Any,
615 components: bool | None = None,
616 missing: list[str] | None = None,
617 explicit_only: bool = False,
618 components_deprecated: bool = True,
619 ) -> dict[DatasetType, list[str | None]]:
620 """Resolve a dataset type wildcard expression.
622 Parameters
623 ----------
624 expression
625 Expression to resolve. Will be passed to
626 `DatasetTypeWildcard.from_expression`.
627 components : `bool`, optional
628 If `True`, apply all expression patterns to component dataset type
629 names as well. If `False`, never apply patterns to components. If
630 `None` (default), apply patterns to components only if their parent
631 datasets were not matched by the expression. Fully-specified
632 component datasets (`str` or `DatasetType` instances) are always
633 included.
634 missing : `list` of `str`, optional
635 String dataset type names that were explicitly given (i.e. not
636 regular expression patterns) but not found will be appended to this
637 list, if it is provided.
638 explicit_only : `bool`, optional
639 If `True`, require explicit `DatasetType` instances or `str` names,
640 with `re.Pattern` instances deprecated and ``...`` prohibited.
641 components_deprecated : `bool`, optional
642 If `True`, this is a context in which component dataset support is
643 deprecated. This will result in a deprecation warning when
644 ``components=True`` or ``components=None`` and a component dataset
645 is matched. In the future this will become an error.
647 Returns
648 -------
649 dataset_types : `dict` [ `DatasetType`, `list` [ `None`, `str` ] ]
650 A mapping with resolved dataset types as keys and lists of
651 matched component names as values, where `None` indicates the
652 parent composite dataset type was matched.
653 """
654 raise NotImplementedError()
656 @abstractmethod
657 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None:
658 """Return a `DatasetRef` for the given dataset primary key
659 value.
661 Parameters
662 ----------
663 id : `DatasetId`
664 Primary key value for the dataset.
666 Returns
667 -------
668 ref : `DatasetRef` or `None`
669 Object representing the dataset, or `None` if no dataset with the
670 given primary key values exists in this layer.
671 """
672 raise NotImplementedError()
674 @abstractmethod
675 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
676 """Return a summary for the given collection.
678 Parameters
679 ----------
680 collection : `CollectionRecord`
681 Record describing the collection for which a summary is to be
682 retrieved.
684 Returns
685 -------
686 summary : `CollectionSummary`
687 Summary of the dataset types and governor dimension values in
688 this collection.
689 """
690 raise NotImplementedError()