Coverage for python/lsst/daf/butler/registry/interfaces/_datasets.py: 94%
74 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-05 10:00 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-05 10:00 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30from ... import ddl
32__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage")
34from abc import ABC, abstractmethod
35from collections.abc import Iterable, Iterator, Mapping, Set
36from typing import TYPE_CHECKING, Any
38from lsst.daf.relation import Relation
40from ..._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
41from ..._dataset_type import DatasetType
42from ..._exceptions import MissingDatasetTypeError
43from ..._timespan import Timespan
44from ...dimensions import DataCoordinate
45from ._versioning import VersionedExtension, VersionTuple
47if TYPE_CHECKING:
48 from .._caching_context import CachingContext
49 from .._collection_summary import CollectionSummary
50 from ..queries import SqlQueryContext
51 from ._collections import CollectionManager, CollectionRecord, RunRecord
52 from ._database import Database, StaticTablesContext
53 from ._dimensions import DimensionRecordStorageManager
56class DatasetRecordStorage(ABC):
57 """An interface that manages the records associated with a particular
58 `DatasetType`.
60 Parameters
61 ----------
62 datasetType : `DatasetType`
63 Dataset type whose records this object manages.
64 """
66 def __init__(self, datasetType: DatasetType):
67 self.datasetType = datasetType
69 @abstractmethod
70 def insert(
71 self,
72 run: RunRecord,
73 dataIds: Iterable[DataCoordinate],
74 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
75 ) -> Iterator[DatasetRef]:
76 """Insert one or more dataset entries into the database.
78 Parameters
79 ----------
80 run : `RunRecord`
81 The record object describing the `~CollectionType.RUN` collection
82 this dataset will be associated with.
83 dataIds : `~collections.abc.Iterable` [ `DataCoordinate` ]
84 Expanded data IDs (`DataCoordinate` instances) for the
85 datasets to be added. The dimensions of all data IDs must be the
86 same as ``self.datasetType.dimensions``.
87 idGenerationMode : `DatasetIdGenEnum`
88 With `UNIQUE` each new dataset is inserted with its new unique ID.
89 With non-`UNIQUE` mode ID is computed from some combination of
90 dataset type, dataId, and run collection name; if the same ID is
91 already in the database then new record is not inserted.
93 Returns
94 -------
95 datasets : `~collections.abc.Iterable` [ `DatasetRef` ]
96 References to the inserted datasets.
97 """
98 raise NotImplementedError()
100 @abstractmethod
101 def import_(
102 self,
103 run: RunRecord,
104 datasets: Iterable[DatasetRef],
105 ) -> Iterator[DatasetRef]:
106 """Insert one or more dataset entries into the database.
108 Parameters
109 ----------
110 run : `RunRecord`
111 The record object describing the `~CollectionType.RUN` collection
112 this dataset will be associated with.
113 datasets : `~collections.abc.Iterable` of `DatasetRef`
114 Datasets to be inserted. Datasets can specify ``id`` attribute
115 which will be used for inserted datasets. All dataset IDs must
116 have the same type (`int` or `uuid.UUID`), if type of dataset IDs
117 does not match type supported by this class then IDs will be
118 ignored and new IDs will be generated by backend.
120 Returns
121 -------
122 datasets : `~collections.abc.Iterable` [ `DatasetRef` ]
123 References to the inserted or existing datasets.
125 Notes
126 -----
127 The ``datasetType`` and ``run`` attributes of datasets are supposed to
128 be identical across all datasets but this is not checked and it should
129 be enforced by higher level registry code. This method does not need
130 to use those attributes from datasets, only ``dataId`` and ``id`` are
131 relevant.
132 """
133 raise NotImplementedError()
135 @abstractmethod
136 def delete(self, datasets: Iterable[DatasetRef]) -> None:
137 """Fully delete the given datasets from the registry.
139 Parameters
140 ----------
141 datasets : `~collections.abc.Iterable` [ `DatasetRef` ]
142 Datasets to be deleted. All datasets must be resolved and have
143 the same `DatasetType` as ``self``.
145 Raises
146 ------
147 AmbiguousDatasetError
148 Raised if any of the given `DatasetRef` instances is unresolved.
149 """
150 raise NotImplementedError()
152 @abstractmethod
153 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
154 """Associate one or more datasets with a collection.
156 Parameters
157 ----------
158 collection : `CollectionRecord`
159 The record object describing the collection. ``collection.type``
160 must be `~CollectionType.TAGGED`.
161 datasets : `~collections.abc.Iterable` [ `DatasetRef` ]
162 Datasets to be associated. All datasets must be resolved and have
163 the same `DatasetType` as ``self``.
165 Raises
166 ------
167 AmbiguousDatasetError
168 Raised if any of the given `DatasetRef` instances is unresolved.
170 Notes
171 -----
172 Associating a dataset with into collection that already contains a
173 different dataset with the same `DatasetType` and data ID will remove
174 the existing dataset from that collection.
176 Associating the same dataset into a collection multiple times is a
177 no-op, but is still not permitted on read-only databases.
178 """
179 raise NotImplementedError()
181 @abstractmethod
182 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
183 """Remove one or more datasets from a collection.
185 Parameters
186 ----------
187 collection : `CollectionRecord`
188 The record object describing the collection. ``collection.type``
189 must be `~CollectionType.TAGGED`.
190 datasets : `~collections.abc.Iterable` [ `DatasetRef` ]
191 Datasets to be disassociated. All datasets must be resolved and
192 have the same `DatasetType` as ``self``.
194 Raises
195 ------
196 AmbiguousDatasetError
197 Raised if any of the given `DatasetRef` instances is unresolved.
198 """
199 raise NotImplementedError()
201 @abstractmethod
202 def certify(
203 self,
204 collection: CollectionRecord,
205 datasets: Iterable[DatasetRef],
206 timespan: Timespan,
207 context: SqlQueryContext,
208 ) -> None:
209 """Associate one or more datasets with a calibration collection and a
210 validity range within it.
212 Parameters
213 ----------
214 collection : `CollectionRecord`
215 The record object describing the collection. ``collection.type``
216 must be `~CollectionType.CALIBRATION`.
217 datasets : `~collections.abc.Iterable` [ `DatasetRef` ]
218 Datasets to be associated. All datasets must be resolved and have
219 the same `DatasetType` as ``self``.
220 timespan : `Timespan`
221 The validity range for these datasets within the collection.
222 context : `SqlQueryContext`
223 The object that manages database connections, temporary tables and
224 relation engines for this query.
226 Raises
227 ------
228 AmbiguousDatasetError
229 Raised if any of the given `DatasetRef` instances is unresolved.
230 ConflictingDefinitionError
231 Raised if the collection already contains a different dataset with
232 the same `DatasetType` and data ID and an overlapping validity
233 range.
234 CollectionTypeError
235 Raised if
236 ``collection.type is not CollectionType.CALIBRATION`` or if
237 ``self.datasetType.isCalibration() is False``.
238 """
239 raise NotImplementedError()
241 @abstractmethod
242 def decertify(
243 self,
244 collection: CollectionRecord,
245 timespan: Timespan,
246 *,
247 dataIds: Iterable[DataCoordinate] | None = None,
248 context: SqlQueryContext,
249 ) -> None:
250 """Remove or adjust datasets to clear a validity range within a
251 calibration collection.
253 Parameters
254 ----------
255 collection : `CollectionRecord`
256 The record object describing the collection. ``collection.type``
257 must be `~CollectionType.CALIBRATION`.
258 timespan : `Timespan`
259 The validity range to remove datasets from within the collection.
260 Datasets that overlap this range but are not contained by it will
261 have their validity ranges adjusted to not overlap it, which may
262 split a single dataset validity range into two.
263 dataIds : `~collections.abc.Iterable` [ `DataCoordinate` ], optional
264 Data IDs that should be decertified within the given validity range
265 If `None`, all data IDs for ``self.datasetType`` will be
266 decertified.
267 context : `SqlQueryContext`
268 The object that manages database connections, temporary tables and
269 relation engines for this query.
271 Raises
272 ------
273 CollectionTypeError
274 Raised if ``collection.type is not CollectionType.CALIBRATION``.
275 """
276 raise NotImplementedError()
278 @abstractmethod
279 def make_relation(
280 self,
281 *collections: CollectionRecord,
282 columns: Set[str],
283 context: SqlQueryContext,
284 ) -> Relation:
285 """Return a `sql.Relation` that represents a query for for this
286 `DatasetType` in one or more collections.
288 Parameters
289 ----------
290 *collections : `CollectionRecord`
291 The record object(s) describing the collection(s) to query. May
292 not be of type `CollectionType.CHAINED`. If multiple collections
293 are passed, the query will search all of them in an unspecified
294 order, and all collections must have the same type. Must include
295 at least one collection.
296 columns : `~collections.abc.Set` [ `str` ]
297 Columns to include in the relation. See `Query.find_datasets` for
298 most options, but this method supports one more:
300 - ``rank``: a calculated integer column holding the index of the
301 collection the dataset was found in, within the ``collections``
302 sequence given.
303 context : `SqlQueryContext`
304 The object that manages database connections, temporary tables and
305 relation engines for this query.
307 Returns
308 -------
309 relation : `~lsst.daf.relation.Relation`
310 Representation of the query.
311 """
312 raise NotImplementedError()
314 datasetType: DatasetType
315 """Dataset type whose records this object manages (`DatasetType`).
316 """
319class DatasetRecordStorageManager(VersionedExtension):
320 """An interface that manages the tables that describe datasets.
322 `DatasetRecordStorageManager` primarily serves as a container and factory
323 for `DatasetRecordStorage` instances, which each provide access to the
324 records for a different `DatasetType`.
326 Parameters
327 ----------
328 registry_schema_version : `VersionTuple` or `None`, optional
329 Version of registry schema.
330 """
332 def __init__(self, *, registry_schema_version: VersionTuple | None = None) -> None:
333 super().__init__(registry_schema_version=registry_schema_version)
335 @abstractmethod
336 def clone(
337 self,
338 *,
339 db: Database,
340 collections: CollectionManager,
341 dimensions: DimensionRecordStorageManager,
342 caching_context: CachingContext,
343 ) -> DatasetRecordStorageManager:
344 """Make an independent copy of this manager instance bound to new
345 instances of `Database` and other managers.
347 Parameters
348 ----------
349 db : `Database`
350 New `Database` object to use when instantiating the manager.
351 collections : `CollectionManager`
352 New `CollectionManager` object to use when instantiating the
353 manager.
354 dimensions : `DimensionRecordStorageManager`
355 New `DimensionRecordStorageManager` object to use when
356 instantiating the manager.
357 caching_context : `CachingContext`
358 New `CachingContext` object to use when instantiating the manager.
360 Returns
361 -------
362 instance : `DatasetRecordStorageManager`
363 New manager instance with the same configuration as this instance,
364 but bound to a new Database object.
365 """
366 raise NotImplementedError()
368 @classmethod
369 @abstractmethod
370 def initialize(
371 cls,
372 db: Database,
373 context: StaticTablesContext,
374 *,
375 collections: CollectionManager,
376 dimensions: DimensionRecordStorageManager,
377 caching_context: CachingContext,
378 registry_schema_version: VersionTuple | None = None,
379 ) -> DatasetRecordStorageManager:
380 """Construct an instance of the manager.
382 Parameters
383 ----------
384 db : `Database`
385 Interface to the underlying database engine and namespace.
386 context : `StaticTablesContext`
387 Context object obtained from `Database.declareStaticTables`; used
388 to declare any tables that should always be present.
389 collections : `CollectionManager`
390 Manager object for the collections in this `Registry`.
391 dimensions : `DimensionRecordStorageManager`
392 Manager object for the dimensions in this `Registry`.
393 caching_context : `CachingContext`
394 Object controlling caching of information returned by managers.
395 registry_schema_version : `VersionTuple` or `None`
396 Schema version of this extension as defined in registry.
398 Returns
399 -------
400 manager : `DatasetRecordStorageManager`
401 An instance of a concrete `DatasetRecordStorageManager` subclass.
402 """
403 raise NotImplementedError()
405 @classmethod
406 @abstractmethod
407 def getIdColumnType(cls) -> type:
408 """Return type used for columns storing dataset IDs.
410 This type is used for columns storing `DatasetRef.id` values, usually
411 a `type` subclass provided by SQLAlchemy.
413 Returns
414 -------
415 dtype : `type`
416 Type used for dataset identification in database.
417 """
418 raise NotImplementedError()
420 @classmethod
421 @abstractmethod
422 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
423 """Test whether the given dataset ID generation mode is supported by
424 `insert`.
426 Parameters
427 ----------
428 mode : `DatasetIdGenEnum`
429 Enum value for the mode to test.
431 Returns
432 -------
433 supported : `bool`
434 Whether the given mode is supported.
435 """
436 raise NotImplementedError()
438 @classmethod
439 @abstractmethod
440 def addDatasetForeignKey(
441 cls,
442 tableSpec: ddl.TableSpec,
443 *,
444 name: str = "dataset",
445 constraint: bool = True,
446 onDelete: str | None = None,
447 **kwargs: Any,
448 ) -> ddl.FieldSpec:
449 """Add a foreign key (field and constraint) referencing the dataset
450 table.
452 Parameters
453 ----------
454 tableSpec : `ddl.TableSpec`
455 Specification for the table that should reference the dataset
456 table. Will be modified in place.
457 name : `str`, optional
458 A name to use for the prefix of the new field; the full name is
459 ``{name}_id``.
460 constraint : `bool`, optional
461 If `False` (`True` is default), add a field that can be joined to
462 the dataset primary key, but do not add a foreign key constraint.
463 onDelete : `str`, optional
464 One of "CASCADE" or "SET NULL", indicating what should happen to
465 the referencing row if the collection row is deleted. `None`
466 indicates that this should be an integrity error.
467 **kwargs
468 Additional keyword arguments are forwarded to the `ddl.FieldSpec`
469 constructor (only the ``name`` and ``dtype`` arguments are
470 otherwise provided).
472 Returns
473 -------
474 idSpec : `ddl.FieldSpec`
475 Specification for the ID field.
476 """
477 raise NotImplementedError()
479 @abstractmethod
480 def refresh(self) -> None:
481 """Ensure all other operations on this manager are aware of any
482 dataset types that may have been registered by other clients since
483 it was initialized or last refreshed.
484 """
485 raise NotImplementedError()
487 def __getitem__(self, name: str) -> DatasetRecordStorage:
488 """Return the object that provides access to the records associated
489 with the given `DatasetType` name.
491 This is simply a convenience wrapper for `find` that raises `KeyError`
492 when the dataset type is not found.
494 Returns
495 -------
496 records : `DatasetRecordStorage`
497 The object representing the records for the given dataset type.
499 Raises
500 ------
501 KeyError
502 Raised if there is no dataset type with the given name.
504 Notes
505 -----
506 Dataset types registered by another client of the same repository since
507 the last call to `initialize` or `refresh` may not be found.
508 """
509 result = self.find(name)
510 if result is None:
511 raise MissingDatasetTypeError(f"Dataset type with name '{name}' not found.")
512 return result
514 @abstractmethod
515 def find(self, name: str) -> DatasetRecordStorage | None:
516 """Return an object that provides access to the records associated with
517 the given `DatasetType` name, if one exists.
519 Parameters
520 ----------
521 name : `str`
522 Name of the dataset type.
524 Returns
525 -------
526 records : `DatasetRecordStorage` or `None`
527 The object representing the records for the given dataset type, or
528 `None` if there are no records for that dataset type.
530 Notes
531 -----
532 Dataset types registered by another client of the same repository since
533 the last call to `initialize` or `refresh` may not be found.
534 """
535 raise NotImplementedError()
537 @abstractmethod
538 def register(self, datasetType: DatasetType) -> bool:
539 """Ensure that this `Registry` can hold records for the given
540 `DatasetType`, creating new tables as necessary.
542 Parameters
543 ----------
544 datasetType : `DatasetType`
545 Dataset type for which a table should created (as necessary) and
546 an associated `DatasetRecordStorage` returned.
548 Returns
549 -------
550 inserted : `bool`
551 `True` if the dataset type did not exist in the registry before.
553 Notes
554 -----
555 This operation may not be invoked within a `Database.transaction`
556 context.
557 """
558 raise NotImplementedError()
560 @abstractmethod
561 def remove(self, name: str) -> None:
562 """Remove the dataset type.
564 Parameters
565 ----------
566 name : `str`
567 Name of the dataset type.
568 """
569 raise NotImplementedError()
571 @abstractmethod
572 def resolve_wildcard(
573 self,
574 expression: Any,
575 missing: list[str] | None = None,
576 explicit_only: bool = False,
577 ) -> list[DatasetType]:
578 """Resolve a dataset type wildcard expression.
580 Parameters
581 ----------
582 expression : `~typing.Any`
583 Expression to resolve. Will be passed to
584 `DatasetTypeWildcard.from_expression`.
585 missing : `list` of `str`, optional
586 String dataset type names that were explicitly given (i.e. not
587 regular expression patterns) but not found will be appended to this
588 list, if it is provided.
589 explicit_only : `bool`, optional
590 If `True`, require explicit `DatasetType` instances or `str` names,
591 with `re.Pattern` instances deprecated and ``...`` prohibited.
593 Returns
594 -------
595 dataset_types : `list` [ `DatasetType` ]
596 A list of resolved dataset types.
597 """
598 raise NotImplementedError()
600 @abstractmethod
601 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None:
602 """Return a `DatasetRef` for the given dataset primary key
603 value.
605 Parameters
606 ----------
607 id : `DatasetId`
608 Primary key value for the dataset.
610 Returns
611 -------
612 ref : `DatasetRef` or `None`
613 Object representing the dataset, or `None` if no dataset with the
614 given primary key values exists in this layer.
615 """
616 raise NotImplementedError()
618 @abstractmethod
619 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
620 """Return a summary for the given collection.
622 Parameters
623 ----------
624 collection : `CollectionRecord`
625 Record describing the collection for which a summary is to be
626 retrieved.
628 Returns
629 -------
630 summary : `CollectionSummary`
631 Summary of the dataset types and governor dimension values in
632 this collection.
633 """
634 raise NotImplementedError()
636 @abstractmethod
637 def fetch_summaries(
638 self, collections: Iterable[CollectionRecord], dataset_types: Iterable[DatasetType] | None = None
639 ) -> Mapping[Any, CollectionSummary]:
640 """Fetch collection summaries given their names and dataset types.
642 Parameters
643 ----------
644 collections : `~collections.abc.Iterable` [`CollectionRecord`]
645 Collection records to query.
646 dataset_types : `~collections.abc.Iterable` [`DatasetType`] or `None`
647 Dataset types to include into returned summaries. If `None` then
648 all dataset types will be included.
650 Returns
651 -------
652 summaries : `~collections.abc.Mapping` [`Any`, `CollectionSummary`]
653 Collection summaries indexed by collection record key. This mapping
654 will also contain all nested non-chained collections of the chained
655 collections.
656 """
657 raise NotImplementedError()
659 @abstractmethod
660 def ingest_date_dtype(self) -> type:
661 """Return type of the ``ingest_date`` column."""
662 raise NotImplementedError()