Coverage for python/lsst/daf/butler/registry/interfaces/_datasets.py: 94%
72 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-16 10:44 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-16 10:44 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30from ... import ddl
32__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage")
34from abc import ABC, abstractmethod
35from collections.abc import Iterable, Iterator, Mapping, Set
36from typing import TYPE_CHECKING, Any
38from lsst.daf.relation import Relation
40from ..._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
41from ..._dataset_type import DatasetType
42from ..._timespan import Timespan
43from ...dimensions import DataCoordinate
44from .._exceptions import MissingDatasetTypeError
45from ._versioning import VersionedExtension, VersionTuple
47if TYPE_CHECKING:
48 from .._caching_context import CachingContext
49 from .._collection_summary import CollectionSummary
50 from ..queries import SqlQueryContext
51 from ._collections import CollectionManager, CollectionRecord, RunRecord
52 from ._database import Database, StaticTablesContext
53 from ._dimensions import DimensionRecordStorageManager
56class DatasetRecordStorage(ABC):
57 """An interface that manages the records associated with a particular
58 `DatasetType`.
60 Parameters
61 ----------
62 datasetType : `DatasetType`
63 Dataset type whose records this object manages.
64 """
66 def __init__(self, datasetType: DatasetType):
67 self.datasetType = datasetType
69 @abstractmethod
70 def insert(
71 self,
72 run: RunRecord,
73 dataIds: Iterable[DataCoordinate],
74 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
75 ) -> Iterator[DatasetRef]:
76 """Insert one or more dataset entries into the database.
78 Parameters
79 ----------
80 run : `RunRecord`
81 The record object describing the `~CollectionType.RUN` collection
82 this dataset will be associated with.
83 dataIds : `~collections.abc.Iterable` [ `DataCoordinate` ]
84 Expanded data IDs (`DataCoordinate` instances) for the
85 datasets to be added. The dimensions of all data IDs must be the
86 same as ``self.datasetType.dimensions``.
87 idGenerationMode : `DatasetIdGenEnum`
88 With `UNIQUE` each new dataset is inserted with its new unique ID.
89 With non-`UNIQUE` mode ID is computed from some combination of
90 dataset type, dataId, and run collection name; if the same ID is
91 already in the database then new record is not inserted.
93 Returns
94 -------
95 datasets : `~collections.abc.Iterable` [ `DatasetRef` ]
96 References to the inserted datasets.
97 """
98 raise NotImplementedError()
100 @abstractmethod
101 def import_(
102 self,
103 run: RunRecord,
104 datasets: Iterable[DatasetRef],
105 ) -> Iterator[DatasetRef]:
106 """Insert one or more dataset entries into the database.
108 Parameters
109 ----------
110 run : `RunRecord`
111 The record object describing the `~CollectionType.RUN` collection
112 this dataset will be associated with.
113 datasets : `~collections.abc.Iterable` of `DatasetRef`
114 Datasets to be inserted. Datasets can specify ``id`` attribute
115 which will be used for inserted datasets. All dataset IDs must
116 have the same type (`int` or `uuid.UUID`), if type of dataset IDs
117 does not match type supported by this class then IDs will be
118 ignored and new IDs will be generated by backend.
120 Returns
121 -------
122 datasets : `~collections.abc.Iterable` [ `DatasetRef` ]
123 References to the inserted or existing datasets.
125 Notes
126 -----
127 The ``datasetType`` and ``run`` attributes of datasets are supposed to
128 be identical across all datasets but this is not checked and it should
129 be enforced by higher level registry code. This method does not need
130 to use those attributes from datasets, only ``dataId`` and ``id`` are
131 relevant.
132 """
133 raise NotImplementedError()
135 @abstractmethod
136 def delete(self, datasets: Iterable[DatasetRef]) -> None:
137 """Fully delete the given datasets from the registry.
139 Parameters
140 ----------
141 datasets : `~collections.abc.Iterable` [ `DatasetRef` ]
142 Datasets to be deleted. All datasets must be resolved and have
143 the same `DatasetType` as ``self``.
145 Raises
146 ------
147 AmbiguousDatasetError
148 Raised if any of the given `DatasetRef` instances is unresolved.
149 """
150 raise NotImplementedError()
152 @abstractmethod
153 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
154 """Associate one or more datasets with a collection.
156 Parameters
157 ----------
158 collection : `CollectionRecord`
159 The record object describing the collection. ``collection.type``
160 must be `~CollectionType.TAGGED`.
161 datasets : `~collections.abc.Iterable` [ `DatasetRef` ]
162 Datasets to be associated. All datasets must be resolved and have
163 the same `DatasetType` as ``self``.
165 Raises
166 ------
167 AmbiguousDatasetError
168 Raised if any of the given `DatasetRef` instances is unresolved.
170 Notes
171 -----
172 Associating a dataset with into collection that already contains a
173 different dataset with the same `DatasetType` and data ID will remove
174 the existing dataset from that collection.
176 Associating the same dataset into a collection multiple times is a
177 no-op, but is still not permitted on read-only databases.
178 """
179 raise NotImplementedError()
181 @abstractmethod
182 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
183 """Remove one or more datasets from a collection.
185 Parameters
186 ----------
187 collection : `CollectionRecord`
188 The record object describing the collection. ``collection.type``
189 must be `~CollectionType.TAGGED`.
190 datasets : `~collections.abc.Iterable` [ `DatasetRef` ]
191 Datasets to be disassociated. All datasets must be resolved and
192 have the same `DatasetType` as ``self``.
194 Raises
195 ------
196 AmbiguousDatasetError
197 Raised if any of the given `DatasetRef` instances is unresolved.
198 """
199 raise NotImplementedError()
201 @abstractmethod
202 def certify(
203 self,
204 collection: CollectionRecord,
205 datasets: Iterable[DatasetRef],
206 timespan: Timespan,
207 context: SqlQueryContext,
208 ) -> None:
209 """Associate one or more datasets with a calibration collection and a
210 validity range within it.
212 Parameters
213 ----------
214 collection : `CollectionRecord`
215 The record object describing the collection. ``collection.type``
216 must be `~CollectionType.CALIBRATION`.
217 datasets : `~collections.abc.Iterable` [ `DatasetRef` ]
218 Datasets to be associated. All datasets must be resolved and have
219 the same `DatasetType` as ``self``.
220 timespan : `Timespan`
221 The validity range for these datasets within the collection.
222 context : `SqlQueryContext`
223 The object that manages database connections, temporary tables and
224 relation engines for this query.
226 Raises
227 ------
228 AmbiguousDatasetError
229 Raised if any of the given `DatasetRef` instances is unresolved.
230 ConflictingDefinitionError
231 Raised if the collection already contains a different dataset with
232 the same `DatasetType` and data ID and an overlapping validity
233 range.
234 CollectionTypeError
235 Raised if
236 ``collection.type is not CollectionType.CALIBRATION`` or if
237 ``self.datasetType.isCalibration() is False``.
238 """
239 raise NotImplementedError()
241 @abstractmethod
242 def decertify(
243 self,
244 collection: CollectionRecord,
245 timespan: Timespan,
246 *,
247 dataIds: Iterable[DataCoordinate] | None = None,
248 context: SqlQueryContext,
249 ) -> None:
250 """Remove or adjust datasets to clear a validity range within a
251 calibration collection.
253 Parameters
254 ----------
255 collection : `CollectionRecord`
256 The record object describing the collection. ``collection.type``
257 must be `~CollectionType.CALIBRATION`.
258 timespan : `Timespan`
259 The validity range to remove datasets from within the collection.
260 Datasets that overlap this range but are not contained by it will
261 have their validity ranges adjusted to not overlap it, which may
262 split a single dataset validity range into two.
263 dataIds : `~collections.abc.Iterable` [ `DataCoordinate` ], optional
264 Data IDs that should be decertified within the given validity range
265 If `None`, all data IDs for ``self.datasetType`` will be
266 decertified.
267 context : `SqlQueryContext`
268 The object that manages database connections, temporary tables and
269 relation engines for this query.
271 Raises
272 ------
273 CollectionTypeError
274 Raised if ``collection.type is not CollectionType.CALIBRATION``.
275 """
276 raise NotImplementedError()
278 @abstractmethod
279 def make_relation(
280 self,
281 *collections: CollectionRecord,
282 columns: Set[str],
283 context: SqlQueryContext,
284 ) -> Relation:
285 """Return a `sql.Relation` that represents a query for for this
286 `DatasetType` in one or more collections.
288 Parameters
289 ----------
290 *collections : `CollectionRecord`
291 The record object(s) describing the collection(s) to query. May
292 not be of type `CollectionType.CHAINED`. If multiple collections
293 are passed, the query will search all of them in an unspecified
294 order, and all collections must have the same type. Must include
295 at least one collection.
296 columns : `~collections.abc.Set` [ `str` ]
297 Columns to include in the relation. See `Query.find_datasets` for
298 most options, but this method supports one more:
300 - ``rank``: a calculated integer column holding the index of the
301 collection the dataset was found in, within the ``collections``
302 sequence given.
303 context : `SqlQueryContext`
304 The object that manages database connections, temporary tables and
305 relation engines for this query.
307 Returns
308 -------
309 relation : `~lsst.daf.relation.Relation`
310 Representation of the query.
311 """
312 raise NotImplementedError()
314 datasetType: DatasetType
315 """Dataset type whose records this object manages (`DatasetType`).
316 """
319class DatasetRecordStorageManager(VersionedExtension):
320 """An interface that manages the tables that describe datasets.
322 `DatasetRecordStorageManager` primarily serves as a container and factory
323 for `DatasetRecordStorage` instances, which each provide access to the
324 records for a different `DatasetType`.
326 Parameters
327 ----------
328 registry_schema_version : `VersionTuple` or `None`, optional
329 Version of registry schema.
330 """
332 def __init__(self, *, registry_schema_version: VersionTuple | None = None) -> None:
333 super().__init__(registry_schema_version=registry_schema_version)
335 @classmethod
336 @abstractmethod
337 def initialize(
338 cls,
339 db: Database,
340 context: StaticTablesContext,
341 *,
342 collections: CollectionManager,
343 dimensions: DimensionRecordStorageManager,
344 caching_context: CachingContext,
345 registry_schema_version: VersionTuple | None = None,
346 ) -> DatasetRecordStorageManager:
347 """Construct an instance of the manager.
349 Parameters
350 ----------
351 db : `Database`
352 Interface to the underlying database engine and namespace.
353 context : `StaticTablesContext`
354 Context object obtained from `Database.declareStaticTables`; used
355 to declare any tables that should always be present.
356 collections : `CollectionManager`
357 Manager object for the collections in this `Registry`.
358 dimensions : `DimensionRecordStorageManager`
359 Manager object for the dimensions in this `Registry`.
360 caching_context : `CachingContext`
361 Object controlling caching of information returned by managers.
362 registry_schema_version : `VersionTuple` or `None`
363 Schema version of this extension as defined in registry.
365 Returns
366 -------
367 manager : `DatasetRecordStorageManager`
368 An instance of a concrete `DatasetRecordStorageManager` subclass.
369 """
370 raise NotImplementedError()
372 @classmethod
373 @abstractmethod
374 def getIdColumnType(cls) -> type:
375 """Return type used for columns storing dataset IDs.
377 This type is used for columns storing `DatasetRef.id` values, usually
378 a `type` subclass provided by SQLAlchemy.
380 Returns
381 -------
382 dtype : `type`
383 Type used for dataset identification in database.
384 """
385 raise NotImplementedError()
387 @classmethod
388 @abstractmethod
389 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
390 """Test whether the given dataset ID generation mode is supported by
391 `insert`.
393 Parameters
394 ----------
395 mode : `DatasetIdGenEnum`
396 Enum value for the mode to test.
398 Returns
399 -------
400 supported : `bool`
401 Whether the given mode is supported.
402 """
403 raise NotImplementedError()
405 @classmethod
406 @abstractmethod
407 def addDatasetForeignKey(
408 cls,
409 tableSpec: ddl.TableSpec,
410 *,
411 name: str = "dataset",
412 constraint: bool = True,
413 onDelete: str | None = None,
414 **kwargs: Any,
415 ) -> ddl.FieldSpec:
416 """Add a foreign key (field and constraint) referencing the dataset
417 table.
419 Parameters
420 ----------
421 tableSpec : `ddl.TableSpec`
422 Specification for the table that should reference the dataset
423 table. Will be modified in place.
424 name : `str`, optional
425 A name to use for the prefix of the new field; the full name is
426 ``{name}_id``.
427 constraint : `bool`, optional
428 If `False` (`True` is default), add a field that can be joined to
429 the dataset primary key, but do not add a foreign key constraint.
430 onDelete : `str`, optional
431 One of "CASCADE" or "SET NULL", indicating what should happen to
432 the referencing row if the collection row is deleted. `None`
433 indicates that this should be an integrity error.
434 **kwargs
435 Additional keyword arguments are forwarded to the `ddl.FieldSpec`
436 constructor (only the ``name`` and ``dtype`` arguments are
437 otherwise provided).
439 Returns
440 -------
441 idSpec : `ddl.FieldSpec`
442 Specification for the ID field.
443 """
444 raise NotImplementedError()
446 @abstractmethod
447 def refresh(self) -> None:
448 """Ensure all other operations on this manager are aware of any
449 dataset types that may have been registered by other clients since
450 it was initialized or last refreshed.
451 """
452 raise NotImplementedError()
454 def __getitem__(self, name: str) -> DatasetRecordStorage:
455 """Return the object that provides access to the records associated
456 with the given `DatasetType` name.
458 This is simply a convenience wrapper for `find` that raises `KeyError`
459 when the dataset type is not found.
461 Returns
462 -------
463 records : `DatasetRecordStorage`
464 The object representing the records for the given dataset type.
466 Raises
467 ------
468 KeyError
469 Raised if there is no dataset type with the given name.
471 Notes
472 -----
473 Dataset types registered by another client of the same repository since
474 the last call to `initialize` or `refresh` may not be found.
475 """
476 result = self.find(name)
477 if result is None:
478 raise MissingDatasetTypeError(f"Dataset type with name '{name}' not found.")
479 return result
481 @abstractmethod
482 def find(self, name: str) -> DatasetRecordStorage | None:
483 """Return an object that provides access to the records associated with
484 the given `DatasetType` name, if one exists.
486 Parameters
487 ----------
488 name : `str`
489 Name of the dataset type.
491 Returns
492 -------
493 records : `DatasetRecordStorage` or `None`
494 The object representing the records for the given dataset type, or
495 `None` if there are no records for that dataset type.
497 Notes
498 -----
499 Dataset types registered by another client of the same repository since
500 the last call to `initialize` or `refresh` may not be found.
501 """
502 raise NotImplementedError()
504 @abstractmethod
505 def register(self, datasetType: DatasetType) -> bool:
506 """Ensure that this `Registry` can hold records for the given
507 `DatasetType`, creating new tables as necessary.
509 Parameters
510 ----------
511 datasetType : `DatasetType`
512 Dataset type for which a table should created (as necessary) and
513 an associated `DatasetRecordStorage` returned.
515 Returns
516 -------
517 inserted : `bool`
518 `True` if the dataset type did not exist in the registry before.
520 Notes
521 -----
522 This operation may not be invoked within a `Database.transaction`
523 context.
524 """
525 raise NotImplementedError()
527 @abstractmethod
528 def remove(self, name: str) -> None:
529 """Remove the dataset type.
531 Parameters
532 ----------
533 name : `str`
534 Name of the dataset type.
535 """
536 raise NotImplementedError()
538 @abstractmethod
539 def resolve_wildcard(
540 self,
541 expression: Any,
542 missing: list[str] | None = None,
543 explicit_only: bool = False,
544 ) -> list[DatasetType]:
545 """Resolve a dataset type wildcard expression.
547 Parameters
548 ----------
549 expression : `~typing.Any`
550 Expression to resolve. Will be passed to
551 `DatasetTypeWildcard.from_expression`.
552 missing : `list` of `str`, optional
553 String dataset type names that were explicitly given (i.e. not
554 regular expression patterns) but not found will be appended to this
555 list, if it is provided.
556 explicit_only : `bool`, optional
557 If `True`, require explicit `DatasetType` instances or `str` names,
558 with `re.Pattern` instances deprecated and ``...`` prohibited.
560 Returns
561 -------
562 dataset_types : `list` [ `DatasetType` ]
563 A list of resolved dataset types.
564 """
565 raise NotImplementedError()
567 @abstractmethod
568 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None:
569 """Return a `DatasetRef` for the given dataset primary key
570 value.
572 Parameters
573 ----------
574 id : `DatasetId`
575 Primary key value for the dataset.
577 Returns
578 -------
579 ref : `DatasetRef` or `None`
580 Object representing the dataset, or `None` if no dataset with the
581 given primary key values exists in this layer.
582 """
583 raise NotImplementedError()
585 @abstractmethod
586 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
587 """Return a summary for the given collection.
589 Parameters
590 ----------
591 collection : `CollectionRecord`
592 Record describing the collection for which a summary is to be
593 retrieved.
595 Returns
596 -------
597 summary : `CollectionSummary`
598 Summary of the dataset types and governor dimension values in
599 this collection.
600 """
601 raise NotImplementedError()
603 @abstractmethod
604 def fetch_summaries(
605 self, collections: Iterable[CollectionRecord], dataset_types: Iterable[DatasetType] | None = None
606 ) -> Mapping[Any, CollectionSummary]:
607 """Fetch collection summaries given their names and dataset types.
609 Parameters
610 ----------
611 collections : `~collections.abc.Iterable` [`CollectionRecord`]
612 Collection records to query.
613 dataset_types : `~collections.abc.Iterable` [`DatasetType`] or `None`
614 Dataset types to include into returned summaries. If `None` then
615 all dataset types will be included.
617 Returns
618 -------
619 summaries : `~collections.abc.Mapping` [`Any`, `CollectionSummary`]
620 Collection summaries indexed by collection record key. This mapping
621 will also contain all nested non-chained collections of the chained
622 collections.
623 """
624 raise NotImplementedError()
626 @abstractmethod
627 def ingest_date_dtype(self) -> type:
628 """Return type of the ``ingest_date`` column."""
629 raise NotImplementedError()