Coverage for python/lsst/daf/butler/registry/interfaces/_datasets.py: 94%
76 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-13 09:58 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-13 09:58 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30from ... import ddl
32__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage")
34from abc import ABC, abstractmethod
35from collections.abc import Iterable, Iterator, Mapping, Sequence, Set
36from typing import TYPE_CHECKING, Any
38from lsst.daf.relation import Relation
40from ..._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
41from ..._dataset_type import DatasetType
42from ..._exceptions import MissingDatasetTypeError
43from ..._timespan import Timespan
44from ...dimensions import DataCoordinate
45from ._versioning import VersionedExtension, VersionTuple
47if TYPE_CHECKING:
48 from ...direct_query_driver import QueryJoiner # new query system, server+direct only
49 from .._caching_context import CachingContext
50 from .._collection_summary import CollectionSummary
51 from ..queries import SqlQueryContext # old registry query system
52 from ._collections import CollectionManager, CollectionRecord, RunRecord
53 from ._database import Database, StaticTablesContext
54 from ._dimensions import DimensionRecordStorageManager
57class DatasetRecordStorage(ABC):
58 """An interface that manages the records associated with a particular
59 `DatasetType`.
61 Parameters
62 ----------
63 datasetType : `DatasetType`
64 Dataset type whose records this object manages.
65 """
67 def __init__(self, datasetType: DatasetType):
68 self.datasetType = datasetType
70 @abstractmethod
71 def insert(
72 self,
73 run: RunRecord,
74 dataIds: Iterable[DataCoordinate],
75 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
76 ) -> Iterator[DatasetRef]:
77 """Insert one or more dataset entries into the database.
79 Parameters
80 ----------
81 run : `RunRecord`
82 The record object describing the `~CollectionType.RUN` collection
83 this dataset will be associated with.
84 dataIds : `~collections.abc.Iterable` [ `DataCoordinate` ]
85 Expanded data IDs (`DataCoordinate` instances) for the
86 datasets to be added. The dimensions of all data IDs must be the
87 same as ``self.datasetType.dimensions``.
88 idGenerationMode : `DatasetIdGenEnum`
89 With `UNIQUE` each new dataset is inserted with its new unique ID.
90 With non-`UNIQUE` mode ID is computed from some combination of
91 dataset type, dataId, and run collection name; if the same ID is
92 already in the database then new record is not inserted.
94 Returns
95 -------
96 datasets : `~collections.abc.Iterable` [ `DatasetRef` ]
97 References to the inserted datasets.
98 """
99 raise NotImplementedError()
101 @abstractmethod
102 def import_(
103 self,
104 run: RunRecord,
105 datasets: Iterable[DatasetRef],
106 ) -> Iterator[DatasetRef]:
107 """Insert one or more dataset entries into the database.
109 Parameters
110 ----------
111 run : `RunRecord`
112 The record object describing the `~CollectionType.RUN` collection
113 this dataset will be associated with.
114 datasets : `~collections.abc.Iterable` of `DatasetRef`
115 Datasets to be inserted. Datasets can specify ``id`` attribute
116 which will be used for inserted datasets. All dataset IDs must
117 have the same type (`int` or `uuid.UUID`), if type of dataset IDs
118 does not match type supported by this class then IDs will be
119 ignored and new IDs will be generated by backend.
121 Returns
122 -------
123 datasets : `~collections.abc.Iterable` [ `DatasetRef` ]
124 References to the inserted or existing datasets.
126 Notes
127 -----
128 The ``datasetType`` and ``run`` attributes of datasets are supposed to
129 be identical across all datasets but this is not checked and it should
130 be enforced by higher level registry code. This method does not need
131 to use those attributes from datasets, only ``dataId`` and ``id`` are
132 relevant.
133 """
134 raise NotImplementedError()
136 @abstractmethod
137 def delete(self, datasets: Iterable[DatasetRef]) -> None:
138 """Fully delete the given datasets from the registry.
140 Parameters
141 ----------
142 datasets : `~collections.abc.Iterable` [ `DatasetRef` ]
143 Datasets to be deleted. All datasets must be resolved and have
144 the same `DatasetType` as ``self``.
146 Raises
147 ------
148 AmbiguousDatasetError
149 Raised if any of the given `DatasetRef` instances is unresolved.
150 """
151 raise NotImplementedError()
153 @abstractmethod
154 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
155 """Associate one or more datasets with a collection.
157 Parameters
158 ----------
159 collection : `CollectionRecord`
160 The record object describing the collection. ``collection.type``
161 must be `~CollectionType.TAGGED`.
162 datasets : `~collections.abc.Iterable` [ `DatasetRef` ]
163 Datasets to be associated. All datasets must be resolved and have
164 the same `DatasetType` as ``self``.
166 Raises
167 ------
168 AmbiguousDatasetError
169 Raised if any of the given `DatasetRef` instances is unresolved.
171 Notes
172 -----
173 Associating a dataset with into collection that already contains a
174 different dataset with the same `DatasetType` and data ID will remove
175 the existing dataset from that collection.
177 Associating the same dataset into a collection multiple times is a
178 no-op, but is still not permitted on read-only databases.
179 """
180 raise NotImplementedError()
182 @abstractmethod
183 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
184 """Remove one or more datasets from a collection.
186 Parameters
187 ----------
188 collection : `CollectionRecord`
189 The record object describing the collection. ``collection.type``
190 must be `~CollectionType.TAGGED`.
191 datasets : `~collections.abc.Iterable` [ `DatasetRef` ]
192 Datasets to be disassociated. All datasets must be resolved and
193 have the same `DatasetType` as ``self``.
195 Raises
196 ------
197 AmbiguousDatasetError
198 Raised if any of the given `DatasetRef` instances is unresolved.
199 """
200 raise NotImplementedError()
202 @abstractmethod
203 def certify(
204 self,
205 collection: CollectionRecord,
206 datasets: Iterable[DatasetRef],
207 timespan: Timespan,
208 context: SqlQueryContext,
209 ) -> None:
210 """Associate one or more datasets with a calibration collection and a
211 validity range within it.
213 Parameters
214 ----------
215 collection : `CollectionRecord`
216 The record object describing the collection. ``collection.type``
217 must be `~CollectionType.CALIBRATION`.
218 datasets : `~collections.abc.Iterable` [ `DatasetRef` ]
219 Datasets to be associated. All datasets must be resolved and have
220 the same `DatasetType` as ``self``.
221 timespan : `Timespan`
222 The validity range for these datasets within the collection.
223 context : `SqlQueryContext`
224 The object that manages database connections, temporary tables and
225 relation engines for this query.
227 Raises
228 ------
229 AmbiguousDatasetError
230 Raised if any of the given `DatasetRef` instances is unresolved.
231 ConflictingDefinitionError
232 Raised if the collection already contains a different dataset with
233 the same `DatasetType` and data ID and an overlapping validity
234 range.
235 CollectionTypeError
236 Raised if
237 ``collection.type is not CollectionType.CALIBRATION`` or if
238 ``self.datasetType.isCalibration() is False``.
239 """
240 raise NotImplementedError()
242 @abstractmethod
243 def decertify(
244 self,
245 collection: CollectionRecord,
246 timespan: Timespan,
247 *,
248 dataIds: Iterable[DataCoordinate] | None = None,
249 context: SqlQueryContext,
250 ) -> None:
251 """Remove or adjust datasets to clear a validity range within a
252 calibration collection.
254 Parameters
255 ----------
256 collection : `CollectionRecord`
257 The record object describing the collection. ``collection.type``
258 must be `~CollectionType.CALIBRATION`.
259 timespan : `Timespan`
260 The validity range to remove datasets from within the collection.
261 Datasets that overlap this range but are not contained by it will
262 have their validity ranges adjusted to not overlap it, which may
263 split a single dataset validity range into two.
264 dataIds : `~collections.abc.Iterable` [ `DataCoordinate` ], optional
265 Data IDs that should be decertified within the given validity range
266 If `None`, all data IDs for ``self.datasetType`` will be
267 decertified.
268 context : `SqlQueryContext`
269 The object that manages database connections, temporary tables and
270 relation engines for this query.
272 Raises
273 ------
274 CollectionTypeError
275 Raised if ``collection.type is not CollectionType.CALIBRATION``.
276 """
277 raise NotImplementedError()
279 @abstractmethod
280 def make_relation(
281 self,
282 *collections: CollectionRecord,
283 columns: Set[str],
284 context: SqlQueryContext,
285 ) -> Relation:
286 """Return a `sql.Relation` that represents a query for for this
287 `DatasetType` in one or more collections.
289 Parameters
290 ----------
291 *collections : `CollectionRecord`
292 The record object(s) describing the collection(s) to query. May
293 not be of type `CollectionType.CHAINED`. If multiple collections
294 are passed, the query will search all of them in an unspecified
295 order, and all collections must have the same type. Must include
296 at least one collection.
297 columns : `~collections.abc.Set` [ `str` ]
298 Columns to include in the relation. See `Query.find_datasets` for
299 most options, but this method supports one more:
301 - ``rank``: a calculated integer column holding the index of the
302 collection the dataset was found in, within the ``collections``
303 sequence given.
304 context : `SqlQueryContext`
305 The object that manages database connections, temporary tables and
306 relation engines for this query.
308 Returns
309 -------
310 relation : `~lsst.daf.relation.Relation`
311 Representation of the query.
312 """
313 raise NotImplementedError()
315 @abstractmethod
316 def make_query_joiner(self, collections: Sequence[CollectionRecord], fields: Set[str]) -> QueryJoiner:
317 """Make a `..direct_query_driver.QueryJoiner` that represents a search
318 for datasets of this type.
320 Parameters
321 ----------
322 collections : `~collections.abc.Sequence` [ `CollectionRecord` ]
323 Collections to search, in order, after filtering out collections
324 with no datasets of this type via collection summaries.
325 fields : `~collections.abc.Set` [ `str` ]
326 Names of fields to make available in the joiner. Options include:
328 - ``dataset_id`` (UUID)
329 - ``run` (collection name, `str`)
330 - ``collection`` (collection name, `str`)
331 - ``collection_key`` (collection primary key, manager-dependent)
332 - ``timespan`` (validity range, or unbounded for non-calibrations)
333 - ``ingest_date`` (time dataset was ingested into repository)
335 Dimension keys for the dataset type's required dimensions are
336 always included.
338 Returns
339 -------
340 joiner : `..direct_query_driver.QueryJoiner`
341 A query-construction object representing a table or subquery. If
342 ``fields`` is empty or ``len(collections) <= 1``, this is
343 guaranteed to have rows that are unique over dimension keys.
344 """
345 raise NotImplementedError()
347 datasetType: DatasetType
348 """Dataset type whose records this object manages (`DatasetType`).
349 """
352class DatasetRecordStorageManager(VersionedExtension):
353 """An interface that manages the tables that describe datasets.
355 `DatasetRecordStorageManager` primarily serves as a container and factory
356 for `DatasetRecordStorage` instances, which each provide access to the
357 records for a different `DatasetType`.
359 Parameters
360 ----------
361 registry_schema_version : `VersionTuple` or `None`, optional
362 Version of registry schema.
363 """
365 def __init__(self, *, registry_schema_version: VersionTuple | None = None) -> None:
366 super().__init__(registry_schema_version=registry_schema_version)
368 @abstractmethod
369 def clone(
370 self,
371 *,
372 db: Database,
373 collections: CollectionManager,
374 dimensions: DimensionRecordStorageManager,
375 caching_context: CachingContext,
376 ) -> DatasetRecordStorageManager:
377 """Make an independent copy of this manager instance bound to new
378 instances of `Database` and other managers.
380 Parameters
381 ----------
382 db : `Database`
383 New `Database` object to use when instantiating the manager.
384 collections : `CollectionManager`
385 New `CollectionManager` object to use when instantiating the
386 manager.
387 dimensions : `DimensionRecordStorageManager`
388 New `DimensionRecordStorageManager` object to use when
389 instantiating the manager.
390 caching_context : `CachingContext`
391 New `CachingContext` object to use when instantiating the manager.
393 Returns
394 -------
395 instance : `DatasetRecordStorageManager`
396 New manager instance with the same configuration as this instance,
397 but bound to a new Database object.
398 """
399 raise NotImplementedError()
401 @classmethod
402 @abstractmethod
403 def initialize(
404 cls,
405 db: Database,
406 context: StaticTablesContext,
407 *,
408 collections: CollectionManager,
409 dimensions: DimensionRecordStorageManager,
410 caching_context: CachingContext,
411 registry_schema_version: VersionTuple | None = None,
412 ) -> DatasetRecordStorageManager:
413 """Construct an instance of the manager.
415 Parameters
416 ----------
417 db : `Database`
418 Interface to the underlying database engine and namespace.
419 context : `StaticTablesContext`
420 Context object obtained from `Database.declareStaticTables`; used
421 to declare any tables that should always be present.
422 collections : `CollectionManager`
423 Manager object for the collections in this `Registry`.
424 dimensions : `DimensionRecordStorageManager`
425 Manager object for the dimensions in this `Registry`.
426 caching_context : `CachingContext`
427 Object controlling caching of information returned by managers.
428 registry_schema_version : `VersionTuple` or `None`
429 Schema version of this extension as defined in registry.
431 Returns
432 -------
433 manager : `DatasetRecordStorageManager`
434 An instance of a concrete `DatasetRecordStorageManager` subclass.
435 """
436 raise NotImplementedError()
438 @classmethod
439 @abstractmethod
440 def getIdColumnType(cls) -> type:
441 """Return type used for columns storing dataset IDs.
443 This type is used for columns storing `DatasetRef.id` values, usually
444 a `type` subclass provided by SQLAlchemy.
446 Returns
447 -------
448 dtype : `type`
449 Type used for dataset identification in database.
450 """
451 raise NotImplementedError()
453 @classmethod
454 @abstractmethod
455 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
456 """Test whether the given dataset ID generation mode is supported by
457 `insert`.
459 Parameters
460 ----------
461 mode : `DatasetIdGenEnum`
462 Enum value for the mode to test.
464 Returns
465 -------
466 supported : `bool`
467 Whether the given mode is supported.
468 """
469 raise NotImplementedError()
471 @classmethod
472 @abstractmethod
473 def addDatasetForeignKey(
474 cls,
475 tableSpec: ddl.TableSpec,
476 *,
477 name: str = "dataset",
478 constraint: bool = True,
479 onDelete: str | None = None,
480 **kwargs: Any,
481 ) -> ddl.FieldSpec:
482 """Add a foreign key (field and constraint) referencing the dataset
483 table.
485 Parameters
486 ----------
487 tableSpec : `ddl.TableSpec`
488 Specification for the table that should reference the dataset
489 table. Will be modified in place.
490 name : `str`, optional
491 A name to use for the prefix of the new field; the full name is
492 ``{name}_id``.
493 constraint : `bool`, optional
494 If `False` (`True` is default), add a field that can be joined to
495 the dataset primary key, but do not add a foreign key constraint.
496 onDelete : `str`, optional
497 One of "CASCADE" or "SET NULL", indicating what should happen to
498 the referencing row if the collection row is deleted. `None`
499 indicates that this should be an integrity error.
500 **kwargs
501 Additional keyword arguments are forwarded to the `ddl.FieldSpec`
502 constructor (only the ``name`` and ``dtype`` arguments are
503 otherwise provided).
505 Returns
506 -------
507 idSpec : `ddl.FieldSpec`
508 Specification for the ID field.
509 """
510 raise NotImplementedError()
512 @abstractmethod
513 def refresh(self) -> None:
514 """Ensure all other operations on this manager are aware of any
515 dataset types that may have been registered by other clients since
516 it was initialized or last refreshed.
517 """
518 raise NotImplementedError()
520 def __getitem__(self, name: str) -> DatasetRecordStorage:
521 """Return the object that provides access to the records associated
522 with the given `DatasetType` name.
524 This is simply a convenience wrapper for `find` that raises `KeyError`
525 when the dataset type is not found.
527 Returns
528 -------
529 records : `DatasetRecordStorage`
530 The object representing the records for the given dataset type.
532 Raises
533 ------
534 KeyError
535 Raised if there is no dataset type with the given name.
537 Notes
538 -----
539 Dataset types registered by another client of the same repository since
540 the last call to `initialize` or `refresh` may not be found.
541 """
542 result = self.find(name)
543 if result is None:
544 raise MissingDatasetTypeError(f"Dataset type with name '{name}' not found.")
545 return result
547 @abstractmethod
548 def find(self, name: str) -> DatasetRecordStorage | None:
549 """Return an object that provides access to the records associated with
550 the given `DatasetType` name, if one exists.
552 Parameters
553 ----------
554 name : `str`
555 Name of the dataset type.
557 Returns
558 -------
559 records : `DatasetRecordStorage` or `None`
560 The object representing the records for the given dataset type, or
561 `None` if there are no records for that dataset type.
563 Notes
564 -----
565 Dataset types registered by another client of the same repository since
566 the last call to `initialize` or `refresh` may not be found.
567 """
568 raise NotImplementedError()
570 @abstractmethod
571 def register(self, datasetType: DatasetType) -> bool:
572 """Ensure that this `Registry` can hold records for the given
573 `DatasetType`, creating new tables as necessary.
575 Parameters
576 ----------
577 datasetType : `DatasetType`
578 Dataset type for which a table should created (as necessary) and
579 an associated `DatasetRecordStorage` returned.
581 Returns
582 -------
583 inserted : `bool`
584 `True` if the dataset type did not exist in the registry before.
586 Notes
587 -----
588 This operation may not be invoked within a `Database.transaction`
589 context.
590 """
591 raise NotImplementedError()
593 @abstractmethod
594 def remove(self, name: str) -> None:
595 """Remove the dataset type.
597 Parameters
598 ----------
599 name : `str`
600 Name of the dataset type.
601 """
602 raise NotImplementedError()
604 @abstractmethod
605 def resolve_wildcard(
606 self,
607 expression: Any,
608 missing: list[str] | None = None,
609 explicit_only: bool = False,
610 ) -> list[DatasetType]:
611 """Resolve a dataset type wildcard expression.
613 Parameters
614 ----------
615 expression : `~typing.Any`
616 Expression to resolve. Will be passed to
617 `DatasetTypeWildcard.from_expression`.
618 missing : `list` of `str`, optional
619 String dataset type names that were explicitly given (i.e. not
620 regular expression patterns) but not found will be appended to this
621 list, if it is provided.
622 explicit_only : `bool`, optional
623 If `True`, require explicit `DatasetType` instances or `str` names,
624 with `re.Pattern` instances deprecated and ``...`` prohibited.
626 Returns
627 -------
628 dataset_types : `list` [ `DatasetType` ]
629 A list of resolved dataset types.
630 """
631 raise NotImplementedError()
633 @abstractmethod
634 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None:
635 """Return a `DatasetRef` for the given dataset primary key
636 value.
638 Parameters
639 ----------
640 id : `DatasetId`
641 Primary key value for the dataset.
643 Returns
644 -------
645 ref : `DatasetRef` or `None`
646 Object representing the dataset, or `None` if no dataset with the
647 given primary key values exists in this layer.
648 """
649 raise NotImplementedError()
651 @abstractmethod
652 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
653 """Return a summary for the given collection.
655 Parameters
656 ----------
657 collection : `CollectionRecord`
658 Record describing the collection for which a summary is to be
659 retrieved.
661 Returns
662 -------
663 summary : `CollectionSummary`
664 Summary of the dataset types and governor dimension values in
665 this collection.
666 """
667 raise NotImplementedError()
669 @abstractmethod
670 def fetch_summaries(
671 self, collections: Iterable[CollectionRecord], dataset_types: Iterable[DatasetType] | None = None
672 ) -> Mapping[Any, CollectionSummary]:
673 """Fetch collection summaries given their names and dataset types.
675 Parameters
676 ----------
677 collections : `~collections.abc.Iterable` [`CollectionRecord`]
678 Collection records to query.
679 dataset_types : `~collections.abc.Iterable` [`DatasetType`] or `None`
680 Dataset types to include into returned summaries. If `None` then
681 all dataset types will be included.
683 Returns
684 -------
685 summaries : `~collections.abc.Mapping` [`Any`, `CollectionSummary`]
686 Collection summaries indexed by collection record key. This mapping
687 will also contain all nested non-chained collections of the chained
688 collections.
689 """
690 raise NotImplementedError()
692 @abstractmethod
693 def ingest_date_dtype(self) -> type:
694 """Return type of the ``ingest_date`` column."""
695 raise NotImplementedError()