Coverage for python / lsst / daf / butler / registry / interfaces / _datasets.py: 77%
78 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-24 08:17 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-24 08:17 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30from ... import ddl
32__all__ = ("DatasetRecordStorageManager",)
34from abc import abstractmethod
35from collections.abc import Iterable, Mapping, Sequence, Set
36from typing import TYPE_CHECKING, Any
38from ..._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
39from ..._dataset_type import DatasetType
40from ..._exceptions import DatasetTypeError, DatasetTypeNotSupportedError
41from ..._timespan import Timespan
42from ...dimensions import DataCoordinate
43from ...queries import QueryFactoryFunction
44from ...queries.tree import AnyDatasetFieldName
45from ._versioning import VersionedExtension, VersionTuple
47if TYPE_CHECKING:
48 from ...direct_query_driver import SqlJoinsBuilder # new query system, server+direct only
49 from .._caching_context import CachingContext
50 from .._collection_summary import CollectionSummary
51 from ._collections import CollectionManager, CollectionRecord, RunRecord
52 from ._database import Database, StaticTablesContext
53 from ._dimensions import DimensionRecordStorageManager
56class DatasetRecordStorageManager(VersionedExtension):
57 """An interface that manages the tables that describe datasets.
59 `DatasetRecordStorageManager` primarily serves as a container and factory
60 for `DatasetRecordStorage` instances, which each provide access to the
61 records for a different `DatasetType`.
63 Parameters
64 ----------
65 registry_schema_version : `VersionTuple` or `None`, optional
66 Version of registry schema.
67 """
69 def __init__(self, *, registry_schema_version: VersionTuple | None = None) -> None:
70 super().__init__(registry_schema_version=registry_schema_version)
72 @abstractmethod
73 def clone(
74 self,
75 *,
76 db: Database,
77 collections: CollectionManager,
78 dimensions: DimensionRecordStorageManager,
79 caching_context: CachingContext,
80 ) -> DatasetRecordStorageManager:
81 """Make an independent copy of this manager instance bound to new
82 instances of `Database` and other managers.
84 Parameters
85 ----------
86 db : `Database`
87 New `Database` object to use when instantiating the manager.
88 collections : `CollectionManager`
89 New `CollectionManager` object to use when instantiating the
90 manager.
91 dimensions : `DimensionRecordStorageManager`
92 New `DimensionRecordStorageManager` object to use when
93 instantiating the manager.
94 caching_context : `CachingContext`
95 New `CachingContext` object to use when instantiating the manager.
97 Returns
98 -------
99 instance : `DatasetRecordStorageManager`
100 New manager instance with the same configuration as this instance,
101 but bound to a new Database object.
102 """
103 raise NotImplementedError()
105 @abstractmethod
106 def preload_cache(self) -> None:
107 """Fetch data from the database and use it to pre-populate caches to
108 speed up later operations.
109 """
110 raise NotImplementedError()
112 @classmethod
113 @abstractmethod
114 def initialize(
115 cls,
116 db: Database,
117 context: StaticTablesContext,
118 *,
119 collections: CollectionManager,
120 dimensions: DimensionRecordStorageManager,
121 caching_context: CachingContext,
122 registry_schema_version: VersionTuple | None = None,
123 ) -> DatasetRecordStorageManager:
124 """Construct an instance of the manager.
126 Parameters
127 ----------
128 db : `Database`
129 Interface to the underlying database engine and namespace.
130 context : `StaticTablesContext`
131 Context object obtained from `Database.declareStaticTables`; used
132 to declare any tables that should always be present.
133 collections : `CollectionManager`
134 Manager object for the collections in this `Registry`.
135 dimensions : `DimensionRecordStorageManager`
136 Manager object for the dimensions in this `Registry`.
137 caching_context : `CachingContext`
138 Object controlling caching of information returned by managers.
139 registry_schema_version : `VersionTuple` or `None`
140 Schema version of this extension as defined in registry.
142 Returns
143 -------
144 manager : `DatasetRecordStorageManager`
145 An instance of a concrete `DatasetRecordStorageManager` subclass.
146 """
147 raise NotImplementedError()
149 @classmethod
150 @abstractmethod
151 def addDatasetForeignKey(
152 cls,
153 tableSpec: ddl.TableSpec,
154 *,
155 name: str = "dataset",
156 constraint: bool = True,
157 onDelete: str | None = None,
158 **kwargs: Any,
159 ) -> ddl.FieldSpec:
160 """Add a foreign key (field and constraint) referencing the dataset
161 table.
163 Parameters
164 ----------
165 tableSpec : `ddl.TableSpec`
166 Specification for the table that should reference the dataset
167 table. Will be modified in place.
168 name : `str`, optional
169 A name to use for the prefix of the new field; the full name is
170 ``{name}_id``.
171 constraint : `bool`, optional
172 If `False` (`True` is default), add a field that can be joined to
173 the dataset primary key, but do not add a foreign key constraint.
174 onDelete : `str`, optional
175 One of "CASCADE" or "SET NULL", indicating what should happen to
176 the referencing row if the collection row is deleted. `None`
177 indicates that this should be an integrity error.
178 **kwargs
179 Additional keyword arguments are forwarded to the `ddl.FieldSpec`
180 constructor (only the ``name`` and ``dtype`` arguments are
181 otherwise provided).
183 Returns
184 -------
185 idSpec : `ddl.FieldSpec`
186 Specification for the ID field.
187 """
188 raise NotImplementedError()
190 @abstractmethod
191 def refresh(self) -> None:
192 """Ensure all other operations on this manager are aware of any
193 dataset types that may have been registered by other clients since
194 it was initialized or last refreshed.
195 """
196 raise NotImplementedError()
198 @abstractmethod
199 def get_dataset_type(self, name: str) -> DatasetType:
200 """Look up a dataset type by name.
202 Parameters
203 ----------
204 name : `str`
205 Name of a parent dataset type.
207 Returns
208 -------
209 dataset_type : `DatasetType`
210 The object representing the records for the given dataset type.
212 Raises
213 ------
214 MissingDatasetTypeError
215 Raised if there is no dataset type with the given name.
216 """
217 raise NotImplementedError()
219 def conform_exact_dataset_type(self, dataset_type: DatasetType | str) -> DatasetType:
220 """Conform a value that may be a dataset type or dataset type name to
221 just the dataset type name, while checking that the dataset type is not
222 a component and (if a `DatasetType` instance is given) has the exact
223 same definition in the registry.
225 Parameters
226 ----------
227 dataset_type : `str` or `DatasetType`
228 Dataset type object or name.
230 Returns
231 -------
232 dataset_type : `DatasetType`
233 The corresponding registered dataset type.
235 Raises
236 ------
237 DatasetTypeError
238 Raised if ``dataset_type`` is a component, or if its definition
239 does not exactly match the registered dataset type.
240 MissingDatasetTypeError
241 Raised if this dataset type is not registered at all.
242 """
243 if isinstance(dataset_type, DatasetType):
244 dataset_type_name = dataset_type.name
245 given_dataset_type = dataset_type
246 else:
247 dataset_type_name = dataset_type
248 given_dataset_type = None
249 parent_name, component = DatasetType.splitDatasetTypeName(dataset_type_name)
250 if component is not None:
251 raise DatasetTypeNotSupportedError(
252 f"Component dataset {dataset_type_name!r} is not supported in this context."
253 )
254 registered_dataset_type = self.get_dataset_type(dataset_type_name)
255 if given_dataset_type is not None and registered_dataset_type != given_dataset_type:
256 raise DatasetTypeError(
257 f"Given dataset type {given_dataset_type} is not identical to the "
258 f"registered one {registered_dataset_type}."
259 )
260 return registered_dataset_type
262 @abstractmethod
263 def register_dataset_type(self, dataset_type: DatasetType) -> bool:
264 """Ensure that this `Registry` can hold records for the given
265 `DatasetType`, creating new tables as necessary.
267 Parameters
268 ----------
269 dataset_type : `DatasetType`
270 Dataset type for which a table should created (as necessary) and
271 an associated `DatasetRecordStorage` returned.
273 Returns
274 -------
275 inserted : `bool`
276 `True` if the dataset type did not exist in the registry before.
278 Notes
279 -----
280 This operation may not be invoked within a `Database.transaction`
281 context.
282 """
283 raise NotImplementedError()
285 @abstractmethod
286 def remove_dataset_type(self, name: str) -> None:
287 """Remove the dataset type.
289 Parameters
290 ----------
291 name : `str`
292 Name of the dataset type.
293 """
294 raise NotImplementedError()
296 @abstractmethod
297 def resolve_wildcard(
298 self,
299 expression: Any,
300 missing: list[str] | None = None,
301 explicit_only: bool = False,
302 ) -> list[DatasetType]:
303 """Resolve a dataset type wildcard expression.
305 Parameters
306 ----------
307 expression : `~typing.Any`
308 Expression to resolve. Will be passed to
309 `DatasetTypeWildcard.from_expression`.
310 missing : `list` of `str`, optional
311 String dataset type names that were explicitly given (i.e. not
312 regular expression patterns) but not found will be appended to this
313 list, if it is provided.
314 explicit_only : `bool`, optional
315 If `True`, require explicit `DatasetType` instances or `str` names,
316 with `re.Pattern` instances deprecated and ``...`` prohibited.
318 Returns
319 -------
320 dataset_types : `list` [ `DatasetType` ]
321 A list of resolved dataset types.
322 """
323 raise NotImplementedError()
325 @abstractmethod
326 def get_dataset_refs(self, ids: list[DatasetId]) -> list[DatasetRef]:
327 """
328 Return a `DatasetRef` for each of the given dataset UUID values.
330 Parameters
331 ----------
332 ids : `list` [ `DatasetId` ]
333 List of UUID instances to look up.
335 Returns
336 -------
337 refs : `list` [ `DatasetRef` ]
338 A list containing a `DatasetRef` for each of the given UUIDs that
339 was found in the database. If a dataset was not found, no error is
340 thrown -- it is just not included in the list. The returned
341 datasets are in no particular order.
342 """
343 raise NotImplementedError()
345 @abstractmethod
346 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
347 """Return a summary for the given collection.
349 Parameters
350 ----------
351 collection : `CollectionRecord`
352 Record describing the collection for which a summary is to be
353 retrieved.
355 Returns
356 -------
357 summary : `CollectionSummary`
358 Summary of the dataset types and governor dimension values in
359 this collection.
360 """
361 raise NotImplementedError()
363 @abstractmethod
364 def fetch_summaries(
365 self,
366 collections: Iterable[CollectionRecord],
367 dataset_types: Iterable[DatasetType] | Iterable[str] | None = None,
368 ) -> Mapping[Any, CollectionSummary]:
369 """Fetch collection summaries given their names and dataset types.
371 Parameters
372 ----------
373 collections : `~collections.abc.Iterable` [`CollectionRecord`]
374 Collection records to query.
375 dataset_types : `~collections.abc.Iterable` [`DatasetType`] or `None`
376 Dataset types to include into returned summaries. If `None` then
377 all dataset types will be included.
379 Returns
380 -------
381 summaries : `~collections.abc.Mapping` [`typing.Any`, \
382 `CollectionSummary`]
383 Collection summaries indexed by collection record key. This mapping
384 will also contain all nested non-chained collections of the chained
385 collections.
386 """
387 raise NotImplementedError()
389 @abstractmethod
390 def fetch_run_dataset_ids(self, run: RunRecord) -> list[DatasetId]:
391 """Return the IDs of all datasets in the given ``RUN``
392 collection.
394 Parameters
395 ----------
396 run : `RunRecord`
397 Record describing the collection.
399 Returns
400 -------
401 dataset_ids : `list` [`uuid.UUID`]
402 List of dataset IDs.
403 """
404 raise NotImplementedError()
406 @abstractmethod
407 def ingest_date_dtype(self) -> type:
408 """Return type of the ``ingest_date`` column."""
409 raise NotImplementedError()
411 @abstractmethod
412 def insert(
413 self,
414 dataset_type_name: str,
415 run: RunRecord,
416 data_ids: Iterable[DataCoordinate],
417 id_generation_mode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
418 ) -> list[DatasetRef]:
419 """Insert one or more dataset entries into the database.
421 Parameters
422 ----------
423 dataset_type_name : `str`
424 Name of the dataset type.
425 run : `RunRecord`
426 The record object describing the `~CollectionType.RUN` collection
427 these datasets will be associated with.
428 data_ids : `~collections.abc.Iterable` [ `DataCoordinate` ]
429 Expanded data IDs (`DataCoordinate` instances) for the
430 datasets to be added. The dimensions of all data IDs must be the
431 same as ``dataset_type.dimensions``.
432 id_generation_mode : `DatasetIdGenEnum`
433 With `~DatasetIdGenEnum.UNIQUE` each new dataset is inserted with
434 its new unique ID. With non-`~DatasetIdGenEnum.UNIQUE` mode the ID
435 is computed from some combination of dataset type, dataId, and run
436 collection name; if the same ID is already in the database then new
437 record is not inserted.
439 Returns
440 -------
441 datasets : `list` [ `DatasetRef` ]
442 References to the inserted datasets.
443 """
444 raise NotImplementedError()
446 @abstractmethod
447 def import_(self, run: RunRecord, refs: list[DatasetRef], assume_new: bool = False) -> None:
448 """Insert one or more dataset entries into the database.
450 Parameters
451 ----------
452 run : `RunRecord`
453 The record object describing the `~CollectionType.RUN` collection
454 these datasets will be associated with.
455 refs : `list` [ `DatasetRef` ]
456 List of datasets to be be inserted. All of the ``DatasetRef``
457 ``run`` attributes must match the ``run`` parameter.
458 assume_new : `bool`, optional
459 If `True`, assume all datasets are new and skip conflict resolution
460 logic.
461 """
462 raise NotImplementedError()
464 @abstractmethod
465 def delete(self, datasets: Iterable[DatasetId | DatasetRef]) -> None:
466 """Fully delete the given datasets from the registry.
468 Parameters
469 ----------
470 datasets : `~collections.abc.Iterable` [ `DatasetId` or `DatasetRef` ]
471 Datasets to be deleted. If `DatasetRef` instances are passed,
472 only the `DatasetRef.id` attribute is used.
473 """
474 raise NotImplementedError()
476 @abstractmethod
477 def associate(
478 self, dataset_type: DatasetType, collection: CollectionRecord, datasets: Iterable[DatasetRef]
479 ) -> None:
480 """Associate one or more datasets with a collection.
482 Parameters
483 ----------
484 dataset_type : `DatasetType`
485 Type of all datasets.
486 collection : `CollectionRecord`
487 The record object describing the collection. ``collection.type``
488 must be `~CollectionType.TAGGED`.
489 datasets : `~collections.abc.Iterable` [ `DatasetRef` ]
490 Datasets to be associated. All datasets must have the same
491 `DatasetType` as ``dataset_type``, but this is not checked.
493 Notes
494 -----
495 Associating a dataset into collection that already contains a
496 different dataset with the same `DatasetType` and data ID will remove
497 the existing dataset from that collection.
499 Associating the same dataset into a collection multiple times is a
500 no-op, but is still not permitted on read-only databases.
501 """
502 raise NotImplementedError()
504 @abstractmethod
505 def disassociate(
506 self, dataset_type: DatasetType, collection: CollectionRecord, datasets: Iterable[DatasetRef]
507 ) -> None:
508 """Remove one or more datasets from a collection.
510 Parameters
511 ----------
512 dataset_type : `DatasetType`
513 Type of all datasets.
514 collection : `CollectionRecord`
515 The record object describing the collection. ``collection.type``
516 must be `~CollectionType.TAGGED`.
517 datasets : `~collections.abc.Iterable` [ `DatasetRef` ]
518 Datasets to be disassociated. All datasets must have the same
519 `DatasetType` as ``dataset_type``, but this is not checked.
520 """
521 raise NotImplementedError()
523 @abstractmethod
524 def certify(
525 self,
526 dataset_type: DatasetType,
527 collection: CollectionRecord,
528 datasets: Iterable[DatasetRef],
529 timespan: Timespan,
530 query_func: QueryFactoryFunction,
531 ) -> None:
532 """Associate one or more datasets with a calibration collection and a
533 validity range within it.
535 Parameters
536 ----------
537 dataset_type : `DatasetType`
538 Type of all datasets.
539 collection : `CollectionRecord`
540 The record object describing the collection. ``collection.type``
541 must be `~CollectionType.CALIBRATION`.
542 datasets : `~collections.abc.Iterable` [ `DatasetRef` ]
543 Datasets to be associated. All datasets must have the same
544 `DatasetType` as ``dataset_type``, but this is not checked.
545 timespan : `Timespan`
546 The validity range for these datasets within the collection.
547 query_func : `QueryFactoryFunction`
548 Function returning a context manager that sets up a `Query` object
549 for querying the registry. (That is, a function equivalent to
550 ``Butler.query()``).
552 Raises
553 ------
554 ConflictingDefinitionError
555 Raised if the collection already contains a different dataset with
556 the same `DatasetType` and data ID and an overlapping validity
557 range.
558 DatasetTypeError
559 Raised if ``dataset_type.isCalibration() is False``.
560 CollectionTypeError
561 Raised if
562 ``collection.type is not CollectionType.CALIBRATION``.
563 """
564 raise NotImplementedError()
566 @abstractmethod
567 def decertify(
568 self,
569 dataset_type: DatasetType,
570 collection: CollectionRecord,
571 timespan: Timespan,
572 *,
573 data_ids: Iterable[DataCoordinate] | None = None,
574 query_func: QueryFactoryFunction,
575 ) -> None:
576 """Remove or adjust datasets to clear a validity range within a
577 calibration collection.
579 Parameters
580 ----------
581 dataset_type : `DatasetType`
582 Type of all datasets.
583 collection : `CollectionRecord`
584 The record object describing the collection. ``collection.type``
585 must be `~CollectionType.CALIBRATION`.
586 timespan : `Timespan`
587 The validity range to remove datasets from within the collection.
588 Datasets that overlap this range but are not contained by it will
589 have their validity ranges adjusted to not overlap it, which may
590 split a single dataset validity range into two.
591 data_ids : `~collections.abc.Iterable` [ `DataCoordinate` ], optional
592 Data IDs that should be decertified within the given validity range
593 If `None`, all data IDs for ``dataset_type`` in ``collection`` will
594 be decertified.
595 query_func : `QueryFactoryFunction`
596 Function returning a context manager that sets up a `Query` object
597 for querying the registry. (That is, a function equivalent to
598 ``Butler.query()``).
600 Raises
601 ------
602 DatasetTypeError
603 Raised if ``dataset_type.isCalibration() is False``.
604 CollectionTypeError
605 Raised if
606 ``collection.type is not CollectionType.CALIBRATION``.
607 """
608 raise NotImplementedError()
610 @abstractmethod
611 def make_joins_builder(
612 self,
613 dataset_type: DatasetType,
614 collections: Sequence[CollectionRecord],
615 fields: Set[AnyDatasetFieldName],
616 is_union: bool = False,
617 ) -> SqlJoinsBuilder:
618 """Make a `lsst.daf.butler.direct_query_driver.SqlJoinsBuilder`
619 that represents a search for datasets of this type.
621 Parameters
622 ----------
623 dataset_type : `DatasetType`
624 Type of dataset to query for.
625 collections : `~collections.abc.Sequence` [ `CollectionRecord` ]
626 Collections to search, in order, after filtering out collections
627 with no datasets of this type via collection summaries.
628 fields : `~collections.abc.Set` [ `str` ]
629 Names of fields to make available in the builder. Options include:
631 - ``dataset_id`` (UUID)
632 - ``run`` (collection name, `str`)
633 - ``collection`` (collection name, `str`)
634 - ``collection_key`` (collection primary key, manager-dependent)
635 - ``timespan`` (validity range, or unbounded for non-calibrations)
636 - ``ingest_date`` (time dataset was ingested into repository)
638 Dimension keys for the dataset type's required dimensions are
639 always included.
640 is_union : `bool`, optional
641 If `True`, this search is being joined in as part of one term in
642 a union over all dataset types. This causes fields to be added to
643 the builder via the special ``...`` instad of the dataset type
644 name.
646 Returns
647 -------
648 builder : `lsst.daf.butler.direct_query_driver.SqlJoinsBuilder`
649 A query-construction object representing a table or subquery.
650 """
651 raise NotImplementedError()
653 @abstractmethod
654 def refresh_collection_summaries(self, dataset_type: DatasetType) -> None:
655 """Make sure that collection summaries for this dataset type are
656 consistent with the contents of the dataset tables.
658 Parameters
659 ----------
660 dataset_type : `DatasetType`
661 Dataset type whose summary entries should be refreshed.
662 """
663 raise NotImplementedError()