Coverage for python/lsst/daf/butler/core/datasets/ref.py: 31%
195 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-28 10:10 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-28 10:10 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = [
24 "AmbiguousDatasetError",
25 "DatasetId",
26 "DatasetIdFactory",
27 "DatasetIdGenEnum",
28 "DatasetRef",
29 "SerializedDatasetRef",
30]
32import enum
33import uuid
34from collections.abc import Iterable
35from typing import TYPE_CHECKING, Any, ClassVar
37from lsst.utils.classes import immutable
38from pydantic import BaseModel, StrictStr, validator
40from ..configSupport import LookupKey
41from ..dimensions import DataCoordinate, DimensionGraph, DimensionUniverse, SerializedDataCoordinate
42from ..json import from_json_pydantic, to_json_pydantic
43from ..named import NamedKeyDict
44from .type import DatasetType, SerializedDatasetType
46if TYPE_CHECKING:
47 from ...registry import Registry
48 from ..storageClass import StorageClass
51class AmbiguousDatasetError(Exception):
52 """Raised when a `DatasetRef` is not resolved but should be.
54 This happens when the `DatasetRef` has no ID or run but the requested
55 operation requires one of them.
56 """
59class DatasetIdGenEnum(enum.Enum):
60 """Enum used to specify dataset ID generation options."""
62 UNIQUE = 0
63 """Unique mode generates unique ID for each inserted dataset, e.g.
64 auto-generated by database or random UUID.
65 """
67 DATAID_TYPE = 1
68 """In this mode ID is computed deterministically from a combination of
69 dataset type and dataId.
70 """
72 DATAID_TYPE_RUN = 2
73 """In this mode ID is computed deterministically from a combination of
74 dataset type, dataId, and run collection name.
75 """
78class DatasetIdFactory:
79 """Factory for dataset IDs (UUIDs).
81 For now the logic is hard-coded and is controlled by the user-provided
82 value of `DatasetIdGenEnum`. In the future we may implement a configurable
83 logic that can guess `DatasetIdGenEnum` value from other parameters.
84 """
86 NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f")
87 """Namespace UUID used for UUID5 generation. Do not change. This was
88 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`.
89 """
91 def makeDatasetId(
92 self,
93 run: str,
94 datasetType: DatasetType,
95 dataId: DataCoordinate,
96 idGenerationMode: DatasetIdGenEnum,
97 ) -> uuid.UUID:
98 """Generate dataset ID for a dataset.
100 Parameters
101 ----------
102 run : `str`
103 Name of the RUN collection for the dataset.
104 datasetType : `DatasetType`
105 Dataset type.
106 dataId : `DataCoordinate`
107 Expanded data ID for the dataset.
108 idGenerationMode : `DatasetIdGenEnum`
109 ID generation option. `~DatasetIdGenEnum.UNIQUE` makes a random
110 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a
111 deterministic UUID5-type ID based on a dataset type name and
112 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a
113 deterministic UUID5-type ID based on a dataset type name, run
114 collection name, and ``dataId``.
116 Returns
117 -------
118 datasetId : `uuid.UUID`
119 Dataset identifier.
120 """
121 if idGenerationMode is DatasetIdGenEnum.UNIQUE:
122 return uuid.uuid4()
123 else:
124 # WARNING: If you modify this code make sure that the order of
125 # items in the `items` list below never changes.
126 items: list[tuple[str, str]] = []
127 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE:
128 items = [
129 ("dataset_type", datasetType.name),
130 ]
131 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN:
132 items = [
133 ("dataset_type", datasetType.name),
134 ("run", run),
135 ]
136 else:
137 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}")
139 for name, value in sorted(dataId.byName().items()):
140 items.append((name, str(value)))
141 data = ",".join(f"{key}={value}" for key, value in items)
142 return uuid.uuid5(self.NS_UUID, data)
145class SerializedDatasetRef(BaseModel):
146 """Simplified model of a `DatasetRef` suitable for serialization."""
148 id: uuid.UUID
149 datasetType: SerializedDatasetType | None = None
150 dataId: SerializedDataCoordinate | None = None
151 run: StrictStr | None = None
152 component: StrictStr | None = None
154 @validator("dataId")
155 def _check_dataId(cls, v: Any, values: dict[str, Any]) -> Any: # noqa: N805
156 if (d := "datasetType") in values and values[d] is None:
157 raise ValueError("Can not specify 'dataId' without specifying 'datasetType'")
158 return v
160 @validator("run")
161 def _check_run(cls, v: Any, values: dict[str, Any]) -> Any: # noqa: N805
162 if v and (i := "id") in values and values[i] is None:
163 raise ValueError("'run' cannot be provided unless 'id' is.")
164 return v
166 @validator("component")
167 def _check_component(cls, v: Any, values: dict[str, Any]) -> Any: # noqa: N805
168 # Component should not be given if datasetType is given
169 if v and (d := "datasetType") in values and values[d] is not None:
170 raise ValueError(f"datasetType ({values[d]}) can not be set if component is given ({v}).")
171 return v
173 @classmethod
174 def direct(
175 cls,
176 *,
177 id: str,
178 run: str,
179 datasetType: dict[str, Any] | None = None,
180 dataId: dict[str, Any] | None = None,
181 component: str | None = None,
182 ) -> SerializedDatasetRef:
183 """Construct a `SerializedDatasetRef` directly without validators.
185 Notes
186 -----
187 This differs from the pydantic "construct" method in that the arguments
188 are explicitly what the model requires, and it will recurse through
189 members, constructing them from their corresponding `direct` methods.
191 The ``id`` parameter is a string representation of dataset ID, it is
192 converted to UUID by this method.
194 This method should only be called when the inputs are trusted.
195 """
196 node = SerializedDatasetRef.__new__(cls)
197 setter = object.__setattr__
198 setter(node, "id", uuid.UUID(id))
199 setter(
200 node,
201 "datasetType",
202 datasetType if datasetType is None else SerializedDatasetType.direct(**datasetType),
203 )
204 setter(node, "dataId", dataId if dataId is None else SerializedDataCoordinate.direct(**dataId))
205 setter(node, "run", run)
206 setter(node, "component", component)
207 setter(node, "__fields_set__", {"id", "datasetType", "dataId", "run", "component"})
208 return node
211DatasetId = uuid.UUID
212"""A type-annotation alias for dataset ID providing typing flexibility.
213"""
216@immutable
217class DatasetRef:
218 """Reference to a Dataset in a `Registry`.
220 A `DatasetRef` may point to a Dataset that currently does not yet exist
221 (e.g., because it is a predicted input for provenance).
223 Parameters
224 ----------
225 datasetType : `DatasetType`
226 The `DatasetType` for this Dataset.
227 dataId : `DataCoordinate`
228 A mapping of dimensions that labels the Dataset within a Collection.
229 run : `str`
230 The name of the run this dataset was associated with when it was
231 created.
232 id : `DatasetId`, optional
233 The unique identifier assigned when the dataset is created. If ``id``
234 is not specified, a new unique ID will be created.
235 conform : `bool`, optional
236 If `True` (default), call `DataCoordinate.standardize` to ensure that
237 the data ID's dimensions are consistent with the dataset type's.
238 `DatasetRef` instances for which those dimensions are not equal should
239 not be created in new code, but are still supported for backwards
240 compatibility. New code should only pass `False` if it can guarantee
241 that the dimensions are already consistent.
242 id_generation_mode : `DatasetIdGenEnum`
243 ID generation option. `~DatasetIdGenEnum.UNIQUE` makes a random
244 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a
245 deterministic UUID5-type ID based on a dataset type name and
246 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a
247 deterministic UUID5-type ID based on a dataset type name, run
248 collection name, and ``dataId``.
250 See Also
251 --------
252 :ref:`daf_butler_organizing_datasets`
253 """
255 _serializedType = SerializedDatasetRef
256 __slots__ = (
257 "id",
258 "datasetType",
259 "dataId",
260 "run",
261 )
263 def __init__(
264 self,
265 datasetType: DatasetType,
266 dataId: DataCoordinate,
267 run: str,
268 *,
269 id: DatasetId | None = None,
270 conform: bool = True,
271 id_generation_mode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
272 ):
273 self.datasetType = datasetType
274 if conform:
275 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions)
276 else:
277 self.dataId = dataId
278 self.run = run
279 if id is not None:
280 self.id = id
281 else:
282 self.id = DatasetIdFactory().makeDatasetId(
283 self.run, self.datasetType, self.dataId, id_generation_mode
284 )
286 def __eq__(self, other: Any) -> bool:
287 try:
288 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id)
289 except AttributeError:
290 return NotImplemented
292 def __hash__(self) -> int:
293 return hash((self.datasetType, self.dataId, self.id))
295 @property
296 def dimensions(self) -> DimensionGraph:
297 """Dimensions associated with the underlying `DatasetType`."""
298 return self.datasetType.dimensions
300 def __repr__(self) -> str:
301 # We delegate to __str__ (i.e use "!s") for the data ID) below because
302 # DataCoordinate's __repr__ - while adhering to the guidelines for
303 # __repr__ - is much harder to users to read, while its __str__ just
304 # produces a dict that can also be passed to DatasetRef's constructor.
305 return f"DatasetRef({self.datasetType!r}, {self.dataId!s}, run={self.run!r}, id={self.id})"
307 def __str__(self) -> str:
308 s = (
309 f"{self.datasetType.name}@{self.dataId!s} [sc={self.datasetType.storageClass_name}]"
310 f" (run={self.run} id={self.id})"
311 )
312 return s
314 def __lt__(self, other: Any) -> bool:
315 # Sort by run, DatasetType name and then by DataCoordinate
316 # The __str__ representation is probably close enough but we
317 # need to ensure that sorting a DatasetRef matches what you would
318 # get if you sorted DatasetType+DataCoordinate
319 if not isinstance(other, type(self)):
320 return NotImplemented
322 # Group by run if defined, takes precedence over DatasetType
323 self_run = "" if self.run is None else self.run
324 other_run = "" if other.run is None else other.run
326 # Compare tuples in the priority order
327 return (self_run, self.datasetType, self.dataId) < (other_run, other.datasetType, other.dataId)
329 def to_simple(self, minimal: bool = False) -> SerializedDatasetRef:
330 """Convert this class to a simple python type.
332 This makes it suitable for serialization.
334 Parameters
335 ----------
336 minimal : `bool`, optional
337 Use minimal serialization. Requires Registry to convert
338 back to a full type.
340 Returns
341 -------
342 simple : `dict` or `int`
343 The object converted to a dictionary.
344 """
345 if minimal:
346 # The only thing needed to uniquely define a DatasetRef is its id
347 # so that can be used directly if it is not a component DatasetRef.
348 # Store is in a dict to allow us to easily add the planned origin
349 # information later without having to support an int and dict in
350 # simple form.
351 simple: dict[str, Any] = {"id": self.id}
352 if self.isComponent():
353 # We can still be a little minimalist with a component
354 # but we will also need to record the datasetType component
355 simple["component"] = self.datasetType.component()
356 return SerializedDatasetRef(**simple)
358 return SerializedDatasetRef(
359 datasetType=self.datasetType.to_simple(minimal=minimal),
360 dataId=self.dataId.to_simple(),
361 run=self.run,
362 id=self.id,
363 )
365 @classmethod
366 def from_simple(
367 cls,
368 simple: SerializedDatasetRef,
369 universe: DimensionUniverse | None = None,
370 registry: Registry | None = None,
371 datasetType: DatasetType | None = None,
372 ) -> DatasetRef:
373 """Construct a new object from simplified form.
375 Generally this is data returned from the `to_simple` method.
377 Parameters
378 ----------
379 simple : `dict` of [`str`, `Any`]
380 The value returned by `to_simple()`.
381 universe : `DimensionUniverse`
382 The special graph of all known dimensions.
383 Can be `None` if a registry is provided.
384 registry : `lsst.daf.butler.Registry`, optional
385 Registry to use to convert simple form of a DatasetRef to
386 a full `DatasetRef`. Can be `None` if a full description of
387 the type is provided along with a universe.
388 datasetType : DatasetType, optional
389 If datasetType is supplied, this will be used as the datasetType
390 object in the resulting DatasetRef instead of being read from
391 the `SerializedDatasetRef`. This is useful when many refs share
392 the same type as memory can be saved. Defaults to None.
394 Returns
395 -------
396 ref : `DatasetRef`
397 Newly-constructed object.
398 """
399 # Minimalist component will just specify component and id and
400 # require registry to reconstruct
401 if set(simple.dict(exclude_unset=True, exclude_defaults=True)).issubset({"id", "component"}):
402 if registry is None:
403 raise ValueError("Registry is required to construct component DatasetRef from integer id")
404 if simple.id is None:
405 raise ValueError("For minimal DatasetRef the ID must be defined.")
406 ref = registry.getDataset(simple.id)
407 if ref is None:
408 raise RuntimeError(f"No matching dataset found in registry for id {simple.id}")
409 if simple.component:
410 ref = ref.makeComponentRef(simple.component)
411 return ref
413 if universe is None and registry is None:
414 raise ValueError("One of universe or registry must be provided.")
416 if universe is None and registry is not None:
417 universe = registry.dimensions
419 if universe is None:
420 # this is for mypy
421 raise ValueError("Unable to determine a usable universe")
423 if simple.datasetType is None and datasetType is None:
424 # mypy
425 raise ValueError("The DatasetType must be specified to construct a DatasetRef")
426 if datasetType is None:
427 if simple.datasetType is None:
428 raise ValueError("Cannot determine Dataset type of this serialized class")
429 datasetType = DatasetType.from_simple(simple.datasetType, universe=universe, registry=registry)
431 if simple.dataId is None:
432 # mypy
433 raise ValueError("The DataId must be specified to construct a DatasetRef")
434 dataId = DataCoordinate.from_simple(simple.dataId, universe=universe)
436 # Check that simple ref is resolved.
437 if simple.run is None:
438 dstr = ""
439 if simple.datasetType is None:
440 dstr = f" (datasetType={datasetType.name!r})"
441 raise ValueError(
442 "Run collection name is missing from serialized representation. "
443 f"Encountered with {simple!r}{dstr}."
444 )
446 return cls(datasetType, dataId, id=simple.id, run=simple.run)
448 to_json = to_json_pydantic
449 from_json: ClassVar = classmethod(from_json_pydantic)
451 @classmethod
452 def _unpickle(
453 cls,
454 datasetType: DatasetType,
455 dataId: DataCoordinate,
456 id: DatasetId,
457 run: str,
458 ) -> DatasetRef:
459 """Create new `DatasetRef`.
461 A custom factory method for use by `__reduce__` as a workaround for
462 its lack of support for keyword arguments.
463 """
464 return cls(datasetType, dataId, id=id, run=run)
466 def __reduce__(self) -> tuple:
467 return (self._unpickle, (self.datasetType, self.dataId, self.id, self.run))
469 def __deepcopy__(self, memo: dict) -> DatasetRef:
470 # DatasetRef is recursively immutable; see note in @immutable
471 # decorator.
472 return self
474 def expanded(self, dataId: DataCoordinate) -> DatasetRef:
475 """Return a new `DatasetRef` with the given expanded data ID.
477 Parameters
478 ----------
479 dataId : `DataCoordinate`
480 Data ID for the new `DatasetRef`. Must compare equal to the
481 original data ID.
483 Returns
484 -------
485 ref : `DatasetRef`
486 A new `DatasetRef` with the given data ID.
487 """
488 assert dataId == self.dataId
489 return DatasetRef(
490 datasetType=self.datasetType, dataId=dataId, id=self.id, run=self.run, conform=False
491 )
493 def isComponent(self) -> bool:
494 """Indicate whether this `DatasetRef` refers to a component.
496 Returns
497 -------
498 isComponent : `bool`
499 `True` if this `DatasetRef` is a component, `False` otherwise.
500 """
501 return self.datasetType.isComponent()
503 def isComposite(self) -> bool:
504 """Boolean indicating whether this `DatasetRef` is a composite type.
506 Returns
507 -------
508 isComposite : `bool`
509 `True` if this `DatasetRef` is a composite type, `False`
510 otherwise.
511 """
512 return self.datasetType.isComposite()
514 def _lookupNames(self) -> tuple[LookupKey, ...]:
515 """Name keys to use when looking up this DatasetRef in a configuration.
517 The names are returned in order of priority.
519 Returns
520 -------
521 names : `tuple` of `LookupKey`
522 Tuple of the `DatasetType` name and the `StorageClass` name.
523 If ``instrument`` is defined in the dataId, each of those names
524 is added to the start of the tuple with a key derived from the
525 value of ``instrument``.
526 """
527 # Special case the instrument Dimension since we allow configs
528 # to include the instrument name in the hierarchy.
529 names: tuple[LookupKey, ...] = self.datasetType._lookupNames()
531 if "instrument" in self.dataId:
532 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) for n in names) + names
534 return names
536 @staticmethod
537 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, list[DatasetRef]]:
538 """Group an iterable of `DatasetRef` by `DatasetType`.
540 Parameters
541 ----------
542 refs : `~collections.abc.Iterable` [ `DatasetRef` ]
543 `DatasetRef` instances to group.
545 Returns
546 -------
547 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ]
548 Grouped `DatasetRef` instances.
549 """
550 result: NamedKeyDict[DatasetType, list[DatasetRef]] = NamedKeyDict()
551 for ref in refs:
552 result.setdefault(ref.datasetType, []).append(ref)
553 return result
555 def makeCompositeRef(self) -> DatasetRef:
556 """Create a `DatasetRef` of the composite from a component ref.
558 Requires that this `DatasetRef` is a component.
560 Returns
561 -------
562 ref : `DatasetRef`
563 A `DatasetRef` with a dataset type that corresponds to the
564 composite parent of this component, and the same ID and run
565 (which may be `None`, if they are `None` in ``self``).
566 """
567 # Assume that the data ID does not need to be standardized
568 # and should match whatever this ref already has.
569 return DatasetRef(
570 self.datasetType.makeCompositeDatasetType(), self.dataId, id=self.id, run=self.run, conform=False
571 )
573 def makeComponentRef(self, name: str) -> DatasetRef:
574 """Create a `DatasetRef` that corresponds to a component.
576 Parameters
577 ----------
578 name : `str`
579 Name of the component.
581 Returns
582 -------
583 ref : `DatasetRef`
584 A `DatasetRef` with a dataset type that corresponds to the given
585 component, and the same ID and run
586 (which may be `None`, if they are `None` in ``self``).
587 """
588 # Assume that the data ID does not need to be standardized
589 # and should match whatever this ref already has.
590 return DatasetRef(
591 self.datasetType.makeComponentDatasetType(name),
592 self.dataId,
593 id=self.id,
594 run=self.run,
595 conform=False,
596 )
598 def overrideStorageClass(self, storageClass: str | StorageClass) -> DatasetRef:
599 """Create a new `DatasetRef` from this one, but with a modified
600 `DatasetType` that has a different `StorageClass`.
602 Parameters
603 ----------
604 storageClass : `str` or `StorageClass`
605 The new storage class.
607 Returns
608 -------
609 modified : `DatasetRef`
610 A new dataset reference that is the same as the current one but
611 with a different storage class in the `DatasetType`.
612 """
613 return DatasetRef(
614 datasetType=self.datasetType.overrideStorageClass(storageClass),
615 dataId=self.dataId,
616 id=self.id,
617 run=self.run,
618 conform=False,
619 )
621 def is_compatible_with(self, ref: DatasetRef) -> bool:
622 """Determine if the given `DatasetRef` is compatible with this one.
624 Parameters
625 ----------
626 other : `DatasetRef`
627 Dataset ref to check.
629 Returns
630 -------
631 is_compatible : `bool`
632 Returns `True` if the other dataset ref is either the same as this
633 or the dataset type associated with the other is compatible with
634 this one and the dataId and dataset ID match.
636 Notes
637 -----
638 Compatibility requires that the dataId and dataset ID match and the
639 `DatasetType` is compatible. Compatibility is defined as the storage
640 class associated with the dataset type of the other ref can be
641 converted to this storage class.
643 Specifically this means that if you have done:
645 .. code-block:: py
647 new_ref = ref.overrideStorageClass(sc)
649 and this is successful, then the guarantee is that:
651 .. code-block:: py
653 assert ref.is_compatible_with(new_ref) is True
655 since we know that the python type associated with the new ref can
656 be converted to the original python type. The reverse is not guaranteed
657 and depends on whether bidirectional converters have been registered.
658 """
659 if self.id != ref.id:
660 return False
661 if self.dataId != ref.dataId:
662 return False
663 if self.run != ref.run:
664 return False
665 return self.datasetType.is_compatible_with(ref.datasetType)
667 datasetType: DatasetType
668 """The definition of this dataset (`DatasetType`).
670 Cannot be changed after a `DatasetRef` is constructed.
671 """
673 dataId: DataCoordinate
674 """A mapping of `Dimension` primary key values that labels the dataset
675 within a Collection (`DataCoordinate`).
677 Cannot be changed after a `DatasetRef` is constructed.
678 """
680 run: str
681 """The name of the run that produced the dataset.
683 Cannot be changed after a `DatasetRef` is constructed.
684 """
686 id: DatasetId
687 """Primary key of the dataset (`DatasetId`).
689 Cannot be changed after a `DatasetRef` is constructed.
690 """