Coverage for python/lsst/daf/butler/core/datasets/ref.py: 35%
212 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-14 19:21 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-14 19:21 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = [
24 "AmbiguousDatasetError",
25 "DatasetId",
26 "DatasetIdFactory",
27 "DatasetIdGenEnum",
28 "DatasetRef",
29 "SerializedDatasetRef",
30]
32import enum
33import sys
34import uuid
35from collections.abc import Iterable
36from typing import TYPE_CHECKING, Any, ClassVar
38from lsst.utils.classes import immutable
40try:
41 from pydantic.v1 import BaseModel, StrictStr, validator
42except ModuleNotFoundError:
43 from pydantic import BaseModel, StrictStr, validator # type: ignore
45from ..configSupport import LookupKey
46from ..dimensions import DataCoordinate, DimensionGraph, DimensionUniverse, SerializedDataCoordinate
47from ..json import from_json_pydantic, to_json_pydantic
48from ..named import NamedKeyDict
49from ..persistenceContext import PersistenceContextVars
50from .type import DatasetType, SerializedDatasetType
52if TYPE_CHECKING:
53 from ...registry import Registry
54 from ..storageClass import StorageClass
57class AmbiguousDatasetError(Exception):
58 """Raised when a `DatasetRef` is not resolved but should be.
60 This happens when the `DatasetRef` has no ID or run but the requested
61 operation requires one of them.
62 """
65class DatasetIdGenEnum(enum.Enum):
66 """Enum used to specify dataset ID generation options."""
68 UNIQUE = 0
69 """Unique mode generates unique ID for each inserted dataset, e.g.
70 auto-generated by database or random UUID.
71 """
73 DATAID_TYPE = 1
74 """In this mode ID is computed deterministically from a combination of
75 dataset type and dataId.
76 """
78 DATAID_TYPE_RUN = 2
79 """In this mode ID is computed deterministically from a combination of
80 dataset type, dataId, and run collection name.
81 """
84class DatasetIdFactory:
85 """Factory for dataset IDs (UUIDs).
87 For now the logic is hard-coded and is controlled by the user-provided
88 value of `DatasetIdGenEnum`. In the future we may implement a configurable
89 logic that can guess `DatasetIdGenEnum` value from other parameters.
90 """
92 NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f")
93 """Namespace UUID used for UUID5 generation. Do not change. This was
94 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`.
95 """
97 def makeDatasetId(
98 self,
99 run: str,
100 datasetType: DatasetType,
101 dataId: DataCoordinate,
102 idGenerationMode: DatasetIdGenEnum,
103 ) -> uuid.UUID:
104 """Generate dataset ID for a dataset.
106 Parameters
107 ----------
108 run : `str`
109 Name of the RUN collection for the dataset.
110 datasetType : `DatasetType`
111 Dataset type.
112 dataId : `DataCoordinate`
113 Expanded data ID for the dataset.
114 idGenerationMode : `DatasetIdGenEnum`
115 ID generation option. `~DatasetIdGenEnum.UNIQUE` makes a random
116 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a
117 deterministic UUID5-type ID based on a dataset type name and
118 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a
119 deterministic UUID5-type ID based on a dataset type name, run
120 collection name, and ``dataId``.
122 Returns
123 -------
124 datasetId : `uuid.UUID`
125 Dataset identifier.
126 """
127 if idGenerationMode is DatasetIdGenEnum.UNIQUE:
128 return uuid.uuid4()
129 else:
130 # WARNING: If you modify this code make sure that the order of
131 # items in the `items` list below never changes.
132 items: list[tuple[str, str]] = []
133 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE:
134 items = [
135 ("dataset_type", datasetType.name),
136 ]
137 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN:
138 items = [
139 ("dataset_type", datasetType.name),
140 ("run", run),
141 ]
142 else:
143 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}")
145 for name, value in sorted(dataId.byName().items()):
146 items.append((name, str(value)))
147 data = ",".join(f"{key}={value}" for key, value in items)
148 return uuid.uuid5(self.NS_UUID, data)
151# This is constant, so don't recreate a set for each instance
152_serializedDatasetRefFieldsSet = {"id", "datasetType", "dataId", "run", "component"}
155class SerializedDatasetRef(BaseModel):
156 """Simplified model of a `DatasetRef` suitable for serialization."""
158 id: uuid.UUID
159 datasetType: SerializedDatasetType | None = None
160 dataId: SerializedDataCoordinate | None = None
161 run: StrictStr | None = None
162 component: StrictStr | None = None
164 @validator("dataId")
165 def _check_dataId(cls, v: Any, values: dict[str, Any]) -> Any: # noqa: N805
166 if (d := "datasetType") in values and values[d] is None:
167 raise ValueError("Can not specify 'dataId' without specifying 'datasetType'")
168 return v
170 @validator("run")
171 def _check_run(cls, v: Any, values: dict[str, Any]) -> Any: # noqa: N805
172 if v and (i := "id") in values and values[i] is None:
173 raise ValueError("'run' cannot be provided unless 'id' is.")
174 return v
176 @validator("component")
177 def _check_component(cls, v: Any, values: dict[str, Any]) -> Any: # noqa: N805
178 # Component should not be given if datasetType is given
179 if v and (d := "datasetType") in values and values[d] is not None:
180 raise ValueError(f"datasetType ({values[d]}) can not be set if component is given ({v}).")
181 return v
183 @classmethod
184 def direct(
185 cls,
186 *,
187 id: str,
188 run: str,
189 datasetType: dict[str, Any] | None = None,
190 dataId: dict[str, Any] | None = None,
191 component: str | None = None,
192 ) -> SerializedDatasetRef:
193 """Construct a `SerializedDatasetRef` directly without validators.
195 Notes
196 -----
197 This differs from the pydantic "construct" method in that the arguments
198 are explicitly what the model requires, and it will recurse through
199 members, constructing them from their corresponding `direct` methods.
201 The ``id`` parameter is a string representation of dataset ID, it is
202 converted to UUID by this method.
204 This method should only be called when the inputs are trusted.
205 """
206 node = SerializedDatasetRef.__new__(cls)
207 setter = object.__setattr__
208 setter(node, "id", uuid.UUID(id))
209 setter(
210 node,
211 "datasetType",
212 datasetType if datasetType is None else SerializedDatasetType.direct(**datasetType),
213 )
214 setter(node, "dataId", dataId if dataId is None else SerializedDataCoordinate.direct(**dataId))
215 setter(node, "run", sys.intern(run))
216 setter(node, "component", component)
217 setter(node, "__fields_set__", _serializedDatasetRefFieldsSet)
218 return node
221DatasetId = uuid.UUID
222"""A type-annotation alias for dataset ID providing typing flexibility.
223"""
226@immutable
227class DatasetRef:
228 """Reference to a Dataset in a `Registry`.
230 A `DatasetRef` may point to a Dataset that currently does not yet exist
231 (e.g., because it is a predicted input for provenance).
233 Parameters
234 ----------
235 datasetType : `DatasetType`
236 The `DatasetType` for this Dataset.
237 dataId : `DataCoordinate`
238 A mapping of dimensions that labels the Dataset within a Collection.
239 run : `str`
240 The name of the run this dataset was associated with when it was
241 created.
242 id : `DatasetId`, optional
243 The unique identifier assigned when the dataset is created. If ``id``
244 is not specified, a new unique ID will be created.
245 conform : `bool`, optional
246 If `True` (default), call `DataCoordinate.standardize` to ensure that
247 the data ID's dimensions are consistent with the dataset type's.
248 `DatasetRef` instances for which those dimensions are not equal should
249 not be created in new code, but are still supported for backwards
250 compatibility. New code should only pass `False` if it can guarantee
251 that the dimensions are already consistent.
252 id_generation_mode : `DatasetIdGenEnum`
253 ID generation option. `~DatasetIdGenEnum.UNIQUE` makes a random
254 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a
255 deterministic UUID5-type ID based on a dataset type name and
256 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a
257 deterministic UUID5-type ID based on a dataset type name, run
258 collection name, and ``dataId``.
260 See Also
261 --------
262 :ref:`daf_butler_organizing_datasets`
263 """
265 _serializedType = SerializedDatasetRef
266 __slots__ = (
267 "_id",
268 "datasetType",
269 "dataId",
270 "run",
271 )
273 def __init__(
274 self,
275 datasetType: DatasetType,
276 dataId: DataCoordinate,
277 run: str,
278 *,
279 id: DatasetId | None = None,
280 conform: bool = True,
281 id_generation_mode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
282 ):
283 self.datasetType = datasetType
284 if conform:
285 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions)
286 else:
287 self.dataId = dataId
288 self.run = run
289 if id is not None:
290 self._id = id.int
291 else:
292 self._id = (
293 DatasetIdFactory()
294 .makeDatasetId(self.run, self.datasetType, self.dataId, id_generation_mode)
295 .int
296 )
298 @property
299 def id(self) -> DatasetId:
300 """Primary key of the dataset (`DatasetId`).
302 Cannot be changed after a `DatasetRef` is constructed.
303 """
304 return uuid.UUID(int=self._id)
306 def __eq__(self, other: Any) -> bool:
307 try:
308 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id)
309 except AttributeError:
310 return NotImplemented
312 def __hash__(self) -> int:
313 return hash((self.datasetType, self.dataId, self.id))
315 @property
316 def dimensions(self) -> DimensionGraph:
317 """Dimensions associated with the underlying `DatasetType`."""
318 return self.datasetType.dimensions
320 def __repr__(self) -> str:
321 # We delegate to __str__ (i.e use "!s") for the data ID) below because
322 # DataCoordinate's __repr__ - while adhering to the guidelines for
323 # __repr__ - is much harder to users to read, while its __str__ just
324 # produces a dict that can also be passed to DatasetRef's constructor.
325 return f"DatasetRef({self.datasetType!r}, {self.dataId!s}, run={self.run!r}, id={self.id})"
327 def __str__(self) -> str:
328 s = (
329 f"{self.datasetType.name}@{self.dataId!s} [sc={self.datasetType.storageClass_name}]"
330 f" (run={self.run} id={self.id})"
331 )
332 return s
334 def __lt__(self, other: Any) -> bool:
335 # Sort by run, DatasetType name and then by DataCoordinate
336 # The __str__ representation is probably close enough but we
337 # need to ensure that sorting a DatasetRef matches what you would
338 # get if you sorted DatasetType+DataCoordinate
339 if not isinstance(other, type(self)):
340 return NotImplemented
342 # Group by run if defined, takes precedence over DatasetType
343 self_run = "" if self.run is None else self.run
344 other_run = "" if other.run is None else other.run
346 # Compare tuples in the priority order
347 return (self_run, self.datasetType, self.dataId) < (other_run, other.datasetType, other.dataId)
349 def to_simple(self, minimal: bool = False) -> SerializedDatasetRef:
350 """Convert this class to a simple python type.
352 This makes it suitable for serialization.
354 Parameters
355 ----------
356 minimal : `bool`, optional
357 Use minimal serialization. Requires Registry to convert
358 back to a full type.
360 Returns
361 -------
362 simple : `dict` or `int`
363 The object converted to a dictionary.
364 """
365 if minimal:
366 # The only thing needed to uniquely define a DatasetRef is its id
367 # so that can be used directly if it is not a component DatasetRef.
368 # Store is in a dict to allow us to easily add the planned origin
369 # information later without having to support an int and dict in
370 # simple form.
371 simple: dict[str, Any] = {"id": self.id}
372 if self.isComponent():
373 # We can still be a little minimalist with a component
374 # but we will also need to record the datasetType component
375 simple["component"] = self.datasetType.component()
376 return SerializedDatasetRef(**simple)
378 return SerializedDatasetRef(
379 datasetType=self.datasetType.to_simple(minimal=minimal),
380 dataId=self.dataId.to_simple(),
381 run=self.run,
382 id=self.id,
383 )
385 @classmethod
386 def from_simple(
387 cls,
388 simple: SerializedDatasetRef,
389 universe: DimensionUniverse | None = None,
390 registry: Registry | None = None,
391 datasetType: DatasetType | None = None,
392 ) -> DatasetRef:
393 """Construct a new object from simplified form.
395 Generally this is data returned from the `to_simple` method.
397 Parameters
398 ----------
399 simple : `dict` of [`str`, `Any`]
400 The value returned by `to_simple()`.
401 universe : `DimensionUniverse`
402 The special graph of all known dimensions.
403 Can be `None` if a registry is provided.
404 registry : `lsst.daf.butler.Registry`, optional
405 Registry to use to convert simple form of a DatasetRef to
406 a full `DatasetRef`. Can be `None` if a full description of
407 the type is provided along with a universe.
408 datasetType : DatasetType, optional
409 If datasetType is supplied, this will be used as the datasetType
410 object in the resulting DatasetRef instead of being read from
411 the `SerializedDatasetRef`. This is useful when many refs share
412 the same type as memory can be saved. Defaults to None.
414 Returns
415 -------
416 ref : `DatasetRef`
417 Newly-constructed object.
418 """
419 cache = PersistenceContextVars.datasetRefs.get()
420 localName = sys.intern(
421 datasetType.name
422 if datasetType is not None
423 else (x.name if (x := simple.datasetType) is not None else "")
424 )
425 key = (simple.id.int, localName)
426 if cache is not None and (cachedRef := cache.get(key, None)) is not None:
427 return cachedRef
428 # Minimalist component will just specify component and id and
429 # require registry to reconstruct
430 if not (simple.datasetType is not None or simple.dataId is not None or simple.run is not None):
431 if registry is None:
432 raise ValueError("Registry is required to construct component DatasetRef from integer id")
433 if simple.id is None:
434 raise ValueError("For minimal DatasetRef the ID must be defined.")
435 ref = registry.getDataset(simple.id)
436 if ref is None:
437 raise RuntimeError(f"No matching dataset found in registry for id {simple.id}")
438 if simple.component:
439 ref = ref.makeComponentRef(simple.component)
440 if cache is not None:
441 cache[key] = ref
442 return ref
444 if universe is None and registry is None:
445 raise ValueError("One of universe or registry must be provided.")
447 if universe is None and registry is not None:
448 universe = registry.dimensions
450 if universe is None:
451 # this is for mypy
452 raise ValueError("Unable to determine a usable universe")
454 if simple.datasetType is None and datasetType is None:
455 # mypy
456 raise ValueError("The DatasetType must be specified to construct a DatasetRef")
457 if datasetType is None:
458 if simple.datasetType is None:
459 raise ValueError("Cannot determine Dataset type of this serialized class")
460 datasetType = DatasetType.from_simple(simple.datasetType, universe=universe, registry=registry)
462 if simple.dataId is None:
463 # mypy
464 raise ValueError("The DataId must be specified to construct a DatasetRef")
465 dataId = DataCoordinate.from_simple(simple.dataId, universe=universe)
467 # Check that simple ref is resolved.
468 if simple.run is None:
469 dstr = ""
470 if simple.datasetType is None:
471 dstr = f" (datasetType={datasetType.name!r})"
472 raise ValueError(
473 "Run collection name is missing from serialized representation. "
474 f"Encountered with {simple!r}{dstr}."
475 )
477 newRef = cls(datasetType, dataId, id=simple.id, run=simple.run)
478 if cache is not None:
479 cache[key] = newRef
480 return newRef
482 to_json = to_json_pydantic
483 from_json: ClassVar = classmethod(from_json_pydantic)
485 @classmethod
486 def _unpickle(
487 cls,
488 datasetType: DatasetType,
489 dataId: DataCoordinate,
490 id: DatasetId,
491 run: str,
492 ) -> DatasetRef:
493 """Create new `DatasetRef`.
495 A custom factory method for use by `__reduce__` as a workaround for
496 its lack of support for keyword arguments.
497 """
498 return cls(datasetType, dataId, id=id, run=run)
500 def __reduce__(self) -> tuple:
501 return (self._unpickle, (self.datasetType, self.dataId, self.id, self.run))
503 def __deepcopy__(self, memo: dict) -> DatasetRef:
504 # DatasetRef is recursively immutable; see note in @immutable
505 # decorator.
506 return self
508 def expanded(self, dataId: DataCoordinate) -> DatasetRef:
509 """Return a new `DatasetRef` with the given expanded data ID.
511 Parameters
512 ----------
513 dataId : `DataCoordinate`
514 Data ID for the new `DatasetRef`. Must compare equal to the
515 original data ID.
517 Returns
518 -------
519 ref : `DatasetRef`
520 A new `DatasetRef` with the given data ID.
521 """
522 assert dataId == self.dataId
523 return DatasetRef(
524 datasetType=self.datasetType, dataId=dataId, id=self.id, run=self.run, conform=False
525 )
527 def isComponent(self) -> bool:
528 """Indicate whether this `DatasetRef` refers to a component.
530 Returns
531 -------
532 isComponent : `bool`
533 `True` if this `DatasetRef` is a component, `False` otherwise.
534 """
535 return self.datasetType.isComponent()
537 def isComposite(self) -> bool:
538 """Boolean indicating whether this `DatasetRef` is a composite type.
540 Returns
541 -------
542 isComposite : `bool`
543 `True` if this `DatasetRef` is a composite type, `False`
544 otherwise.
545 """
546 return self.datasetType.isComposite()
548 def _lookupNames(self) -> tuple[LookupKey, ...]:
549 """Name keys to use when looking up this DatasetRef in a configuration.
551 The names are returned in order of priority.
553 Returns
554 -------
555 names : `tuple` of `LookupKey`
556 Tuple of the `DatasetType` name and the `StorageClass` name.
557 If ``instrument`` is defined in the dataId, each of those names
558 is added to the start of the tuple with a key derived from the
559 value of ``instrument``.
560 """
561 # Special case the instrument Dimension since we allow configs
562 # to include the instrument name in the hierarchy.
563 names: tuple[LookupKey, ...] = self.datasetType._lookupNames()
565 if "instrument" in self.dataId:
566 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) for n in names) + names
568 return names
570 @staticmethod
571 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, list[DatasetRef]]:
572 """Group an iterable of `DatasetRef` by `DatasetType`.
574 Parameters
575 ----------
576 refs : `~collections.abc.Iterable` [ `DatasetRef` ]
577 `DatasetRef` instances to group.
579 Returns
580 -------
581 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ]
582 Grouped `DatasetRef` instances.
583 """
584 result: NamedKeyDict[DatasetType, list[DatasetRef]] = NamedKeyDict()
585 for ref in refs:
586 result.setdefault(ref.datasetType, []).append(ref)
587 return result
589 def makeCompositeRef(self) -> DatasetRef:
590 """Create a `DatasetRef` of the composite from a component ref.
592 Requires that this `DatasetRef` is a component.
594 Returns
595 -------
596 ref : `DatasetRef`
597 A `DatasetRef` with a dataset type that corresponds to the
598 composite parent of this component, and the same ID and run
599 (which may be `None`, if they are `None` in ``self``).
600 """
601 # Assume that the data ID does not need to be standardized
602 # and should match whatever this ref already has.
603 return DatasetRef(
604 self.datasetType.makeCompositeDatasetType(), self.dataId, id=self.id, run=self.run, conform=False
605 )
607 def makeComponentRef(self, name: str) -> DatasetRef:
608 """Create a `DatasetRef` that corresponds to a component.
610 Parameters
611 ----------
612 name : `str`
613 Name of the component.
615 Returns
616 -------
617 ref : `DatasetRef`
618 A `DatasetRef` with a dataset type that corresponds to the given
619 component, and the same ID and run
620 (which may be `None`, if they are `None` in ``self``).
621 """
622 # Assume that the data ID does not need to be standardized
623 # and should match whatever this ref already has.
624 return DatasetRef(
625 self.datasetType.makeComponentDatasetType(name),
626 self.dataId,
627 id=self.id,
628 run=self.run,
629 conform=False,
630 )
632 def overrideStorageClass(self, storageClass: str | StorageClass) -> DatasetRef:
633 """Create a new `DatasetRef` from this one, but with a modified
634 `DatasetType` that has a different `StorageClass`.
636 Parameters
637 ----------
638 storageClass : `str` or `StorageClass`
639 The new storage class.
641 Returns
642 -------
643 modified : `DatasetRef`
644 A new dataset reference that is the same as the current one but
645 with a different storage class in the `DatasetType`.
646 """
647 return DatasetRef(
648 datasetType=self.datasetType.overrideStorageClass(storageClass),
649 dataId=self.dataId,
650 id=self.id,
651 run=self.run,
652 conform=False,
653 )
655 def is_compatible_with(self, ref: DatasetRef) -> bool:
656 """Determine if the given `DatasetRef` is compatible with this one.
658 Parameters
659 ----------
660 other : `DatasetRef`
661 Dataset ref to check.
663 Returns
664 -------
665 is_compatible : `bool`
666 Returns `True` if the other dataset ref is either the same as this
667 or the dataset type associated with the other is compatible with
668 this one and the dataId and dataset ID match.
670 Notes
671 -----
672 Compatibility requires that the dataId and dataset ID match and the
673 `DatasetType` is compatible. Compatibility is defined as the storage
674 class associated with the dataset type of the other ref can be
675 converted to this storage class.
677 Specifically this means that if you have done:
679 .. code-block:: py
681 new_ref = ref.overrideStorageClass(sc)
683 and this is successful, then the guarantee is that:
685 .. code-block:: py
687 assert ref.is_compatible_with(new_ref) is True
689 since we know that the python type associated with the new ref can
690 be converted to the original python type. The reverse is not guaranteed
691 and depends on whether bidirectional converters have been registered.
692 """
693 if self.id != ref.id:
694 return False
695 if self.dataId != ref.dataId:
696 return False
697 if self.run != ref.run:
698 return False
699 return self.datasetType.is_compatible_with(ref.datasetType)
701 datasetType: DatasetType
702 """The definition of this dataset (`DatasetType`).
704 Cannot be changed after a `DatasetRef` is constructed.
705 """
707 dataId: DataCoordinate
708 """A mapping of `Dimension` primary key values that labels the dataset
709 within a Collection (`DataCoordinate`).
711 Cannot be changed after a `DatasetRef` is constructed.
712 """
714 run: str
715 """The name of the run that produced the dataset.
717 Cannot be changed after a `DatasetRef` is constructed.
718 """