Coverage for python/lsst/daf/butler/core/datasets/ref.py: 33%
180 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-15 09:41 +0000
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-15 09:41 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["AmbiguousDatasetError", "DatasetId", "DatasetRef", "SerializedDatasetRef"]
25import uuid
26from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union
28from lsst.utils.classes import immutable
29from pydantic import BaseModel, ConstrainedInt, StrictStr, validator
31from ..configSupport import LookupKey
32from ..dimensions import DataCoordinate, DimensionGraph, DimensionUniverse, SerializedDataCoordinate
33from ..json import from_json_pydantic, to_json_pydantic
34from ..named import NamedKeyDict
35from .type import DatasetType, SerializedDatasetType
37if TYPE_CHECKING: 37 ↛ 38line 37 didn't jump to line 38, because the condition on line 37 was never true
38 from ...registry import Registry
41class AmbiguousDatasetError(Exception):
42 """Raised when a `DatasetRef` is not resolved but should be.
44 This happens when the `DatasetRef` has no ID or run but the requested
45 operation requires one of them.
46 """
49class PositiveInt(ConstrainedInt):
50 ge = 0
51 strict = True
54class SerializedDatasetRef(BaseModel):
55 """Simplified model of a `DatasetRef` suitable for serialization."""
57 # DO NOT change order in the Union, pydantic is sensitive to that!
58 id: Optional[Union[uuid.UUID, PositiveInt]] = None
59 datasetType: Optional[SerializedDatasetType] = None
60 dataId: Optional[SerializedDataCoordinate] = None
61 run: Optional[StrictStr] = None
62 component: Optional[StrictStr] = None
64 @validator("dataId")
65 def _check_dataId(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805
66 if (d := "datasetType") in values and values[d] is None:
67 raise ValueError("Can not specify 'dataId' without specifying 'datasetType'")
68 return v
70 @validator("run")
71 def _check_run(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805
72 if v and (i := "id") in values and values[i] is None:
73 raise ValueError("'run' cannot be provided unless 'id' is.")
74 return v
76 @validator("component")
77 def _check_component(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805
78 # Component should not be given if datasetType is given
79 if v and (d := "datasetType") in values and values[d] is not None:
80 raise ValueError(f"datasetType ({values[d]}) can not be set if component is given ({v}).")
81 return v
83 @classmethod
84 def direct(
85 cls,
86 *,
87 id: Optional[Union[str, int]] = None,
88 datasetType: Optional[Dict[str, Any]] = None,
89 dataId: Optional[Dict[str, Any]] = None,
90 run: str = None,
91 component: Optional[str] = None,
92 ) -> SerializedDatasetRef:
93 """Construct a `SerializedDatasetRef` directly without validators.
95 This differs from the pydantic "construct" method in that the arguments
96 are explicitly what the model requires, and it will recurse through
97 members, constructing them from their corresponding `direct` methods.
99 This method should only be called when the inputs are trusted.
100 """
101 node = SerializedDatasetRef.__new__(cls)
102 setter = object.__setattr__
103 setter(node, "id", uuid.UUID(id) if isinstance(id, str) else id)
104 setter(
105 node,
106 "datasetType",
107 datasetType if datasetType is None else SerializedDatasetType.direct(**datasetType),
108 )
109 setter(node, "dataId", dataId if dataId is None else SerializedDataCoordinate.direct(**dataId))
110 setter(node, "run", run)
111 setter(node, "component", component)
112 setter(node, "__fields_set__", {"id", "datasetType", "dataId", "run", "component"})
113 return node
116DatasetId = Union[int, uuid.UUID]
117"""A type-annotation alias for dataset ID which could be either integer or
118UUID.
119"""
122@immutable
123class DatasetRef:
124 """Reference to a Dataset in a `Registry`.
126 A `DatasetRef` may point to a Dataset that currently does not yet exist
127 (e.g., because it is a predicted input for provenance).
129 Parameters
130 ----------
131 datasetType : `DatasetType`
132 The `DatasetType` for this Dataset.
133 dataId : `DataCoordinate`
134 A mapping of dimensions that labels the Dataset within a Collection.
135 id : `DatasetId`, optional
136 The unique identifier assigned when the dataset is created.
137 run : `str`, optional
138 The name of the run this dataset was associated with when it was
139 created. Must be provided if ``id`` is.
140 conform : `bool`, optional
141 If `True` (default), call `DataCoordinate.standardize` to ensure that
142 the data ID's dimensions are consistent with the dataset type's.
143 `DatasetRef` instances for which those dimensions are not equal should
144 not be created in new code, but are still supported for backwards
145 compatibility. New code should only pass `False` if it can guarantee
146 that the dimensions are already consistent.
148 Raises
149 ------
150 ValueError
151 Raised if ``run`` is provided but ``id`` is not, or if ``id`` is
152 provided but ``run`` is not.
154 See Also
155 --------
156 :ref:`daf_butler_organizing_datasets`
157 """
159 _serializedType = SerializedDatasetRef
160 __slots__ = (
161 "id",
162 "datasetType",
163 "dataId",
164 "run",
165 )
167 def __init__(
168 self,
169 datasetType: DatasetType,
170 dataId: DataCoordinate,
171 *,
172 id: Optional[DatasetId] = None,
173 run: Optional[str] = None,
174 conform: bool = True,
175 ):
176 self.id = id
177 self.datasetType = datasetType
178 if conform:
179 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions)
180 else:
181 self.dataId = dataId
182 if self.id is not None:
183 if run is None:
184 raise ValueError(
185 f"Cannot provide id without run for dataset with id={id}, "
186 f"type={datasetType}, and dataId={dataId}."
187 )
188 self.run = run
189 else:
190 if run is not None:
191 raise ValueError("'run' cannot be provided unless 'id' is.")
192 self.run = None
194 def __eq__(self, other: Any) -> bool:
195 try:
196 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id)
197 except AttributeError:
198 return NotImplemented
200 def __hash__(self) -> int:
201 return hash((self.datasetType, self.dataId, self.id))
203 @property
204 def dimensions(self) -> DimensionGraph:
205 """Dimensions associated with the underlying `DatasetType`."""
206 return self.datasetType.dimensions
208 def __repr__(self) -> str:
209 # We delegate to __str__ (i.e use "!s") for the data ID) below because
210 # DataCoordinate's __repr__ - while adhering to the guidelines for
211 # __repr__ - is much harder to users to read, while its __str__ just
212 # produces a dict that can also be passed to DatasetRef's constructor.
213 if self.id is not None:
214 return f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r})"
215 else:
216 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})"
218 def __str__(self) -> str:
219 s = f"{self.datasetType.name}@{self.dataId!s}, sc={self.datasetType.storageClass_name}]"
220 if self.id is not None:
221 s += f" (id={self.id})"
222 return s
224 def __lt__(self, other: Any) -> bool:
225 # Sort by run, DatasetType name and then by DataCoordinate
226 # The __str__ representation is probably close enough but we
227 # need to ensure that sorting a DatasetRef matches what you would
228 # get if you sorted DatasetType+DataCoordinate
229 if not isinstance(other, type(self)):
230 return NotImplemented
232 # Group by run if defined, takes precedence over DatasetType
233 self_run = "" if self.run is None else self.run
234 other_run = "" if other.run is None else other.run
236 # Compare tuples in the priority order
237 return (self_run, self.datasetType, self.dataId) < (other_run, other.datasetType, other.dataId)
239 def to_simple(self, minimal: bool = False) -> SerializedDatasetRef:
240 """Convert this class to a simple python type.
242 This makes it suitable for serialization.
244 Parameters
245 ----------
246 minimal : `bool`, optional
247 Use minimal serialization. Requires Registry to convert
248 back to a full type.
250 Returns
251 -------
252 simple : `dict` or `int`
253 The object converted to a dictionary.
254 """
255 if minimal and self.id is not None:
256 # The only thing needed to uniquely define a DatasetRef
257 # is its id so that can be used directly if it is
258 # resolved and if it is not a component DatasetRef.
259 # Store is in a dict to allow us to easily add the planned
260 # origin information later without having to support
261 # an int and dict in simple form.
262 simple: Dict[str, Any] = {"id": self.id}
263 if self.isComponent():
264 # We can still be a little minimalist with a component
265 # but we will also need to record the datasetType component
266 simple["component"] = self.datasetType.component()
267 return SerializedDatasetRef(**simple)
269 # Convert to a dict form
270 as_dict: Dict[str, Any] = {
271 "datasetType": self.datasetType.to_simple(minimal=minimal),
272 "dataId": self.dataId.to_simple(),
273 }
275 # Only include the id entry if it is defined
276 if self.id is not None:
277 as_dict["run"] = self.run
278 as_dict["id"] = self.id
280 return SerializedDatasetRef(**as_dict)
282 @classmethod
283 def from_simple(
284 cls,
285 simple: SerializedDatasetRef,
286 universe: Optional[DimensionUniverse] = None,
287 registry: Optional[Registry] = None,
288 datasetType: Optional[DatasetType] = None,
289 ) -> DatasetRef:
290 """Construct a new object from simplified form.
292 Generally this is data returned from the `to_simple` method.
294 Parameters
295 ----------
296 simple : `dict` of [`str`, `Any`]
297 The value returned by `to_simple()`.
298 universe : `DimensionUniverse`
299 The special graph of all known dimensions.
300 Can be `None` if a registry is provided.
301 registry : `lsst.daf.butler.Registry`, optional
302 Registry to use to convert simple form of a DatasetRef to
303 a full `DatasetRef`. Can be `None` if a full description of
304 the type is provided along with a universe.
305 datasetType : DatasetType, optional
306 If datasetType is supplied, this will be used as the datasetType
307 object in the resulting DatasetRef instead of being read from
308 the `SerializedDatasetRef`. This is useful when many refs share
309 the same type as memory can be saved. Defaults to None.
311 Returns
312 -------
313 ref : `DatasetRef`
314 Newly-constructed object.
315 """
316 # Minimalist component will just specify component and id and
317 # require registry to reconstruct
318 if set(simple.dict(exclude_unset=True, exclude_defaults=True)).issubset({"id", "component"}):
319 if registry is None:
320 raise ValueError("Registry is required to construct component DatasetRef from integer id")
321 if simple.id is None:
322 raise ValueError("For minimal DatasetRef the ID must be defined.")
323 ref = registry.getDataset(simple.id)
324 if ref is None:
325 raise RuntimeError(f"No matching dataset found in registry for id {simple.id}")
326 if simple.component:
327 ref = ref.makeComponentRef(simple.component)
328 return ref
330 if universe is None and registry is None:
331 raise ValueError("One of universe or registry must be provided.")
333 if universe is None and registry is not None:
334 universe = registry.dimensions
336 if universe is None:
337 # this is for mypy
338 raise ValueError("Unable to determine a usable universe")
340 if simple.datasetType is None and datasetType is None:
341 # mypy
342 raise ValueError("The DatasetType must be specified to construct a DatasetRef")
343 if datasetType is None:
344 if simple.datasetType is None:
345 raise ValueError("Cannot determine Dataset type of this serialized class")
346 datasetType = DatasetType.from_simple(simple.datasetType, universe=universe, registry=registry)
348 if simple.dataId is None:
349 # mypy
350 raise ValueError("The DataId must be specified to construct a DatasetRef")
351 dataId = DataCoordinate.from_simple(simple.dataId, universe=universe)
352 return cls(datasetType, dataId, id=simple.id, run=simple.run)
354 to_json = to_json_pydantic
355 from_json = classmethod(from_json_pydantic)
357 @classmethod
358 def _unpickle(
359 cls,
360 datasetType: DatasetType,
361 dataId: DataCoordinate,
362 id: Optional[DatasetId],
363 run: Optional[str],
364 ) -> DatasetRef:
365 """Create new `DatasetRef`.
367 A custom factory method for use by `__reduce__` as a workaround for
368 its lack of support for keyword arguments.
369 """
370 return cls(datasetType, dataId, id=id, run=run)
372 def __reduce__(self) -> tuple:
373 return (self._unpickle, (self.datasetType, self.dataId, self.id, self.run))
375 def __deepcopy__(self, memo: dict) -> DatasetRef:
376 # DatasetRef is recursively immutable; see note in @immutable
377 # decorator.
378 return self
380 def resolved(self, id: DatasetId, run: str) -> DatasetRef:
381 """Return resolved `DatasetRef`.
383 This is a new `DatasetRef` with the same data ID and dataset type
384 and the given ID and run.
386 Parameters
387 ----------
388 id : `DatasetId`
389 The unique identifier assigned when the dataset is created.
390 run : `str`
391 The run this dataset was associated with when it was created.
393 Returns
394 -------
395 ref : `DatasetRef`
396 A new `DatasetRef`.
397 """
398 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, id=id, run=run, conform=False)
400 def unresolved(self) -> DatasetRef:
401 """Return unresolved `DatasetRef`.
403 This is a new `DatasetRef` with the same data ID and dataset type,
404 but no ID or run.
406 Returns
407 -------
408 ref : `DatasetRef`
409 A new `DatasetRef`.
411 Notes
412 -----
413 This can be used to compare only the data ID and dataset type of a
414 pair of `DatasetRef` instances, regardless of whether either is
415 resolved::
417 if ref1.unresolved() == ref2.unresolved():
418 ...
419 """
420 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, conform=False)
422 def expanded(self, dataId: DataCoordinate) -> DatasetRef:
423 """Return a new `DatasetRef` with the given expanded data ID.
425 Parameters
426 ----------
427 dataId : `DataCoordinate`
428 Data ID for the new `DatasetRef`. Must compare equal to the
429 original data ID.
431 Returns
432 -------
433 ref : `DatasetRef`
434 A new `DatasetRef` with the given data ID.
435 """
436 assert dataId == self.dataId
437 return DatasetRef(
438 datasetType=self.datasetType, dataId=dataId, id=self.id, run=self.run, conform=False
439 )
441 def isComponent(self) -> bool:
442 """Indicate whether this `DatasetRef` refers to a component.
444 Returns
445 -------
446 isComponent : `bool`
447 `True` if this `DatasetRef` is a component, `False` otherwise.
448 """
449 return self.datasetType.isComponent()
451 def isComposite(self) -> bool:
452 """Boolean indicating whether this `DatasetRef` is a composite type.
454 Returns
455 -------
456 isComposite : `bool`
457 `True` if this `DatasetRef` is a composite type, `False`
458 otherwise.
459 """
460 return self.datasetType.isComposite()
462 def _lookupNames(self) -> Tuple[LookupKey, ...]:
463 """Name keys to use when looking up this DatasetRef in a configuration.
465 The names are returned in order of priority.
467 Returns
468 -------
469 names : `tuple` of `LookupKey`
470 Tuple of the `DatasetType` name and the `StorageClass` name.
471 If ``instrument`` is defined in the dataId, each of those names
472 is added to the start of the tuple with a key derived from the
473 value of ``instrument``.
474 """
475 # Special case the instrument Dimension since we allow configs
476 # to include the instrument name in the hierarchy.
477 names: Tuple[LookupKey, ...] = self.datasetType._lookupNames()
479 # mypy doesn't think this could return True, because even though
480 # __contains__ can take an object of any type, it seems hard-coded to
481 # assume it will return False if the type doesn't match the key type
482 # of the Mapping.
483 if "instrument" in self.dataId: # type: ignore
484 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) for n in names) + names
486 return names
488 @staticmethod
489 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
490 """Group an iterable of `DatasetRef` by `DatasetType`.
492 Parameters
493 ----------
494 refs : `Iterable` [ `DatasetRef` ]
495 `DatasetRef` instances to group.
497 Returns
498 -------
499 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ]
500 Grouped `DatasetRef` instances.
501 """
502 result: NamedKeyDict[DatasetType, List[DatasetRef]] = NamedKeyDict()
503 for ref in refs:
504 result.setdefault(ref.datasetType, []).append(ref)
505 return result
507 def getCheckedId(self) -> DatasetId:
508 """Return ``self.id``, or raise if it is `None`.
510 This trivial method exists to allow operations that would otherwise be
511 natural list comprehensions to check that the ID is not `None` as well.
513 Returns
514 -------
515 id : `DatasetId`
516 ``self.id`` if it is not `None`.
518 Raises
519 ------
520 AmbiguousDatasetError
521 Raised if ``ref.id`` is `None`.
522 """
523 if self.id is None:
524 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; a resolved reference is required.")
525 return self.id
527 def makeCompositeRef(self) -> DatasetRef:
528 """Create a `DatasetRef` of the composite from a component ref.
530 Requires that this `DatasetRef` is a component.
532 Returns
533 -------
534 ref : `DatasetRef`
535 A `DatasetRef` with a dataset type that corresponds to the
536 composite parent of this component, and the same ID and run
537 (which may be `None`, if they are `None` in ``self``).
538 """
539 # Assume that the data ID does not need to be standardized
540 # and should match whatever this ref already has.
541 return DatasetRef(
542 self.datasetType.makeCompositeDatasetType(), self.dataId, id=self.id, run=self.run, conform=False
543 )
545 def makeComponentRef(self, name: str) -> DatasetRef:
546 """Create a `DatasetRef` that corresponds to a component.
548 Parameters
549 ----------
550 name : `str`
551 Name of the component.
553 Returns
554 -------
555 ref : `DatasetRef`
556 A `DatasetRef` with a dataset type that corresponds to the given
557 component, and the same ID and run
558 (which may be `None`, if they are `None` in ``self``).
559 """
560 # Assume that the data ID does not need to be standardized
561 # and should match whatever this ref already has.
562 return DatasetRef(
563 self.datasetType.makeComponentDatasetType(name),
564 self.dataId,
565 id=self.id,
566 run=self.run,
567 conform=False,
568 )
570 datasetType: DatasetType
571 """The definition of this dataset (`DatasetType`).
573 Cannot be changed after a `DatasetRef` is constructed.
574 """
576 dataId: DataCoordinate
577 """A mapping of `Dimension` primary key values that labels the dataset
578 within a Collection (`DataCoordinate`).
580 Cannot be changed after a `DatasetRef` is constructed.
581 """
583 run: Optional[str]
584 """The name of the run that produced the dataset.
586 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or
587 `unresolved` to add or remove this information when creating a new
588 `DatasetRef`.
589 """
591 id: Optional[DatasetId]
592 """Primary key of the dataset (`DatasetId` or `None`).
594 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or
595 `unresolved` to add or remove this information when creating a new
596 `DatasetRef`.
597 """