Coverage for python/lsst/daf/butler/core/datasets/ref.py: 31%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["AmbiguousDatasetError", "DatasetId", "DatasetRef", "SerializedDatasetRef"]
25import uuid
26from typing import (
27 TYPE_CHECKING,
28 Any,
29 Dict,
30 Iterable,
31 List,
32 Optional,
33 Tuple,
34 Union,
35)
37from pydantic import BaseModel, StrictStr, ConstrainedInt, validator
39from lsst.utils.classes import immutable
40from ..dimensions import DataCoordinate, DimensionGraph, DimensionUniverse, SerializedDataCoordinate
41from ..configSupport import LookupKey
42from ..named import NamedKeyDict
43from .type import DatasetType, SerializedDatasetType
44from ..json import from_json_pydantic, to_json_pydantic
46if TYPE_CHECKING: 46 ↛ 47line 46 didn't jump to line 47, because the condition on line 46 was never true
47 from ...registry import Registry
50class AmbiguousDatasetError(Exception):
51 """Raised when a `DatasetRef` is not resolved but should be.
53 This happens when the `DatasetRef` has no ID or run but the requested
54 operation requires one of them.
55 """
58class PositiveInt(ConstrainedInt):
59 ge = 0
60 strict = True
63class SerializedDatasetRef(BaseModel):
64 """Simplified model of a `DatasetRef` suitable for serialization."""
66 # DO NOT change order in the Union, pydantic is sensitive to that!
67 id: Optional[Union[uuid.UUID, PositiveInt]] = None
68 datasetType: Optional[SerializedDatasetType] = None
69 dataId: Optional[SerializedDataCoordinate] = None
70 run: Optional[StrictStr] = None
71 component: Optional[StrictStr] = None
73 @validator("dataId")
74 def _check_dataId(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805
75 if (d := "datasetType") in values and values[d] is None:
76 raise ValueError("Can not specify 'dataId' without specifying 'datasetType'")
77 return v
79 @validator("run")
80 def _check_run(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805
81 if v and (i := "id") in values and values[i] is None:
82 raise ValueError("'run' cannot be provided unless 'id' is.")
83 return v
85 @validator("component")
86 def _check_component(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805
87 # Component should not be given if datasetType is given
88 if v and (d := "datasetType") in values and values[d] is not None:
89 raise ValueError(f"datasetType ({values[d]}) can not be set if component is given ({v}).")
90 return v
93DatasetId = Union[int, uuid.UUID]
94"""A type-annotation alias for dataset ID which could be either integer or
95UUID.
96"""
99@immutable
100class DatasetRef:
101 """Reference to a Dataset in a `Registry`.
103 A `DatasetRef` may point to a Dataset that currently does not yet exist
104 (e.g., because it is a predicted input for provenance).
106 Parameters
107 ----------
108 datasetType : `DatasetType`
109 The `DatasetType` for this Dataset.
110 dataId : `DataCoordinate`
111 A mapping of dimensions that labels the Dataset within a Collection.
112 id : `DatasetId`, optional
113 The unique identifier assigned when the dataset is created.
114 run : `str`, optional
115 The name of the run this dataset was associated with when it was
116 created. Must be provided if ``id`` is.
117 conform : `bool`, optional
118 If `True` (default), call `DataCoordinate.standardize` to ensure that
119 the data ID's dimensions are consistent with the dataset type's.
120 `DatasetRef` instances for which those dimensions are not equal should
121 not be created in new code, but are still supported for backwards
122 compatibility. New code should only pass `False` if it can guarantee
123 that the dimensions are already consistent.
125 Raises
126 ------
127 ValueError
128 Raised if ``run`` is provided but ``id`` is not, or if ``id`` is
129 provided but ``run`` is not.
131 See Also
132 --------
133 :ref:`daf_butler_organizing_datasets`
134 """
136 _serializedType = SerializedDatasetRef
137 __slots__ = ("id", "datasetType", "dataId", "run",)
139 def __init__(
140 self,
141 datasetType: DatasetType, dataId: DataCoordinate, *,
142 id: Optional[DatasetId] = None,
143 run: Optional[str] = None,
144 conform: bool = True
145 ):
146 self.id = id
147 self.datasetType = datasetType
148 if conform:
149 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions)
150 else:
151 self.dataId = dataId
152 if self.id is not None:
153 if run is None:
154 raise ValueError(f"Cannot provide id without run for dataset with id={id}, "
155 f"type={datasetType}, and dataId={dataId}.")
156 self.run = run
157 else:
158 if run is not None:
159 raise ValueError("'run' cannot be provided unless 'id' is.")
160 self.run = None
162 def __eq__(self, other: Any) -> bool:
163 try:
164 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id)
165 except AttributeError:
166 return NotImplemented
168 def __hash__(self) -> int:
169 return hash((self.datasetType, self.dataId, self.id))
171 @property
172 def dimensions(self) -> DimensionGraph:
173 """Dimensions associated with the underlying `DatasetType`."""
174 return self.datasetType.dimensions
176 def __repr__(self) -> str:
177 # We delegate to __str__ (i.e use "!s") for the data ID) below because
178 # DataCoordinate's __repr__ - while adhering to the guidelines for
179 # __repr__ - is much harder to users to read, while its __str__ just
180 # produces a dict that can also be passed to DatasetRef's constructor.
181 if self.id is not None:
182 return (f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r})")
183 else:
184 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})"
186 def __str__(self) -> str:
187 s = f"{self.datasetType.name}@{self.dataId!s}, sc={self.datasetType.storageClass.name}]"
188 if self.id is not None:
189 s += f" (id={self.id})"
190 return s
192 def __lt__(self, other: Any) -> bool:
193 # Sort by run, DatasetType name and then by DataCoordinate
194 # The __str__ representation is probably close enough but we
195 # need to ensure that sorting a DatasetRef matches what you would
196 # get if you sorted DatasetType+DataCoordinate
197 if not isinstance(other, type(self)):
198 return NotImplemented
200 # Group by run if defined, takes precedence over DatasetType
201 self_run = "" if self.run is None else self.run
202 other_run = "" if other.run is None else other.run
204 # Compare tuples in the priority order
205 return (self_run, self.datasetType, self.dataId) < (other_run, other.datasetType, other.dataId)
207 def to_simple(self, minimal: bool = False) -> SerializedDatasetRef:
208 """Convert this class to a simple python type.
210 This makes it suitable for serialization.
212 Parameters
213 ----------
214 minimal : `bool`, optional
215 Use minimal serialization. Requires Registry to convert
216 back to a full type.
218 Returns
219 -------
220 simple : `dict` or `int`
221 The object converted to a dictionary.
222 """
223 if minimal and self.id is not None:
224 # The only thing needed to uniquely define a DatasetRef
225 # is its id so that can be used directly if it is
226 # resolved and if it is not a component DatasetRef.
227 # Store is in a dict to allow us to easily add the planned
228 # origin information later without having to support
229 # an int and dict in simple form.
230 simple: Dict[str, Any] = {"id": self.id}
231 if self.isComponent():
232 # We can still be a little minimalist with a component
233 # but we will also need to record the datasetType component
234 simple["component"] = self.datasetType.component()
235 return SerializedDatasetRef(**simple)
237 # Convert to a dict form
238 as_dict: Dict[str, Any] = {"datasetType": self.datasetType.to_simple(minimal=minimal),
239 "dataId": self.dataId.to_simple(),
240 }
242 # Only include the id entry if it is defined
243 if self.id is not None:
244 as_dict["run"] = self.run
245 as_dict["id"] = self.id
247 return SerializedDatasetRef(**as_dict)
249 @classmethod
250 def from_simple(cls, simple: SerializedDatasetRef,
251 universe: Optional[DimensionUniverse] = None,
252 registry: Optional[Registry] = None) -> DatasetRef:
253 """Construct a new object from simplified form.
255 Generally this is data returned from the `to_simple` method.
257 Parameters
258 ----------
259 simple : `dict` of [`str`, `Any`]
260 The value returned by `to_simple()`.
261 universe : `DimensionUniverse`
262 The special graph of all known dimensions.
263 Can be `None` if a registry is provided.
264 registry : `lsst.daf.butler.Registry`, optional
265 Registry to use to convert simple form of a DatasetRef to
266 a full `DatasetRef`. Can be `None` if a full description of
267 the type is provided along with a universe.
269 Returns
270 -------
271 ref : `DatasetRef`
272 Newly-constructed object.
273 """
274 # Minimalist component will just specify component and id and
275 # require registry to reconstruct
276 if set(simple.dict(exclude_unset=True, exclude_defaults=True)).issubset({"id", "component"}):
277 if registry is None:
278 raise ValueError("Registry is required to construct component DatasetRef from integer id")
279 if simple.id is None:
280 raise ValueError("For minimal DatasetRef the ID must be defined.")
281 ref = registry.getDataset(simple.id)
282 if ref is None:
283 raise RuntimeError(f"No matching dataset found in registry for id {simple.id}")
284 if simple.component:
285 ref = ref.makeComponentRef(simple.component)
286 return ref
288 if universe is None and registry is None:
289 raise ValueError("One of universe or registry must be provided.")
291 if universe is None and registry is not None:
292 universe = registry.dimensions
294 if universe is None:
295 # this is for mypy
296 raise ValueError("Unable to determine a usable universe")
298 if simple.datasetType is None:
299 # mypy
300 raise ValueError("The DatasetType must be specified to construct a DatasetRef")
301 datasetType = DatasetType.from_simple(simple.datasetType, universe=universe, registry=registry)
303 if simple.dataId is None:
304 # mypy
305 raise ValueError("The DataId must be specified to construct a DatasetRef")
306 dataId = DataCoordinate.from_simple(simple.dataId, universe=universe)
307 return cls(datasetType, dataId,
308 id=simple.id, run=simple.run)
310 to_json = to_json_pydantic
311 from_json = classmethod(from_json_pydantic)
313 @classmethod
314 def _unpickle(
315 cls,
316 datasetType: DatasetType,
317 dataId: DataCoordinate,
318 id: Optional[DatasetId],
319 run: Optional[str],
320 ) -> DatasetRef:
321 """Create new `DatasetRef`.
323 A custom factory method for use by `__reduce__` as a workaround for
324 its lack of support for keyword arguments.
325 """
326 return cls(datasetType, dataId, id=id, run=run)
328 def __reduce__(self) -> tuple:
329 return (self._unpickle, (self.datasetType, self.dataId, self.id, self.run))
331 def __deepcopy__(self, memo: dict) -> DatasetRef:
332 # DatasetRef is recursively immutable; see note in @immutable
333 # decorator.
334 return self
336 def resolved(self, id: DatasetId, run: str) -> DatasetRef:
337 """Return resolved `DatasetRef`.
339 This is a new `DatasetRef` with the same data ID and dataset type
340 and the given ID and run.
342 Parameters
343 ----------
344 id : `DatasetId`
345 The unique identifier assigned when the dataset is created.
346 run : `str`
347 The run this dataset was associated with when it was created.
349 Returns
350 -------
351 ref : `DatasetRef`
352 A new `DatasetRef`.
353 """
354 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId,
355 id=id, run=run, conform=False)
357 def unresolved(self) -> DatasetRef:
358 """Return unresolved `DatasetRef`.
360 This is a new `DatasetRef` with the same data ID and dataset type,
361 but no ID or run.
363 Returns
364 -------
365 ref : `DatasetRef`
366 A new `DatasetRef`.
368 Notes
369 -----
370 This can be used to compare only the data ID and dataset type of a
371 pair of `DatasetRef` instances, regardless of whether either is
372 resolved::
374 if ref1.unresolved() == ref2.unresolved():
375 ...
376 """
377 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, conform=False)
379 def expanded(self, dataId: DataCoordinate) -> DatasetRef:
380 """Return a new `DatasetRef` with the given expanded data ID.
382 Parameters
383 ----------
384 dataId : `DataCoordinate`
385 Data ID for the new `DatasetRef`. Must compare equal to the
386 original data ID.
388 Returns
389 -------
390 ref : `DatasetRef`
391 A new `DatasetRef` with the given data ID.
392 """
393 assert dataId == self.dataId
394 return DatasetRef(datasetType=self.datasetType, dataId=dataId,
395 id=self.id, run=self.run,
396 conform=False)
398 def isComponent(self) -> bool:
399 """Indicate whether this `DatasetRef` refers to a component.
401 Returns
402 -------
403 isComponent : `bool`
404 `True` if this `DatasetRef` is a component, `False` otherwise.
405 """
406 return self.datasetType.isComponent()
408 def isComposite(self) -> bool:
409 """Boolean indicating whether this `DatasetRef` is a composite type.
411 Returns
412 -------
413 isComposite : `bool`
414 `True` if this `DatasetRef` is a composite type, `False`
415 otherwise.
416 """
417 return self.datasetType.isComposite()
419 def _lookupNames(self) -> Tuple[LookupKey, ...]:
420 """Name keys to use when looking up this DatasetRef in a configuration.
422 The names are returned in order of priority.
424 Returns
425 -------
426 names : `tuple` of `LookupKey`
427 Tuple of the `DatasetType` name and the `StorageClass` name.
428 If ``instrument`` is defined in the dataId, each of those names
429 is added to the start of the tuple with a key derived from the
430 value of ``instrument``.
431 """
432 # Special case the instrument Dimension since we allow configs
433 # to include the instrument name in the hierarchy.
434 names: Tuple[LookupKey, ...] = self.datasetType._lookupNames()
436 # mypy doesn't think this could return True, because even though
437 # __contains__ can take an object of any type, it seems hard-coded to
438 # assume it will return False if the type doesn't match the key type
439 # of the Mapping.
440 if "instrument" in self.dataId: # type: ignore
441 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]})
442 for n in names) + names
444 return names
446 @staticmethod
447 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
448 """Group an iterable of `DatasetRef` by `DatasetType`.
450 Parameters
451 ----------
452 refs : `Iterable` [ `DatasetRef` ]
453 `DatasetRef` instances to group.
455 Returns
456 -------
457 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ]
458 Grouped `DatasetRef` instances.
459 """
460 result: NamedKeyDict[DatasetType, List[DatasetRef]] = NamedKeyDict()
461 for ref in refs:
462 result.setdefault(ref.datasetType, []).append(ref)
463 return result
465 def getCheckedId(self) -> DatasetId:
466 """Return ``self.id``, or raise if it is `None`.
468 This trivial method exists to allow operations that would otherwise be
469 natural list comprehensions to check that the ID is not `None` as well.
471 Returns
472 -------
473 id : `DatasetId`
474 ``self.id`` if it is not `None`.
476 Raises
477 ------
478 AmbiguousDatasetError
479 Raised if ``ref.id`` is `None`.
480 """
481 if self.id is None:
482 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; "
483 f"a resolved reference is required.")
484 return self.id
486 def makeCompositeRef(self) -> DatasetRef:
487 """Create a `DatasetRef` of the composite from a component ref.
489 Requires that this `DatasetRef` is a component.
491 Returns
492 -------
493 ref : `DatasetRef`
494 A `DatasetRef` with a dataset type that corresponds to the
495 composite parent of this component, and the same ID and run
496 (which may be `None`, if they are `None` in ``self``).
497 """
498 # Assume that the data ID does not need to be standardized
499 # and should match whatever this ref already has.
500 return DatasetRef(self.datasetType.makeCompositeDatasetType(), self.dataId,
501 id=self.id, run=self.run, conform=False)
503 def makeComponentRef(self, name: str) -> DatasetRef:
504 """Create a `DatasetRef` that corresponds to a component.
506 Parameters
507 ----------
508 name : `str`
509 Name of the component.
511 Returns
512 -------
513 ref : `DatasetRef`
514 A `DatasetRef` with a dataset type that corresponds to the given
515 component, and the same ID and run
516 (which may be `None`, if they are `None` in ``self``).
517 """
518 # Assume that the data ID does not need to be standardized
519 # and should match whatever this ref already has.
520 return DatasetRef(self.datasetType.makeComponentDatasetType(name), self.dataId,
521 id=self.id, run=self.run, conform=False)
523 datasetType: DatasetType
524 """The definition of this dataset (`DatasetType`).
526 Cannot be changed after a `DatasetRef` is constructed.
527 """
529 dataId: DataCoordinate
530 """A mapping of `Dimension` primary key values that labels the dataset
531 within a Collection (`DataCoordinate`).
533 Cannot be changed after a `DatasetRef` is constructed.
534 """
536 run: Optional[str]
537 """The name of the run that produced the dataset.
539 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or
540 `unresolved` to add or remove this information when creating a new
541 `DatasetRef`.
542 """
544 id: Optional[DatasetId]
545 """Primary key of the dataset (`DatasetId` or `None`).
547 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or
548 `unresolved` to add or remove this information when creating a new
549 `DatasetRef`.
550 """