Coverage for python/lsst/daf/butler/core/datasets/ref.py: 32%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["AmbiguousDatasetError", "DatasetId", "DatasetRef", "SerializedDatasetRef"]
25import uuid
26from typing import (
27 TYPE_CHECKING,
28 Any,
29 Dict,
30 Iterable,
31 List,
32 Optional,
33 Tuple,
34 Union,
35)
37from pydantic import BaseModel, StrictStr, ConstrainedInt, validator
39from lsst.utils.classes import immutable
40from ..dimensions import DataCoordinate, DimensionGraph, DimensionUniverse, SerializedDataCoordinate
41from ..configSupport import LookupKey
42from ..named import NamedKeyDict
43from .type import DatasetType, SerializedDatasetType
44from ..json import from_json_pydantic, to_json_pydantic
46if TYPE_CHECKING: 46 ↛ 47line 46 didn't jump to line 47, because the condition on line 46 was never true
47 from ...registry import Registry
50class AmbiguousDatasetError(Exception):
51 """Raised when a `DatasetRef` is not resolved but should be.
53 This happens when the `DatasetRef` has no ID or run but the requested
54 operation requires one of them.
55 """
58class PositiveInt(ConstrainedInt):
59 ge = 0
60 strict = True
63class SerializedDatasetRef(BaseModel):
64 """Simplified model of a `DatasetRef` suitable for serialization."""
66 # DO NOT change order in the Union, pydantic is sensitive to that!
67 id: Optional[Union[uuid.UUID, PositiveInt]] = None
68 datasetType: Optional[SerializedDatasetType] = None
69 dataId: Optional[SerializedDataCoordinate] = None
70 run: Optional[StrictStr] = None
71 component: Optional[StrictStr] = None
73 @validator("dataId")
74 def _check_dataId(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805
75 if (d := "datasetType") in values and values[d] is None:
76 raise ValueError("Can not specify 'dataId' without specifying 'datasetType'")
77 return v
79 @validator("run")
80 def _check_run(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805
81 if v and (i := "id") in values and values[i] is None:
82 raise ValueError("'run' cannot be provided unless 'id' is.")
83 return v
85 @validator("component")
86 def _check_component(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805
87 # Component should not be given if datasetType is given
88 if v and (d := "datasetType") in values and values[d] is not None:
89 raise ValueError(f"datasetType ({values[d]}) can not be set if component is given ({v}).")
90 return v
92 @classmethod
93 def direct(cls, *, id: Optional[Union[str, int]] = None, datasetType: Optional[Dict[str, Any]] = None,
94 dataId: Optional[Dict[str, Any]] = None, run: str = None, component: Optional[str] = None
95 ) -> SerializedDatasetRef:
96 """Construct a `SerializedDatasetRef` directly without validators.
98 This differs from the pydantic "construct" method in that the arguments
99 are explicitly what the model requires, and it will recurse through
100 members, constructing them from their corresponding `direct` methods.
102 This method should only be called when the inputs are trusted.
103 """
104 node = SerializedDatasetRef.__new__(cls)
105 setter = object.__setattr__
106 setter(node, 'id', uuid.UUID(id) if isinstance(id, str) else id)
107 setter(node, 'datasetType',
108 datasetType if datasetType is None else SerializedDatasetType.direct(**datasetType))
109 setter(node, 'dataId', dataId if dataId is None else SerializedDataCoordinate.direct(**dataId))
110 setter(node, 'run', run)
111 setter(node, 'component', component)
112 setter(node, '__fields_set__', {'id', 'datasetType', 'dataId', 'run', 'component'})
113 return node
116DatasetId = Union[int, uuid.UUID]
117"""A type-annotation alias for dataset ID which could be either integer or
118UUID.
119"""
122@immutable
123class DatasetRef:
124 """Reference to a Dataset in a `Registry`.
126 A `DatasetRef` may point to a Dataset that currently does not yet exist
127 (e.g., because it is a predicted input for provenance).
129 Parameters
130 ----------
131 datasetType : `DatasetType`
132 The `DatasetType` for this Dataset.
133 dataId : `DataCoordinate`
134 A mapping of dimensions that labels the Dataset within a Collection.
135 id : `DatasetId`, optional
136 The unique identifier assigned when the dataset is created.
137 run : `str`, optional
138 The name of the run this dataset was associated with when it was
139 created. Must be provided if ``id`` is.
140 conform : `bool`, optional
141 If `True` (default), call `DataCoordinate.standardize` to ensure that
142 the data ID's dimensions are consistent with the dataset type's.
143 `DatasetRef` instances for which those dimensions are not equal should
144 not be created in new code, but are still supported for backwards
145 compatibility. New code should only pass `False` if it can guarantee
146 that the dimensions are already consistent.
148 Raises
149 ------
150 ValueError
151 Raised if ``run`` is provided but ``id`` is not, or if ``id`` is
152 provided but ``run`` is not.
154 See Also
155 --------
156 :ref:`daf_butler_organizing_datasets`
157 """
159 _serializedType = SerializedDatasetRef
160 __slots__ = ("id", "datasetType", "dataId", "run",)
162 def __init__(
163 self,
164 datasetType: DatasetType, dataId: DataCoordinate, *,
165 id: Optional[DatasetId] = None,
166 run: Optional[str] = None,
167 conform: bool = True
168 ):
169 self.id = id
170 self.datasetType = datasetType
171 if conform:
172 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions)
173 else:
174 self.dataId = dataId
175 if self.id is not None:
176 if run is None:
177 raise ValueError(f"Cannot provide id without run for dataset with id={id}, "
178 f"type={datasetType}, and dataId={dataId}.")
179 self.run = run
180 else:
181 if run is not None:
182 raise ValueError("'run' cannot be provided unless 'id' is.")
183 self.run = None
185 def __eq__(self, other: Any) -> bool:
186 try:
187 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id)
188 except AttributeError:
189 return NotImplemented
191 def __hash__(self) -> int:
192 return hash((self.datasetType, self.dataId, self.id))
194 @property
195 def dimensions(self) -> DimensionGraph:
196 """Dimensions associated with the underlying `DatasetType`."""
197 return self.datasetType.dimensions
199 def __repr__(self) -> str:
200 # We delegate to __str__ (i.e use "!s") for the data ID) below because
201 # DataCoordinate's __repr__ - while adhering to the guidelines for
202 # __repr__ - is much harder to users to read, while its __str__ just
203 # produces a dict that can also be passed to DatasetRef's constructor.
204 if self.id is not None:
205 return (f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r})")
206 else:
207 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})"
209 def __str__(self) -> str:
210 s = f"{self.datasetType.name}@{self.dataId!s}, sc={self.datasetType.storageClass.name}]"
211 if self.id is not None:
212 s += f" (id={self.id})"
213 return s
215 def __lt__(self, other: Any) -> bool:
216 # Sort by run, DatasetType name and then by DataCoordinate
217 # The __str__ representation is probably close enough but we
218 # need to ensure that sorting a DatasetRef matches what you would
219 # get if you sorted DatasetType+DataCoordinate
220 if not isinstance(other, type(self)):
221 return NotImplemented
223 # Group by run if defined, takes precedence over DatasetType
224 self_run = "" if self.run is None else self.run
225 other_run = "" if other.run is None else other.run
227 # Compare tuples in the priority order
228 return (self_run, self.datasetType, self.dataId) < (other_run, other.datasetType, other.dataId)
230 def to_simple(self, minimal: bool = False) -> SerializedDatasetRef:
231 """Convert this class to a simple python type.
233 This makes it suitable for serialization.
235 Parameters
236 ----------
237 minimal : `bool`, optional
238 Use minimal serialization. Requires Registry to convert
239 back to a full type.
241 Returns
242 -------
243 simple : `dict` or `int`
244 The object converted to a dictionary.
245 """
246 if minimal and self.id is not None:
247 # The only thing needed to uniquely define a DatasetRef
248 # is its id so that can be used directly if it is
249 # resolved and if it is not a component DatasetRef.
250 # Store is in a dict to allow us to easily add the planned
251 # origin information later without having to support
252 # an int and dict in simple form.
253 simple: Dict[str, Any] = {"id": self.id}
254 if self.isComponent():
255 # We can still be a little minimalist with a component
256 # but we will also need to record the datasetType component
257 simple["component"] = self.datasetType.component()
258 return SerializedDatasetRef(**simple)
260 # Convert to a dict form
261 as_dict: Dict[str, Any] = {"datasetType": self.datasetType.to_simple(minimal=minimal),
262 "dataId": self.dataId.to_simple(),
263 }
265 # Only include the id entry if it is defined
266 if self.id is not None:
267 as_dict["run"] = self.run
268 as_dict["id"] = self.id
270 return SerializedDatasetRef(**as_dict)
272 @classmethod
273 def from_simple(cls, simple: SerializedDatasetRef,
274 universe: Optional[DimensionUniverse] = None,
275 registry: Optional[Registry] = None,
276 datasetType: Optional[DatasetType] = None) -> DatasetRef:
277 """Construct a new object from simplified form.
279 Generally this is data returned from the `to_simple` method.
281 Parameters
282 ----------
283 simple : `dict` of [`str`, `Any`]
284 The value returned by `to_simple()`.
285 universe : `DimensionUniverse`
286 The special graph of all known dimensions.
287 Can be `None` if a registry is provided.
288 registry : `lsst.daf.butler.Registry`, optional
289 Registry to use to convert simple form of a DatasetRef to
290 a full `DatasetRef`. Can be `None` if a full description of
291 the type is provided along with a universe.
292 datasetType : DatasetType, optional
293 If datasetType is supplied, this will be used as the datasetType
294 object in the resulting DatasetRef instead of being read from
295 the `SerializedDatasetRef`. This is useful when many refs share
296 the same type as memory can be saved. Defaults to None.
298 Returns
299 -------
300 ref : `DatasetRef`
301 Newly-constructed object.
302 """
303 # Minimalist component will just specify component and id and
304 # require registry to reconstruct
305 if set(simple.dict(exclude_unset=True, exclude_defaults=True)).issubset({"id", "component"}):
306 if registry is None:
307 raise ValueError("Registry is required to construct component DatasetRef from integer id")
308 if simple.id is None:
309 raise ValueError("For minimal DatasetRef the ID must be defined.")
310 ref = registry.getDataset(simple.id)
311 if ref is None:
312 raise RuntimeError(f"No matching dataset found in registry for id {simple.id}")
313 if simple.component:
314 ref = ref.makeComponentRef(simple.component)
315 return ref
317 if universe is None and registry is None:
318 raise ValueError("One of universe or registry must be provided.")
320 if universe is None and registry is not None:
321 universe = registry.dimensions
323 if universe is None:
324 # this is for mypy
325 raise ValueError("Unable to determine a usable universe")
327 if simple.datasetType is None and datasetType is None:
328 # mypy
329 raise ValueError("The DatasetType must be specified to construct a DatasetRef")
330 if datasetType is None:
331 if simple.datasetType is None:
332 raise ValueError("Cannot determine Dataset type of this serialized class")
333 datasetType = DatasetType.from_simple(simple.datasetType, universe=universe, registry=registry)
335 if simple.dataId is None:
336 # mypy
337 raise ValueError("The DataId must be specified to construct a DatasetRef")
338 dataId = DataCoordinate.from_simple(simple.dataId, universe=universe)
339 return cls(datasetType, dataId,
340 id=simple.id, run=simple.run)
342 to_json = to_json_pydantic
343 from_json = classmethod(from_json_pydantic)
345 @classmethod
346 def _unpickle(
347 cls,
348 datasetType: DatasetType,
349 dataId: DataCoordinate,
350 id: Optional[DatasetId],
351 run: Optional[str],
352 ) -> DatasetRef:
353 """Create new `DatasetRef`.
355 A custom factory method for use by `__reduce__` as a workaround for
356 its lack of support for keyword arguments.
357 """
358 return cls(datasetType, dataId, id=id, run=run)
360 def __reduce__(self) -> tuple:
361 return (self._unpickle, (self.datasetType, self.dataId, self.id, self.run))
363 def __deepcopy__(self, memo: dict) -> DatasetRef:
364 # DatasetRef is recursively immutable; see note in @immutable
365 # decorator.
366 return self
368 def resolved(self, id: DatasetId, run: str) -> DatasetRef:
369 """Return resolved `DatasetRef`.
371 This is a new `DatasetRef` with the same data ID and dataset type
372 and the given ID and run.
374 Parameters
375 ----------
376 id : `DatasetId`
377 The unique identifier assigned when the dataset is created.
378 run : `str`
379 The run this dataset was associated with when it was created.
381 Returns
382 -------
383 ref : `DatasetRef`
384 A new `DatasetRef`.
385 """
386 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId,
387 id=id, run=run, conform=False)
389 def unresolved(self) -> DatasetRef:
390 """Return unresolved `DatasetRef`.
392 This is a new `DatasetRef` with the same data ID and dataset type,
393 but no ID or run.
395 Returns
396 -------
397 ref : `DatasetRef`
398 A new `DatasetRef`.
400 Notes
401 -----
402 This can be used to compare only the data ID and dataset type of a
403 pair of `DatasetRef` instances, regardless of whether either is
404 resolved::
406 if ref1.unresolved() == ref2.unresolved():
407 ...
408 """
409 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, conform=False)
411 def expanded(self, dataId: DataCoordinate) -> DatasetRef:
412 """Return a new `DatasetRef` with the given expanded data ID.
414 Parameters
415 ----------
416 dataId : `DataCoordinate`
417 Data ID for the new `DatasetRef`. Must compare equal to the
418 original data ID.
420 Returns
421 -------
422 ref : `DatasetRef`
423 A new `DatasetRef` with the given data ID.
424 """
425 assert dataId == self.dataId
426 return DatasetRef(datasetType=self.datasetType, dataId=dataId,
427 id=self.id, run=self.run,
428 conform=False)
430 def isComponent(self) -> bool:
431 """Indicate whether this `DatasetRef` refers to a component.
433 Returns
434 -------
435 isComponent : `bool`
436 `True` if this `DatasetRef` is a component, `False` otherwise.
437 """
438 return self.datasetType.isComponent()
440 def isComposite(self) -> bool:
441 """Boolean indicating whether this `DatasetRef` is a composite type.
443 Returns
444 -------
445 isComposite : `bool`
446 `True` if this `DatasetRef` is a composite type, `False`
447 otherwise.
448 """
449 return self.datasetType.isComposite()
451 def _lookupNames(self) -> Tuple[LookupKey, ...]:
452 """Name keys to use when looking up this DatasetRef in a configuration.
454 The names are returned in order of priority.
456 Returns
457 -------
458 names : `tuple` of `LookupKey`
459 Tuple of the `DatasetType` name and the `StorageClass` name.
460 If ``instrument`` is defined in the dataId, each of those names
461 is added to the start of the tuple with a key derived from the
462 value of ``instrument``.
463 """
464 # Special case the instrument Dimension since we allow configs
465 # to include the instrument name in the hierarchy.
466 names: Tuple[LookupKey, ...] = self.datasetType._lookupNames()
468 # mypy doesn't think this could return True, because even though
469 # __contains__ can take an object of any type, it seems hard-coded to
470 # assume it will return False if the type doesn't match the key type
471 # of the Mapping.
472 if "instrument" in self.dataId: # type: ignore
473 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]})
474 for n in names) + names
476 return names
478 @staticmethod
479 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
480 """Group an iterable of `DatasetRef` by `DatasetType`.
482 Parameters
483 ----------
484 refs : `Iterable` [ `DatasetRef` ]
485 `DatasetRef` instances to group.
487 Returns
488 -------
489 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ]
490 Grouped `DatasetRef` instances.
491 """
492 result: NamedKeyDict[DatasetType, List[DatasetRef]] = NamedKeyDict()
493 for ref in refs:
494 result.setdefault(ref.datasetType, []).append(ref)
495 return result
497 def getCheckedId(self) -> DatasetId:
498 """Return ``self.id``, or raise if it is `None`.
500 This trivial method exists to allow operations that would otherwise be
501 natural list comprehensions to check that the ID is not `None` as well.
503 Returns
504 -------
505 id : `DatasetId`
506 ``self.id`` if it is not `None`.
508 Raises
509 ------
510 AmbiguousDatasetError
511 Raised if ``ref.id`` is `None`.
512 """
513 if self.id is None:
514 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; "
515 f"a resolved reference is required.")
516 return self.id
518 def makeCompositeRef(self) -> DatasetRef:
519 """Create a `DatasetRef` of the composite from a component ref.
521 Requires that this `DatasetRef` is a component.
523 Returns
524 -------
525 ref : `DatasetRef`
526 A `DatasetRef` with a dataset type that corresponds to the
527 composite parent of this component, and the same ID and run
528 (which may be `None`, if they are `None` in ``self``).
529 """
530 # Assume that the data ID does not need to be standardized
531 # and should match whatever this ref already has.
532 return DatasetRef(self.datasetType.makeCompositeDatasetType(), self.dataId,
533 id=self.id, run=self.run, conform=False)
535 def makeComponentRef(self, name: str) -> DatasetRef:
536 """Create a `DatasetRef` that corresponds to a component.
538 Parameters
539 ----------
540 name : `str`
541 Name of the component.
543 Returns
544 -------
545 ref : `DatasetRef`
546 A `DatasetRef` with a dataset type that corresponds to the given
547 component, and the same ID and run
548 (which may be `None`, if they are `None` in ``self``).
549 """
550 # Assume that the data ID does not need to be standardized
551 # and should match whatever this ref already has.
552 return DatasetRef(self.datasetType.makeComponentDatasetType(name), self.dataId,
553 id=self.id, run=self.run, conform=False)
555 datasetType: DatasetType
556 """The definition of this dataset (`DatasetType`).
558 Cannot be changed after a `DatasetRef` is constructed.
559 """
561 dataId: DataCoordinate
562 """A mapping of `Dimension` primary key values that labels the dataset
563 within a Collection (`DataCoordinate`).
565 Cannot be changed after a `DatasetRef` is constructed.
566 """
568 run: Optional[str]
569 """The name of the run that produced the dataset.
571 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or
572 `unresolved` to add or remove this information when creating a new
573 `DatasetRef`.
574 """
576 id: Optional[DatasetId]
577 """Primary key of the dataset (`DatasetId` or `None`).
579 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or
580 `unresolved` to add or remove this information when creating a new
581 `DatasetRef`.
582 """