Coverage for python/lsst/daf/butler/core/datasets/ref.py : 23%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["DatasetRef"]
25import hashlib
26from typing import Any, Dict, Iterable, Iterator, Mapping, Optional, Tuple
28from types import MappingProxyType
29from ..dimensions import DataCoordinate, DimensionGraph, ExpandedDataCoordinate
30from ..configSupport import LookupKey
31from ..utils import immutable
32from .type import DatasetType
35@immutable
36class DatasetRef:
37 """Reference to a Dataset in a `Registry`.
39 A `DatasetRef` may point to a Dataset that currently does not yet exist
40 (e.g., because it is a predicted input for provenance).
42 Parameters
43 ----------
44 datasetType : `DatasetType`
45 The `DatasetType` for this Dataset.
46 dataId : `DataCoordinate`
47 A mapping of dimensions that labels the Dataset within a Collection.
48 id : `int`, optional
49 The unique integer identifier assigned when the dataset is created.
50 run : `str`, optional
51 The name of the run this dataset was associated with when it was
52 created.
53 hash : `bytes`, optional
54 A hash of the dataset type and data ID. Should only be provided if
55 copying from another `DatasetRef` with the same dataset type and data
56 ID.
57 components : `dict`, optional
58 A dictionary mapping component name to a `DatasetRef` for that
59 component. Should not be passed unless ``id`` is also provided (i.e.
60 if this is a "resolved" reference).
61 conform : `bool`, optional
62 If `True` (default), call `DataCoordinate.standardize` to ensure that
63 the data ID's dimensions are consistent with the dataset type's.
64 `DatasetRef` instances for which those dimensions are not equal should
65 not be created in new code, but are still supported for backwards
66 compatibility. New code should only pass `False` if it can guarantee
67 that the dimensions are already consistent.
69 Raises
70 ------
71 ValueError
72 Raised if ``run`` or ``components`` is provided but ``id`` is not, or
73 if a component dataset is inconsistent with the storage class.
74 """
76 __slots__ = ("id", "datasetType", "dataId", "run", "_hash", "_components")
78 def __new__(cls, datasetType: DatasetType, dataId: DataCoordinate, *,
79 id: Optional[int] = None,
80 run: Optional[str] = None, hash: Optional[bytes] = None,
81 components: Optional[Mapping[str, DatasetRef]] = None, conform: bool = True) -> DatasetRef:
82 self = super().__new__(cls)
83 assert isinstance(datasetType, DatasetType)
84 self.id = id
85 self.datasetType = datasetType
86 if conform:
87 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions)
88 else:
89 self.dataId = dataId
90 if self.id is not None:
91 self._components = dict()
92 if components is not None:
93 self._components.update(components)
94 for k, v in self._components.items():
95 expectedStorageClass = self.datasetType.storageClass.components.get(k)
96 if expectedStorageClass is None:
97 raise ValueError(f"{k} is not a valid component for "
98 f"storage class {self.datasetType.storageClass.name}.")
99 if not isinstance(v, DatasetRef):
100 # It's easy to accidentally pass DatasetType or
101 # StorageClass; make that error message friendly.
102 raise ValueError(f"Component {k}={v} is not a DatasetRef.")
103 if v.id is None:
104 raise ValueError(f"DatasetRef components must be resolved ({k}={v} isn't).")
105 if expectedStorageClass != v.datasetType.storageClass:
106 raise ValueError(f"Storage class mismatch for component {k}: "
107 f"{v.datasetType.storageClass.name} != {expectedStorageClass.name}")
108 # TODO: it would be nice to guarantee that id and run should be
109 # either both None or not None together. We can't easily do that
110 # yet because the Query infrastructure has a hard time obtaining
111 # run strings, so we allow run to be `None` here, but that will
112 # change.
113 self.run = run
114 else:
115 self._components = None
116 if components:
117 raise ValueError("'components' cannot be provided unless 'id' is.")
118 if run is not None:
119 raise ValueError("'run' cannot be provided unless 'id' is.")
120 self.run = None
121 if hash is not None:
122 # We only set self._hash if we know it; this plays nicely with
123 # the @immutable decorator, which allows an attribute to be set
124 # only one time.
125 self._hash = hash
126 return self
128 def __eq__(self, other: DatasetRef):
129 try:
130 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id)
131 except AttributeError:
132 return NotImplemented
134 def __hash__(self) -> int:
135 return hash((self.datasetType, self.dataId, self.id))
137 @property
138 def hash(self) -> bytes:
139 """Secure hash of the `DatasetType` name and data ID (`bytes`).
140 """
141 if not hasattr(self, "_hash"):
142 message = hashlib.blake2b(digest_size=32)
143 message.update(self.datasetType.name.encode("utf8"))
144 self.dataId.fingerprint(message.update)
145 self._hash = message.digest()
146 return self._hash
148 @property
149 def components(self) -> Optional[Mapping[str, DatasetRef]]:
150 """Named `DatasetRef` components (`~collections.abc.Mapping` or
151 `None`).
153 For resolved `DatasetRef` instances, this is a read-only mapping that
154 can be updated in-place via `Registry.attachComponent()`. For
155 unresolved instances, this is always `None`.
156 """
157 if self._components is None:
158 return None
159 return MappingProxyType(self._components)
161 @property
162 def dimensions(self) -> DimensionGraph:
163 """The dimensions associated with the underlying `DatasetType`
164 """
165 return self.datasetType.dimensions
167 def __repr__(self) -> str:
168 # We delegate to __str__ (i.e use "!s") for the data ID) below because
169 # DataCoordinate's __repr__ - while adhering to the guidelines for
170 # __repr__ - is much harder to users to read, while its __str__ just
171 # produces a dict that can also be passed to DatasetRef's constructor.
172 if self.id is not None:
173 return (f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r}, "
174 f"components={self._components})")
175 else:
176 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})"
178 def __str__(self) -> str:
179 s = f"{self.datasetType.name}@{self.dataId!s}"
180 if self.id is not None:
181 s += f" (id={self.id})"
182 return s
184 def __getnewargs_ex__(self) -> Tuple[Tuple[Any, ...], Dict[str, Any]]:
185 return ((self.datasetType, self.dataId),
186 {"id": self.id, "run": self.run, "components": self._components})
188 def resolved(self, id: int, run: str, components: Optional[Mapping[str, DatasetRef]] = None
189 ) -> DatasetRef:
190 """Return a new `DatasetRef` with the same data ID and dataset type
191 and the given ID and run.
193 Parameters
194 ----------
195 id : `int`
196 The unique integer identifier assigned when the dataset is created.
197 run : `str`
198 The run this dataset was associated with when it was created.
199 components : `dict`, optional
200 A dictionary mapping component name to a `DatasetRef` for that
201 component. If ``self`` is already a resolved `DatasetRef`,
202 its components will be merged with this dictionary, with this
203 dictionary taking precedence.
205 Returns
206 -------
207 ref : `DatasetRef`
208 A new `DatasetRef`.
209 """
210 if self._components is not None:
211 newComponents = self._components.copy()
212 else:
213 newComponents = {}
214 if components:
215 newComponents.update(components)
216 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId,
217 id=id, run=run, hash=self.hash, components=newComponents, conform=False)
219 def unresolved(self) -> DatasetRef:
220 """Return a new `DatasetRef` with the same data ID and dataset type,
221 but no ID, run, or components.
223 Returns
224 -------
225 ref : `DatasetRef`
226 A new `DatasetRef`.
228 Notes
229 -----
230 This can be used to compare only the data ID and dataset type of a
231 pair of `DatasetRef` instances, regardless of whether either is
232 resolved::
234 if ref1.unresolved() == ref2.unresolved():
235 ...
236 """
237 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, hash=self.hash, conform=False)
239 def expanded(self, dataId: ExpandedDataCoordinate) -> DatasetRef:
240 """Return a new `DatasetRef` with the given expanded data ID.
242 Parameters
243 ----------
244 dataId : `ExpandedDataCoordinate`
245 Data ID for the new `DatasetRef`. Must compare equal to the
246 original data ID.
248 Returns
249 -------
250 ref : `DatasetRef`
251 A new `DatasetRef` with the given data ID.
252 """
253 assert dataId == self.dataId
254 return DatasetRef(datasetType=self.datasetType, dataId=dataId,
255 id=self.id, run=self.run, hash=self.hash, components=self.components,
256 conform=False)
258 def isComponent(self) -> bool:
259 """Boolean indicating whether this `DatasetRef` refers to a
260 component of a composite.
262 Returns
263 -------
264 isComponent : `bool`
265 `True` if this `DatasetRef` is a component, `False` otherwise.
266 """
267 return self.datasetType.isComponent()
269 def isComposite(self) -> bool:
270 """Boolean indicating whether this `DatasetRef` is a composite type.
272 Returns
273 -------
274 isComposite : `bool`
275 `True` if this `DatasetRef` is a composite type, `False`
276 otherwise.
277 """
278 return self.datasetType.isComposite()
280 def _lookupNames(self) -> Tuple[LookupKey]:
281 """Name keys to use when looking up this DatasetRef in a configuration.
283 The names are returned in order of priority.
285 Returns
286 -------
287 names : `tuple` of `LookupKey`
288 Tuple of the `DatasetType` name and the `StorageClass` name.
289 If ``instrument`` is defined in the dataId, each of those names
290 is added to the start of the tuple with a key derived from the
291 value of ``instrument``.
292 """
293 # Special case the instrument Dimension since we allow configs
294 # to include the instrument name in the hierarchy.
295 names = self.datasetType._lookupNames()
297 if "instrument" in self.dataId:
298 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]})
299 for n in names) + names
301 return names
303 @staticmethod
304 def flatten(refs: Iterable[DatasetRef], *, parents: bool = True) -> Iterator[DatasetRef]:
305 """Recursively transform an iterable over `DatasetRef` to include
306 nested component `DatasetRef` instances.
308 Parameters
309 ----------
310 refs : `~collections.abc.Iterable` [ `DatasetRef` ]
311 Input iterable to process. Must contain only resolved `DatasetRef`
312 instances (i.e. with `DatasetRef.components` not `None`).
313 parents : `bool`, optional
314 If `True` (default) include the given datasets in the output
315 iterable. If `False`, include only their components. This does
316 not propagate recursively - only the outermost level of parents
317 is ignored if ``parents`` is `False`.
319 Yields
320 ------
321 ref : `DatasetRef`
322 Either one of the given `DatasetRef` instances (only if ``parent``
323 is `True`) or on of its (recursive) children.
325 Notes
326 -----
327 If ``parents`` is `True`, components are guaranteed to be yielded
328 before their parents.
329 """
330 for ref in refs:
331 if ref.components is None:
332 raise TypeError(f"Unresolved ref '{ref} passed to 'flatten'.")
333 yield from DatasetRef.flatten(ref.components.values(), parents=True)
334 if parents:
335 yield ref
337 datasetType: DatasetType
338 """The definition of this dataset (`DatasetType`).
340 Cannot be changed after a `DatasetRef` is constructed.
341 """
343 dataId: DataCoordinate
344 """A mapping of `Dimension` primary key values that labels the dataset
345 within a Collection (`DataCoordinate`).
347 Cannot be changed after a `DatasetRef` is constructed.
348 """
350 run: Optional[setattr]
351 """The name of the run that produced the dataset.
353 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or
354 `unresolved` to add or remove this information when creating a new
355 `DatasetRef`.
356 """
358 id: Optional[int]
359 """Primary key of the dataset (`int` or `None`).
361 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or
362 `unresolved` to add or remove this information when creating a new
363 `DatasetRef`.
364 """