Coverage for python/lsst/daf/butler/core/datasets/ref.py : 22%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["AmbiguousDatasetError", "DatasetRef"]
25import hashlib
26from typing import (
27 Any,
28 Dict,
29 Iterable,
30 Iterator,
31 List,
32 Mapping,
33 Optional,
34 Tuple,
35)
37from types import MappingProxyType
38from ..dimensions import DataCoordinate, DimensionGraph, ExpandedDataCoordinate
39from ..configSupport import LookupKey
40from ..utils import immutable, NamedKeyDict
41from .type import DatasetType
44class AmbiguousDatasetError(Exception):
45 """Exception raised when a `DatasetRef` is not resolved (has no ID, run, or
46 components), but the requested operation requires one of them.
47 """
50@immutable
51class DatasetRef:
52 """Reference to a Dataset in a `Registry`.
54 A `DatasetRef` may point to a Dataset that currently does not yet exist
55 (e.g., because it is a predicted input for provenance).
57 Parameters
58 ----------
59 datasetType : `DatasetType`
60 The `DatasetType` for this Dataset.
61 dataId : `DataCoordinate`
62 A mapping of dimensions that labels the Dataset within a Collection.
63 id : `int`, optional
64 The unique integer identifier assigned when the dataset is created.
65 run : `str`, optional
66 The name of the run this dataset was associated with when it was
67 created. Must be provided if ``id`` is.
68 hash : `bytes`, optional
69 A hash of the dataset type and data ID. Should only be provided if
70 copying from another `DatasetRef` with the same dataset type and data
71 ID.
72 components : `dict`, optional
73 A dictionary mapping component name to a `DatasetRef` for that
74 component. Should not be passed unless ``id`` is also provided (i.e.
75 if this is a "resolved" reference).
76 conform : `bool`, optional
77 If `True` (default), call `DataCoordinate.standardize` to ensure that
78 the data ID's dimensions are consistent with the dataset type's.
79 `DatasetRef` instances for which those dimensions are not equal should
80 not be created in new code, but are still supported for backwards
81 compatibility. New code should only pass `False` if it can guarantee
82 that the dimensions are already consistent.
83 hasParentId : `bool`, optional
84 If `True` this `DatasetRef` is a component that has the ``id``
85 of the composite parent. This is set if the registry does not
86 know about individual components but does know about the composite.
88 Raises
89 ------
90 ValueError
91 Raised if ``run`` or ``components`` is provided but ``id`` is not, or
92 if a component dataset is inconsistent with the storage class, or if
93 ``id`` is provided but ``run`` is not.
94 """
96 __slots__ = ("id", "datasetType", "dataId", "run", "_hash", "_components", "hasParentId")
98 def __new__(cls, datasetType: DatasetType, dataId: DataCoordinate, *,
99 id: Optional[int] = None,
100 run: Optional[str] = None, hash: Optional[bytes] = None,
101 components: Optional[Mapping[str, DatasetRef]] = None, conform: bool = True,
102 hasParentId: bool = False) -> DatasetRef:
103 self = super().__new__(cls)
104 assert isinstance(datasetType, DatasetType)
105 self.id = id
106 self.datasetType = datasetType
107 self.hasParentId = hasParentId
108 if conform:
109 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions)
110 else:
111 self.dataId = dataId
112 if self.id is not None:
113 self._components = dict()
114 if components is not None:
115 self._components.update(components)
116 for k, v in self._components.items():
117 expectedStorageClass = self.datasetType.storageClass.components.get(k)
118 if expectedStorageClass is None:
119 raise ValueError(f"{k} is not a valid component for "
120 f"storage class {self.datasetType.storageClass.name}.")
121 if not isinstance(v, DatasetRef):
122 # It's easy to accidentally pass DatasetType or
123 # StorageClass; make that error message friendly.
124 raise ValueError(f"Component {k}={v} is not a DatasetRef.")
125 if v.id is None:
126 raise ValueError(f"DatasetRef components must be resolved ({k}={v} isn't).")
127 if expectedStorageClass != v.datasetType.storageClass:
128 raise ValueError(f"Storage class mismatch for component {k}: "
129 f"{v.datasetType.storageClass.name} != {expectedStorageClass.name}")
130 if run is None:
131 raise ValueError(f"Cannot provide id without run for dataset with id={id}, "
132 f"type={datasetType}, and dataId={dataId}.")
133 self.run = run
134 else:
135 self._components = None
136 if components:
137 raise ValueError("'components' cannot be provided unless 'id' is.")
138 if run is not None:
139 raise ValueError("'run' cannot be provided unless 'id' is.")
140 self.run = None
141 if hash is not None:
142 # We only set self._hash if we know it; this plays nicely with
143 # the @immutable decorator, which allows an attribute to be set
144 # only one time.
145 self._hash = hash
146 return self
148 def __eq__(self, other: Any) -> bool:
149 try:
150 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id)
151 except AttributeError:
152 return NotImplemented
154 def __hash__(self) -> int:
155 return hash((self.datasetType, self.dataId, self.id))
157 @property
158 def hash(self) -> bytes:
159 """Secure hash of the `DatasetType` name and data ID (`bytes`).
160 """
161 if not hasattr(self, "_hash"):
162 message = hashlib.blake2b(digest_size=32)
163 message.update(self.datasetType.name.encode("utf8"))
164 self.dataId.fingerprint(message.update)
165 self._hash = message.digest()
166 return self._hash
168 @property
169 def components(self) -> Optional[Mapping[str, DatasetRef]]:
170 """Named `DatasetRef` components (`~collections.abc.Mapping` or
171 `None`).
173 For resolved `DatasetRef` instances, this is a read-only mapping. For
174 unresolved instances, this is always `None`.
175 """
176 if self._components is None:
177 return None
178 return MappingProxyType(self._components)
180 @property
181 def dimensions(self) -> DimensionGraph:
182 """The dimensions associated with the underlying `DatasetType`
183 """
184 return self.datasetType.dimensions
186 def __repr__(self) -> str:
187 # We delegate to __str__ (i.e use "!s") for the data ID) below because
188 # DataCoordinate's __repr__ - while adhering to the guidelines for
189 # __repr__ - is much harder to users to read, while its __str__ just
190 # produces a dict that can also be passed to DatasetRef's constructor.
191 if self.id is not None:
192 return (f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r}, "
193 f"components={self._components})")
194 else:
195 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})"
197 def __str__(self) -> str:
198 s = f"{self.datasetType.name}@{self.dataId!s}"
199 if self.id is not None:
200 s += f" (id={self.id})"
201 return s
203 def __getnewargs_ex__(self) -> Tuple[Tuple[Any, ...], Dict[str, Any]]:
204 return ((self.datasetType, self.dataId),
205 {"id": self.id, "run": self.run, "components": self._components})
207 def resolved(self, id: int, run: str, components: Optional[Mapping[str, DatasetRef]] = None
208 ) -> DatasetRef:
209 """Return a new `DatasetRef` with the same data ID and dataset type
210 and the given ID and run.
212 Parameters
213 ----------
214 id : `int`
215 The unique integer identifier assigned when the dataset is created.
216 run : `str`
217 The run this dataset was associated with when it was created.
218 components : `dict`, optional
219 A dictionary mapping component name to a `DatasetRef` for that
220 component. If ``self`` is already a resolved `DatasetRef`,
221 its components will be merged with this dictionary, with this
222 dictionary taking precedence.
224 Returns
225 -------
226 ref : `DatasetRef`
227 A new `DatasetRef`.
228 """
229 if self._components is not None:
230 newComponents = self._components.copy()
231 else:
232 newComponents = {}
233 if components:
234 newComponents.update(components)
235 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId,
236 id=id, run=run, hash=self.hash, components=newComponents, conform=False)
238 def unresolved(self) -> DatasetRef:
239 """Return a new `DatasetRef` with the same data ID and dataset type,
240 but no ID, run, or components.
242 Returns
243 -------
244 ref : `DatasetRef`
245 A new `DatasetRef`.
247 Notes
248 -----
249 This can be used to compare only the data ID and dataset type of a
250 pair of `DatasetRef` instances, regardless of whether either is
251 resolved::
253 if ref1.unresolved() == ref2.unresolved():
254 ...
255 """
256 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, hash=self.hash, conform=False)
258 def expanded(self, dataId: ExpandedDataCoordinate) -> DatasetRef:
259 """Return a new `DatasetRef` with the given expanded data ID.
261 Parameters
262 ----------
263 dataId : `ExpandedDataCoordinate`
264 Data ID for the new `DatasetRef`. Must compare equal to the
265 original data ID.
267 Returns
268 -------
269 ref : `DatasetRef`
270 A new `DatasetRef` with the given data ID.
271 """
272 assert dataId == self.dataId
273 return DatasetRef(datasetType=self.datasetType, dataId=dataId,
274 id=self.id, run=self.run, hash=self.hash, components=self.components,
275 conform=False)
277 def isComponent(self) -> bool:
278 """Boolean indicating whether this `DatasetRef` refers to a
279 component of a composite.
281 Returns
282 -------
283 isComponent : `bool`
284 `True` if this `DatasetRef` is a component, `False` otherwise.
285 """
286 return self.datasetType.isComponent()
288 def isComposite(self) -> bool:
289 """Boolean indicating whether this `DatasetRef` is a composite type.
291 Returns
292 -------
293 isComposite : `bool`
294 `True` if this `DatasetRef` is a composite type, `False`
295 otherwise.
296 """
297 return self.datasetType.isComposite()
299 def _lookupNames(self) -> Tuple[LookupKey, ...]:
300 """Name keys to use when looking up this DatasetRef in a configuration.
302 The names are returned in order of priority.
304 Returns
305 -------
306 names : `tuple` of `LookupKey`
307 Tuple of the `DatasetType` name and the `StorageClass` name.
308 If ``instrument`` is defined in the dataId, each of those names
309 is added to the start of the tuple with a key derived from the
310 value of ``instrument``.
311 """
312 # Special case the instrument Dimension since we allow configs
313 # to include the instrument name in the hierarchy.
314 names: Tuple[LookupKey, ...] = self.datasetType._lookupNames()
316 if "instrument" in self.dataId:
317 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]})
318 for n in names) + names
320 return names
322 def allRefs(self, parents: bool = True) -> Iterator[DatasetRef]:
323 """Return all the nested component `DatasetRef` and optionally the
324 parent.
326 Parameters
327 ----------
328 parents : `bool`, optional
329 If `True` (default) include the given dataset in the output
330 iterable. If `False`, include only its components. This does
331 not propagate recursively - only the outermost level of parents
332 is ignored if ``parents`` is `False`.
334 Yields
335 ------
336 ref : `DatasetRef`
337 Itself (only if ``parent`` is `True`) or one of its (recursive)
338 children.
340 Notes
341 -----
342 If ``parents`` is `True`, components are guaranteed to be yielded
343 before their parents.
344 """
345 if self.components is None:
346 raise AmbiguousDatasetError(f"Unresolved ref {self} cannot be flattened.")
347 yield from DatasetRef.flatten(self.components.values(), parents=True)
348 if parents:
349 yield self
351 @staticmethod
352 def flatten(refs: Iterable[DatasetRef], *, parents: bool = True) -> Iterator[DatasetRef]:
353 """Recursively transform an iterable over `DatasetRef` to include
354 nested component `DatasetRef` instances.
356 Parameters
357 ----------
358 refs : `~collections.abc.Iterable` [ `DatasetRef` ]
359 Input iterable to process. Must contain only resolved `DatasetRef`
360 instances (i.e. with `DatasetRef.components` not `None`).
361 parents : `bool`, optional
362 If `True` (default) include the given datasets in the output
363 iterable. If `False`, include only their components. This does
364 not propagate recursively - only the outermost level of parents
365 is ignored if ``parents`` is `False`.
367 Yields
368 ------
369 ref : `DatasetRef`
370 Either one of the given `DatasetRef` instances (only if ``parent``
371 is `True`) or one of its (recursive) children.
373 Notes
374 -----
375 If ``parents`` is `True`, components are guaranteed to be yielded
376 before their parents.
377 """
378 for ref in refs:
379 for subref in ref.allRefs(parents):
380 yield subref
382 @staticmethod
383 def groupByType(refs: Iterable[DatasetRef], *, recursive: bool = True
384 ) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
385 """Group an iterable of `DatasetRef` by `DatasetType`.
387 Parameters
388 ----------
389 refs : `Iterable` [ `DatasetRef` ]
390 `DatasetRef` instances to group.
391 recursive : `bool`, optional
392 If `True` (default), also group any `DatasetRef` instances found in
393 the `DatasetRef.components` dictionaries of ``refs``, recursively.
394 `True` also checks that references are "resolved" (unresolved
395 references never have components).
397 Returns
398 -------
399 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ]
400 Grouped `DatasetRef` instances.
402 Raises
403 ------
404 AmbiguousDatasetError
405 Raised if ``recursive is True``, and one or more refs has
406 ``DatasetRef.components is None`` (as is always the case for
407 unresolved `DatasetRef` objects).
408 """
409 result = NamedKeyDict()
410 iter = DatasetRef.flatten(refs) if recursive else refs
411 for ref in iter:
412 result.setdefault(ref.datasetType, []).append(ref)
413 return result
415 def getCheckedId(self) -> int:
416 """Return ``self.id``, or raise if it is `None`.
418 This trivial method exists to allow operations that would otherwise be
419 natural list comprehensions to check that the ID is not `None` as well.
421 Returns
422 -------
423 id : `int`
424 ``self.id`` if it is not `None`.
426 Raises
427 ------
428 AmbiguousDatasetError
429 Raised if ``ref.id`` is `None`.
430 """
431 if self.id is None:
432 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; "
433 f"a resolved reference is required.")
434 return self.id
436 datasetType: DatasetType
437 """The definition of this dataset (`DatasetType`).
439 Cannot be changed after a `DatasetRef` is constructed.
440 """
442 dataId: DataCoordinate
443 """A mapping of `Dimension` primary key values that labels the dataset
444 within a Collection (`DataCoordinate`).
446 Cannot be changed after a `DatasetRef` is constructed.
447 """
449 run: Optional[setattr]
450 """The name of the run that produced the dataset.
452 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or
453 `unresolved` to add or remove this information when creating a new
454 `DatasetRef`.
455 """
457 id: Optional[int]
458 """Primary key of the dataset (`int` or `None`).
460 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or
461 `unresolved` to add or remove this information when creating a new
462 `DatasetRef`.
463 """