Coverage for python/lsst/daf/butler/core/datasets/ref.py : 26%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["AmbiguousDatasetError", "DatasetRef", "FakeDatasetRef"]
25import hashlib
26from typing import Any, Dict, Iterable, Iterator, List, Mapping, Optional, Tuple
28from types import MappingProxyType
29from ..dimensions import DataCoordinate, DimensionGraph, ExpandedDataCoordinate
30from ..configSupport import LookupKey
31from ..utils import immutable, NamedKeyDict
32from .type import DatasetType
35class AmbiguousDatasetError(Exception):
36 """Exception raised when a `DatasetRef` is not resolved (has no ID, run, or
37 components), but the requested operation requires one of them.
38 """
41@immutable
42class DatasetRef:
43 """Reference to a Dataset in a `Registry`.
45 A `DatasetRef` may point to a Dataset that currently does not yet exist
46 (e.g., because it is a predicted input for provenance).
48 Parameters
49 ----------
50 datasetType : `DatasetType`
51 The `DatasetType` for this Dataset.
52 dataId : `DataCoordinate`
53 A mapping of dimensions that labels the Dataset within a Collection.
54 id : `int`, optional
55 The unique integer identifier assigned when the dataset is created.
56 run : `str`, optional
57 The name of the run this dataset was associated with when it was
58 created. Must be provided if ``id`` is.
59 hash : `bytes`, optional
60 A hash of the dataset type and data ID. Should only be provided if
61 copying from another `DatasetRef` with the same dataset type and data
62 ID.
63 components : `dict`, optional
64 A dictionary mapping component name to a `DatasetRef` for that
65 component. Should not be passed unless ``id`` is also provided (i.e.
66 if this is a "resolved" reference).
67 conform : `bool`, optional
68 If `True` (default), call `DataCoordinate.standardize` to ensure that
69 the data ID's dimensions are consistent with the dataset type's.
70 `DatasetRef` instances for which those dimensions are not equal should
71 not be created in new code, but are still supported for backwards
72 compatibility. New code should only pass `False` if it can guarantee
73 that the dimensions are already consistent.
75 Raises
76 ------
77 ValueError
78 Raised if ``run`` or ``components`` is provided but ``id`` is not, or
79 if a component dataset is inconsistent with the storage class, or if
80 ``id`` is provided but ``run`` is not.
81 """
83 __slots__ = ("id", "datasetType", "dataId", "run", "_hash", "_components")
85 def __new__(cls, datasetType: DatasetType, dataId: DataCoordinate, *,
86 id: Optional[int] = None,
87 run: Optional[str] = None, hash: Optional[bytes] = None,
88 components: Optional[Mapping[str, DatasetRef]] = None, conform: bool = True) -> DatasetRef:
89 self = super().__new__(cls)
90 assert isinstance(datasetType, DatasetType)
91 self.id = id
92 self.datasetType = datasetType
93 if conform:
94 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions)
95 else:
96 self.dataId = dataId
97 if self.id is not None:
98 self._components = dict()
99 if components is not None:
100 self._components.update(components)
101 for k, v in self._components.items():
102 expectedStorageClass = self.datasetType.storageClass.components.get(k)
103 if expectedStorageClass is None:
104 raise ValueError(f"{k} is not a valid component for "
105 f"storage class {self.datasetType.storageClass.name}.")
106 if not isinstance(v, DatasetRef):
107 # It's easy to accidentally pass DatasetType or
108 # StorageClass; make that error message friendly.
109 raise ValueError(f"Component {k}={v} is not a DatasetRef.")
110 if v.id is None:
111 raise ValueError(f"DatasetRef components must be resolved ({k}={v} isn't).")
112 if expectedStorageClass != v.datasetType.storageClass:
113 raise ValueError(f"Storage class mismatch for component {k}: "
114 f"{v.datasetType.storageClass.name} != {expectedStorageClass.name}")
115 if run is None:
116 raise ValueError(f"Cannot provide id without run for dataset with id={id}, "
117 f"type={datasetType}, and dataId={dataId}.")
118 self.run = run
119 else:
120 self._components = None
121 if components:
122 raise ValueError("'components' cannot be provided unless 'id' is.")
123 if run is not None:
124 raise ValueError("'run' cannot be provided unless 'id' is.")
125 self.run = None
126 if hash is not None:
127 # We only set self._hash if we know it; this plays nicely with
128 # the @immutable decorator, which allows an attribute to be set
129 # only one time.
130 self._hash = hash
131 return self
133 def __eq__(self, other: DatasetRef):
134 try:
135 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id)
136 except AttributeError:
137 return NotImplemented
139 def __hash__(self) -> int:
140 return hash((self.datasetType, self.dataId, self.id))
142 @property
143 def hash(self) -> bytes:
144 """Secure hash of the `DatasetType` name and data ID (`bytes`).
145 """
146 if not hasattr(self, "_hash"):
147 message = hashlib.blake2b(digest_size=32)
148 message.update(self.datasetType.name.encode("utf8"))
149 self.dataId.fingerprint(message.update)
150 self._hash = message.digest()
151 return self._hash
153 @property
154 def components(self) -> Optional[Mapping[str, DatasetRef]]:
155 """Named `DatasetRef` components (`~collections.abc.Mapping` or
156 `None`).
158 For resolved `DatasetRef` instances, this is a read-only mapping. For
159 unresolved instances, this is always `None`.
160 """
161 if self._components is None:
162 return None
163 return MappingProxyType(self._components)
165 @property
166 def dimensions(self) -> DimensionGraph:
167 """The dimensions associated with the underlying `DatasetType`
168 """
169 return self.datasetType.dimensions
171 def __repr__(self) -> str:
172 # We delegate to __str__ (i.e use "!s") for the data ID) below because
173 # DataCoordinate's __repr__ - while adhering to the guidelines for
174 # __repr__ - is much harder to users to read, while its __str__ just
175 # produces a dict that can also be passed to DatasetRef's constructor.
176 if self.id is not None:
177 return (f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r}, "
178 f"components={self._components})")
179 else:
180 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})"
182 def __str__(self) -> str:
183 s = f"{self.datasetType.name}@{self.dataId!s}"
184 if self.id is not None:
185 s += f" (id={self.id})"
186 return s
188 def __getnewargs_ex__(self) -> Tuple[Tuple[Any, ...], Dict[str, Any]]:
189 return ((self.datasetType, self.dataId),
190 {"id": self.id, "run": self.run, "components": self._components})
192 def resolved(self, id: int, run: str, components: Optional[Mapping[str, DatasetRef]] = None
193 ) -> DatasetRef:
194 """Return a new `DatasetRef` with the same data ID and dataset type
195 and the given ID and run.
197 Parameters
198 ----------
199 id : `int`
200 The unique integer identifier assigned when the dataset is created.
201 run : `str`
202 The run this dataset was associated with when it was created.
203 components : `dict`, optional
204 A dictionary mapping component name to a `DatasetRef` for that
205 component. If ``self`` is already a resolved `DatasetRef`,
206 its components will be merged with this dictionary, with this
207 dictionary taking precedence.
209 Returns
210 -------
211 ref : `DatasetRef`
212 A new `DatasetRef`.
213 """
214 if self._components is not None:
215 newComponents = self._components.copy()
216 else:
217 newComponents = {}
218 if components:
219 newComponents.update(components)
220 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId,
221 id=id, run=run, hash=self.hash, components=newComponents, conform=False)
223 def unresolved(self) -> DatasetRef:
224 """Return a new `DatasetRef` with the same data ID and dataset type,
225 but no ID, run, or components.
227 Returns
228 -------
229 ref : `DatasetRef`
230 A new `DatasetRef`.
232 Notes
233 -----
234 This can be used to compare only the data ID and dataset type of a
235 pair of `DatasetRef` instances, regardless of whether either is
236 resolved::
238 if ref1.unresolved() == ref2.unresolved():
239 ...
240 """
241 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, hash=self.hash, conform=False)
243 def expanded(self, dataId: ExpandedDataCoordinate) -> DatasetRef:
244 """Return a new `DatasetRef` with the given expanded data ID.
246 Parameters
247 ----------
248 dataId : `ExpandedDataCoordinate`
249 Data ID for the new `DatasetRef`. Must compare equal to the
250 original data ID.
252 Returns
253 -------
254 ref : `DatasetRef`
255 A new `DatasetRef` with the given data ID.
256 """
257 assert dataId == self.dataId
258 return DatasetRef(datasetType=self.datasetType, dataId=dataId,
259 id=self.id, run=self.run, hash=self.hash, components=self.components,
260 conform=False)
262 def isComponent(self) -> bool:
263 """Boolean indicating whether this `DatasetRef` refers to a
264 component of a composite.
266 Returns
267 -------
268 isComponent : `bool`
269 `True` if this `DatasetRef` is a component, `False` otherwise.
270 """
271 return self.datasetType.isComponent()
273 def isComposite(self) -> bool:
274 """Boolean indicating whether this `DatasetRef` is a composite type.
276 Returns
277 -------
278 isComposite : `bool`
279 `True` if this `DatasetRef` is a composite type, `False`
280 otherwise.
281 """
282 return self.datasetType.isComposite()
284 def _lookupNames(self) -> Tuple[LookupKey]:
285 """Name keys to use when looking up this DatasetRef in a configuration.
287 The names are returned in order of priority.
289 Returns
290 -------
291 names : `tuple` of `LookupKey`
292 Tuple of the `DatasetType` name and the `StorageClass` name.
293 If ``instrument`` is defined in the dataId, each of those names
294 is added to the start of the tuple with a key derived from the
295 value of ``instrument``.
296 """
297 # Special case the instrument Dimension since we allow configs
298 # to include the instrument name in the hierarchy.
299 names = self.datasetType._lookupNames()
301 if "instrument" in self.dataId:
302 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]})
303 for n in names) + names
305 return names
307 @staticmethod
308 def flatten(refs: Iterable[DatasetRef], *, parents: bool = True) -> Iterator[DatasetRef]:
309 """Recursively transform an iterable over `DatasetRef` to include
310 nested component `DatasetRef` instances.
312 Parameters
313 ----------
314 refs : `~collections.abc.Iterable` [ `DatasetRef` ]
315 Input iterable to process. Must contain only resolved `DatasetRef`
316 instances (i.e. with `DatasetRef.components` not `None`).
317 parents : `bool`, optional
318 If `True` (default) include the given datasets in the output
319 iterable. If `False`, include only their components. This does
320 not propagate recursively - only the outermost level of parents
321 is ignored if ``parents`` is `False`.
323 Yields
324 ------
325 ref : `DatasetRef`
326 Either one of the given `DatasetRef` instances (only if ``parent``
327 is `True`) or on of its (recursive) children.
329 Notes
330 -----
331 If ``parents`` is `True`, components are guaranteed to be yielded
332 before their parents.
333 """
334 for ref in refs:
335 if ref.components is None:
336 raise AmbiguousDatasetError(f"Unresolved ref {ref} passed to 'flatten'.")
337 yield from DatasetRef.flatten(ref.components.values(), parents=True)
338 if parents:
339 yield ref
341 @staticmethod
342 def groupByType(refs: Iterable[DatasetRef], *, recursive: bool = True
343 ) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
344 """Group an iterable of `DatasetRef` by `DatasetType`.
346 Parameters
347 ----------
348 refs : `Iterable` [ `DatasetRef` ]
349 `DatasetRef` instances to group.
350 recursive : `bool`, optional
351 If `True` (default), also group any `DatasetRef` instances found in
352 the `DatasetRef.components` dictionaries of ``refs``, recursively.
353 `True` also checks that references are "resolved" (unresolved
354 references never have components).
356 Returns
357 -------
358 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ]
359 Grouped `DatasetRef` instances.
361 Raises
362 ------
363 AmbiguousDatasetError
364 Raised if ``recursive is True``, and one or more refs has
365 ``DatasetRef.components is None`` (as is always the case for
366 unresolved `DatasetRef` objects).
367 """
368 result = NamedKeyDict()
369 iter = DatasetRef.flatten(refs) if recursive else refs
370 for ref in iter:
371 result.setdefault(ref.datasetType, []).append(ref)
372 return result
374 def getCheckedId(self) -> int:
375 """Return ``self.id``, or raise if it is `None`.
377 This trivial method exists to allow operations that would otherwise be
378 natural list comprehensions to check that the ID is not `None` as well.
380 Returns
381 -------
382 id : `int`
383 ``self.id`` if it is not `None`.
385 Raises
386 ------
387 AmbiguousDatasetError
388 Raised if ``ref.id`` is `None`.
389 """
390 if self.id is None:
391 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; "
392 f"a resolved reference is required.")
393 return self.id
395 datasetType: DatasetType
396 """The definition of this dataset (`DatasetType`).
398 Cannot be changed after a `DatasetRef` is constructed.
399 """
401 dataId: DataCoordinate
402 """A mapping of `Dimension` primary key values that labels the dataset
403 within a Collection (`DataCoordinate`).
405 Cannot be changed after a `DatasetRef` is constructed.
406 """
408 run: Optional[setattr]
409 """The name of the run that produced the dataset.
411 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or
412 `unresolved` to add or remove this information when creating a new
413 `DatasetRef`.
414 """
416 id: Optional[int]
417 """Primary key of the dataset (`int` or `None`).
419 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or
420 `unresolved` to add or remove this information when creating a new
421 `DatasetRef`.
422 """
425@immutable
426class FakeDatasetRef:
427 """A fake `DatasetRef` that can be used internally by butler where
428 only the dataset ID is available.
430 Should only be used when registry can not be used to create a full
431 `DatasetRef` from the ID. A particular use case is during dataset
432 deletion when solely the ID is available.
434 Parameters
435 ----------
436 id : `int`
437 The dataset ID.
438 """
439 __slots__ = ("id",)
441 def __new__(cls, id: int):
442 self = super().__new__(cls)
443 self.id = id
444 return self
446 def __str__(self):
447 return f"dataset_id={self.id}"
449 def __repr__(self):
450 return f"FakeDatasetRef({self.id})"
452 def __eq__(self, other: FakeDatasetRef):
453 try:
454 return self.id == other.id
455 except AttributeError:
456 return NotImplemented
458 def __hash__(self) -> int:
459 return hash(self.id)
461 @property
462 def components(self):
463 return {}
465 @staticmethod
466 def flatten(refs: Iterable[FakeDatasetRef], *, parents: bool = True) -> Iterator[DatasetRef]:
467 return DatasetRef.flatten(refs, parents=parents)