Coverage for python/lsst/daf/butler/core/datasets/type.py: 24%
235 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-08-05 01:26 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-08-05 01:26 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["DatasetType", "SerializedDatasetType"]
26import re
27from collections.abc import Callable, Iterable, Mapping
28from copy import deepcopy
29from types import MappingProxyType
30from typing import TYPE_CHECKING, Any, ClassVar
32from lsst.daf.butler._compat import _BaseModelCompat
33from pydantic import StrictBool, StrictStr
35from ..configSupport import LookupKey
36from ..dimensions import DimensionGraph, SerializedDimensionGraph
37from ..json import from_json_pydantic, to_json_pydantic
38from ..persistenceContext import PersistenceContextVars
39from ..storageClass import StorageClass, StorageClassFactory
41if TYPE_CHECKING:
42 from ...registry import Registry
43 from ..dimensions import Dimension, DimensionUniverse
46def _safeMakeMappingProxyType(data: Mapping | None) -> Mapping:
47 if data is None:
48 data = {}
49 return MappingProxyType(data)
52class SerializedDatasetType(_BaseModelCompat):
53 """Simplified model of a `DatasetType` suitable for serialization."""
55 name: StrictStr
56 storageClass: StrictStr | None = None
57 dimensions: SerializedDimensionGraph | None = None
58 parentStorageClass: StrictStr | None = None
59 isCalibration: StrictBool = False
61 @classmethod
62 def direct(
63 cls,
64 *,
65 name: str,
66 storageClass: str | None = None,
67 dimensions: dict | None = None,
68 parentStorageClass: str | None = None,
69 isCalibration: bool = False,
70 ) -> SerializedDatasetType:
71 """Construct a `SerializedDatasetType` directly without validators.
73 This differs from PyDantics construct method in that the arguments are
74 explicitly what the model requires, and it will recurse through
75 members, constructing them from their corresponding `direct` methods.
77 This method should only be called when the inputs are trusted.
78 """
79 cache = PersistenceContextVars.serializedDatasetTypeMapping.get()
80 key = (name, storageClass or "")
81 if cache is not None and (type_ := cache.get(key, None)) is not None:
82 return type_
84 serialized_dimensions = (
85 SerializedDimensionGraph.direct(**dimensions) if dimensions is not None else None
86 )
88 node = cls.model_construct(
89 name=name,
90 storageClass=storageClass,
91 dimensions=serialized_dimensions,
92 parentStorageClass=parentStorageClass,
93 isCalibration=isCalibration,
94 )
96 if cache is not None:
97 cache[key] = node
98 return node
101class DatasetType:
102 r"""A named category of Datasets.
104 Defines how they are organized, related, and stored.
106 A concrete, final class whose instances represent `DatasetType`\ s.
107 `DatasetType` instances may be constructed without a `Registry`,
108 but they must be registered
109 via `Registry.registerDatasetType()` before corresponding Datasets
110 may be added.
111 `DatasetType` instances are immutable.
113 Parameters
114 ----------
115 name : `str`
116 A string name for the Dataset; must correspond to the same
117 `DatasetType` across all Registries. Names must start with an
118 upper or lowercase letter, and may contain only letters, numbers,
119 and underscores. Component dataset types should contain a single
120 period separating the base dataset type name from the component name
121 (and may be recursive).
122 dimensions : `DimensionGraph` or iterable of `Dimension` or `str`
123 Dimensions used to label and relate instances of this `DatasetType`.
124 If not a `DimensionGraph`, ``universe`` must be provided as well.
125 storageClass : `StorageClass` or `str`
126 Instance of a `StorageClass` or name of `StorageClass` that defines
127 how this `DatasetType` is persisted.
128 parentStorageClass : `StorageClass` or `str`, optional
129 Instance of a `StorageClass` or name of `StorageClass` that defines
130 how the composite parent is persisted. Must be `None` if this
131 is not a component.
132 universe : `DimensionUniverse`, optional
133 Set of all known dimensions, used to normalize ``dimensions`` if it
134 is not already a `DimensionGraph`.
135 isCalibration : `bool`, optional
136 If `True`, this dataset type may be included in
137 `~CollectionType.CALIBRATION` collections.
139 See Also
140 --------
141 :ref:`daf_butler_organizing_datasets`
142 """
144 __slots__ = (
145 "_name",
146 "_dimensions",
147 "_storageClass",
148 "_storageClassName",
149 "_parentStorageClass",
150 "_parentStorageClassName",
151 "_isCalibration",
152 )
154 _serializedType = SerializedDatasetType
156 VALID_NAME_REGEX = re.compile("^[a-zA-Z_][a-zA-Z0-9_]*(\\.[a-zA-Z_][a-zA-Z0-9_]*)*$")
158 @staticmethod
159 def nameWithComponent(datasetTypeName: str, componentName: str) -> str:
160 """Form a valid DatasetTypeName from a parent and component.
162 No validation is performed.
164 Parameters
165 ----------
166 datasetTypeName : `str`
167 Base type name.
168 componentName : `str`
169 Name of component.
171 Returns
172 -------
173 compTypeName : `str`
174 Name to use for component DatasetType.
175 """
176 return f"{datasetTypeName}.{componentName}"
178 def __init__(
179 self,
180 name: str,
181 dimensions: DimensionGraph | Iterable[Dimension | str],
182 storageClass: StorageClass | str,
183 parentStorageClass: StorageClass | str | None = None,
184 *,
185 universe: DimensionUniverse | None = None,
186 isCalibration: bool = False,
187 ):
188 if self.VALID_NAME_REGEX.match(name) is None:
189 raise ValueError(f"DatasetType name '{name}' is invalid.")
190 self._name = name
191 if not isinstance(dimensions, DimensionGraph):
192 if universe is None:
193 raise ValueError(
194 "If dimensions is not a normalized DimensionGraph, a universe must be provided."
195 )
196 dimensions = universe.extract(dimensions)
197 self._dimensions = dimensions
198 if name in self._dimensions.universe.getGovernorDimensions().names:
199 raise ValueError(f"Governor dimension name {name} cannot be used as a dataset type name.")
200 if not isinstance(storageClass, StorageClass | str):
201 raise ValueError(f"StorageClass argument must be StorageClass or str. Got {storageClass}")
202 self._storageClass: StorageClass | None
203 if isinstance(storageClass, StorageClass):
204 self._storageClass = storageClass
205 self._storageClassName = storageClass.name
206 else:
207 self._storageClass = None
208 self._storageClassName = storageClass
210 self._parentStorageClass: StorageClass | None = None
211 self._parentStorageClassName: str | None = None
212 if parentStorageClass is not None:
213 if not isinstance(storageClass, StorageClass | str):
214 raise ValueError(
215 f"Parent StorageClass argument must be StorageClass or str. Got {parentStorageClass}"
216 )
218 # Only allowed for a component dataset type
219 _, componentName = self.splitDatasetTypeName(self._name)
220 if componentName is None:
221 raise ValueError(
222 f"Can not specify a parent storage class if this is not a component ({self._name})"
223 )
224 if isinstance(parentStorageClass, StorageClass):
225 self._parentStorageClass = parentStorageClass
226 self._parentStorageClassName = parentStorageClass.name
227 else:
228 self._parentStorageClassName = parentStorageClass
230 # Ensure that parent storage class is specified when we have
231 # a component and is not specified when we don't
232 _, componentName = self.splitDatasetTypeName(self._name)
233 if parentStorageClass is None and componentName is not None:
234 raise ValueError(
235 f"Component dataset type '{self._name}' constructed without parent storage class"
236 )
237 if parentStorageClass is not None and componentName is None:
238 raise ValueError(f"Parent storage class specified by {self._name} is not a composite")
239 self._isCalibration = isCalibration
241 def __repr__(self) -> str:
242 extra = ""
243 if self._parentStorageClassName:
244 extra = f", parentStorageClass={self._parentStorageClassName}"
245 if self._isCalibration:
246 extra += ", isCalibration=True"
247 return f"DatasetType({self.name!r}, {self.dimensions}, {self._storageClassName}{extra})"
249 def _equal_ignoring_storage_class(self, other: Any) -> bool:
250 """Check everything is equal except the storage class.
252 Parameters
253 ----------
254 other : Any
255 Object to check against this one.
257 Returns
258 -------
259 mostly : `bool`
260 Returns `True` if everything except the storage class is equal.
261 """
262 if not isinstance(other, type(self)):
263 return False
264 if self._name != other._name:
265 return False
266 if self._dimensions != other._dimensions:
267 return False
268 if self._isCalibration != other._isCalibration:
269 return False
270 if self._parentStorageClass is not None and other._parentStorageClass is not None:
271 return self._parentStorageClass == other._parentStorageClass
272 else:
273 return self._parentStorageClassName == other._parentStorageClassName
275 def __eq__(self, other: Any) -> bool:
276 mostly_equal = self._equal_ignoring_storage_class(other)
277 if not mostly_equal:
278 return False
280 # Be careful not to force a storage class to import the corresponding
281 # python code.
282 if self._storageClass is not None and other._storageClass is not None:
283 if self._storageClass != other._storageClass:
284 return False
285 else:
286 if self._storageClassName != other._storageClassName:
287 return False
288 return True
290 def is_compatible_with(self, other: DatasetType) -> bool:
291 """Determine if the given `DatasetType` is compatible with this one.
293 Compatibility requires a matching name and dimensions and a storage
294 class for this dataset type that can convert the python type associated
295 with the other storage class to this python type.
297 Parameters
298 ----------
299 other : `DatasetType`
300 Dataset type to check.
302 Returns
303 -------
304 is_compatible : `bool`
305 Returns `True` if the other dataset type is either the same as this
306 or the storage class associated with the other can be converted to
307 this.
308 """
309 mostly_equal = self._equal_ignoring_storage_class(other)
310 if not mostly_equal:
311 return False
313 # If the storage class names match then they are compatible.
314 if self._storageClassName == other._storageClassName:
315 return True
317 # Now required to check the full storage class.
318 self_sc = self.storageClass
319 other_sc = other.storageClass
321 return self_sc.can_convert(other_sc)
323 def __hash__(self) -> int:
324 """Hash DatasetType instance.
326 This only uses StorageClass name which is it consistent with the
327 implementation of StorageClass hash method.
328 """
329 return hash((self._name, self._dimensions, self._storageClassName, self._parentStorageClassName))
331 def __lt__(self, other: Any) -> bool:
332 """Sort using the dataset type name."""
333 if not isinstance(other, type(self)):
334 return NotImplemented
335 return self.name < other.name
337 @property
338 def name(self) -> str:
339 """Return a string name for the Dataset.
341 Must correspond to the same `DatasetType` across all Registries.
342 """
343 return self._name
345 @property
346 def dimensions(self) -> DimensionGraph:
347 r"""Return the `Dimension`\ s fir this dataset type.
349 The dimensions label and relate instances of this
350 `DatasetType` (`DimensionGraph`).
351 """
352 return self._dimensions
354 @property
355 def storageClass(self) -> StorageClass:
356 """Return `StorageClass` instance associated with this dataset type.
358 The `StorageClass` defines how this `DatasetType`
359 is persisted. Note that if DatasetType was constructed with a name
360 of a StorageClass then Butler has to be initialized before using
361 this property.
362 """
363 if self._storageClass is None:
364 self._storageClass = StorageClassFactory().getStorageClass(self._storageClassName)
365 return self._storageClass
367 @property
368 def storageClass_name(self) -> str:
369 """Return the storage class name.
371 This will never force the storage class to be imported.
372 """
373 return self._storageClassName
375 @property
376 def parentStorageClass(self) -> StorageClass | None:
377 """Return the storage class of the composite containing this component.
379 Note that if DatasetType was constructed with a name of a
380 StorageClass then Butler has to be initialized before using this
381 property. Can be `None` if this is not a component of a composite.
382 Must be defined if this is a component.
383 """
384 if self._parentStorageClass is None and self._parentStorageClassName is None:
385 return None
386 if self._parentStorageClass is None and self._parentStorageClassName is not None:
387 self._parentStorageClass = StorageClassFactory().getStorageClass(self._parentStorageClassName)
388 return self._parentStorageClass
390 def isCalibration(self) -> bool:
391 """Return if datasets of this type can be in calibration collections.
393 Returns
394 -------
395 flag : `bool`
396 `True` if datasets of this type may be included in calibration
397 collections.
398 """
399 return self._isCalibration
401 @staticmethod
402 def splitDatasetTypeName(datasetTypeName: str) -> tuple[str, str | None]:
403 """Return the root name and the component from a composite name.
405 Parameters
406 ----------
407 datasetTypeName : `str`
408 The name of the dataset type, can include a component using
409 a "."-separator.
411 Returns
412 -------
413 rootName : `str`
414 Root name without any components.
415 componentName : `str`
416 The component if it has been specified, else `None`.
418 Notes
419 -----
420 If the dataset type name is ``a.b.c`` this method will return a
421 root name of ``a`` and a component name of ``b.c``.
422 """
423 comp = None
424 root = datasetTypeName
425 if "." in root:
426 # If there is doubt, the component is after the first "."
427 root, comp = root.split(".", maxsplit=1)
428 return root, comp
430 def nameAndComponent(self) -> tuple[str, str | None]:
431 """Return the root name of this dataset type and any component.
433 Returns
434 -------
435 rootName : `str`
436 Root name for this `DatasetType` without any components.
437 componentName : `str`
438 The component if it has been specified, else `None`.
439 """
440 return self.splitDatasetTypeName(self.name)
442 def component(self) -> str | None:
443 """Return the component name (if defined).
445 Returns
446 -------
447 comp : `str`
448 Name of component part of DatasetType name. `None` if this
449 `DatasetType` is not associated with a component.
450 """
451 _, comp = self.nameAndComponent()
452 return comp
454 def componentTypeName(self, component: str) -> str:
455 """Derive a component dataset type from a composite.
457 Parameters
458 ----------
459 component : `str`
460 Name of component
462 Returns
463 -------
464 derived : `str`
465 Compound name of this `DatasetType` and the component.
467 Raises
468 ------
469 KeyError
470 Requested component is not supported by this `DatasetType`.
471 """
472 if component in self.storageClass.allComponents():
473 return self.nameWithComponent(self.name, component)
474 raise KeyError(f"Requested component ({component}) not understood by this DatasetType ({self})")
476 def makeCompositeDatasetType(self) -> DatasetType:
477 """Return a composite dataset type from the component.
479 Returns
480 -------
481 composite : `DatasetType`
482 The composite dataset type.
484 Raises
485 ------
486 RuntimeError
487 Raised if this dataset type is not a component dataset type.
488 """
489 if not self.isComponent():
490 raise RuntimeError(f"DatasetType {self.name} must be a component to form the composite")
491 composite_name, _ = self.nameAndComponent()
492 if self.parentStorageClass is None:
493 raise ValueError(
494 f"Parent storage class is not set. Unable to create composite type from {self.name}"
495 )
496 return DatasetType(
497 composite_name,
498 dimensions=self.dimensions,
499 storageClass=self.parentStorageClass,
500 isCalibration=self.isCalibration(),
501 )
503 def makeComponentDatasetType(self, component: str) -> DatasetType:
504 """Return a component dataset type from a composite.
506 Assumes the same dimensions as the parent.
508 Parameters
509 ----------
510 component : `str`
511 Name of component
513 Returns
514 -------
515 datasetType : `DatasetType`
516 A new DatasetType instance.
517 """
518 # The component could be a read/write or read component
519 return DatasetType(
520 self.componentTypeName(component),
521 dimensions=self.dimensions,
522 storageClass=self.storageClass.allComponents()[component],
523 parentStorageClass=self.storageClass,
524 isCalibration=self.isCalibration(),
525 )
527 def makeAllComponentDatasetTypes(self) -> list[DatasetType]:
528 """Return all component dataset types for this composite.
530 Returns
531 -------
532 all : `list` of `DatasetType`
533 All the component dataset types. If this is not a composite
534 then returns an empty list.
535 """
536 return [
537 self.makeComponentDatasetType(componentName)
538 for componentName in self.storageClass.allComponents()
539 ]
541 def overrideStorageClass(self, storageClass: str | StorageClass) -> DatasetType:
542 """Create a new `DatasetType` from this one but with an updated
543 `StorageClass`.
545 Parameters
546 ----------
547 storageClass : `str` or `StorageClass`
548 The new storage class.
550 Returns
551 -------
552 modified : `DatasetType`
553 A dataset type that is the same as the current one but with a
554 different storage class. Will be ``self`` if the given storage
555 class is the current one.
557 Notes
558 -----
559 If this is a component dataset type, the parent storage class will be
560 retained.
561 """
562 if storageClass == self._storageClassName or storageClass == self._storageClass:
563 return self
564 parent = self._parentStorageClass if self._parentStorageClass else self._parentStorageClassName
565 new = DatasetType(
566 self.name,
567 dimensions=self.dimensions,
568 storageClass=storageClass,
569 parentStorageClass=parent,
570 isCalibration=self.isCalibration(),
571 )
572 # Check validity.
573 if new.is_compatible_with(self) or self.is_compatible_with(new):
574 return new
575 raise ValueError(
576 f"The new storage class ({new.storageClass}) is not compatible with the "
577 f"existing storage class ({self.storageClass})."
578 )
580 def isComponent(self) -> bool:
581 """Return whether this `DatasetType` refers to a component.
583 Returns
584 -------
585 isComponent : `bool`
586 `True` if this `DatasetType` is a component, `False` otherwise.
587 """
588 if self.component():
589 return True
590 return False
592 def isComposite(self) -> bool:
593 """Return whether this `DatasetType` is a composite.
595 Returns
596 -------
597 isComposite : `bool`
598 `True` if this `DatasetType` is a composite type, `False`
599 otherwise.
600 """
601 return self.storageClass.isComposite()
603 def _lookupNames(self) -> tuple[LookupKey, ...]:
604 """Return name keys to use for lookups in configurations.
606 The names are returned in order of priority.
608 Returns
609 -------
610 names : `tuple` of `LookupKey`
611 Tuple of the `DatasetType` name and the `StorageClass` name.
612 If the name includes a component the name with the component
613 is first, then the name without the component and finally
614 the storage class name and the storage class name of the
615 composite.
616 """
617 rootName, componentName = self.nameAndComponent()
618 lookups: tuple[LookupKey, ...] = (LookupKey(name=self.name),)
619 if componentName is not None:
620 lookups = lookups + (LookupKey(name=rootName),)
622 if self.dimensions:
623 # Dimensions are a lower priority than dataset type name
624 lookups = lookups + (LookupKey(dimensions=self.dimensions),)
626 storageClasses = self.storageClass._lookupNames()
627 if componentName is not None and self.parentStorageClass is not None:
628 storageClasses += self.parentStorageClass._lookupNames()
630 return lookups + storageClasses
632 def to_simple(self, minimal: bool = False) -> SerializedDatasetType:
633 """Convert this class to a simple python type.
635 This makes it suitable for serialization.
637 Parameters
638 ----------
639 minimal : `bool`, optional
640 Use minimal serialization. Requires Registry to convert
641 back to a full type.
643 Returns
644 -------
645 simple : `SerializedDatasetType`
646 The object converted to a class suitable for serialization.
647 """
648 as_dict: dict[str, Any]
649 if minimal:
650 # Only needs the name.
651 as_dict = {"name": self.name}
652 else:
653 # Convert to a dict form
654 as_dict = {
655 "name": self.name,
656 "storageClass": self._storageClassName,
657 "isCalibration": self._isCalibration,
658 "dimensions": self.dimensions.to_simple(),
659 }
661 if self._parentStorageClassName is not None:
662 as_dict["parentStorageClass"] = self._parentStorageClassName
663 return SerializedDatasetType(**as_dict)
665 @classmethod
666 def from_simple(
667 cls,
668 simple: SerializedDatasetType,
669 universe: DimensionUniverse | None = None,
670 registry: Registry | None = None,
671 ) -> DatasetType:
672 """Construct a new object from the simplified form.
674 This is usually data returned from the `to_simple` method.
676 Parameters
677 ----------
678 simple : `SerializedDatasetType`
679 The value returned by `to_simple()`.
680 universe : `DimensionUniverse`
681 The special graph of all known dimensions of which this graph will
682 be a subset. Can be `None` if a registry is provided.
683 registry : `lsst.daf.butler.Registry`, optional
684 Registry to use to convert simple name of a DatasetType to
685 a full `DatasetType`. Can be `None` if a full description of
686 the type is provided along with a universe.
688 Returns
689 -------
690 datasetType : `DatasetType`
691 Newly-constructed object.
692 """
693 # check to see if there is a cache, and if there is, if there is a
694 # cached dataset type
695 cache = PersistenceContextVars.loadedTypes.get()
696 key = (simple.name, simple.storageClass or "")
697 if cache is not None and (type_ := cache.get(key, None)) is not None:
698 return type_
700 if simple.storageClass is None:
701 # Treat this as minimalist representation
702 if registry is None:
703 raise ValueError(
704 f"Unable to convert a DatasetType name '{simple}' to DatasetType without a Registry"
705 )
706 return registry.getDatasetType(simple.name)
708 if universe is None and registry is None:
709 raise ValueError("One of universe or registry must be provided.")
711 if universe is None and registry is not None:
712 # registry should not be none by now but test helps mypy
713 universe = registry.dimensions
715 if universe is None:
716 # this is for mypy
717 raise ValueError("Unable to determine a usable universe")
719 if simple.dimensions is None:
720 # mypy hint
721 raise ValueError(f"Dimensions must be specified in {simple}")
723 newType = cls(
724 name=simple.name,
725 dimensions=DimensionGraph.from_simple(simple.dimensions, universe=universe),
726 storageClass=simple.storageClass,
727 isCalibration=simple.isCalibration,
728 parentStorageClass=simple.parentStorageClass,
729 universe=universe,
730 )
731 if cache is not None:
732 cache[key] = newType
733 return newType
735 to_json = to_json_pydantic
736 from_json: ClassVar = classmethod(from_json_pydantic)
738 def __reduce__(
739 self,
740 ) -> tuple[
741 Callable, tuple[type[DatasetType], tuple[str, DimensionGraph, str, str | None], dict[str, bool]]
742 ]:
743 """Support pickling.
745 StorageClass instances can not normally be pickled, so we pickle
746 StorageClass name instead of instance.
747 """
748 return _unpickle_via_factory, (
749 self.__class__,
750 (self.name, self.dimensions, self._storageClassName, self._parentStorageClassName),
751 {"isCalibration": self._isCalibration},
752 )
754 def __deepcopy__(self, memo: Any) -> DatasetType:
755 """Support for deep copy method.
757 Normally ``deepcopy`` will use pickle mechanism to make copies.
758 We want to avoid that to support (possibly degenerate) use case when
759 DatasetType is constructed with StorageClass instance which is not
760 registered with StorageClassFactory (this happens in unit tests).
761 Instead we re-implement ``__deepcopy__`` method.
762 """
763 return DatasetType(
764 name=deepcopy(self.name, memo),
765 dimensions=deepcopy(self.dimensions, memo),
766 storageClass=deepcopy(self._storageClass or self._storageClassName, memo),
767 parentStorageClass=deepcopy(self._parentStorageClass or self._parentStorageClassName, memo),
768 isCalibration=deepcopy(self._isCalibration, memo),
769 )
772def _unpickle_via_factory(factory: Callable, args: Any, kwargs: Any) -> DatasetType:
773 """Unpickle something by calling a factory.
775 Allows subclasses to unpickle using `__reduce__` with keyword
776 arguments as well as positional arguments.
777 """
778 return factory(*args, **kwargs)