Coverage for python/lsst/daf/butler/core/datasets/type.py: 24%
243 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-14 19:21 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-14 19:21 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["DatasetType", "SerializedDatasetType"]
26import re
27from collections.abc import Callable, Iterable, Mapping
28from copy import deepcopy
29from types import MappingProxyType
30from typing import TYPE_CHECKING, Any, ClassVar
32try:
33 from pydantic.v1 import BaseModel, StrictBool, StrictStr
34except ModuleNotFoundError:
35 from pydantic import BaseModel, StrictBool, StrictStr # type: ignore
37from ..configSupport import LookupKey
38from ..dimensions import DimensionGraph, SerializedDimensionGraph
39from ..json import from_json_pydantic, to_json_pydantic
40from ..persistenceContext import PersistenceContextVars
41from ..storageClass import StorageClass, StorageClassFactory
43if TYPE_CHECKING:
44 from ...registry import Registry
45 from ..dimensions import Dimension, DimensionUniverse
48def _safeMakeMappingProxyType(data: Mapping | None) -> Mapping:
49 if data is None:
50 data = {}
51 return MappingProxyType(data)
54class SerializedDatasetType(BaseModel):
55 """Simplified model of a `DatasetType` suitable for serialization."""
57 name: StrictStr
58 storageClass: StrictStr | None = None
59 dimensions: SerializedDimensionGraph | None = None
60 parentStorageClass: StrictStr | None = None
61 isCalibration: StrictBool = False
63 @classmethod
64 def direct(
65 cls,
66 *,
67 name: str,
68 storageClass: str | None = None,
69 dimensions: dict | None = None,
70 parentStorageClass: str | None = None,
71 isCalibration: bool = False,
72 ) -> SerializedDatasetType:
73 """Construct a `SerializedDatasetType` directly without validators.
75 This differs from PyDantics construct method in that the arguments are
76 explicitly what the model requires, and it will recurse through
77 members, constructing them from their corresponding `direct` methods.
79 This method should only be called when the inputs are trusted.
80 """
81 cache = PersistenceContextVars.serializedDatasetTypeMapping.get()
82 key = (name, storageClass or "")
83 if cache is not None and (type_ := cache.get(key, None)) is not None:
84 return type_
85 node = SerializedDatasetType.__new__(cls)
86 setter = object.__setattr__
87 setter(node, "name", name)
88 setter(node, "storageClass", storageClass)
89 setter(
90 node,
91 "dimensions",
92 dimensions if dimensions is None else SerializedDimensionGraph.direct(**dimensions),
93 )
94 setter(node, "parentStorageClass", parentStorageClass)
95 setter(node, "isCalibration", isCalibration)
96 setter(
97 node,
98 "__fields_set__",
99 {"name", "storageClass", "dimensions", "parentStorageClass", "isCalibration"},
100 )
101 if cache is not None:
102 cache[key] = node
103 return node
106class DatasetType:
107 r"""A named category of Datasets.
109 Defines how they are organized, related, and stored.
111 A concrete, final class whose instances represent `DatasetType`\ s.
112 `DatasetType` instances may be constructed without a `Registry`,
113 but they must be registered
114 via `Registry.registerDatasetType()` before corresponding Datasets
115 may be added.
116 `DatasetType` instances are immutable.
118 Parameters
119 ----------
120 name : `str`
121 A string name for the Dataset; must correspond to the same
122 `DatasetType` across all Registries. Names must start with an
123 upper or lowercase letter, and may contain only letters, numbers,
124 and underscores. Component dataset types should contain a single
125 period separating the base dataset type name from the component name
126 (and may be recursive).
127 dimensions : `DimensionGraph` or iterable of `Dimension` or `str`
128 Dimensions used to label and relate instances of this `DatasetType`.
129 If not a `DimensionGraph`, ``universe`` must be provided as well.
130 storageClass : `StorageClass` or `str`
131 Instance of a `StorageClass` or name of `StorageClass` that defines
132 how this `DatasetType` is persisted.
133 parentStorageClass : `StorageClass` or `str`, optional
134 Instance of a `StorageClass` or name of `StorageClass` that defines
135 how the composite parent is persisted. Must be `None` if this
136 is not a component.
137 universe : `DimensionUniverse`, optional
138 Set of all known dimensions, used to normalize ``dimensions`` if it
139 is not already a `DimensionGraph`.
140 isCalibration : `bool`, optional
141 If `True`, this dataset type may be included in
142 `~CollectionType.CALIBRATION` collections.
144 See Also
145 --------
146 :ref:`daf_butler_organizing_datasets`
147 """
149 __slots__ = (
150 "_name",
151 "_dimensions",
152 "_storageClass",
153 "_storageClassName",
154 "_parentStorageClass",
155 "_parentStorageClassName",
156 "_isCalibration",
157 )
159 _serializedType = SerializedDatasetType
161 VALID_NAME_REGEX = re.compile("^[a-zA-Z_][a-zA-Z0-9_]*(\\.[a-zA-Z_][a-zA-Z0-9_]*)*$")
163 @staticmethod
164 def nameWithComponent(datasetTypeName: str, componentName: str) -> str:
165 """Form a valid DatasetTypeName from a parent and component.
167 No validation is performed.
169 Parameters
170 ----------
171 datasetTypeName : `str`
172 Base type name.
173 componentName : `str`
174 Name of component.
176 Returns
177 -------
178 compTypeName : `str`
179 Name to use for component DatasetType.
180 """
181 return f"{datasetTypeName}.{componentName}"
183 def __init__(
184 self,
185 name: str,
186 dimensions: DimensionGraph | Iterable[Dimension | str],
187 storageClass: StorageClass | str,
188 parentStorageClass: StorageClass | str | None = None,
189 *,
190 universe: DimensionUniverse | None = None,
191 isCalibration: bool = False,
192 ):
193 if self.VALID_NAME_REGEX.match(name) is None:
194 raise ValueError(f"DatasetType name '{name}' is invalid.")
195 self._name = name
196 if not isinstance(dimensions, DimensionGraph):
197 if universe is None:
198 raise ValueError(
199 "If dimensions is not a normalized DimensionGraph, a universe must be provided."
200 )
201 dimensions = universe.extract(dimensions)
202 self._dimensions = dimensions
203 if name in self._dimensions.universe.getGovernorDimensions().names:
204 raise ValueError(f"Governor dimension name {name} cannot be used as a dataset type name.")
205 if not isinstance(storageClass, (StorageClass, str)):
206 raise ValueError(f"StorageClass argument must be StorageClass or str. Got {storageClass}")
207 self._storageClass: StorageClass | None
208 if isinstance(storageClass, StorageClass):
209 self._storageClass = storageClass
210 self._storageClassName = storageClass.name
211 else:
212 self._storageClass = None
213 self._storageClassName = storageClass
215 self._parentStorageClass: StorageClass | None = None
216 self._parentStorageClassName: str | None = None
217 if parentStorageClass is not None:
218 if not isinstance(storageClass, (StorageClass, str)):
219 raise ValueError(
220 f"Parent StorageClass argument must be StorageClass or str. Got {parentStorageClass}"
221 )
223 # Only allowed for a component dataset type
224 _, componentName = self.splitDatasetTypeName(self._name)
225 if componentName is None:
226 raise ValueError(
227 f"Can not specify a parent storage class if this is not a component ({self._name})"
228 )
229 if isinstance(parentStorageClass, StorageClass):
230 self._parentStorageClass = parentStorageClass
231 self._parentStorageClassName = parentStorageClass.name
232 else:
233 self._parentStorageClassName = parentStorageClass
235 # Ensure that parent storage class is specified when we have
236 # a component and is not specified when we don't
237 _, componentName = self.splitDatasetTypeName(self._name)
238 if parentStorageClass is None and componentName is not None:
239 raise ValueError(
240 f"Component dataset type '{self._name}' constructed without parent storage class"
241 )
242 if parentStorageClass is not None and componentName is None:
243 raise ValueError(f"Parent storage class specified by {self._name} is not a composite")
244 self._isCalibration = isCalibration
246 def __repr__(self) -> str:
247 extra = ""
248 if self._parentStorageClassName:
249 extra = f", parentStorageClass={self._parentStorageClassName}"
250 if self._isCalibration:
251 extra += ", isCalibration=True"
252 return f"DatasetType({self.name!r}, {self.dimensions}, {self._storageClassName}{extra})"
254 def _equal_ignoring_storage_class(self, other: Any) -> bool:
255 """Check everything is equal except the storage class.
257 Parameters
258 ----------
259 other : Any
260 Object to check against this one.
262 Returns
263 -------
264 mostly : `bool`
265 Returns `True` if everything except the storage class is equal.
266 """
267 if not isinstance(other, type(self)):
268 return False
269 if self._name != other._name:
270 return False
271 if self._dimensions != other._dimensions:
272 return False
273 if self._isCalibration != other._isCalibration:
274 return False
275 if self._parentStorageClass is not None and other._parentStorageClass is not None:
276 return self._parentStorageClass == other._parentStorageClass
277 else:
278 return self._parentStorageClassName == other._parentStorageClassName
280 def __eq__(self, other: Any) -> bool:
281 mostly_equal = self._equal_ignoring_storage_class(other)
282 if not mostly_equal:
283 return False
285 # Be careful not to force a storage class to import the corresponding
286 # python code.
287 if self._storageClass is not None and other._storageClass is not None:
288 if self._storageClass != other._storageClass:
289 return False
290 else:
291 if self._storageClassName != other._storageClassName:
292 return False
293 return True
295 def is_compatible_with(self, other: DatasetType) -> bool:
296 """Determine if the given `DatasetType` is compatible with this one.
298 Compatibility requires a matching name and dimensions and a storage
299 class for this dataset type that can convert the python type associated
300 with the other storage class to this python type.
302 Parameters
303 ----------
304 other : `DatasetType`
305 Dataset type to check.
307 Returns
308 -------
309 is_compatible : `bool`
310 Returns `True` if the other dataset type is either the same as this
311 or the storage class associated with the other can be converted to
312 this.
313 """
314 mostly_equal = self._equal_ignoring_storage_class(other)
315 if not mostly_equal:
316 return False
318 # If the storage class names match then they are compatible.
319 if self._storageClassName == other._storageClassName:
320 return True
322 # Now required to check the full storage class.
323 self_sc = self.storageClass
324 other_sc = other.storageClass
326 return self_sc.can_convert(other_sc)
328 def __hash__(self) -> int:
329 """Hash DatasetType instance.
331 This only uses StorageClass name which is it consistent with the
332 implementation of StorageClass hash method.
333 """
334 return hash((self._name, self._dimensions, self._storageClassName, self._parentStorageClassName))
336 def __lt__(self, other: Any) -> bool:
337 """Sort using the dataset type name."""
338 if not isinstance(other, type(self)):
339 return NotImplemented
340 return self.name < other.name
342 @property
343 def name(self) -> str:
344 """Return a string name for the Dataset.
346 Must correspond to the same `DatasetType` across all Registries.
347 """
348 return self._name
350 @property
351 def dimensions(self) -> DimensionGraph:
352 r"""Return the `Dimension`\ s fir this dataset type.
354 The dimensions label and relate instances of this
355 `DatasetType` (`DimensionGraph`).
356 """
357 return self._dimensions
359 @property
360 def storageClass(self) -> StorageClass:
361 """Return `StorageClass` instance associated with this dataset type.
363 The `StorageClass` defines how this `DatasetType`
364 is persisted. Note that if DatasetType was constructed with a name
365 of a StorageClass then Butler has to be initialized before using
366 this property.
367 """
368 if self._storageClass is None:
369 self._storageClass = StorageClassFactory().getStorageClass(self._storageClassName)
370 return self._storageClass
372 @property
373 def storageClass_name(self) -> str:
374 """Return the storage class name.
376 This will never force the storage class to be imported.
377 """
378 return self._storageClassName
380 @property
381 def parentStorageClass(self) -> StorageClass | None:
382 """Return the storage class of the composite containing this component.
384 Note that if DatasetType was constructed with a name of a
385 StorageClass then Butler has to be initialized before using this
386 property. Can be `None` if this is not a component of a composite.
387 Must be defined if this is a component.
388 """
389 if self._parentStorageClass is None and self._parentStorageClassName is None:
390 return None
391 if self._parentStorageClass is None and self._parentStorageClassName is not None:
392 self._parentStorageClass = StorageClassFactory().getStorageClass(self._parentStorageClassName)
393 return self._parentStorageClass
395 def isCalibration(self) -> bool:
396 """Return if datasets of this type can be in calibration collections.
398 Returns
399 -------
400 flag : `bool`
401 `True` if datasets of this type may be included in calibration
402 collections.
403 """
404 return self._isCalibration
406 @staticmethod
407 def splitDatasetTypeName(datasetTypeName: str) -> tuple[str, str | None]:
408 """Return the root name and the component from a composite name.
410 Parameters
411 ----------
412 datasetTypeName : `str`
413 The name of the dataset type, can include a component using
414 a "."-separator.
416 Returns
417 -------
418 rootName : `str`
419 Root name without any components.
420 componentName : `str`
421 The component if it has been specified, else `None`.
423 Notes
424 -----
425 If the dataset type name is ``a.b.c`` this method will return a
426 root name of ``a`` and a component name of ``b.c``.
427 """
428 comp = None
429 root = datasetTypeName
430 if "." in root:
431 # If there is doubt, the component is after the first "."
432 root, comp = root.split(".", maxsplit=1)
433 return root, comp
435 def nameAndComponent(self) -> tuple[str, str | None]:
436 """Return the root name of this dataset type and any component.
438 Returns
439 -------
440 rootName : `str`
441 Root name for this `DatasetType` without any components.
442 componentName : `str`
443 The component if it has been specified, else `None`.
444 """
445 return self.splitDatasetTypeName(self.name)
447 def component(self) -> str | None:
448 """Return the component name (if defined).
450 Returns
451 -------
452 comp : `str`
453 Name of component part of DatasetType name. `None` if this
454 `DatasetType` is not associated with a component.
455 """
456 _, comp = self.nameAndComponent()
457 return comp
459 def componentTypeName(self, component: str) -> str:
460 """Derive a component dataset type from a composite.
462 Parameters
463 ----------
464 component : `str`
465 Name of component
467 Returns
468 -------
469 derived : `str`
470 Compound name of this `DatasetType` and the component.
472 Raises
473 ------
474 KeyError
475 Requested component is not supported by this `DatasetType`.
476 """
477 if component in self.storageClass.allComponents():
478 return self.nameWithComponent(self.name, component)
479 raise KeyError(f"Requested component ({component}) not understood by this DatasetType ({self})")
481 def makeCompositeDatasetType(self) -> DatasetType:
482 """Return a composite dataset type from the component.
484 Returns
485 -------
486 composite : `DatasetType`
487 The composite dataset type.
489 Raises
490 ------
491 RuntimeError
492 Raised if this dataset type is not a component dataset type.
493 """
494 if not self.isComponent():
495 raise RuntimeError(f"DatasetType {self.name} must be a component to form the composite")
496 composite_name, _ = self.nameAndComponent()
497 if self.parentStorageClass is None:
498 raise ValueError(
499 f"Parent storage class is not set. Unable to create composite type from {self.name}"
500 )
501 return DatasetType(
502 composite_name,
503 dimensions=self.dimensions,
504 storageClass=self.parentStorageClass,
505 isCalibration=self.isCalibration(),
506 )
508 def makeComponentDatasetType(self, component: str) -> DatasetType:
509 """Return a component dataset type from a composite.
511 Assumes the same dimensions as the parent.
513 Parameters
514 ----------
515 component : `str`
516 Name of component
518 Returns
519 -------
520 datasetType : `DatasetType`
521 A new DatasetType instance.
522 """
523 # The component could be a read/write or read component
524 return DatasetType(
525 self.componentTypeName(component),
526 dimensions=self.dimensions,
527 storageClass=self.storageClass.allComponents()[component],
528 parentStorageClass=self.storageClass,
529 isCalibration=self.isCalibration(),
530 )
532 def makeAllComponentDatasetTypes(self) -> list[DatasetType]:
533 """Return all component dataset types for this composite.
535 Returns
536 -------
537 all : `list` of `DatasetType`
538 All the component dataset types. If this is not a composite
539 then returns an empty list.
540 """
541 return [
542 self.makeComponentDatasetType(componentName)
543 for componentName in self.storageClass.allComponents()
544 ]
546 def overrideStorageClass(self, storageClass: str | StorageClass) -> DatasetType:
547 """Create a new `DatasetType` from this one but with an updated
548 `StorageClass`.
550 Parameters
551 ----------
552 storageClass : `str` or `StorageClass`
553 The new storage class.
555 Returns
556 -------
557 modified : `DatasetType`
558 A dataset type that is the same as the current one but with a
559 different storage class. Will be ``self`` if the given storage
560 class is the current one.
562 Notes
563 -----
564 If this is a component dataset type, the parent storage class will be
565 retained.
566 """
567 if storageClass == self._storageClassName or storageClass == self._storageClass:
568 return self
569 parent = self._parentStorageClass if self._parentStorageClass else self._parentStorageClassName
570 new = DatasetType(
571 self.name,
572 dimensions=self.dimensions,
573 storageClass=storageClass,
574 parentStorageClass=parent,
575 isCalibration=self.isCalibration(),
576 )
577 # Check validity.
578 if new.is_compatible_with(self) or self.is_compatible_with(new):
579 return new
580 raise ValueError(
581 f"The new storage class ({new.storageClass}) is not compatible with the "
582 f"existing storage class ({self.storageClass})."
583 )
585 def isComponent(self) -> bool:
586 """Return whether this `DatasetType` refers to a component.
588 Returns
589 -------
590 isComponent : `bool`
591 `True` if this `DatasetType` is a component, `False` otherwise.
592 """
593 if self.component():
594 return True
595 return False
597 def isComposite(self) -> bool:
598 """Return whether this `DatasetType` is a composite.
600 Returns
601 -------
602 isComposite : `bool`
603 `True` if this `DatasetType` is a composite type, `False`
604 otherwise.
605 """
606 return self.storageClass.isComposite()
608 def _lookupNames(self) -> tuple[LookupKey, ...]:
609 """Return name keys to use for lookups in configurations.
611 The names are returned in order of priority.
613 Returns
614 -------
615 names : `tuple` of `LookupKey`
616 Tuple of the `DatasetType` name and the `StorageClass` name.
617 If the name includes a component the name with the component
618 is first, then the name without the component and finally
619 the storage class name and the storage class name of the
620 composite.
621 """
622 rootName, componentName = self.nameAndComponent()
623 lookups: tuple[LookupKey, ...] = (LookupKey(name=self.name),)
624 if componentName is not None:
625 lookups = lookups + (LookupKey(name=rootName),)
627 if self.dimensions:
628 # Dimensions are a lower priority than dataset type name
629 lookups = lookups + (LookupKey(dimensions=self.dimensions),)
631 storageClasses = self.storageClass._lookupNames()
632 if componentName is not None and self.parentStorageClass is not None:
633 storageClasses += self.parentStorageClass._lookupNames()
635 return lookups + storageClasses
637 def to_simple(self, minimal: bool = False) -> SerializedDatasetType:
638 """Convert this class to a simple python type.
640 This makes it suitable for serialization.
642 Parameters
643 ----------
644 minimal : `bool`, optional
645 Use minimal serialization. Requires Registry to convert
646 back to a full type.
648 Returns
649 -------
650 simple : `SerializedDatasetType`
651 The object converted to a class suitable for serialization.
652 """
653 as_dict: dict[str, Any]
654 if minimal:
655 # Only needs the name.
656 as_dict = {"name": self.name}
657 else:
658 # Convert to a dict form
659 as_dict = {
660 "name": self.name,
661 "storageClass": self._storageClassName,
662 "isCalibration": self._isCalibration,
663 "dimensions": self.dimensions.to_simple(),
664 }
666 if self._parentStorageClassName is not None:
667 as_dict["parentStorageClass"] = self._parentStorageClassName
668 return SerializedDatasetType(**as_dict)
670 @classmethod
671 def from_simple(
672 cls,
673 simple: SerializedDatasetType,
674 universe: DimensionUniverse | None = None,
675 registry: Registry | None = None,
676 ) -> DatasetType:
677 """Construct a new object from the simplified form.
679 This is usually data returned from the `to_simple` method.
681 Parameters
682 ----------
683 simple : `SerializedDatasetType`
684 The value returned by `to_simple()`.
685 universe : `DimensionUniverse`
686 The special graph of all known dimensions of which this graph will
687 be a subset. Can be `None` if a registry is provided.
688 registry : `lsst.daf.butler.Registry`, optional
689 Registry to use to convert simple name of a DatasetType to
690 a full `DatasetType`. Can be `None` if a full description of
691 the type is provided along with a universe.
693 Returns
694 -------
695 datasetType : `DatasetType`
696 Newly-constructed object.
697 """
698 # check to see if there is a cache, and if there is, if there is a
699 # cached dataset type
700 cache = PersistenceContextVars.loadedTypes.get()
701 key = (simple.name, simple.storageClass or "")
702 if cache is not None and (type_ := cache.get(key, None)) is not None:
703 return type_
705 if simple.storageClass is None:
706 # Treat this as minimalist representation
707 if registry is None:
708 raise ValueError(
709 f"Unable to convert a DatasetType name '{simple}' to DatasetType without a Registry"
710 )
711 return registry.getDatasetType(simple.name)
713 if universe is None and registry is None:
714 raise ValueError("One of universe or registry must be provided.")
716 if universe is None and registry is not None:
717 # registry should not be none by now but test helps mypy
718 universe = registry.dimensions
720 if universe is None:
721 # this is for mypy
722 raise ValueError("Unable to determine a usable universe")
724 if simple.dimensions is None:
725 # mypy hint
726 raise ValueError(f"Dimensions must be specified in {simple}")
728 newType = cls(
729 name=simple.name,
730 dimensions=DimensionGraph.from_simple(simple.dimensions, universe=universe),
731 storageClass=simple.storageClass,
732 isCalibration=simple.isCalibration,
733 parentStorageClass=simple.parentStorageClass,
734 universe=universe,
735 )
736 if cache is not None:
737 cache[key] = newType
738 return newType
740 to_json = to_json_pydantic
741 from_json: ClassVar = classmethod(from_json_pydantic)
743 def __reduce__(
744 self,
745 ) -> tuple[
746 Callable, tuple[type[DatasetType], tuple[str, DimensionGraph, str, str | None], dict[str, bool]]
747 ]:
748 """Support pickling.
750 StorageClass instances can not normally be pickled, so we pickle
751 StorageClass name instead of instance.
752 """
753 return _unpickle_via_factory, (
754 self.__class__,
755 (self.name, self.dimensions, self._storageClassName, self._parentStorageClassName),
756 {"isCalibration": self._isCalibration},
757 )
759 def __deepcopy__(self, memo: Any) -> DatasetType:
760 """Support for deep copy method.
762 Normally ``deepcopy`` will use pickle mechanism to make copies.
763 We want to avoid that to support (possibly degenerate) use case when
764 DatasetType is constructed with StorageClass instance which is not
765 registered with StorageClassFactory (this happens in unit tests).
766 Instead we re-implement ``__deepcopy__`` method.
767 """
768 return DatasetType(
769 name=deepcopy(self.name, memo),
770 dimensions=deepcopy(self.dimensions, memo),
771 storageClass=deepcopy(self._storageClass or self._storageClassName, memo),
772 parentStorageClass=deepcopy(self._parentStorageClass or self._parentStorageClassName, memo),
773 isCalibration=deepcopy(self._isCalibration, memo),
774 )
777def _unpickle_via_factory(factory: Callable, args: Any, kwargs: Any) -> DatasetType:
778 """Unpickle something by calling a factory.
780 Allows subclasses to unpickle using `__reduce__` with keyword
781 arguments as well as positional arguments.
782 """
783 return factory(*args, **kwargs)