Coverage for python/lsst/daf/butler/core/datasets/type.py: 20%
226 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-23 09:30 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-23 09:30 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["DatasetType", "SerializedDatasetType"]
26import re
27from collections.abc import Callable, Iterable, Mapping
28from copy import deepcopy
29from types import MappingProxyType
30from typing import TYPE_CHECKING, Any, ClassVar
32from pydantic import BaseModel, StrictBool, StrictStr
34from ..configSupport import LookupKey
35from ..dimensions import DimensionGraph, SerializedDimensionGraph
36from ..json import from_json_pydantic, to_json_pydantic
37from ..storageClass import StorageClass, StorageClassFactory
39if TYPE_CHECKING:
40 from ...registry import Registry
41 from ..dimensions import Dimension, DimensionUniverse
44def _safeMakeMappingProxyType(data: Mapping | None) -> Mapping:
45 if data is None:
46 data = {}
47 return MappingProxyType(data)
50class SerializedDatasetType(BaseModel):
51 """Simplified model of a `DatasetType` suitable for serialization."""
53 name: StrictStr
54 storageClass: StrictStr | None = None
55 dimensions: SerializedDimensionGraph | None = None
56 parentStorageClass: StrictStr | None = None
57 isCalibration: StrictBool = False
59 @classmethod
60 def direct(
61 cls,
62 *,
63 name: str,
64 storageClass: str | None = None,
65 dimensions: dict | None = None,
66 parentStorageClass: str | None = None,
67 isCalibration: bool = False,
68 ) -> SerializedDatasetType:
69 """Construct a `SerializedDatasetType` directly without validators.
71 This differs from PyDantics construct method in that the arguments are
72 explicitly what the model requires, and it will recurse through
73 members, constructing them from their corresponding `direct` methods.
75 This method should only be called when the inputs are trusted.
76 """
77 node = SerializedDatasetType.__new__(cls)
78 setter = object.__setattr__
79 setter(node, "name", name)
80 setter(node, "storageClass", storageClass)
81 setter(
82 node,
83 "dimensions",
84 dimensions if dimensions is None else SerializedDimensionGraph.direct(**dimensions),
85 )
86 setter(node, "parentStorageClass", parentStorageClass)
87 setter(node, "isCalibration", isCalibration)
88 setter(
89 node,
90 "__fields_set__",
91 {"name", "storageClass", "dimensions", "parentStorageClass", "isCalibration"},
92 )
93 return node
96class DatasetType:
97 r"""A named category of Datasets.
99 Defines how they are organized, related, and stored.
101 A concrete, final class whose instances represent `DatasetType`\ s.
102 `DatasetType` instances may be constructed without a `Registry`,
103 but they must be registered
104 via `Registry.registerDatasetType()` before corresponding Datasets
105 may be added.
106 `DatasetType` instances are immutable.
108 Parameters
109 ----------
110 name : `str`
111 A string name for the Dataset; must correspond to the same
112 `DatasetType` across all Registries. Names must start with an
113 upper or lowercase letter, and may contain only letters, numbers,
114 and underscores. Component dataset types should contain a single
115 period separating the base dataset type name from the component name
116 (and may be recursive).
117 dimensions : `DimensionGraph` or iterable of `Dimension` or `str`
118 Dimensions used to label and relate instances of this `DatasetType`.
119 If not a `DimensionGraph`, ``universe`` must be provided as well.
120 storageClass : `StorageClass` or `str`
121 Instance of a `StorageClass` or name of `StorageClass` that defines
122 how this `DatasetType` is persisted.
123 parentStorageClass : `StorageClass` or `str`, optional
124 Instance of a `StorageClass` or name of `StorageClass` that defines
125 how the composite parent is persisted. Must be `None` if this
126 is not a component.
127 universe : `DimensionUniverse`, optional
128 Set of all known dimensions, used to normalize ``dimensions`` if it
129 is not already a `DimensionGraph`.
130 isCalibration : `bool`, optional
131 If `True`, this dataset type may be included in
132 `~CollectionType.CALIBRATION` collections.
134 See Also
135 --------
136 :ref:`daf_butler_organizing_datasets`
137 """
139 __slots__ = (
140 "_name",
141 "_dimensions",
142 "_storageClass",
143 "_storageClassName",
144 "_parentStorageClass",
145 "_parentStorageClassName",
146 "_isCalibration",
147 )
149 _serializedType = SerializedDatasetType
151 VALID_NAME_REGEX = re.compile("^[a-zA-Z_][a-zA-Z0-9_]*(\\.[a-zA-Z_][a-zA-Z0-9_]*)*$")
153 @staticmethod
154 def nameWithComponent(datasetTypeName: str, componentName: str) -> str:
155 """Form a valid DatasetTypeName from a parent and component.
157 No validation is performed.
159 Parameters
160 ----------
161 datasetTypeName : `str`
162 Base type name.
163 componentName : `str`
164 Name of component.
166 Returns
167 -------
168 compTypeName : `str`
169 Name to use for component DatasetType.
170 """
171 return f"{datasetTypeName}.{componentName}"
173 def __init__(
174 self,
175 name: str,
176 dimensions: DimensionGraph | Iterable[Dimension | str],
177 storageClass: StorageClass | str,
178 parentStorageClass: StorageClass | str | None = None,
179 *,
180 universe: DimensionUniverse | None = None,
181 isCalibration: bool = False,
182 ):
183 if self.VALID_NAME_REGEX.match(name) is None:
184 raise ValueError(f"DatasetType name '{name}' is invalid.")
185 self._name = name
186 if not isinstance(dimensions, DimensionGraph):
187 if universe is None:
188 raise ValueError(
189 "If dimensions is not a normalized DimensionGraph, a universe must be provided."
190 )
191 dimensions = universe.extract(dimensions)
192 self._dimensions = dimensions
193 if name in self._dimensions.universe.getGovernorDimensions().names:
194 raise ValueError(f"Governor dimension name {name} cannot be used as a dataset type name.")
195 if not isinstance(storageClass, (StorageClass, str)):
196 raise ValueError(f"StorageClass argument must be StorageClass or str. Got {storageClass}")
197 self._storageClass: StorageClass | None
198 if isinstance(storageClass, StorageClass):
199 self._storageClass = storageClass
200 self._storageClassName = storageClass.name
201 else:
202 self._storageClass = None
203 self._storageClassName = storageClass
205 self._parentStorageClass: StorageClass | None = None
206 self._parentStorageClassName: str | None = None
207 if parentStorageClass is not None:
208 if not isinstance(storageClass, (StorageClass, str)):
209 raise ValueError(
210 f"Parent StorageClass argument must be StorageClass or str. Got {parentStorageClass}"
211 )
213 # Only allowed for a component dataset type
214 _, componentName = self.splitDatasetTypeName(self._name)
215 if componentName is None:
216 raise ValueError(
217 f"Can not specify a parent storage class if this is not a component ({self._name})"
218 )
219 if isinstance(parentStorageClass, StorageClass):
220 self._parentStorageClass = parentStorageClass
221 self._parentStorageClassName = parentStorageClass.name
222 else:
223 self._parentStorageClassName = parentStorageClass
225 # Ensure that parent storage class is specified when we have
226 # a component and is not specified when we don't
227 _, componentName = self.splitDatasetTypeName(self._name)
228 if parentStorageClass is None and componentName is not None:
229 raise ValueError(
230 f"Component dataset type '{self._name}' constructed without parent storage class"
231 )
232 if parentStorageClass is not None and componentName is None:
233 raise ValueError(f"Parent storage class specified by {self._name} is not a composite")
234 self._isCalibration = isCalibration
236 def __repr__(self) -> str:
237 extra = ""
238 if self._parentStorageClassName:
239 extra = f", parentStorageClass={self._parentStorageClassName}"
240 if self._isCalibration:
241 extra += ", isCalibration=True"
242 return f"DatasetType({self.name!r}, {self.dimensions}, {self._storageClassName}{extra})"
244 def _equal_ignoring_storage_class(self, other: Any) -> bool:
245 """Check everything is equal except the storage class.
247 Parameters
248 ----------
249 other : Any
250 Object to check against this one.
252 Returns
253 -------
254 mostly : `bool`
255 Returns `True` if everything except the storage class is equal.
256 """
257 if not isinstance(other, type(self)):
258 return False
259 if self._name != other._name:
260 return False
261 if self._dimensions != other._dimensions:
262 return False
263 if self._isCalibration != other._isCalibration:
264 return False
265 if self._parentStorageClass is not None and other._parentStorageClass is not None:
266 return self._parentStorageClass == other._parentStorageClass
267 else:
268 return self._parentStorageClassName == other._parentStorageClassName
270 def __eq__(self, other: Any) -> bool:
271 mostly_equal = self._equal_ignoring_storage_class(other)
272 if not mostly_equal:
273 return False
275 # Be careful not to force a storage class to import the corresponding
276 # python code.
277 if self._storageClass is not None and other._storageClass is not None:
278 if self._storageClass != other._storageClass:
279 return False
280 else:
281 if self._storageClassName != other._storageClassName:
282 return False
283 return True
285 def is_compatible_with(self, other: DatasetType) -> bool:
286 """Determine if the given `DatasetType` is compatible with this one.
288 Compatibility requires a matching name and dimensions and a storage
289 class for this dataset type that can convert the python type associated
290 with the other storage class to this python type.
292 Parameters
293 ----------
294 other : `DatasetType`
295 Dataset type to check.
297 Returns
298 -------
299 is_compatible : `bool`
300 Returns `True` if the other dataset type is either the same as this
301 or the storage class associated with the other can be converted to
302 this.
303 """
304 mostly_equal = self._equal_ignoring_storage_class(other)
305 if not mostly_equal:
306 return False
308 # If the storage class names match then they are compatible.
309 if self._storageClassName == other._storageClassName:
310 return True
312 # Now required to check the full storage class.
313 self_sc = self.storageClass
314 other_sc = other.storageClass
316 return self_sc.can_convert(other_sc)
318 def __hash__(self) -> int:
319 """Hash DatasetType instance.
321 This only uses StorageClass name which is it consistent with the
322 implementation of StorageClass hash method.
323 """
324 return hash((self._name, self._dimensions, self._storageClassName, self._parentStorageClassName))
326 def __lt__(self, other: Any) -> bool:
327 """Sort using the dataset type name."""
328 if not isinstance(other, type(self)):
329 return NotImplemented
330 return self.name < other.name
332 @property
333 def name(self) -> str:
334 """Return a string name for the Dataset.
336 Must correspond to the same `DatasetType` across all Registries.
337 """
338 return self._name
340 @property
341 def dimensions(self) -> DimensionGraph:
342 r"""Return the `Dimension`\ s fir this dataset type.
344 The dimensions label and relate instances of this
345 `DatasetType` (`DimensionGraph`).
346 """
347 return self._dimensions
349 @property
350 def storageClass(self) -> StorageClass:
351 """Return `StorageClass` instance associated with this dataset type.
353 The `StorageClass` defines how this `DatasetType`
354 is persisted. Note that if DatasetType was constructed with a name
355 of a StorageClass then Butler has to be initialized before using
356 this property.
357 """
358 if self._storageClass is None:
359 self._storageClass = StorageClassFactory().getStorageClass(self._storageClassName)
360 return self._storageClass
362 @property
363 def storageClass_name(self) -> str:
364 """Return the storage class name.
366 This will never force the storage class to be imported.
367 """
368 return self._storageClassName
370 @property
371 def parentStorageClass(self) -> StorageClass | None:
372 """Return the storage class of the composite containing this component.
374 Note that if DatasetType was constructed with a name of a
375 StorageClass then Butler has to be initialized before using this
376 property. Can be `None` if this is not a component of a composite.
377 Must be defined if this is a component.
378 """
379 if self._parentStorageClass is None and self._parentStorageClassName is None:
380 return None
381 if self._parentStorageClass is None and self._parentStorageClassName is not None:
382 self._parentStorageClass = StorageClassFactory().getStorageClass(self._parentStorageClassName)
383 return self._parentStorageClass
385 def isCalibration(self) -> bool:
386 """Return if datasets of this type can be in calibration collections.
388 Returns
389 -------
390 flag : `bool`
391 `True` if datasets of this type may be included in calibration
392 collections.
393 """
394 return self._isCalibration
396 @staticmethod
397 def splitDatasetTypeName(datasetTypeName: str) -> tuple[str, str | None]:
398 """Return the root name and the component from a composite name.
400 Parameters
401 ----------
402 datasetTypeName : `str`
403 The name of the dataset type, can include a component using
404 a "."-separator.
406 Returns
407 -------
408 rootName : `str`
409 Root name without any components.
410 componentName : `str`
411 The component if it has been specified, else `None`.
413 Notes
414 -----
415 If the dataset type name is ``a.b.c`` this method will return a
416 root name of ``a`` and a component name of ``b.c``.
417 """
418 comp = None
419 root = datasetTypeName
420 if "." in root:
421 # If there is doubt, the component is after the first "."
422 root, comp = root.split(".", maxsplit=1)
423 return root, comp
425 def nameAndComponent(self) -> tuple[str, str | None]:
426 """Return the root name of this dataset type and any component.
428 Returns
429 -------
430 rootName : `str`
431 Root name for this `DatasetType` without any components.
432 componentName : `str`
433 The component if it has been specified, else `None`.
434 """
435 return self.splitDatasetTypeName(self.name)
437 def component(self) -> str | None:
438 """Return the component name (if defined).
440 Returns
441 -------
442 comp : `str`
443 Name of component part of DatasetType name. `None` if this
444 `DatasetType` is not associated with a component.
445 """
446 _, comp = self.nameAndComponent()
447 return comp
449 def componentTypeName(self, component: str) -> str:
450 """Derive a component dataset type from a composite.
452 Parameters
453 ----------
454 component : `str`
455 Name of component
457 Returns
458 -------
459 derived : `str`
460 Compound name of this `DatasetType` and the component.
462 Raises
463 ------
464 KeyError
465 Requested component is not supported by this `DatasetType`.
466 """
467 if component in self.storageClass.allComponents():
468 return self.nameWithComponent(self.name, component)
469 raise KeyError(f"Requested component ({component}) not understood by this DatasetType ({self})")
471 def makeCompositeDatasetType(self) -> DatasetType:
472 """Return a composite dataset type from the component.
474 Returns
475 -------
476 composite : `DatasetType`
477 The composite dataset type.
479 Raises
480 ------
481 RuntimeError
482 Raised if this dataset type is not a component dataset type.
483 """
484 if not self.isComponent():
485 raise RuntimeError(f"DatasetType {self.name} must be a component to form the composite")
486 composite_name, _ = self.nameAndComponent()
487 if self.parentStorageClass is None:
488 raise ValueError(
489 f"Parent storage class is not set. Unable to create composite type from {self.name}"
490 )
491 return DatasetType(
492 composite_name,
493 dimensions=self.dimensions,
494 storageClass=self.parentStorageClass,
495 isCalibration=self.isCalibration(),
496 )
498 def makeComponentDatasetType(self, component: str) -> DatasetType:
499 """Return a component dataset type from a composite.
501 Assumes the same dimensions as the parent.
503 Parameters
504 ----------
505 component : `str`
506 Name of component
508 Returns
509 -------
510 datasetType : `DatasetType`
511 A new DatasetType instance.
512 """
513 # The component could be a read/write or read component
514 return DatasetType(
515 self.componentTypeName(component),
516 dimensions=self.dimensions,
517 storageClass=self.storageClass.allComponents()[component],
518 parentStorageClass=self.storageClass,
519 isCalibration=self.isCalibration(),
520 )
522 def makeAllComponentDatasetTypes(self) -> list[DatasetType]:
523 """Return all component dataset types for this composite.
525 Returns
526 -------
527 all : `list` of `DatasetType`
528 All the component dataset types. If this is not a composite
529 then returns an empty list.
530 """
531 return [
532 self.makeComponentDatasetType(componentName)
533 for componentName in self.storageClass.allComponents()
534 ]
536 def overrideStorageClass(self, storageClass: str | StorageClass) -> DatasetType:
537 """Create a new `DatasetType` from this one but with an updated
538 `StorageClass`.
540 Parameters
541 ----------
542 storageClass : `str` or `StorageClass`
543 The new storage class.
545 Returns
546 -------
547 modified : `DatasetType`
548 A dataset type that is the same as the current one but with a
549 different storage class. Will be ``self`` if the given storage
550 class is the current one.
552 Notes
553 -----
554 If this is a component dataset type, the parent storage class will be
555 retained.
556 """
557 if storageClass == self._storageClassName or storageClass == self._storageClass:
558 return self
559 parent = self._parentStorageClass if self._parentStorageClass else self._parentStorageClassName
560 new = DatasetType(
561 self.name,
562 dimensions=self.dimensions,
563 storageClass=storageClass,
564 parentStorageClass=parent,
565 isCalibration=self.isCalibration(),
566 )
567 # Check validity.
568 if new.is_compatible_with(self) or self.is_compatible_with(new):
569 return new
570 raise ValueError(
571 f"The new storage class ({new.storageClass}) is not compatible with the "
572 f"existing storage class ({self.storageClass})."
573 )
575 def isComponent(self) -> bool:
576 """Return whether this `DatasetType` refers to a component.
578 Returns
579 -------
580 isComponent : `bool`
581 `True` if this `DatasetType` is a component, `False` otherwise.
582 """
583 if self.component():
584 return True
585 return False
587 def isComposite(self) -> bool:
588 """Return whether this `DatasetType` is a composite.
590 Returns
591 -------
592 isComposite : `bool`
593 `True` if this `DatasetType` is a composite type, `False`
594 otherwise.
595 """
596 return self.storageClass.isComposite()
598 def _lookupNames(self) -> tuple[LookupKey, ...]:
599 """Return name keys to use for lookups in configurations.
601 The names are returned in order of priority.
603 Returns
604 -------
605 names : `tuple` of `LookupKey`
606 Tuple of the `DatasetType` name and the `StorageClass` name.
607 If the name includes a component the name with the component
608 is first, then the name without the component and finally
609 the storage class name and the storage class name of the
610 composite.
611 """
612 rootName, componentName = self.nameAndComponent()
613 lookups: tuple[LookupKey, ...] = (LookupKey(name=self.name),)
614 if componentName is not None:
615 lookups = lookups + (LookupKey(name=rootName),)
617 if self.dimensions:
618 # Dimensions are a lower priority than dataset type name
619 lookups = lookups + (LookupKey(dimensions=self.dimensions),)
621 storageClasses = self.storageClass._lookupNames()
622 if componentName is not None and self.parentStorageClass is not None:
623 storageClasses += self.parentStorageClass._lookupNames()
625 return lookups + storageClasses
627 def to_simple(self, minimal: bool = False) -> SerializedDatasetType:
628 """Convert this class to a simple python type.
630 This makes it suitable for serialization.
632 Parameters
633 ----------
634 minimal : `bool`, optional
635 Use minimal serialization. Requires Registry to convert
636 back to a full type.
638 Returns
639 -------
640 simple : `SerializedDatasetType`
641 The object converted to a class suitable for serialization.
642 """
643 as_dict: dict[str, Any]
644 if minimal:
645 # Only needs the name.
646 as_dict = {"name": self.name}
647 else:
648 # Convert to a dict form
649 as_dict = {
650 "name": self.name,
651 "storageClass": self._storageClassName,
652 "isCalibration": self._isCalibration,
653 "dimensions": self.dimensions.to_simple(),
654 }
656 if self._parentStorageClassName is not None:
657 as_dict["parentStorageClass"] = self._parentStorageClassName
658 return SerializedDatasetType(**as_dict)
660 @classmethod
661 def from_simple(
662 cls,
663 simple: SerializedDatasetType,
664 universe: DimensionUniverse | None = None,
665 registry: Registry | None = None,
666 ) -> DatasetType:
667 """Construct a new object from the simplified form.
669 This is usually data returned from the `to_simple` method.
671 Parameters
672 ----------
673 simple : `SerializedDatasetType`
674 The value returned by `to_simple()`.
675 universe : `DimensionUniverse`
676 The special graph of all known dimensions of which this graph will
677 be a subset. Can be `None` if a registry is provided.
678 registry : `lsst.daf.butler.Registry`, optional
679 Registry to use to convert simple name of a DatasetType to
680 a full `DatasetType`. Can be `None` if a full description of
681 the type is provided along with a universe.
683 Returns
684 -------
685 datasetType : `DatasetType`
686 Newly-constructed object.
687 """
688 if simple.storageClass is None:
689 # Treat this as minimalist representation
690 if registry is None:
691 raise ValueError(
692 f"Unable to convert a DatasetType name '{simple}' to DatasetType without a Registry"
693 )
694 return registry.getDatasetType(simple.name)
696 if universe is None and registry is None:
697 raise ValueError("One of universe or registry must be provided.")
699 if universe is None and registry is not None:
700 # registry should not be none by now but test helps mypy
701 universe = registry.dimensions
703 if universe is None:
704 # this is for mypy
705 raise ValueError("Unable to determine a usable universe")
707 if simple.dimensions is None:
708 # mypy hint
709 raise ValueError(f"Dimensions must be specified in {simple}")
711 return cls(
712 name=simple.name,
713 dimensions=DimensionGraph.from_simple(simple.dimensions, universe=universe),
714 storageClass=simple.storageClass,
715 isCalibration=simple.isCalibration,
716 parentStorageClass=simple.parentStorageClass,
717 universe=universe,
718 )
720 to_json = to_json_pydantic
721 from_json: ClassVar = classmethod(from_json_pydantic)
723 def __reduce__(
724 self,
725 ) -> tuple[
726 Callable, tuple[type[DatasetType], tuple[str, DimensionGraph, str, str | None], dict[str, bool]]
727 ]:
728 """Support pickling.
730 StorageClass instances can not normally be pickled, so we pickle
731 StorageClass name instead of instance.
732 """
733 return _unpickle_via_factory, (
734 self.__class__,
735 (self.name, self.dimensions, self._storageClassName, self._parentStorageClassName),
736 {"isCalibration": self._isCalibration},
737 )
739 def __deepcopy__(self, memo: Any) -> DatasetType:
740 """Support for deep copy method.
742 Normally ``deepcopy`` will use pickle mechanism to make copies.
743 We want to avoid that to support (possibly degenerate) use case when
744 DatasetType is constructed with StorageClass instance which is not
745 registered with StorageClassFactory (this happens in unit tests).
746 Instead we re-implement ``__deepcopy__`` method.
747 """
748 return DatasetType(
749 name=deepcopy(self.name, memo),
750 dimensions=deepcopy(self.dimensions, memo),
751 storageClass=deepcopy(self._storageClass or self._storageClassName, memo),
752 parentStorageClass=deepcopy(self._parentStorageClass or self._parentStorageClassName, memo),
753 isCalibration=deepcopy(self._isCalibration, memo),
754 )
757def _unpickle_via_factory(factory: Callable, args: Any, kwargs: Any) -> DatasetType:
758 """Unpickle something by calling a factory.
760 Allows subclasses to unpickle using `__reduce__` with keyword
761 arguments as well as positional arguments.
762 """
763 return factory(*args, **kwargs)