Coverage for python/lsst/daf/butler/core/datasets/type.py: 24%
235 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = ["DatasetType", "SerializedDatasetType"]
32import re
33from collections.abc import Callable, Iterable, Mapping
34from copy import deepcopy
35from types import MappingProxyType
36from typing import TYPE_CHECKING, Any, ClassVar
38from lsst.daf.butler._compat import _BaseModelCompat
39from pydantic import StrictBool, StrictStr
41from ..configSupport import LookupKey
42from ..dimensions import DimensionGraph, SerializedDimensionGraph
43from ..json import from_json_pydantic, to_json_pydantic
44from ..persistenceContext import PersistenceContextVars
45from ..storageClass import StorageClass, StorageClassFactory
47if TYPE_CHECKING:
48 from ...registry import Registry
49 from ..dimensions import Dimension, DimensionUniverse
52def _safeMakeMappingProxyType(data: Mapping | None) -> Mapping:
53 if data is None:
54 data = {}
55 return MappingProxyType(data)
58class SerializedDatasetType(_BaseModelCompat):
59 """Simplified model of a `DatasetType` suitable for serialization."""
61 name: StrictStr
62 storageClass: StrictStr | None = None
63 dimensions: SerializedDimensionGraph | None = None
64 parentStorageClass: StrictStr | None = None
65 isCalibration: StrictBool = False
67 @classmethod
68 def direct(
69 cls,
70 *,
71 name: str,
72 storageClass: str | None = None,
73 dimensions: dict | None = None,
74 parentStorageClass: str | None = None,
75 isCalibration: bool = False,
76 ) -> SerializedDatasetType:
77 """Construct a `SerializedDatasetType` directly without validators.
79 This differs from PyDantics construct method in that the arguments are
80 explicitly what the model requires, and it will recurse through
81 members, constructing them from their corresponding `direct` methods.
83 This method should only be called when the inputs are trusted.
84 """
85 cache = PersistenceContextVars.serializedDatasetTypeMapping.get()
86 key = (name, storageClass or "")
87 if cache is not None and (type_ := cache.get(key, None)) is not None:
88 return type_
90 serialized_dimensions = (
91 SerializedDimensionGraph.direct(**dimensions) if dimensions is not None else None
92 )
94 node = cls.model_construct(
95 name=name,
96 storageClass=storageClass,
97 dimensions=serialized_dimensions,
98 parentStorageClass=parentStorageClass,
99 isCalibration=isCalibration,
100 )
102 if cache is not None:
103 cache[key] = node
104 return node
107class DatasetType:
108 r"""A named category of Datasets.
110 Defines how they are organized, related, and stored.
112 A concrete, final class whose instances represent `DatasetType`\ s.
113 `DatasetType` instances may be constructed without a `Registry`,
114 but they must be registered
115 via `Registry.registerDatasetType()` before corresponding Datasets
116 may be added.
117 `DatasetType` instances are immutable.
119 Parameters
120 ----------
121 name : `str`
122 A string name for the Dataset; must correspond to the same
123 `DatasetType` across all Registries. Names must start with an
124 upper or lowercase letter, and may contain only letters, numbers,
125 and underscores. Component dataset types should contain a single
126 period separating the base dataset type name from the component name
127 (and may be recursive).
128 dimensions : `DimensionGraph` or iterable of `Dimension` or `str`
129 Dimensions used to label and relate instances of this `DatasetType`.
130 If not a `DimensionGraph`, ``universe`` must be provided as well.
131 storageClass : `StorageClass` or `str`
132 Instance of a `StorageClass` or name of `StorageClass` that defines
133 how this `DatasetType` is persisted.
134 parentStorageClass : `StorageClass` or `str`, optional
135 Instance of a `StorageClass` or name of `StorageClass` that defines
136 how the composite parent is persisted. Must be `None` if this
137 is not a component.
138 universe : `DimensionUniverse`, optional
139 Set of all known dimensions, used to normalize ``dimensions`` if it
140 is not already a `DimensionGraph`.
141 isCalibration : `bool`, optional
142 If `True`, this dataset type may be included in
143 `~CollectionType.CALIBRATION` collections.
145 See Also
146 --------
147 :ref:`daf_butler_organizing_datasets`
148 """
150 __slots__ = (
151 "_name",
152 "_dimensions",
153 "_storageClass",
154 "_storageClassName",
155 "_parentStorageClass",
156 "_parentStorageClassName",
157 "_isCalibration",
158 )
160 _serializedType = SerializedDatasetType
162 VALID_NAME_REGEX = re.compile("^[a-zA-Z_][a-zA-Z0-9_]*(\\.[a-zA-Z_][a-zA-Z0-9_]*)*$")
164 @staticmethod
165 def nameWithComponent(datasetTypeName: str, componentName: str) -> str:
166 """Form a valid DatasetTypeName from a parent and component.
168 No validation is performed.
170 Parameters
171 ----------
172 datasetTypeName : `str`
173 Base type name.
174 componentName : `str`
175 Name of component.
177 Returns
178 -------
179 compTypeName : `str`
180 Name to use for component DatasetType.
181 """
182 return f"{datasetTypeName}.{componentName}"
184 def __init__(
185 self,
186 name: str,
187 dimensions: DimensionGraph | Iterable[Dimension | str],
188 storageClass: StorageClass | str,
189 parentStorageClass: StorageClass | str | None = None,
190 *,
191 universe: DimensionUniverse | None = None,
192 isCalibration: bool = False,
193 ):
194 if self.VALID_NAME_REGEX.match(name) is None:
195 raise ValueError(f"DatasetType name '{name}' is invalid.")
196 self._name = name
197 if not isinstance(dimensions, DimensionGraph):
198 if universe is None:
199 raise ValueError(
200 "If dimensions is not a normalized DimensionGraph, a universe must be provided."
201 )
202 dimensions = universe.extract(dimensions)
203 self._dimensions = dimensions
204 if name in self._dimensions.universe.getGovernorDimensions().names:
205 raise ValueError(f"Governor dimension name {name} cannot be used as a dataset type name.")
206 if not isinstance(storageClass, StorageClass | str):
207 raise ValueError(f"StorageClass argument must be StorageClass or str. Got {storageClass}")
208 self._storageClass: StorageClass | None
209 if isinstance(storageClass, StorageClass):
210 self._storageClass = storageClass
211 self._storageClassName = storageClass.name
212 else:
213 self._storageClass = None
214 self._storageClassName = storageClass
216 self._parentStorageClass: StorageClass | None = None
217 self._parentStorageClassName: str | None = None
218 if parentStorageClass is not None:
219 if not isinstance(storageClass, StorageClass | str):
220 raise ValueError(
221 f"Parent StorageClass argument must be StorageClass or str. Got {parentStorageClass}"
222 )
224 # Only allowed for a component dataset type
225 _, componentName = self.splitDatasetTypeName(self._name)
226 if componentName is None:
227 raise ValueError(
228 f"Can not specify a parent storage class if this is not a component ({self._name})"
229 )
230 if isinstance(parentStorageClass, StorageClass):
231 self._parentStorageClass = parentStorageClass
232 self._parentStorageClassName = parentStorageClass.name
233 else:
234 self._parentStorageClassName = parentStorageClass
236 # Ensure that parent storage class is specified when we have
237 # a component and is not specified when we don't
238 _, componentName = self.splitDatasetTypeName(self._name)
239 if parentStorageClass is None and componentName is not None:
240 raise ValueError(
241 f"Component dataset type '{self._name}' constructed without parent storage class"
242 )
243 if parentStorageClass is not None and componentName is None:
244 raise ValueError(f"Parent storage class specified by {self._name} is not a composite")
245 self._isCalibration = isCalibration
247 def __repr__(self) -> str:
248 extra = ""
249 if self._parentStorageClassName:
250 extra = f", parentStorageClass={self._parentStorageClassName}"
251 if self._isCalibration:
252 extra += ", isCalibration=True"
253 return f"DatasetType({self.name!r}, {self.dimensions}, {self._storageClassName}{extra})"
255 def _equal_ignoring_storage_class(self, other: Any) -> bool:
256 """Check everything is equal except the storage class.
258 Parameters
259 ----------
260 other : Any
261 Object to check against this one.
263 Returns
264 -------
265 mostly : `bool`
266 Returns `True` if everything except the storage class is equal.
267 """
268 if not isinstance(other, type(self)):
269 return False
270 if self._name != other._name:
271 return False
272 if self._dimensions != other._dimensions:
273 return False
274 if self._isCalibration != other._isCalibration:
275 return False
276 if self._parentStorageClass is not None and other._parentStorageClass is not None:
277 return self._parentStorageClass == other._parentStorageClass
278 else:
279 return self._parentStorageClassName == other._parentStorageClassName
281 def __eq__(self, other: Any) -> bool:
282 mostly_equal = self._equal_ignoring_storage_class(other)
283 if not mostly_equal:
284 return False
286 # Be careful not to force a storage class to import the corresponding
287 # python code.
288 if self._storageClass is not None and other._storageClass is not None:
289 if self._storageClass != other._storageClass:
290 return False
291 else:
292 if self._storageClassName != other._storageClassName:
293 return False
294 return True
296 def is_compatible_with(self, other: DatasetType) -> bool:
297 """Determine if the given `DatasetType` is compatible with this one.
299 Compatibility requires a matching name and dimensions and a storage
300 class for this dataset type that can convert the python type associated
301 with the other storage class to this python type.
303 Parameters
304 ----------
305 other : `DatasetType`
306 Dataset type to check.
308 Returns
309 -------
310 is_compatible : `bool`
311 Returns `True` if the other dataset type is either the same as this
312 or the storage class associated with the other can be converted to
313 this.
314 """
315 mostly_equal = self._equal_ignoring_storage_class(other)
316 if not mostly_equal:
317 return False
319 # If the storage class names match then they are compatible.
320 if self._storageClassName == other._storageClassName:
321 return True
323 # Now required to check the full storage class.
324 self_sc = self.storageClass
325 other_sc = other.storageClass
327 return self_sc.can_convert(other_sc)
329 def __hash__(self) -> int:
330 """Hash DatasetType instance.
332 This only uses StorageClass name which is it consistent with the
333 implementation of StorageClass hash method.
334 """
335 return hash((self._name, self._dimensions, self._storageClassName, self._parentStorageClassName))
337 def __lt__(self, other: Any) -> bool:
338 """Sort using the dataset type name."""
339 if not isinstance(other, type(self)):
340 return NotImplemented
341 return self.name < other.name
343 @property
344 def name(self) -> str:
345 """Return a string name for the Dataset.
347 Must correspond to the same `DatasetType` across all Registries.
348 """
349 return self._name
351 @property
352 def dimensions(self) -> DimensionGraph:
353 r"""Return the `Dimension`\ s fir this dataset type.
355 The dimensions label and relate instances of this
356 `DatasetType` (`DimensionGraph`).
357 """
358 return self._dimensions
360 @property
361 def storageClass(self) -> StorageClass:
362 """Return `StorageClass` instance associated with this dataset type.
364 The `StorageClass` defines how this `DatasetType`
365 is persisted. Note that if DatasetType was constructed with a name
366 of a StorageClass then Butler has to be initialized before using
367 this property.
368 """
369 if self._storageClass is None:
370 self._storageClass = StorageClassFactory().getStorageClass(self._storageClassName)
371 return self._storageClass
373 @property
374 def storageClass_name(self) -> str:
375 """Return the storage class name.
377 This will never force the storage class to be imported.
378 """
379 return self._storageClassName
381 @property
382 def parentStorageClass(self) -> StorageClass | None:
383 """Return the storage class of the composite containing this component.
385 Note that if DatasetType was constructed with a name of a
386 StorageClass then Butler has to be initialized before using this
387 property. Can be `None` if this is not a component of a composite.
388 Must be defined if this is a component.
389 """
390 if self._parentStorageClass is None and self._parentStorageClassName is None:
391 return None
392 if self._parentStorageClass is None and self._parentStorageClassName is not None:
393 self._parentStorageClass = StorageClassFactory().getStorageClass(self._parentStorageClassName)
394 return self._parentStorageClass
396 def isCalibration(self) -> bool:
397 """Return if datasets of this type can be in calibration collections.
399 Returns
400 -------
401 flag : `bool`
402 `True` if datasets of this type may be included in calibration
403 collections.
404 """
405 return self._isCalibration
407 @staticmethod
408 def splitDatasetTypeName(datasetTypeName: str) -> tuple[str, str | None]:
409 """Return the root name and the component from a composite name.
411 Parameters
412 ----------
413 datasetTypeName : `str`
414 The name of the dataset type, can include a component using
415 a "."-separator.
417 Returns
418 -------
419 rootName : `str`
420 Root name without any components.
421 componentName : `str`
422 The component if it has been specified, else `None`.
424 Notes
425 -----
426 If the dataset type name is ``a.b.c`` this method will return a
427 root name of ``a`` and a component name of ``b.c``.
428 """
429 comp = None
430 root = datasetTypeName
431 if "." in root:
432 # If there is doubt, the component is after the first "."
433 root, comp = root.split(".", maxsplit=1)
434 return root, comp
436 def nameAndComponent(self) -> tuple[str, str | None]:
437 """Return the root name of this dataset type and any component.
439 Returns
440 -------
441 rootName : `str`
442 Root name for this `DatasetType` without any components.
443 componentName : `str`
444 The component if it has been specified, else `None`.
445 """
446 return self.splitDatasetTypeName(self.name)
448 def component(self) -> str | None:
449 """Return the component name (if defined).
451 Returns
452 -------
453 comp : `str`
454 Name of component part of DatasetType name. `None` if this
455 `DatasetType` is not associated with a component.
456 """
457 _, comp = self.nameAndComponent()
458 return comp
460 def componentTypeName(self, component: str) -> str:
461 """Derive a component dataset type from a composite.
463 Parameters
464 ----------
465 component : `str`
466 Name of component
468 Returns
469 -------
470 derived : `str`
471 Compound name of this `DatasetType` and the component.
473 Raises
474 ------
475 KeyError
476 Requested component is not supported by this `DatasetType`.
477 """
478 if component in self.storageClass.allComponents():
479 return self.nameWithComponent(self.name, component)
480 raise KeyError(f"Requested component ({component}) not understood by this DatasetType ({self})")
482 def makeCompositeDatasetType(self) -> DatasetType:
483 """Return a composite dataset type from the component.
485 Returns
486 -------
487 composite : `DatasetType`
488 The composite dataset type.
490 Raises
491 ------
492 RuntimeError
493 Raised if this dataset type is not a component dataset type.
494 """
495 if not self.isComponent():
496 raise RuntimeError(f"DatasetType {self.name} must be a component to form the composite")
497 composite_name, _ = self.nameAndComponent()
498 if self.parentStorageClass is None:
499 raise ValueError(
500 f"Parent storage class is not set. Unable to create composite type from {self.name}"
501 )
502 return DatasetType(
503 composite_name,
504 dimensions=self.dimensions,
505 storageClass=self.parentStorageClass,
506 isCalibration=self.isCalibration(),
507 )
509 def makeComponentDatasetType(self, component: str) -> DatasetType:
510 """Return a component dataset type from a composite.
512 Assumes the same dimensions as the parent.
514 Parameters
515 ----------
516 component : `str`
517 Name of component
519 Returns
520 -------
521 datasetType : `DatasetType`
522 A new DatasetType instance.
523 """
524 # The component could be a read/write or read component
525 return DatasetType(
526 self.componentTypeName(component),
527 dimensions=self.dimensions,
528 storageClass=self.storageClass.allComponents()[component],
529 parentStorageClass=self.storageClass,
530 isCalibration=self.isCalibration(),
531 )
533 def makeAllComponentDatasetTypes(self) -> list[DatasetType]:
534 """Return all component dataset types for this composite.
536 Returns
537 -------
538 all : `list` of `DatasetType`
539 All the component dataset types. If this is not a composite
540 then returns an empty list.
541 """
542 return [
543 self.makeComponentDatasetType(componentName)
544 for componentName in self.storageClass.allComponents()
545 ]
547 def overrideStorageClass(self, storageClass: str | StorageClass) -> DatasetType:
548 """Create a new `DatasetType` from this one but with an updated
549 `StorageClass`.
551 Parameters
552 ----------
553 storageClass : `str` or `StorageClass`
554 The new storage class.
556 Returns
557 -------
558 modified : `DatasetType`
559 A dataset type that is the same as the current one but with a
560 different storage class. Will be ``self`` if the given storage
561 class is the current one.
563 Notes
564 -----
565 If this is a component dataset type, the parent storage class will be
566 retained.
567 """
568 if storageClass == self._storageClassName or storageClass == self._storageClass:
569 return self
570 parent = self._parentStorageClass if self._parentStorageClass else self._parentStorageClassName
571 new = DatasetType(
572 self.name,
573 dimensions=self.dimensions,
574 storageClass=storageClass,
575 parentStorageClass=parent,
576 isCalibration=self.isCalibration(),
577 )
578 # Check validity.
579 if new.is_compatible_with(self) or self.is_compatible_with(new):
580 return new
581 raise ValueError(
582 f"The new storage class ({new.storageClass}) is not compatible with the "
583 f"existing storage class ({self.storageClass})."
584 )
586 def isComponent(self) -> bool:
587 """Return whether this `DatasetType` refers to a component.
589 Returns
590 -------
591 isComponent : `bool`
592 `True` if this `DatasetType` is a component, `False` otherwise.
593 """
594 if self.component():
595 return True
596 return False
598 def isComposite(self) -> bool:
599 """Return whether this `DatasetType` is a composite.
601 Returns
602 -------
603 isComposite : `bool`
604 `True` if this `DatasetType` is a composite type, `False`
605 otherwise.
606 """
607 return self.storageClass.isComposite()
609 def _lookupNames(self) -> tuple[LookupKey, ...]:
610 """Return name keys to use for lookups in configurations.
612 The names are returned in order of priority.
614 Returns
615 -------
616 names : `tuple` of `LookupKey`
617 Tuple of the `DatasetType` name and the `StorageClass` name.
618 If the name includes a component the name with the component
619 is first, then the name without the component and finally
620 the storage class name and the storage class name of the
621 composite.
622 """
623 rootName, componentName = self.nameAndComponent()
624 lookups: tuple[LookupKey, ...] = (LookupKey(name=self.name),)
625 if componentName is not None:
626 lookups = lookups + (LookupKey(name=rootName),)
628 if self.dimensions:
629 # Dimensions are a lower priority than dataset type name
630 lookups = lookups + (LookupKey(dimensions=self.dimensions),)
632 storageClasses = self.storageClass._lookupNames()
633 if componentName is not None and self.parentStorageClass is not None:
634 storageClasses += self.parentStorageClass._lookupNames()
636 return lookups + storageClasses
638 def to_simple(self, minimal: bool = False) -> SerializedDatasetType:
639 """Convert this class to a simple python type.
641 This makes it suitable for serialization.
643 Parameters
644 ----------
645 minimal : `bool`, optional
646 Use minimal serialization. Requires Registry to convert
647 back to a full type.
649 Returns
650 -------
651 simple : `SerializedDatasetType`
652 The object converted to a class suitable for serialization.
653 """
654 as_dict: dict[str, Any]
655 if minimal:
656 # Only needs the name.
657 as_dict = {"name": self.name}
658 else:
659 # Convert to a dict form
660 as_dict = {
661 "name": self.name,
662 "storageClass": self._storageClassName,
663 "isCalibration": self._isCalibration,
664 "dimensions": self.dimensions.to_simple(),
665 }
667 if self._parentStorageClassName is not None:
668 as_dict["parentStorageClass"] = self._parentStorageClassName
669 return SerializedDatasetType(**as_dict)
671 @classmethod
672 def from_simple(
673 cls,
674 simple: SerializedDatasetType,
675 universe: DimensionUniverse | None = None,
676 registry: Registry | None = None,
677 ) -> DatasetType:
678 """Construct a new object from the simplified form.
680 This is usually data returned from the `to_simple` method.
682 Parameters
683 ----------
684 simple : `SerializedDatasetType`
685 The value returned by `to_simple()`.
686 universe : `DimensionUniverse`
687 The special graph of all known dimensions of which this graph will
688 be a subset. Can be `None` if a registry is provided.
689 registry : `lsst.daf.butler.Registry`, optional
690 Registry to use to convert simple name of a DatasetType to
691 a full `DatasetType`. Can be `None` if a full description of
692 the type is provided along with a universe.
694 Returns
695 -------
696 datasetType : `DatasetType`
697 Newly-constructed object.
698 """
699 # check to see if there is a cache, and if there is, if there is a
700 # cached dataset type
701 cache = PersistenceContextVars.loadedTypes.get()
702 key = (simple.name, simple.storageClass or "")
703 if cache is not None and (type_ := cache.get(key, None)) is not None:
704 return type_
706 if simple.storageClass is None:
707 # Treat this as minimalist representation
708 if registry is None:
709 raise ValueError(
710 f"Unable to convert a DatasetType name '{simple}' to DatasetType without a Registry"
711 )
712 return registry.getDatasetType(simple.name)
714 if universe is None and registry is None:
715 raise ValueError("One of universe or registry must be provided.")
717 if universe is None and registry is not None:
718 # registry should not be none by now but test helps mypy
719 universe = registry.dimensions
721 if universe is None:
722 # this is for mypy
723 raise ValueError("Unable to determine a usable universe")
725 if simple.dimensions is None:
726 # mypy hint
727 raise ValueError(f"Dimensions must be specified in {simple}")
729 newType = cls(
730 name=simple.name,
731 dimensions=DimensionGraph.from_simple(simple.dimensions, universe=universe),
732 storageClass=simple.storageClass,
733 isCalibration=simple.isCalibration,
734 parentStorageClass=simple.parentStorageClass,
735 universe=universe,
736 )
737 if cache is not None:
738 cache[key] = newType
739 return newType
741 to_json = to_json_pydantic
742 from_json: ClassVar = classmethod(from_json_pydantic)
744 def __reduce__(
745 self,
746 ) -> tuple[
747 Callable, tuple[type[DatasetType], tuple[str, DimensionGraph, str, str | None], dict[str, bool]]
748 ]:
749 """Support pickling.
751 StorageClass instances can not normally be pickled, so we pickle
752 StorageClass name instead of instance.
753 """
754 return _unpickle_via_factory, (
755 self.__class__,
756 (self.name, self.dimensions, self._storageClassName, self._parentStorageClassName),
757 {"isCalibration": self._isCalibration},
758 )
760 def __deepcopy__(self, memo: Any) -> DatasetType:
761 """Support for deep copy method.
763 Normally ``deepcopy`` will use pickle mechanism to make copies.
764 We want to avoid that to support (possibly degenerate) use case when
765 DatasetType is constructed with StorageClass instance which is not
766 registered with StorageClassFactory (this happens in unit tests).
767 Instead we re-implement ``__deepcopy__`` method.
768 """
769 return DatasetType(
770 name=deepcopy(self.name, memo),
771 dimensions=deepcopy(self.dimensions, memo),
772 storageClass=deepcopy(self._storageClass or self._storageClassName, memo),
773 parentStorageClass=deepcopy(self._parentStorageClass or self._parentStorageClassName, memo),
774 isCalibration=deepcopy(self._isCalibration, memo),
775 )
778def _unpickle_via_factory(factory: Callable, args: Any, kwargs: Any) -> DatasetType:
779 """Unpickle something by calling a factory.
781 Allows subclasses to unpickle using `__reduce__` with keyword
782 arguments as well as positional arguments.
783 """
784 return factory(*args, **kwargs)