Coverage for python/lsst/daf/butler/core/datasets/type.py: 22%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

202 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["DatasetType", "SerializedDatasetType"] 

25 

26from copy import deepcopy 

27import re 

28 

29from types import MappingProxyType 

30 

31from typing import ( 

32 TYPE_CHECKING, 

33 Any, 

34 Callable, 

35 Dict, 

36 Iterable, 

37 List, 

38 Mapping, 

39 Optional, 

40 Tuple, 

41 Type, 

42 Union, 

43) 

44 

45from pydantic import BaseModel, StrictStr, StrictBool 

46 

47from ..storageClass import StorageClass, StorageClassFactory 

48from ..dimensions import DimensionGraph, SerializedDimensionGraph 

49from ..configSupport import LookupKey 

50from ..json import from_json_pydantic, to_json_pydantic 

51 

52if TYPE_CHECKING: 52 ↛ 53line 52 didn't jump to line 53, because the condition on line 52 was never true

53 from ..dimensions import Dimension, DimensionUniverse 

54 from ...registry import Registry 

55 

56 

57def _safeMakeMappingProxyType(data: Optional[Mapping]) -> Mapping: 

58 if data is None: 

59 data = {} 

60 return MappingProxyType(data) 

61 

62 

63class SerializedDatasetType(BaseModel): 

64 """Simplified model of a `DatasetType` suitable for serialization.""" 

65 

66 name: StrictStr 

67 storageClass: Optional[StrictStr] = None 

68 dimensions: Optional[SerializedDimensionGraph] = None 

69 parentStorageClass: Optional[StrictStr] = None 

70 isCalibration: StrictBool = False 

71 

72 

73class DatasetType: 

74 r"""A named category of Datasets. 

75 

76 Defines how they are organized, related, and stored. 

77 

78 A concrete, final class whose instances represent `DatasetType`\ s. 

79 `DatasetType` instances may be constructed without a `Registry`, 

80 but they must be registered 

81 via `Registry.registerDatasetType()` before corresponding Datasets 

82 may be added. 

83 `DatasetType` instances are immutable. 

84 

85 Parameters 

86 ---------- 

87 name : `str` 

88 A string name for the Dataset; must correspond to the same 

89 `DatasetType` across all Registries. Names must start with an 

90 upper or lowercase letter, and may contain only letters, numbers, 

91 and underscores. Component dataset types should contain a single 

92 period separating the base dataset type name from the component name 

93 (and may be recursive). 

94 dimensions : `DimensionGraph` or iterable of `Dimension` 

95 Dimensions used to label and relate instances of this `DatasetType`. 

96 If not a `DimensionGraph`, ``universe`` must be provided as well. 

97 storageClass : `StorageClass` or `str` 

98 Instance of a `StorageClass` or name of `StorageClass` that defines 

99 how this `DatasetType` is persisted. 

100 parentStorageClass : `StorageClass` or `str`, optional 

101 Instance of a `StorageClass` or name of `StorageClass` that defines 

102 how the composite parent is persisted. Must be `None` if this 

103 is not a component. Mandatory if it is a component but can be the 

104 special temporary placeholder 

105 (`DatasetType.PlaceholderParentStorageClass`) to allow 

106 construction with an intent to finalize later. 

107 universe : `DimensionUniverse`, optional 

108 Set of all known dimensions, used to normalize ``dimensions`` if it 

109 is not already a `DimensionGraph`. 

110 isCalibration : `bool`, optional 

111 If `True`, this dataset type may be included in 

112 `~CollectionType.CALIBRATION` collections. 

113 """ 

114 

115 __slots__ = ("_name", "_dimensions", "_storageClass", "_storageClassName", 

116 "_parentStorageClass", "_parentStorageClassName", 

117 "_isCalibration") 

118 

119 _serializedType = SerializedDatasetType 

120 

121 VALID_NAME_REGEX = re.compile("^[a-zA-Z][a-zA-Z0-9_]*(\\.[a-zA-Z][a-zA-Z0-9_]*)*$") 

122 

123 PlaceholderParentStorageClass = StorageClass("PlaceHolder") 

124 """Placeholder StorageClass that can be used temporarily for a 

125 component. 

126 

127 This can be useful in pipeline construction where we are creating 

128 dataset types without a registry. 

129 """ 

130 

131 @staticmethod 

132 def nameWithComponent(datasetTypeName: str, componentName: str) -> str: 

133 """Form a valid DatasetTypeName from a parent and component. 

134 

135 No validation is performed. 

136 

137 Parameters 

138 ---------- 

139 datasetTypeName : `str` 

140 Base type name. 

141 componentName : `str` 

142 Name of component. 

143 

144 Returns 

145 ------- 

146 compTypeName : `str` 

147 Name to use for component DatasetType. 

148 """ 

149 return "{}.{}".format(datasetTypeName, componentName) 

150 

151 def __init__(self, name: str, dimensions: Union[DimensionGraph, Iterable[Dimension]], 

152 storageClass: Union[StorageClass, str], 

153 parentStorageClass: Optional[Union[StorageClass, str]] = None, *, 

154 universe: Optional[DimensionUniverse] = None, 

155 isCalibration: bool = False): 

156 if self.VALID_NAME_REGEX.match(name) is None: 

157 raise ValueError(f"DatasetType name '{name}' is invalid.") 

158 self._name = name 

159 if not isinstance(dimensions, DimensionGraph): 

160 if universe is None: 

161 raise ValueError("If dimensions is not a normalized DimensionGraph, " 

162 "a universe must be provided.") 

163 dimensions = universe.extract(dimensions) 

164 self._dimensions = dimensions 

165 if name in self._dimensions.universe.getGovernorDimensions().names: 

166 raise ValueError(f"Governor dimension name {name} cannot be used as a dataset type name.") 

167 if not isinstance(storageClass, (StorageClass, str)): 

168 raise ValueError("StorageClass argument must be StorageClass or str. " 

169 f"Got {storageClass}") 

170 self._storageClass: Optional[StorageClass] 

171 if isinstance(storageClass, StorageClass): 

172 self._storageClass = storageClass 

173 self._storageClassName = storageClass.name 

174 else: 

175 self._storageClass = None 

176 self._storageClassName = storageClass 

177 

178 self._parentStorageClass: Optional[StorageClass] = None 

179 self._parentStorageClassName: Optional[str] = None 

180 if parentStorageClass is not None: 

181 if not isinstance(storageClass, (StorageClass, str)): 

182 raise ValueError("Parent StorageClass argument must be StorageClass or str. " 

183 f"Got {parentStorageClass}") 

184 

185 # Only allowed for a component dataset type 

186 _, componentName = self.splitDatasetTypeName(self._name) 

187 if componentName is None: 

188 raise ValueError("Can not specify a parent storage class if this is not a component" 

189 f" ({self._name})") 

190 if isinstance(parentStorageClass, StorageClass): 

191 self._parentStorageClass = parentStorageClass 

192 self._parentStorageClassName = parentStorageClass.name 

193 else: 

194 self._parentStorageClassName = parentStorageClass 

195 

196 # Ensure that parent storage class is specified when we have 

197 # a component and is not specified when we don't 

198 _, componentName = self.splitDatasetTypeName(self._name) 

199 if parentStorageClass is None and componentName is not None: 

200 raise ValueError(f"Component dataset type '{self._name}' constructed without parent" 

201 " storage class") 

202 if parentStorageClass is not None and componentName is None: 

203 raise ValueError(f"Parent storage class specified by {self._name} is not a composite") 

204 self._isCalibration = isCalibration 

205 

206 def __repr__(self) -> str: 

207 extra = "" 

208 if self._parentStorageClassName: 

209 extra = f", parentStorageClass={self._parentStorageClassName}" 

210 if self._isCalibration: 

211 extra += ", isCalibration=True" 

212 return f"DatasetType({self.name!r}, {self.dimensions}, {self._storageClassName}{extra})" 

213 

214 def __eq__(self, other: Any) -> bool: 

215 if not isinstance(other, type(self)): 

216 return False 

217 if self._name != other._name: 

218 return False 

219 if self._dimensions != other._dimensions: 

220 return False 

221 if self._storageClass is not None and other._storageClass is not None: 

222 if self._storageClass != other._storageClass: 

223 return False 

224 else: 

225 if self._storageClassName != other._storageClassName: 

226 return False 

227 if self._isCalibration != other._isCalibration: 

228 return False 

229 if self._parentStorageClass is not None and other._parentStorageClass is not None: 

230 return self._parentStorageClass == other._parentStorageClass 

231 else: 

232 return self._parentStorageClassName == other._parentStorageClassName 

233 

234 def __hash__(self) -> int: 

235 """Hash DatasetType instance. 

236 

237 This only uses StorageClass name which is it consistent with the 

238 implementation of StorageClass hash method. 

239 """ 

240 return hash((self._name, self._dimensions, self._storageClassName, 

241 self._parentStorageClassName)) 

242 

243 def __lt__(self, other: Any) -> bool: 

244 """Sort using the dataset type name.""" 

245 if not isinstance(other, type(self)): 

246 return NotImplemented 

247 return self.name < other.name 

248 

249 @property 

250 def name(self) -> str: 

251 """Return a string name for the Dataset. 

252 

253 Mmust correspond to the same `DatasetType` across all Registries. 

254 """ 

255 return self._name 

256 

257 @property 

258 def dimensions(self) -> DimensionGraph: 

259 r"""Return the `Dimension`\ s fir this dataset type. 

260 

261 The dimensions label and relate instances of this 

262 `DatasetType` (`DimensionGraph`). 

263 """ 

264 return self._dimensions 

265 

266 @property 

267 def storageClass(self) -> StorageClass: 

268 """Return `StorageClass` instance associated with this dataset type. 

269 

270 The `StorageClass` defines how this `DatasetType` 

271 is persisted. Note that if DatasetType was constructed with a name 

272 of a StorageClass then Butler has to be initialized before using 

273 this property. 

274 """ 

275 if self._storageClass is None: 

276 self._storageClass = StorageClassFactory().getStorageClass(self._storageClassName) 

277 return self._storageClass 

278 

279 @property 

280 def parentStorageClass(self) -> Optional[StorageClass]: 

281 """Return the storage class of the composite containing this component. 

282 

283 Note that if DatasetType was constructed with a name of a 

284 StorageClass then Butler has to be initialized before using this 

285 property. Can be `None` if this is not a component of a composite. 

286 Must be defined if this is a component. 

287 """ 

288 if self._parentStorageClass is None and self._parentStorageClassName is None: 

289 return None 

290 if self._parentStorageClass is None and self._parentStorageClassName is not None: 

291 self._parentStorageClass = StorageClassFactory().getStorageClass(self._parentStorageClassName) 

292 return self._parentStorageClass 

293 

294 def isCalibration(self) -> bool: 

295 """Return if datasets of this type can be in calibration collections. 

296 

297 Returns 

298 ------- 

299 flag : `bool` 

300 `True` if datasets of this type may be included in calibration 

301 collections. 

302 """ 

303 return self._isCalibration 

304 

305 def finalizeParentStorageClass(self, newParent: StorageClass) -> None: 

306 """Finalize the parent storage class definition. 

307 

308 Replaces the current placeholder parent storage class with 

309 the real parent. 

310 

311 Parameters 

312 ---------- 

313 newParent : `StorageClass` 

314 The new parent to be associated with this composite dataset 

315 type. This replaces the temporary placeholder parent that 

316 was specified during construction. 

317 

318 Raises 

319 ------ 

320 ValueError 

321 Raised if this dataset type is not a component of a composite. 

322 Raised if a StorageClass is not given. 

323 Raised if the parent currently associated with the dataset 

324 type is not a placeholder. 

325 """ 

326 if not self.isComponent(): 

327 raise ValueError("Can not set a parent storage class if this is not a component" 

328 f" ({self.name})") 

329 if self._parentStorageClass != self.PlaceholderParentStorageClass: 

330 raise ValueError(f"This DatasetType has a parent of {self._parentStorageClassName} and" 

331 " is not a placeholder.") 

332 if not isinstance(newParent, StorageClass): 

333 raise ValueError(f"Supplied parent must be a StorageClass. Got {newParent!r}") 

334 self._parentStorageClass = newParent 

335 self._parentStorageClassName = newParent.name 

336 

337 @staticmethod 

338 def splitDatasetTypeName(datasetTypeName: str) -> Tuple[str, Optional[str]]: 

339 """Return the root name and the component from a composite name. 

340 

341 Parameters 

342 ---------- 

343 datasetTypeName : `str` 

344 The name of the dataset type, can include a component using 

345 a "."-separator. 

346 

347 Returns 

348 ------- 

349 rootName : `str` 

350 Root name without any components. 

351 componentName : `str` 

352 The component if it has been specified, else `None`. 

353 

354 Notes 

355 ----- 

356 If the dataset type name is ``a.b.c`` this method will return a 

357 root name of ``a`` and a component name of ``b.c``. 

358 """ 

359 comp = None 

360 root = datasetTypeName 

361 if "." in root: 

362 # If there is doubt, the component is after the first "." 

363 root, comp = root.split(".", maxsplit=1) 

364 return root, comp 

365 

366 def nameAndComponent(self) -> Tuple[str, Optional[str]]: 

367 """Return the root name of this dataset type and any component. 

368 

369 Returns 

370 ------- 

371 rootName : `str` 

372 Root name for this `DatasetType` without any components. 

373 componentName : `str` 

374 The component if it has been specified, else `None`. 

375 """ 

376 return self.splitDatasetTypeName(self.name) 

377 

378 def component(self) -> Optional[str]: 

379 """Return the component name (if defined). 

380 

381 Returns 

382 ------- 

383 comp : `str` 

384 Name of component part of DatasetType name. `None` if this 

385 `DatasetType` is not associated with a component. 

386 """ 

387 _, comp = self.nameAndComponent() 

388 return comp 

389 

390 def componentTypeName(self, component: str) -> str: 

391 """Derive a component dataset type from a composite. 

392 

393 Parameters 

394 ---------- 

395 component : `str` 

396 Name of component 

397 

398 Returns 

399 ------- 

400 derived : `str` 

401 Compound name of this `DatasetType` and the component. 

402 

403 Raises 

404 ------ 

405 KeyError 

406 Requested component is not supported by this `DatasetType`. 

407 """ 

408 if component in self.storageClass.allComponents(): 

409 return self.nameWithComponent(self.name, component) 

410 raise KeyError("Requested component ({}) not understood by this DatasetType".format(component)) 

411 

412 def makeCompositeDatasetType(self) -> DatasetType: 

413 """Return a composite dataset type from the component. 

414 

415 Returns 

416 ------- 

417 composite : `DatasetType` 

418 The composite dataset type. 

419 

420 Raises 

421 ------ 

422 RuntimeError 

423 Raised if this dataset type is not a component dataset type. 

424 """ 

425 if not self.isComponent(): 

426 raise RuntimeError(f"DatasetType {self.name} must be a component to form the composite") 

427 composite_name, _ = self.nameAndComponent() 

428 if self.parentStorageClass is None: 

429 raise ValueError("Parent storage class is not set. " 

430 f"Unable to create composite type from {self.name}") 

431 return DatasetType(composite_name, dimensions=self.dimensions, 

432 storageClass=self.parentStorageClass) 

433 

434 def makeComponentDatasetType(self, component: str) -> DatasetType: 

435 """Return a component dataset type from a composite. 

436 

437 Assumes the same dimensions as the parent. 

438 

439 Parameters 

440 ---------- 

441 component : `str` 

442 Name of component 

443 

444 Returns 

445 ------- 

446 datasetType : `DatasetType` 

447 A new DatasetType instance. 

448 """ 

449 # The component could be a read/write or read component 

450 return DatasetType(self.componentTypeName(component), dimensions=self.dimensions, 

451 storageClass=self.storageClass.allComponents()[component], 

452 parentStorageClass=self.storageClass) 

453 

454 def makeAllComponentDatasetTypes(self) -> List[DatasetType]: 

455 """Return all component dataset types for this composite. 

456 

457 Returns 

458 ------- 

459 all : `list` of `DatasetType` 

460 All the component dataset types. If this is not a composite 

461 then returns an empty list. 

462 """ 

463 return [self.makeComponentDatasetType(componentName) 

464 for componentName in self.storageClass.allComponents()] 

465 

466 def isComponent(self) -> bool: 

467 """Return whether this `DatasetType` refers to a component. 

468 

469 Returns 

470 ------- 

471 isComponent : `bool` 

472 `True` if this `DatasetType` is a component, `False` otherwise. 

473 """ 

474 if self.component(): 

475 return True 

476 return False 

477 

478 def isComposite(self) -> bool: 

479 """Return whether this `DatasetType` is a composite. 

480 

481 Returns 

482 ------- 

483 isComposite : `bool` 

484 `True` if this `DatasetType` is a composite type, `False` 

485 otherwise. 

486 """ 

487 return self.storageClass.isComposite() 

488 

489 def _lookupNames(self) -> Tuple[LookupKey, ...]: 

490 """Return name keys to use for lookups in configurations. 

491 

492 The names are returned in order of priority. 

493 

494 Returns 

495 ------- 

496 names : `tuple` of `LookupKey` 

497 Tuple of the `DatasetType` name and the `StorageClass` name. 

498 If the name includes a component the name with the component 

499 is first, then the name without the component and finally 

500 the storage class name and the storage class name of the 

501 composite. 

502 """ 

503 rootName, componentName = self.nameAndComponent() 

504 lookups: Tuple[LookupKey, ...] = (LookupKey(name=self.name),) 

505 if componentName is not None: 

506 lookups = lookups + (LookupKey(name=rootName),) 

507 

508 if self.dimensions: 

509 # Dimensions are a lower priority than dataset type name 

510 lookups = lookups + (LookupKey(dimensions=self.dimensions),) 

511 

512 storageClasses = self.storageClass._lookupNames() 

513 if componentName is not None and self.parentStorageClass is not None: 

514 storageClasses += self.parentStorageClass._lookupNames() 

515 

516 return lookups + storageClasses 

517 

518 def to_simple(self, minimal: bool = False) -> SerializedDatasetType: 

519 """Convert this class to a simple python type. 

520 

521 This makes it suitable for serialization. 

522 

523 Parameters 

524 ---------- 

525 minimal : `bool`, optional 

526 Use minimal serialization. Requires Registry to convert 

527 back to a full type. 

528 

529 Returns 

530 ------- 

531 simple : `SerializedDatasetType` 

532 The object converted to a class suitable for serialization. 

533 """ 

534 as_dict: Dict[str, Any] 

535 if minimal: 

536 # Only needs the name. 

537 as_dict = {"name": self.name} 

538 else: 

539 # Convert to a dict form 

540 as_dict = {"name": self.name, 

541 "storageClass": self._storageClassName, 

542 "isCalibration": self._isCalibration, 

543 "dimensions": self.dimensions.to_simple(), 

544 } 

545 

546 if self._parentStorageClassName is not None: 

547 as_dict["parentStorageClass"] = self._parentStorageClassName 

548 return SerializedDatasetType(**as_dict) 

549 

550 @classmethod 

551 def from_simple(cls, simple: SerializedDatasetType, 

552 universe: Optional[DimensionUniverse] = None, 

553 registry: Optional[Registry] = None) -> DatasetType: 

554 """Construct a new object from the simplified form. 

555 

556 This is usally data returned from the `to_simple` method. 

557 

558 Parameters 

559 ---------- 

560 simple : `SerializedDatasetType` 

561 The value returned by `to_simple()`. 

562 universe : `DimensionUniverse` 

563 The special graph of all known dimensions of which this graph will 

564 be a subset. Can be `None` if a registry is provided. 

565 registry : `lsst.daf.butler.Registry`, optional 

566 Registry to use to convert simple name of a DatasetType to 

567 a full `DatasetType`. Can be `None` if a full description of 

568 the type is provided along with a universe. 

569 

570 Returns 

571 ------- 

572 datasetType : `DatasetType` 

573 Newly-constructed object. 

574 """ 

575 if simple.storageClass is None: 

576 # Treat this as minimalist representation 

577 if registry is None: 

578 raise ValueError(f"Unable to convert a DatasetType name '{simple}' to DatasetType" 

579 " without a Registry") 

580 return registry.getDatasetType(simple.name) 

581 

582 if universe is None and registry is None: 

583 raise ValueError("One of universe or registry must be provided.") 

584 

585 if universe is None and registry is not None: 

586 # registry should not be none by now but test helps mypy 

587 universe = registry.dimensions 

588 

589 if universe is None: 

590 # this is for mypy 

591 raise ValueError("Unable to determine a usable universe") 

592 

593 if simple.dimensions is None: 

594 # mypy hint 

595 raise ValueError(f"Dimensions must be specified in {simple}") 

596 

597 return cls(name=simple.name, 

598 dimensions=DimensionGraph.from_simple(simple.dimensions, universe=universe), 

599 storageClass=simple.storageClass, 

600 isCalibration=simple.isCalibration, 

601 parentStorageClass=simple.parentStorageClass, 

602 universe=universe) 

603 

604 to_json = to_json_pydantic 

605 from_json = classmethod(from_json_pydantic) 

606 

607 def __reduce__(self) -> Tuple[Callable, Tuple[Type[DatasetType], 

608 Tuple[str, DimensionGraph, str, Optional[str]], 

609 Dict[str, bool]]]: 

610 """Support pickling. 

611 

612 StorageClass instances can not normally be pickled, so we pickle 

613 StorageClass name instead of instance. 

614 """ 

615 return _unpickle_via_factory, (self.__class__, (self.name, self.dimensions, self._storageClassName, 

616 self._parentStorageClassName), 

617 {"isCalibration": self._isCalibration}) 

618 

619 def __deepcopy__(self, memo: Any) -> DatasetType: 

620 """Support for deep copy method. 

621 

622 Normally ``deepcopy`` will use pickle mechanism to make copies. 

623 We want to avoid that to support (possibly degenerate) use case when 

624 DatasetType is constructed with StorageClass instance which is not 

625 registered with StorageClassFactory (this happens in unit tests). 

626 Instead we re-implement ``__deepcopy__`` method. 

627 """ 

628 return DatasetType(name=deepcopy(self.name, memo), 

629 dimensions=deepcopy(self.dimensions, memo), 

630 storageClass=deepcopy(self._storageClass or self._storageClassName, memo), 

631 parentStorageClass=deepcopy(self._parentStorageClass 

632 or self._parentStorageClassName, memo), 

633 isCalibration=deepcopy(self._isCalibration, memo)) 

634 

635 

636def _unpickle_via_factory(factory: Callable, args: Any, kwargs: Any) -> DatasetType: 

637 """Unpickle something by calling a factory. 

638 

639 Allows subclasses to unpickle using `__reduce__` with keyword 

640 arguments as well as positional arguments. 

641 """ 

642 return factory(*args, **kwargs)