Coverage for python/lsst/daf/butler/core/datasets/ref.py: 31%

183 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-02-01 02:05 -0800

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["AmbiguousDatasetError", "DatasetId", "DatasetRef", "SerializedDatasetRef"] 

24 

25import uuid 

26from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union 

27 

28from lsst.utils.classes import immutable 

29from pydantic import BaseModel, ConstrainedInt, StrictStr, validator 

30 

31from ..configSupport import LookupKey 

32from ..dimensions import DataCoordinate, DimensionGraph, DimensionUniverse, SerializedDataCoordinate 

33from ..json import from_json_pydantic, to_json_pydantic 

34from ..named import NamedKeyDict 

35from .type import DatasetType, SerializedDatasetType 

36 

37if TYPE_CHECKING: 37 ↛ 38line 37 didn't jump to line 38, because the condition on line 37 was never true

38 from ...registry import Registry 

39 from ..storageClass import StorageClass 

40 

41 

42class AmbiguousDatasetError(Exception): 

43 """Raised when a `DatasetRef` is not resolved but should be. 

44 

45 This happens when the `DatasetRef` has no ID or run but the requested 

46 operation requires one of them. 

47 """ 

48 

49 

50class PositiveInt(ConstrainedInt): 

51 ge = 0 

52 strict = True 

53 

54 

55class SerializedDatasetRef(BaseModel): 

56 """Simplified model of a `DatasetRef` suitable for serialization.""" 

57 

58 # DO NOT change order in the Union, pydantic is sensitive to that! 

59 id: Optional[Union[uuid.UUID, PositiveInt]] = None 

60 datasetType: Optional[SerializedDatasetType] = None 

61 dataId: Optional[SerializedDataCoordinate] = None 

62 run: Optional[StrictStr] = None 

63 component: Optional[StrictStr] = None 

64 

65 @validator("dataId") 

66 def _check_dataId(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805 

67 if (d := "datasetType") in values and values[d] is None: 

68 raise ValueError("Can not specify 'dataId' without specifying 'datasetType'") 

69 return v 

70 

71 @validator("run") 

72 def _check_run(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805 

73 if v and (i := "id") in values and values[i] is None: 

74 raise ValueError("'run' cannot be provided unless 'id' is.") 

75 return v 

76 

77 @validator("component") 

78 def _check_component(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805 

79 # Component should not be given if datasetType is given 

80 if v and (d := "datasetType") in values and values[d] is not None: 

81 raise ValueError(f"datasetType ({values[d]}) can not be set if component is given ({v}).") 

82 return v 

83 

84 @classmethod 

85 def direct( 

86 cls, 

87 *, 

88 id: Optional[Union[str, int]] = None, 

89 datasetType: Optional[Dict[str, Any]] = None, 

90 dataId: Optional[Dict[str, Any]] = None, 

91 run: str | None = None, 

92 component: Optional[str] = None, 

93 ) -> SerializedDatasetRef: 

94 """Construct a `SerializedDatasetRef` directly without validators. 

95 

96 This differs from the pydantic "construct" method in that the arguments 

97 are explicitly what the model requires, and it will recurse through 

98 members, constructing them from their corresponding `direct` methods. 

99 

100 This method should only be called when the inputs are trusted. 

101 """ 

102 node = SerializedDatasetRef.__new__(cls) 

103 setter = object.__setattr__ 

104 setter(node, "id", uuid.UUID(id) if isinstance(id, str) else id) 

105 setter( 

106 node, 

107 "datasetType", 

108 datasetType if datasetType is None else SerializedDatasetType.direct(**datasetType), 

109 ) 

110 setter(node, "dataId", dataId if dataId is None else SerializedDataCoordinate.direct(**dataId)) 

111 setter(node, "run", run) 

112 setter(node, "component", component) 

113 setter(node, "__fields_set__", {"id", "datasetType", "dataId", "run", "component"}) 

114 return node 

115 

116 

117DatasetId = Union[int, uuid.UUID] 

118"""A type-annotation alias for dataset ID which could be either integer or 

119UUID. 

120""" 

121 

122 

123@immutable 

124class DatasetRef: 

125 """Reference to a Dataset in a `Registry`. 

126 

127 A `DatasetRef` may point to a Dataset that currently does not yet exist 

128 (e.g., because it is a predicted input for provenance). 

129 

130 Parameters 

131 ---------- 

132 datasetType : `DatasetType` 

133 The `DatasetType` for this Dataset. 

134 dataId : `DataCoordinate` 

135 A mapping of dimensions that labels the Dataset within a Collection. 

136 id : `DatasetId`, optional 

137 The unique identifier assigned when the dataset is created. 

138 run : `str`, optional 

139 The name of the run this dataset was associated with when it was 

140 created. Must be provided if ``id`` is. 

141 conform : `bool`, optional 

142 If `True` (default), call `DataCoordinate.standardize` to ensure that 

143 the data ID's dimensions are consistent with the dataset type's. 

144 `DatasetRef` instances for which those dimensions are not equal should 

145 not be created in new code, but are still supported for backwards 

146 compatibility. New code should only pass `False` if it can guarantee 

147 that the dimensions are already consistent. 

148 

149 Raises 

150 ------ 

151 ValueError 

152 Raised if ``run`` is provided but ``id`` is not, or if ``id`` is 

153 provided but ``run`` is not. 

154 

155 See Also 

156 -------- 

157 :ref:`daf_butler_organizing_datasets` 

158 """ 

159 

160 _serializedType = SerializedDatasetRef 

161 __slots__ = ( 

162 "id", 

163 "datasetType", 

164 "dataId", 

165 "run", 

166 ) 

167 

168 def __init__( 

169 self, 

170 datasetType: DatasetType, 

171 dataId: DataCoordinate, 

172 *, 

173 id: Optional[DatasetId] = None, 

174 run: Optional[str] = None, 

175 conform: bool = True, 

176 ): 

177 self.id = id 

178 self.datasetType = datasetType 

179 if conform: 

180 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions) 

181 else: 

182 self.dataId = dataId 

183 if self.id is not None: 

184 if run is None: 

185 raise ValueError( 

186 f"Cannot provide id without run for dataset with id={id}, " 

187 f"type={datasetType}, and dataId={dataId}." 

188 ) 

189 self.run = run 

190 else: 

191 if run is not None: 

192 raise ValueError("'run' cannot be provided unless 'id' is.") 

193 self.run = None 

194 

195 def __eq__(self, other: Any) -> bool: 

196 try: 

197 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id) 

198 except AttributeError: 

199 return NotImplemented 

200 

201 def __hash__(self) -> int: 

202 return hash((self.datasetType, self.dataId, self.id)) 

203 

204 @property 

205 def dimensions(self) -> DimensionGraph: 

206 """Dimensions associated with the underlying `DatasetType`.""" 

207 return self.datasetType.dimensions 

208 

209 def __repr__(self) -> str: 

210 # We delegate to __str__ (i.e use "!s") for the data ID) below because 

211 # DataCoordinate's __repr__ - while adhering to the guidelines for 

212 # __repr__ - is much harder to users to read, while its __str__ just 

213 # produces a dict that can also be passed to DatasetRef's constructor. 

214 if self.id is not None: 

215 return f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r})" 

216 else: 

217 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})" 

218 

219 def __str__(self) -> str: 

220 s = f"{self.datasetType.name}@{self.dataId!s}, sc={self.datasetType.storageClass_name}]" 

221 if self.id is not None: 

222 s += f" (id={self.id})" 

223 return s 

224 

225 def __lt__(self, other: Any) -> bool: 

226 # Sort by run, DatasetType name and then by DataCoordinate 

227 # The __str__ representation is probably close enough but we 

228 # need to ensure that sorting a DatasetRef matches what you would 

229 # get if you sorted DatasetType+DataCoordinate 

230 if not isinstance(other, type(self)): 

231 return NotImplemented 

232 

233 # Group by run if defined, takes precedence over DatasetType 

234 self_run = "" if self.run is None else self.run 

235 other_run = "" if other.run is None else other.run 

236 

237 # Compare tuples in the priority order 

238 return (self_run, self.datasetType, self.dataId) < (other_run, other.datasetType, other.dataId) 

239 

240 def to_simple(self, minimal: bool = False) -> SerializedDatasetRef: 

241 """Convert this class to a simple python type. 

242 

243 This makes it suitable for serialization. 

244 

245 Parameters 

246 ---------- 

247 minimal : `bool`, optional 

248 Use minimal serialization. Requires Registry to convert 

249 back to a full type. 

250 

251 Returns 

252 ------- 

253 simple : `dict` or `int` 

254 The object converted to a dictionary. 

255 """ 

256 if minimal and self.id is not None: 

257 # The only thing needed to uniquely define a DatasetRef 

258 # is its id so that can be used directly if it is 

259 # resolved and if it is not a component DatasetRef. 

260 # Store is in a dict to allow us to easily add the planned 

261 # origin information later without having to support 

262 # an int and dict in simple form. 

263 simple: Dict[str, Any] = {"id": self.id} 

264 if self.isComponent(): 

265 # We can still be a little minimalist with a component 

266 # but we will also need to record the datasetType component 

267 simple["component"] = self.datasetType.component() 

268 return SerializedDatasetRef(**simple) 

269 

270 # Convert to a dict form 

271 as_dict: Dict[str, Any] = { 

272 "datasetType": self.datasetType.to_simple(minimal=minimal), 

273 "dataId": self.dataId.to_simple(), 

274 } 

275 

276 # Only include the id entry if it is defined 

277 if self.id is not None: 

278 as_dict["run"] = self.run 

279 as_dict["id"] = self.id 

280 

281 return SerializedDatasetRef(**as_dict) 

282 

283 @classmethod 

284 def from_simple( 

285 cls, 

286 simple: SerializedDatasetRef, 

287 universe: Optional[DimensionUniverse] = None, 

288 registry: Optional[Registry] = None, 

289 datasetType: Optional[DatasetType] = None, 

290 ) -> DatasetRef: 

291 """Construct a new object from simplified form. 

292 

293 Generally this is data returned from the `to_simple` method. 

294 

295 Parameters 

296 ---------- 

297 simple : `dict` of [`str`, `Any`] 

298 The value returned by `to_simple()`. 

299 universe : `DimensionUniverse` 

300 The special graph of all known dimensions. 

301 Can be `None` if a registry is provided. 

302 registry : `lsst.daf.butler.Registry`, optional 

303 Registry to use to convert simple form of a DatasetRef to 

304 a full `DatasetRef`. Can be `None` if a full description of 

305 the type is provided along with a universe. 

306 datasetType : DatasetType, optional 

307 If datasetType is supplied, this will be used as the datasetType 

308 object in the resulting DatasetRef instead of being read from 

309 the `SerializedDatasetRef`. This is useful when many refs share 

310 the same type as memory can be saved. Defaults to None. 

311 

312 Returns 

313 ------- 

314 ref : `DatasetRef` 

315 Newly-constructed object. 

316 """ 

317 # Minimalist component will just specify component and id and 

318 # require registry to reconstruct 

319 if set(simple.dict(exclude_unset=True, exclude_defaults=True)).issubset({"id", "component"}): 

320 if registry is None: 

321 raise ValueError("Registry is required to construct component DatasetRef from integer id") 

322 if simple.id is None: 

323 raise ValueError("For minimal DatasetRef the ID must be defined.") 

324 ref = registry.getDataset(simple.id) 

325 if ref is None: 

326 raise RuntimeError(f"No matching dataset found in registry for id {simple.id}") 

327 if simple.component: 

328 ref = ref.makeComponentRef(simple.component) 

329 return ref 

330 

331 if universe is None and registry is None: 

332 raise ValueError("One of universe or registry must be provided.") 

333 

334 if universe is None and registry is not None: 

335 universe = registry.dimensions 

336 

337 if universe is None: 

338 # this is for mypy 

339 raise ValueError("Unable to determine a usable universe") 

340 

341 if simple.datasetType is None and datasetType is None: 

342 # mypy 

343 raise ValueError("The DatasetType must be specified to construct a DatasetRef") 

344 if datasetType is None: 

345 if simple.datasetType is None: 

346 raise ValueError("Cannot determine Dataset type of this serialized class") 

347 datasetType = DatasetType.from_simple(simple.datasetType, universe=universe, registry=registry) 

348 

349 if simple.dataId is None: 

350 # mypy 

351 raise ValueError("The DataId must be specified to construct a DatasetRef") 

352 dataId = DataCoordinate.from_simple(simple.dataId, universe=universe) 

353 return cls(datasetType, dataId, id=simple.id, run=simple.run) 

354 

355 to_json = to_json_pydantic 

356 from_json = classmethod(from_json_pydantic) 

357 

358 @classmethod 

359 def _unpickle( 

360 cls, 

361 datasetType: DatasetType, 

362 dataId: DataCoordinate, 

363 id: Optional[DatasetId], 

364 run: Optional[str], 

365 ) -> DatasetRef: 

366 """Create new `DatasetRef`. 

367 

368 A custom factory method for use by `__reduce__` as a workaround for 

369 its lack of support for keyword arguments. 

370 """ 

371 return cls(datasetType, dataId, id=id, run=run) 

372 

373 def __reduce__(self) -> tuple: 

374 return (self._unpickle, (self.datasetType, self.dataId, self.id, self.run)) 

375 

376 def __deepcopy__(self, memo: dict) -> DatasetRef: 

377 # DatasetRef is recursively immutable; see note in @immutable 

378 # decorator. 

379 return self 

380 

381 def resolved(self, id: DatasetId, run: str) -> DatasetRef: 

382 """Return resolved `DatasetRef`. 

383 

384 This is a new `DatasetRef` with the same data ID and dataset type 

385 and the given ID and run. 

386 

387 Parameters 

388 ---------- 

389 id : `DatasetId` 

390 The unique identifier assigned when the dataset is created. 

391 run : `str` 

392 The run this dataset was associated with when it was created. 

393 

394 Returns 

395 ------- 

396 ref : `DatasetRef` 

397 A new `DatasetRef`. 

398 """ 

399 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, id=id, run=run, conform=False) 

400 

401 def unresolved(self) -> DatasetRef: 

402 """Return unresolved `DatasetRef`. 

403 

404 This is a new `DatasetRef` with the same data ID and dataset type, 

405 but no ID or run. 

406 

407 Returns 

408 ------- 

409 ref : `DatasetRef` 

410 A new `DatasetRef`. 

411 

412 Notes 

413 ----- 

414 This can be used to compare only the data ID and dataset type of a 

415 pair of `DatasetRef` instances, regardless of whether either is 

416 resolved:: 

417 

418 if ref1.unresolved() == ref2.unresolved(): 

419 ... 

420 """ 

421 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, conform=False) 

422 

423 def expanded(self, dataId: DataCoordinate) -> DatasetRef: 

424 """Return a new `DatasetRef` with the given expanded data ID. 

425 

426 Parameters 

427 ---------- 

428 dataId : `DataCoordinate` 

429 Data ID for the new `DatasetRef`. Must compare equal to the 

430 original data ID. 

431 

432 Returns 

433 ------- 

434 ref : `DatasetRef` 

435 A new `DatasetRef` with the given data ID. 

436 """ 

437 assert dataId == self.dataId 

438 return DatasetRef( 

439 datasetType=self.datasetType, dataId=dataId, id=self.id, run=self.run, conform=False 

440 ) 

441 

442 def isComponent(self) -> bool: 

443 """Indicate whether this `DatasetRef` refers to a component. 

444 

445 Returns 

446 ------- 

447 isComponent : `bool` 

448 `True` if this `DatasetRef` is a component, `False` otherwise. 

449 """ 

450 return self.datasetType.isComponent() 

451 

452 def isComposite(self) -> bool: 

453 """Boolean indicating whether this `DatasetRef` is a composite type. 

454 

455 Returns 

456 ------- 

457 isComposite : `bool` 

458 `True` if this `DatasetRef` is a composite type, `False` 

459 otherwise. 

460 """ 

461 return self.datasetType.isComposite() 

462 

463 def _lookupNames(self) -> Tuple[LookupKey, ...]: 

464 """Name keys to use when looking up this DatasetRef in a configuration. 

465 

466 The names are returned in order of priority. 

467 

468 Returns 

469 ------- 

470 names : `tuple` of `LookupKey` 

471 Tuple of the `DatasetType` name and the `StorageClass` name. 

472 If ``instrument`` is defined in the dataId, each of those names 

473 is added to the start of the tuple with a key derived from the 

474 value of ``instrument``. 

475 """ 

476 # Special case the instrument Dimension since we allow configs 

477 # to include the instrument name in the hierarchy. 

478 names: Tuple[LookupKey, ...] = self.datasetType._lookupNames() 

479 

480 # mypy doesn't think this could return True, because even though 

481 # __contains__ can take an object of any type, it seems hard-coded to 

482 # assume it will return False if the type doesn't match the key type 

483 # of the Mapping. 

484 if "instrument" in self.dataId: # type: ignore 

485 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) for n in names) + names 

486 

487 return names 

488 

489 @staticmethod 

490 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

491 """Group an iterable of `DatasetRef` by `DatasetType`. 

492 

493 Parameters 

494 ---------- 

495 refs : `Iterable` [ `DatasetRef` ] 

496 `DatasetRef` instances to group. 

497 

498 Returns 

499 ------- 

500 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ] 

501 Grouped `DatasetRef` instances. 

502 """ 

503 result: NamedKeyDict[DatasetType, List[DatasetRef]] = NamedKeyDict() 

504 for ref in refs: 

505 result.setdefault(ref.datasetType, []).append(ref) 

506 return result 

507 

508 def getCheckedId(self) -> DatasetId: 

509 """Return ``self.id``, or raise if it is `None`. 

510 

511 This trivial method exists to allow operations that would otherwise be 

512 natural list comprehensions to check that the ID is not `None` as well. 

513 

514 Returns 

515 ------- 

516 id : `DatasetId` 

517 ``self.id`` if it is not `None`. 

518 

519 Raises 

520 ------ 

521 AmbiguousDatasetError 

522 Raised if ``ref.id`` is `None`. 

523 """ 

524 if self.id is None: 

525 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; a resolved reference is required.") 

526 return self.id 

527 

528 def makeCompositeRef(self) -> DatasetRef: 

529 """Create a `DatasetRef` of the composite from a component ref. 

530 

531 Requires that this `DatasetRef` is a component. 

532 

533 Returns 

534 ------- 

535 ref : `DatasetRef` 

536 A `DatasetRef` with a dataset type that corresponds to the 

537 composite parent of this component, and the same ID and run 

538 (which may be `None`, if they are `None` in ``self``). 

539 """ 

540 # Assume that the data ID does not need to be standardized 

541 # and should match whatever this ref already has. 

542 return DatasetRef( 

543 self.datasetType.makeCompositeDatasetType(), self.dataId, id=self.id, run=self.run, conform=False 

544 ) 

545 

546 def makeComponentRef(self, name: str) -> DatasetRef: 

547 """Create a `DatasetRef` that corresponds to a component. 

548 

549 Parameters 

550 ---------- 

551 name : `str` 

552 Name of the component. 

553 

554 Returns 

555 ------- 

556 ref : `DatasetRef` 

557 A `DatasetRef` with a dataset type that corresponds to the given 

558 component, and the same ID and run 

559 (which may be `None`, if they are `None` in ``self``). 

560 """ 

561 # Assume that the data ID does not need to be standardized 

562 # and should match whatever this ref already has. 

563 return DatasetRef( 

564 self.datasetType.makeComponentDatasetType(name), 

565 self.dataId, 

566 id=self.id, 

567 run=self.run, 

568 conform=False, 

569 ) 

570 

571 def overrideStorageClass(self, storageClass: str | StorageClass) -> DatasetRef: 

572 """Create a new `DatasetRef` from this one, but with a modified 

573 `DatasetType` that has a different `StorageClass`. 

574 

575 Parameters 

576 ---------- 

577 storageClass : `str` or `StorageClass` 

578 The new storage class. 

579 

580 Returns 

581 ------- 

582 modified : `DatasetRef` 

583 A new dataset reference that is the same as the current one but 

584 with a different storage class in the `DatasetType`. 

585 """ 

586 return DatasetRef( 

587 datasetType=self.datasetType.overrideStorageClass(storageClass), 

588 dataId=self.dataId, 

589 id=self.id, 

590 run=self.run, 

591 conform=False, 

592 ) 

593 

594 datasetType: DatasetType 

595 """The definition of this dataset (`DatasetType`). 

596 

597 Cannot be changed after a `DatasetRef` is constructed. 

598 """ 

599 

600 dataId: DataCoordinate 

601 """A mapping of `Dimension` primary key values that labels the dataset 

602 within a Collection (`DataCoordinate`). 

603 

604 Cannot be changed after a `DatasetRef` is constructed. 

605 """ 

606 

607 run: Optional[str] 

608 """The name of the run that produced the dataset. 

609 

610 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

611 `unresolved` to add or remove this information when creating a new 

612 `DatasetRef`. 

613 """ 

614 

615 id: Optional[DatasetId] 

616 """Primary key of the dataset (`DatasetId` or `None`). 

617 

618 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

619 `unresolved` to add or remove this information when creating a new 

620 `DatasetRef`. 

621 """