Coverage for python/lsst/daf/butler/core/datasets/ref.py: 31%

183 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-02-14 02:05 -0800

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["AmbiguousDatasetError", "DatasetId", "DatasetRef", "SerializedDatasetRef"] 

24 

25import uuid 

26from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union 

27 

28from lsst.utils.classes import immutable 

29from pydantic import BaseModel, ConstrainedInt, StrictStr, validator 

30 

31from ..configSupport import LookupKey 

32from ..dimensions import DataCoordinate, DimensionGraph, DimensionUniverse, SerializedDataCoordinate 

33from ..json import from_json_pydantic, to_json_pydantic 

34from ..named import NamedKeyDict 

35from .type import DatasetType, SerializedDatasetType 

36 

37if TYPE_CHECKING: 37 ↛ 38line 37 didn't jump to line 38, because the condition on line 37 was never true

38 from ...registry import Registry 

39 from ..storageClass import StorageClass 

40 

41 

42class AmbiguousDatasetError(Exception): 

43 """Raised when a `DatasetRef` is not resolved but should be. 

44 

45 This happens when the `DatasetRef` has no ID or run but the requested 

46 operation requires one of them. 

47 """ 

48 

49 

50class PositiveInt(ConstrainedInt): 

51 ge = 0 

52 strict = True 

53 

54 

55class SerializedDatasetRef(BaseModel): 

56 """Simplified model of a `DatasetRef` suitable for serialization.""" 

57 

58 # DO NOT change order in the Union, pydantic is sensitive to that! 

59 id: Optional[Union[uuid.UUID, PositiveInt]] = None 

60 datasetType: Optional[SerializedDatasetType] = None 

61 dataId: Optional[SerializedDataCoordinate] = None 

62 run: Optional[StrictStr] = None 

63 component: Optional[StrictStr] = None 

64 

65 @validator("dataId") 

66 def _check_dataId(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805 

67 if (d := "datasetType") in values and values[d] is None: 

68 raise ValueError("Can not specify 'dataId' without specifying 'datasetType'") 

69 return v 

70 

71 @validator("run") 

72 def _check_run(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805 

73 if v and (i := "id") in values and values[i] is None: 

74 raise ValueError("'run' cannot be provided unless 'id' is.") 

75 return v 

76 

77 @validator("component") 

78 def _check_component(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805 

79 # Component should not be given if datasetType is given 

80 if v and (d := "datasetType") in values and values[d] is not None: 

81 raise ValueError(f"datasetType ({values[d]}) can not be set if component is given ({v}).") 

82 return v 

83 

84 @classmethod 

85 def direct( 

86 cls, 

87 *, 

88 id: Optional[Union[str, int]] = None, 

89 datasetType: Optional[Dict[str, Any]] = None, 

90 dataId: Optional[Dict[str, Any]] = None, 

91 run: str | None = None, 

92 component: Optional[str] = None, 

93 ) -> SerializedDatasetRef: 

94 """Construct a `SerializedDatasetRef` directly without validators. 

95 

96 This differs from the pydantic "construct" method in that the arguments 

97 are explicitly what the model requires, and it will recurse through 

98 members, constructing them from their corresponding `direct` methods. 

99 

100 This method should only be called when the inputs are trusted. 

101 """ 

102 node = SerializedDatasetRef.__new__(cls) 

103 setter = object.__setattr__ 

104 setter(node, "id", uuid.UUID(id) if isinstance(id, str) else id) 

105 setter( 

106 node, 

107 "datasetType", 

108 datasetType if datasetType is None else SerializedDatasetType.direct(**datasetType), 

109 ) 

110 setter(node, "dataId", dataId if dataId is None else SerializedDataCoordinate.direct(**dataId)) 

111 setter(node, "run", run) 

112 setter(node, "component", component) 

113 setter(node, "__fields_set__", {"id", "datasetType", "dataId", "run", "component"}) 

114 return node 

115 

116 

117DatasetId = Union[int, uuid.UUID] 

118"""A type-annotation alias for dataset ID which could be either integer or 

119UUID. 

120""" 

121 

122 

123@immutable 

124class DatasetRef: 

125 """Reference to a Dataset in a `Registry`. 

126 

127 A `DatasetRef` may point to a Dataset that currently does not yet exist 

128 (e.g., because it is a predicted input for provenance). 

129 

130 Parameters 

131 ---------- 

132 datasetType : `DatasetType` 

133 The `DatasetType` for this Dataset. 

134 dataId : `DataCoordinate` 

135 A mapping of dimensions that labels the Dataset within a Collection. 

136 id : `DatasetId`, optional 

137 The unique identifier assigned when the dataset is created. 

138 run : `str`, optional 

139 The name of the run this dataset was associated with when it was 

140 created. Must be provided if ``id`` is. 

141 conform : `bool`, optional 

142 If `True` (default), call `DataCoordinate.standardize` to ensure that 

143 the data ID's dimensions are consistent with the dataset type's. 

144 `DatasetRef` instances for which those dimensions are not equal should 

145 not be created in new code, but are still supported for backwards 

146 compatibility. New code should only pass `False` if it can guarantee 

147 that the dimensions are already consistent. 

148 

149 Raises 

150 ------ 

151 ValueError 

152 Raised if ``run`` is provided but ``id`` is not, or if ``id`` is 

153 provided but ``run`` is not. 

154 

155 See Also 

156 -------- 

157 :ref:`daf_butler_organizing_datasets` 

158 """ 

159 

160 _serializedType = SerializedDatasetRef 

161 __slots__ = ( 

162 "id", 

163 "datasetType", 

164 "dataId", 

165 "run", 

166 ) 

167 

168 def __init__( 

169 self, 

170 datasetType: DatasetType, 

171 dataId: DataCoordinate, 

172 *, 

173 id: Optional[DatasetId] = None, 

174 run: Optional[str] = None, 

175 conform: bool = True, 

176 ): 

177 self.id = id 

178 self.datasetType = datasetType 

179 if conform: 

180 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions) 

181 else: 

182 self.dataId = dataId 

183 if self.id is not None: 

184 if run is None: 

185 raise ValueError( 

186 f"Cannot provide id without run for dataset with id={id}, " 

187 f"type={datasetType}, and dataId={dataId}." 

188 ) 

189 self.run = run 

190 else: 

191 if run is not None: 

192 raise ValueError("'run' cannot be provided unless 'id' is.") 

193 self.run = None 

194 

195 def __eq__(self, other: Any) -> bool: 

196 try: 

197 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id) 

198 except AttributeError: 

199 return NotImplemented 

200 

201 def __hash__(self) -> int: 

202 return hash((self.datasetType, self.dataId, self.id)) 

203 

204 @property 

205 def dimensions(self) -> DimensionGraph: 

206 """Dimensions associated with the underlying `DatasetType`.""" 

207 return self.datasetType.dimensions 

208 

209 def __repr__(self) -> str: 

210 # We delegate to __str__ (i.e use "!s") for the data ID) below because 

211 # DataCoordinate's __repr__ - while adhering to the guidelines for 

212 # __repr__ - is much harder to users to read, while its __str__ just 

213 # produces a dict that can also be passed to DatasetRef's constructor. 

214 if self.id is not None: 

215 return f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r})" 

216 else: 

217 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})" 

218 

219 def __str__(self) -> str: 

220 s = f"{self.datasetType.name}@{self.dataId!s}, sc={self.datasetType.storageClass_name}]" 

221 if self.id is not None: 

222 s += f" (id={self.id})" 

223 return s 

224 

225 def __lt__(self, other: Any) -> bool: 

226 # Sort by run, DatasetType name and then by DataCoordinate 

227 # The __str__ representation is probably close enough but we 

228 # need to ensure that sorting a DatasetRef matches what you would 

229 # get if you sorted DatasetType+DataCoordinate 

230 if not isinstance(other, type(self)): 

231 return NotImplemented 

232 

233 # Group by run if defined, takes precedence over DatasetType 

234 self_run = "" if self.run is None else self.run 

235 other_run = "" if other.run is None else other.run 

236 

237 # Compare tuples in the priority order 

238 return (self_run, self.datasetType, self.dataId) < (other_run, other.datasetType, other.dataId) 

239 

240 def to_simple(self, minimal: bool = False) -> SerializedDatasetRef: 

241 """Convert this class to a simple python type. 

242 

243 This makes it suitable for serialization. 

244 

245 Parameters 

246 ---------- 

247 minimal : `bool`, optional 

248 Use minimal serialization. Requires Registry to convert 

249 back to a full type. 

250 

251 Returns 

252 ------- 

253 simple : `dict` or `int` 

254 The object converted to a dictionary. 

255 """ 

256 if minimal and self.id is not None: 

257 # The only thing needed to uniquely define a DatasetRef 

258 # is its id so that can be used directly if it is 

259 # resolved and if it is not a component DatasetRef. 

260 # Store is in a dict to allow us to easily add the planned 

261 # origin information later without having to support 

262 # an int and dict in simple form. 

263 simple: Dict[str, Any] = {"id": self.id} 

264 if self.isComponent(): 

265 # We can still be a little minimalist with a component 

266 # but we will also need to record the datasetType component 

267 simple["component"] = self.datasetType.component() 

268 return SerializedDatasetRef(**simple) 

269 

270 # Convert to a dict form 

271 as_dict: Dict[str, Any] = { 

272 "datasetType": self.datasetType.to_simple(minimal=minimal), 

273 "dataId": self.dataId.to_simple(), 

274 } 

275 

276 # Only include the id entry if it is defined 

277 if self.id is not None: 

278 as_dict["run"] = self.run 

279 as_dict["id"] = self.id 

280 

281 return SerializedDatasetRef(**as_dict) 

282 

283 @classmethod 

284 def from_simple( 

285 cls, 

286 simple: SerializedDatasetRef, 

287 universe: Optional[DimensionUniverse] = None, 

288 registry: Optional[Registry] = None, 

289 datasetType: Optional[DatasetType] = None, 

290 ) -> DatasetRef: 

291 """Construct a new object from simplified form. 

292 

293 Generally this is data returned from the `to_simple` method. 

294 

295 Parameters 

296 ---------- 

297 simple : `dict` of [`str`, `Any`] 

298 The value returned by `to_simple()`. 

299 universe : `DimensionUniverse` 

300 The special graph of all known dimensions. 

301 Can be `None` if a registry is provided. 

302 registry : `lsst.daf.butler.Registry`, optional 

303 Registry to use to convert simple form of a DatasetRef to 

304 a full `DatasetRef`. Can be `None` if a full description of 

305 the type is provided along with a universe. 

306 datasetType : DatasetType, optional 

307 If datasetType is supplied, this will be used as the datasetType 

308 object in the resulting DatasetRef instead of being read from 

309 the `SerializedDatasetRef`. This is useful when many refs share 

310 the same type as memory can be saved. Defaults to None. 

311 

312 Returns 

313 ------- 

314 ref : `DatasetRef` 

315 Newly-constructed object. 

316 """ 

317 # Minimalist component will just specify component and id and 

318 # require registry to reconstruct 

319 if set(simple.dict(exclude_unset=True, exclude_defaults=True)).issubset({"id", "component"}): 

320 if registry is None: 

321 raise ValueError("Registry is required to construct component DatasetRef from integer id") 

322 if simple.id is None: 

323 raise ValueError("For minimal DatasetRef the ID must be defined.") 

324 ref = registry.getDataset(simple.id) 

325 if ref is None: 

326 raise RuntimeError(f"No matching dataset found in registry for id {simple.id}") 

327 if simple.component: 

328 ref = ref.makeComponentRef(simple.component) 

329 return ref 

330 

331 if universe is None and registry is None: 

332 raise ValueError("One of universe or registry must be provided.") 

333 

334 if universe is None and registry is not None: 

335 universe = registry.dimensions 

336 

337 if universe is None: 

338 # this is for mypy 

339 raise ValueError("Unable to determine a usable universe") 

340 

341 if simple.datasetType is None and datasetType is None: 

342 # mypy 

343 raise ValueError("The DatasetType must be specified to construct a DatasetRef") 

344 if datasetType is None: 

345 if simple.datasetType is None: 

346 raise ValueError("Cannot determine Dataset type of this serialized class") 

347 datasetType = DatasetType.from_simple(simple.datasetType, universe=universe, registry=registry) 

348 

349 if simple.dataId is None: 

350 # mypy 

351 raise ValueError("The DataId must be specified to construct a DatasetRef") 

352 dataId = DataCoordinate.from_simple(simple.dataId, universe=universe) 

353 return cls(datasetType, dataId, id=simple.id, run=simple.run) 

354 

355 to_json = to_json_pydantic 

356 from_json = classmethod(from_json_pydantic) 

357 

358 @classmethod 

359 def _unpickle( 

360 cls, 

361 datasetType: DatasetType, 

362 dataId: DataCoordinate, 

363 id: Optional[DatasetId], 

364 run: Optional[str], 

365 ) -> DatasetRef: 

366 """Create new `DatasetRef`. 

367 

368 A custom factory method for use by `__reduce__` as a workaround for 

369 its lack of support for keyword arguments. 

370 """ 

371 return cls(datasetType, dataId, id=id, run=run) 

372 

373 def __reduce__(self) -> tuple: 

374 return (self._unpickle, (self.datasetType, self.dataId, self.id, self.run)) 

375 

376 def __deepcopy__(self, memo: dict) -> DatasetRef: 

377 # DatasetRef is recursively immutable; see note in @immutable 

378 # decorator. 

379 return self 

380 

381 def resolved(self, id: DatasetId, run: str) -> DatasetRef: 

382 """Return resolved `DatasetRef`. 

383 

384 This is a new `DatasetRef` with the same data ID and dataset type 

385 and the given ID and run. 

386 

387 Parameters 

388 ---------- 

389 id : `DatasetId` 

390 The unique identifier assigned when the dataset is created. 

391 run : `str` 

392 The run this dataset was associated with when it was created. 

393 

394 Returns 

395 ------- 

396 ref : `DatasetRef` 

397 A new `DatasetRef`. 

398 """ 

399 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, id=id, run=run, conform=False) 

400 

401 def unresolved(self) -> DatasetRef: 

402 """Return unresolved `DatasetRef`. 

403 

404 This is a new `DatasetRef` with the same data ID and dataset type, 

405 but no ID or run. 

406 

407 Returns 

408 ------- 

409 ref : `DatasetRef` 

410 A new `DatasetRef`. 

411 

412 Notes 

413 ----- 

414 This can be used to compare only the data ID and dataset type of a 

415 pair of `DatasetRef` instances, regardless of whether either is 

416 resolved:: 

417 

418 if ref1.unresolved() == ref2.unresolved(): 

419 ... 

420 """ 

421 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, conform=False) 

422 

423 def expanded(self, dataId: DataCoordinate) -> DatasetRef: 

424 """Return a new `DatasetRef` with the given expanded data ID. 

425 

426 Parameters 

427 ---------- 

428 dataId : `DataCoordinate` 

429 Data ID for the new `DatasetRef`. Must compare equal to the 

430 original data ID. 

431 

432 Returns 

433 ------- 

434 ref : `DatasetRef` 

435 A new `DatasetRef` with the given data ID. 

436 """ 

437 assert dataId == self.dataId 

438 return DatasetRef( 

439 datasetType=self.datasetType, dataId=dataId, id=self.id, run=self.run, conform=False 

440 ) 

441 

442 def isComponent(self) -> bool: 

443 """Indicate whether this `DatasetRef` refers to a component. 

444 

445 Returns 

446 ------- 

447 isComponent : `bool` 

448 `True` if this `DatasetRef` is a component, `False` otherwise. 

449 """ 

450 return self.datasetType.isComponent() 

451 

452 def isComposite(self) -> bool: 

453 """Boolean indicating whether this `DatasetRef` is a composite type. 

454 

455 Returns 

456 ------- 

457 isComposite : `bool` 

458 `True` if this `DatasetRef` is a composite type, `False` 

459 otherwise. 

460 """ 

461 return self.datasetType.isComposite() 

462 

463 def _lookupNames(self) -> Tuple[LookupKey, ...]: 

464 """Name keys to use when looking up this DatasetRef in a configuration. 

465 

466 The names are returned in order of priority. 

467 

468 Returns 

469 ------- 

470 names : `tuple` of `LookupKey` 

471 Tuple of the `DatasetType` name and the `StorageClass` name. 

472 If ``instrument`` is defined in the dataId, each of those names 

473 is added to the start of the tuple with a key derived from the 

474 value of ``instrument``. 

475 """ 

476 # Special case the instrument Dimension since we allow configs 

477 # to include the instrument name in the hierarchy. 

478 names: Tuple[LookupKey, ...] = self.datasetType._lookupNames() 

479 

480 if "instrument" in self.dataId: 

481 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) for n in names) + names 

482 

483 return names 

484 

485 @staticmethod 

486 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

487 """Group an iterable of `DatasetRef` by `DatasetType`. 

488 

489 Parameters 

490 ---------- 

491 refs : `Iterable` [ `DatasetRef` ] 

492 `DatasetRef` instances to group. 

493 

494 Returns 

495 ------- 

496 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ] 

497 Grouped `DatasetRef` instances. 

498 """ 

499 result: NamedKeyDict[DatasetType, List[DatasetRef]] = NamedKeyDict() 

500 for ref in refs: 

501 result.setdefault(ref.datasetType, []).append(ref) 

502 return result 

503 

504 def getCheckedId(self) -> DatasetId: 

505 """Return ``self.id``, or raise if it is `None`. 

506 

507 This trivial method exists to allow operations that would otherwise be 

508 natural list comprehensions to check that the ID is not `None` as well. 

509 

510 Returns 

511 ------- 

512 id : `DatasetId` 

513 ``self.id`` if it is not `None`. 

514 

515 Raises 

516 ------ 

517 AmbiguousDatasetError 

518 Raised if ``ref.id`` is `None`. 

519 """ 

520 if self.id is None: 

521 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; a resolved reference is required.") 

522 return self.id 

523 

524 def makeCompositeRef(self) -> DatasetRef: 

525 """Create a `DatasetRef` of the composite from a component ref. 

526 

527 Requires that this `DatasetRef` is a component. 

528 

529 Returns 

530 ------- 

531 ref : `DatasetRef` 

532 A `DatasetRef` with a dataset type that corresponds to the 

533 composite parent of this component, and the same ID and run 

534 (which may be `None`, if they are `None` in ``self``). 

535 """ 

536 # Assume that the data ID does not need to be standardized 

537 # and should match whatever this ref already has. 

538 return DatasetRef( 

539 self.datasetType.makeCompositeDatasetType(), self.dataId, id=self.id, run=self.run, conform=False 

540 ) 

541 

542 def makeComponentRef(self, name: str) -> DatasetRef: 

543 """Create a `DatasetRef` that corresponds to a component. 

544 

545 Parameters 

546 ---------- 

547 name : `str` 

548 Name of the component. 

549 

550 Returns 

551 ------- 

552 ref : `DatasetRef` 

553 A `DatasetRef` with a dataset type that corresponds to the given 

554 component, and the same ID and run 

555 (which may be `None`, if they are `None` in ``self``). 

556 """ 

557 # Assume that the data ID does not need to be standardized 

558 # and should match whatever this ref already has. 

559 return DatasetRef( 

560 self.datasetType.makeComponentDatasetType(name), 

561 self.dataId, 

562 id=self.id, 

563 run=self.run, 

564 conform=False, 

565 ) 

566 

567 def overrideStorageClass(self, storageClass: str | StorageClass) -> DatasetRef: 

568 """Create a new `DatasetRef` from this one, but with a modified 

569 `DatasetType` that has a different `StorageClass`. 

570 

571 Parameters 

572 ---------- 

573 storageClass : `str` or `StorageClass` 

574 The new storage class. 

575 

576 Returns 

577 ------- 

578 modified : `DatasetRef` 

579 A new dataset reference that is the same as the current one but 

580 with a different storage class in the `DatasetType`. 

581 """ 

582 return DatasetRef( 

583 datasetType=self.datasetType.overrideStorageClass(storageClass), 

584 dataId=self.dataId, 

585 id=self.id, 

586 run=self.run, 

587 conform=False, 

588 ) 

589 

590 datasetType: DatasetType 

591 """The definition of this dataset (`DatasetType`). 

592 

593 Cannot be changed after a `DatasetRef` is constructed. 

594 """ 

595 

596 dataId: DataCoordinate 

597 """A mapping of `Dimension` primary key values that labels the dataset 

598 within a Collection (`DataCoordinate`). 

599 

600 Cannot be changed after a `DatasetRef` is constructed. 

601 """ 

602 

603 run: Optional[str] 

604 """The name of the run that produced the dataset. 

605 

606 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

607 `unresolved` to add or remove this information when creating a new 

608 `DatasetRef`. 

609 """ 

610 

611 id: Optional[DatasetId] 

612 """Primary key of the dataset (`DatasetId` or `None`). 

613 

614 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

615 `unresolved` to add or remove this information when creating a new 

616 `DatasetRef`. 

617 """