Coverage for python/lsst/daf/butler/core/datasets/ref.py: 33%

180 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2022-09-30 02:19 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["AmbiguousDatasetError", "DatasetId", "DatasetRef", "SerializedDatasetRef"] 

24 

25import uuid 

26from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union 

27 

28from lsst.utils.classes import immutable 

29from pydantic import BaseModel, ConstrainedInt, StrictStr, validator 

30 

31from ..configSupport import LookupKey 

32from ..dimensions import DataCoordinate, DimensionGraph, DimensionUniverse, SerializedDataCoordinate 

33from ..json import from_json_pydantic, to_json_pydantic 

34from ..named import NamedKeyDict 

35from .type import DatasetType, SerializedDatasetType 

36 

37if TYPE_CHECKING: 37 ↛ 38line 37 didn't jump to line 38, because the condition on line 37 was never true

38 from ...registry import Registry 

39 

40 

41class AmbiguousDatasetError(Exception): 

42 """Raised when a `DatasetRef` is not resolved but should be. 

43 

44 This happens when the `DatasetRef` has no ID or run but the requested 

45 operation requires one of them. 

46 """ 

47 

48 

49class PositiveInt(ConstrainedInt): 

50 ge = 0 

51 strict = True 

52 

53 

54class SerializedDatasetRef(BaseModel): 

55 """Simplified model of a `DatasetRef` suitable for serialization.""" 

56 

57 # DO NOT change order in the Union, pydantic is sensitive to that! 

58 id: Optional[Union[uuid.UUID, PositiveInt]] = None 

59 datasetType: Optional[SerializedDatasetType] = None 

60 dataId: Optional[SerializedDataCoordinate] = None 

61 run: Optional[StrictStr] = None 

62 component: Optional[StrictStr] = None 

63 

64 @validator("dataId") 

65 def _check_dataId(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805 

66 if (d := "datasetType") in values and values[d] is None: 

67 raise ValueError("Can not specify 'dataId' without specifying 'datasetType'") 

68 return v 

69 

70 @validator("run") 

71 def _check_run(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805 

72 if v and (i := "id") in values and values[i] is None: 

73 raise ValueError("'run' cannot be provided unless 'id' is.") 

74 return v 

75 

76 @validator("component") 

77 def _check_component(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805 

78 # Component should not be given if datasetType is given 

79 if v and (d := "datasetType") in values and values[d] is not None: 

80 raise ValueError(f"datasetType ({values[d]}) can not be set if component is given ({v}).") 

81 return v 

82 

83 @classmethod 

84 def direct( 

85 cls, 

86 *, 

87 id: Optional[Union[str, int]] = None, 

88 datasetType: Optional[Dict[str, Any]] = None, 

89 dataId: Optional[Dict[str, Any]] = None, 

90 run: str = None, 

91 component: Optional[str] = None, 

92 ) -> SerializedDatasetRef: 

93 """Construct a `SerializedDatasetRef` directly without validators. 

94 

95 This differs from the pydantic "construct" method in that the arguments 

96 are explicitly what the model requires, and it will recurse through 

97 members, constructing them from their corresponding `direct` methods. 

98 

99 This method should only be called when the inputs are trusted. 

100 """ 

101 node = SerializedDatasetRef.__new__(cls) 

102 setter = object.__setattr__ 

103 setter(node, "id", uuid.UUID(id) if isinstance(id, str) else id) 

104 setter( 

105 node, 

106 "datasetType", 

107 datasetType if datasetType is None else SerializedDatasetType.direct(**datasetType), 

108 ) 

109 setter(node, "dataId", dataId if dataId is None else SerializedDataCoordinate.direct(**dataId)) 

110 setter(node, "run", run) 

111 setter(node, "component", component) 

112 setter(node, "__fields_set__", {"id", "datasetType", "dataId", "run", "component"}) 

113 return node 

114 

115 

116DatasetId = Union[int, uuid.UUID] 

117"""A type-annotation alias for dataset ID which could be either integer or 

118UUID. 

119""" 

120 

121 

122@immutable 

123class DatasetRef: 

124 """Reference to a Dataset in a `Registry`. 

125 

126 A `DatasetRef` may point to a Dataset that currently does not yet exist 

127 (e.g., because it is a predicted input for provenance). 

128 

129 Parameters 

130 ---------- 

131 datasetType : `DatasetType` 

132 The `DatasetType` for this Dataset. 

133 dataId : `DataCoordinate` 

134 A mapping of dimensions that labels the Dataset within a Collection. 

135 id : `DatasetId`, optional 

136 The unique identifier assigned when the dataset is created. 

137 run : `str`, optional 

138 The name of the run this dataset was associated with when it was 

139 created. Must be provided if ``id`` is. 

140 conform : `bool`, optional 

141 If `True` (default), call `DataCoordinate.standardize` to ensure that 

142 the data ID's dimensions are consistent with the dataset type's. 

143 `DatasetRef` instances for which those dimensions are not equal should 

144 not be created in new code, but are still supported for backwards 

145 compatibility. New code should only pass `False` if it can guarantee 

146 that the dimensions are already consistent. 

147 

148 Raises 

149 ------ 

150 ValueError 

151 Raised if ``run`` is provided but ``id`` is not, or if ``id`` is 

152 provided but ``run`` is not. 

153 

154 See Also 

155 -------- 

156 :ref:`daf_butler_organizing_datasets` 

157 """ 

158 

159 _serializedType = SerializedDatasetRef 

160 __slots__ = ( 

161 "id", 

162 "datasetType", 

163 "dataId", 

164 "run", 

165 ) 

166 

167 def __init__( 

168 self, 

169 datasetType: DatasetType, 

170 dataId: DataCoordinate, 

171 *, 

172 id: Optional[DatasetId] = None, 

173 run: Optional[str] = None, 

174 conform: bool = True, 

175 ): 

176 self.id = id 

177 self.datasetType = datasetType 

178 if conform: 

179 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions) 

180 else: 

181 self.dataId = dataId 

182 if self.id is not None: 

183 if run is None: 

184 raise ValueError( 

185 f"Cannot provide id without run for dataset with id={id}, " 

186 f"type={datasetType}, and dataId={dataId}." 

187 ) 

188 self.run = run 

189 else: 

190 if run is not None: 

191 raise ValueError("'run' cannot be provided unless 'id' is.") 

192 self.run = None 

193 

194 def __eq__(self, other: Any) -> bool: 

195 try: 

196 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id) 

197 except AttributeError: 

198 return NotImplemented 

199 

200 def __hash__(self) -> int: 

201 return hash((self.datasetType, self.dataId, self.id)) 

202 

203 @property 

204 def dimensions(self) -> DimensionGraph: 

205 """Dimensions associated with the underlying `DatasetType`.""" 

206 return self.datasetType.dimensions 

207 

208 def __repr__(self) -> str: 

209 # We delegate to __str__ (i.e use "!s") for the data ID) below because 

210 # DataCoordinate's __repr__ - while adhering to the guidelines for 

211 # __repr__ - is much harder to users to read, while its __str__ just 

212 # produces a dict that can also be passed to DatasetRef's constructor. 

213 if self.id is not None: 

214 return f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r})" 

215 else: 

216 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})" 

217 

218 def __str__(self) -> str: 

219 s = f"{self.datasetType.name}@{self.dataId!s}, sc={self.datasetType.storageClass_name}]" 

220 if self.id is not None: 

221 s += f" (id={self.id})" 

222 return s 

223 

224 def __lt__(self, other: Any) -> bool: 

225 # Sort by run, DatasetType name and then by DataCoordinate 

226 # The __str__ representation is probably close enough but we 

227 # need to ensure that sorting a DatasetRef matches what you would 

228 # get if you sorted DatasetType+DataCoordinate 

229 if not isinstance(other, type(self)): 

230 return NotImplemented 

231 

232 # Group by run if defined, takes precedence over DatasetType 

233 self_run = "" if self.run is None else self.run 

234 other_run = "" if other.run is None else other.run 

235 

236 # Compare tuples in the priority order 

237 return (self_run, self.datasetType, self.dataId) < (other_run, other.datasetType, other.dataId) 

238 

239 def to_simple(self, minimal: bool = False) -> SerializedDatasetRef: 

240 """Convert this class to a simple python type. 

241 

242 This makes it suitable for serialization. 

243 

244 Parameters 

245 ---------- 

246 minimal : `bool`, optional 

247 Use minimal serialization. Requires Registry to convert 

248 back to a full type. 

249 

250 Returns 

251 ------- 

252 simple : `dict` or `int` 

253 The object converted to a dictionary. 

254 """ 

255 if minimal and self.id is not None: 

256 # The only thing needed to uniquely define a DatasetRef 

257 # is its id so that can be used directly if it is 

258 # resolved and if it is not a component DatasetRef. 

259 # Store is in a dict to allow us to easily add the planned 

260 # origin information later without having to support 

261 # an int and dict in simple form. 

262 simple: Dict[str, Any] = {"id": self.id} 

263 if self.isComponent(): 

264 # We can still be a little minimalist with a component 

265 # but we will also need to record the datasetType component 

266 simple["component"] = self.datasetType.component() 

267 return SerializedDatasetRef(**simple) 

268 

269 # Convert to a dict form 

270 as_dict: Dict[str, Any] = { 

271 "datasetType": self.datasetType.to_simple(minimal=minimal), 

272 "dataId": self.dataId.to_simple(), 

273 } 

274 

275 # Only include the id entry if it is defined 

276 if self.id is not None: 

277 as_dict["run"] = self.run 

278 as_dict["id"] = self.id 

279 

280 return SerializedDatasetRef(**as_dict) 

281 

282 @classmethod 

283 def from_simple( 

284 cls, 

285 simple: SerializedDatasetRef, 

286 universe: Optional[DimensionUniverse] = None, 

287 registry: Optional[Registry] = None, 

288 datasetType: Optional[DatasetType] = None, 

289 ) -> DatasetRef: 

290 """Construct a new object from simplified form. 

291 

292 Generally this is data returned from the `to_simple` method. 

293 

294 Parameters 

295 ---------- 

296 simple : `dict` of [`str`, `Any`] 

297 The value returned by `to_simple()`. 

298 universe : `DimensionUniverse` 

299 The special graph of all known dimensions. 

300 Can be `None` if a registry is provided. 

301 registry : `lsst.daf.butler.Registry`, optional 

302 Registry to use to convert simple form of a DatasetRef to 

303 a full `DatasetRef`. Can be `None` if a full description of 

304 the type is provided along with a universe. 

305 datasetType : DatasetType, optional 

306 If datasetType is supplied, this will be used as the datasetType 

307 object in the resulting DatasetRef instead of being read from 

308 the `SerializedDatasetRef`. This is useful when many refs share 

309 the same type as memory can be saved. Defaults to None. 

310 

311 Returns 

312 ------- 

313 ref : `DatasetRef` 

314 Newly-constructed object. 

315 """ 

316 # Minimalist component will just specify component and id and 

317 # require registry to reconstruct 

318 if set(simple.dict(exclude_unset=True, exclude_defaults=True)).issubset({"id", "component"}): 

319 if registry is None: 

320 raise ValueError("Registry is required to construct component DatasetRef from integer id") 

321 if simple.id is None: 

322 raise ValueError("For minimal DatasetRef the ID must be defined.") 

323 ref = registry.getDataset(simple.id) 

324 if ref is None: 

325 raise RuntimeError(f"No matching dataset found in registry for id {simple.id}") 

326 if simple.component: 

327 ref = ref.makeComponentRef(simple.component) 

328 return ref 

329 

330 if universe is None and registry is None: 

331 raise ValueError("One of universe or registry must be provided.") 

332 

333 if universe is None and registry is not None: 

334 universe = registry.dimensions 

335 

336 if universe is None: 

337 # this is for mypy 

338 raise ValueError("Unable to determine a usable universe") 

339 

340 if simple.datasetType is None and datasetType is None: 

341 # mypy 

342 raise ValueError("The DatasetType must be specified to construct a DatasetRef") 

343 if datasetType is None: 

344 if simple.datasetType is None: 

345 raise ValueError("Cannot determine Dataset type of this serialized class") 

346 datasetType = DatasetType.from_simple(simple.datasetType, universe=universe, registry=registry) 

347 

348 if simple.dataId is None: 

349 # mypy 

350 raise ValueError("The DataId must be specified to construct a DatasetRef") 

351 dataId = DataCoordinate.from_simple(simple.dataId, universe=universe) 

352 return cls(datasetType, dataId, id=simple.id, run=simple.run) 

353 

354 to_json = to_json_pydantic 

355 from_json = classmethod(from_json_pydantic) 

356 

357 @classmethod 

358 def _unpickle( 

359 cls, 

360 datasetType: DatasetType, 

361 dataId: DataCoordinate, 

362 id: Optional[DatasetId], 

363 run: Optional[str], 

364 ) -> DatasetRef: 

365 """Create new `DatasetRef`. 

366 

367 A custom factory method for use by `__reduce__` as a workaround for 

368 its lack of support for keyword arguments. 

369 """ 

370 return cls(datasetType, dataId, id=id, run=run) 

371 

372 def __reduce__(self) -> tuple: 

373 return (self._unpickle, (self.datasetType, self.dataId, self.id, self.run)) 

374 

375 def __deepcopy__(self, memo: dict) -> DatasetRef: 

376 # DatasetRef is recursively immutable; see note in @immutable 

377 # decorator. 

378 return self 

379 

380 def resolved(self, id: DatasetId, run: str) -> DatasetRef: 

381 """Return resolved `DatasetRef`. 

382 

383 This is a new `DatasetRef` with the same data ID and dataset type 

384 and the given ID and run. 

385 

386 Parameters 

387 ---------- 

388 id : `DatasetId` 

389 The unique identifier assigned when the dataset is created. 

390 run : `str` 

391 The run this dataset was associated with when it was created. 

392 

393 Returns 

394 ------- 

395 ref : `DatasetRef` 

396 A new `DatasetRef`. 

397 """ 

398 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, id=id, run=run, conform=False) 

399 

400 def unresolved(self) -> DatasetRef: 

401 """Return unresolved `DatasetRef`. 

402 

403 This is a new `DatasetRef` with the same data ID and dataset type, 

404 but no ID or run. 

405 

406 Returns 

407 ------- 

408 ref : `DatasetRef` 

409 A new `DatasetRef`. 

410 

411 Notes 

412 ----- 

413 This can be used to compare only the data ID and dataset type of a 

414 pair of `DatasetRef` instances, regardless of whether either is 

415 resolved:: 

416 

417 if ref1.unresolved() == ref2.unresolved(): 

418 ... 

419 """ 

420 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, conform=False) 

421 

422 def expanded(self, dataId: DataCoordinate) -> DatasetRef: 

423 """Return a new `DatasetRef` with the given expanded data ID. 

424 

425 Parameters 

426 ---------- 

427 dataId : `DataCoordinate` 

428 Data ID for the new `DatasetRef`. Must compare equal to the 

429 original data ID. 

430 

431 Returns 

432 ------- 

433 ref : `DatasetRef` 

434 A new `DatasetRef` with the given data ID. 

435 """ 

436 assert dataId == self.dataId 

437 return DatasetRef( 

438 datasetType=self.datasetType, dataId=dataId, id=self.id, run=self.run, conform=False 

439 ) 

440 

441 def isComponent(self) -> bool: 

442 """Indicate whether this `DatasetRef` refers to a component. 

443 

444 Returns 

445 ------- 

446 isComponent : `bool` 

447 `True` if this `DatasetRef` is a component, `False` otherwise. 

448 """ 

449 return self.datasetType.isComponent() 

450 

451 def isComposite(self) -> bool: 

452 """Boolean indicating whether this `DatasetRef` is a composite type. 

453 

454 Returns 

455 ------- 

456 isComposite : `bool` 

457 `True` if this `DatasetRef` is a composite type, `False` 

458 otherwise. 

459 """ 

460 return self.datasetType.isComposite() 

461 

462 def _lookupNames(self) -> Tuple[LookupKey, ...]: 

463 """Name keys to use when looking up this DatasetRef in a configuration. 

464 

465 The names are returned in order of priority. 

466 

467 Returns 

468 ------- 

469 names : `tuple` of `LookupKey` 

470 Tuple of the `DatasetType` name and the `StorageClass` name. 

471 If ``instrument`` is defined in the dataId, each of those names 

472 is added to the start of the tuple with a key derived from the 

473 value of ``instrument``. 

474 """ 

475 # Special case the instrument Dimension since we allow configs 

476 # to include the instrument name in the hierarchy. 

477 names: Tuple[LookupKey, ...] = self.datasetType._lookupNames() 

478 

479 # mypy doesn't think this could return True, because even though 

480 # __contains__ can take an object of any type, it seems hard-coded to 

481 # assume it will return False if the type doesn't match the key type 

482 # of the Mapping. 

483 if "instrument" in self.dataId: # type: ignore 

484 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) for n in names) + names 

485 

486 return names 

487 

488 @staticmethod 

489 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

490 """Group an iterable of `DatasetRef` by `DatasetType`. 

491 

492 Parameters 

493 ---------- 

494 refs : `Iterable` [ `DatasetRef` ] 

495 `DatasetRef` instances to group. 

496 

497 Returns 

498 ------- 

499 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ] 

500 Grouped `DatasetRef` instances. 

501 """ 

502 result: NamedKeyDict[DatasetType, List[DatasetRef]] = NamedKeyDict() 

503 for ref in refs: 

504 result.setdefault(ref.datasetType, []).append(ref) 

505 return result 

506 

507 def getCheckedId(self) -> DatasetId: 

508 """Return ``self.id``, or raise if it is `None`. 

509 

510 This trivial method exists to allow operations that would otherwise be 

511 natural list comprehensions to check that the ID is not `None` as well. 

512 

513 Returns 

514 ------- 

515 id : `DatasetId` 

516 ``self.id`` if it is not `None`. 

517 

518 Raises 

519 ------ 

520 AmbiguousDatasetError 

521 Raised if ``ref.id`` is `None`. 

522 """ 

523 if self.id is None: 

524 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; a resolved reference is required.") 

525 return self.id 

526 

527 def makeCompositeRef(self) -> DatasetRef: 

528 """Create a `DatasetRef` of the composite from a component ref. 

529 

530 Requires that this `DatasetRef` is a component. 

531 

532 Returns 

533 ------- 

534 ref : `DatasetRef` 

535 A `DatasetRef` with a dataset type that corresponds to the 

536 composite parent of this component, and the same ID and run 

537 (which may be `None`, if they are `None` in ``self``). 

538 """ 

539 # Assume that the data ID does not need to be standardized 

540 # and should match whatever this ref already has. 

541 return DatasetRef( 

542 self.datasetType.makeCompositeDatasetType(), self.dataId, id=self.id, run=self.run, conform=False 

543 ) 

544 

545 def makeComponentRef(self, name: str) -> DatasetRef: 

546 """Create a `DatasetRef` that corresponds to a component. 

547 

548 Parameters 

549 ---------- 

550 name : `str` 

551 Name of the component. 

552 

553 Returns 

554 ------- 

555 ref : `DatasetRef` 

556 A `DatasetRef` with a dataset type that corresponds to the given 

557 component, and the same ID and run 

558 (which may be `None`, if they are `None` in ``self``). 

559 """ 

560 # Assume that the data ID does not need to be standardized 

561 # and should match whatever this ref already has. 

562 return DatasetRef( 

563 self.datasetType.makeComponentDatasetType(name), 

564 self.dataId, 

565 id=self.id, 

566 run=self.run, 

567 conform=False, 

568 ) 

569 

570 datasetType: DatasetType 

571 """The definition of this dataset (`DatasetType`). 

572 

573 Cannot be changed after a `DatasetRef` is constructed. 

574 """ 

575 

576 dataId: DataCoordinate 

577 """A mapping of `Dimension` primary key values that labels the dataset 

578 within a Collection (`DataCoordinate`). 

579 

580 Cannot be changed after a `DatasetRef` is constructed. 

581 """ 

582 

583 run: Optional[str] 

584 """The name of the run that produced the dataset. 

585 

586 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

587 `unresolved` to add or remove this information when creating a new 

588 `DatasetRef`. 

589 """ 

590 

591 id: Optional[DatasetId] 

592 """Primary key of the dataset (`DatasetId` or `None`). 

593 

594 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

595 `unresolved` to add or remove this information when creating a new 

596 `DatasetRef`. 

597 """