Coverage for python/lsst/daf/butler/core/datasets/ref.py: 32%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

176 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["AmbiguousDatasetError", "DatasetId", "DatasetRef", "SerializedDatasetRef"] 

24 

25import uuid 

26from typing import ( 

27 TYPE_CHECKING, 

28 Any, 

29 Dict, 

30 Iterable, 

31 List, 

32 Optional, 

33 Tuple, 

34 Union, 

35) 

36 

37from pydantic import BaseModel, StrictStr, ConstrainedInt, validator 

38 

39from lsst.utils.classes import immutable 

40from ..dimensions import DataCoordinate, DimensionGraph, DimensionUniverse, SerializedDataCoordinate 

41from ..configSupport import LookupKey 

42from ..named import NamedKeyDict 

43from .type import DatasetType, SerializedDatasetType 

44from ..json import from_json_pydantic, to_json_pydantic 

45 

46if TYPE_CHECKING: 46 ↛ 47line 46 didn't jump to line 47, because the condition on line 46 was never true

47 from ...registry import Registry 

48 

49 

50class AmbiguousDatasetError(Exception): 

51 """Raised when a `DatasetRef` is not resolved but should be. 

52 

53 This happens when the `DatasetRef` has no ID or run but the requested 

54 operation requires one of them. 

55 """ 

56 

57 

58class PositiveInt(ConstrainedInt): 

59 ge = 0 

60 strict = True 

61 

62 

63class SerializedDatasetRef(BaseModel): 

64 """Simplified model of a `DatasetRef` suitable for serialization.""" 

65 

66 # DO NOT change order in the Union, pydantic is sensitive to that! 

67 id: Optional[Union[uuid.UUID, PositiveInt]] = None 

68 datasetType: Optional[SerializedDatasetType] = None 

69 dataId: Optional[SerializedDataCoordinate] = None 

70 run: Optional[StrictStr] = None 

71 component: Optional[StrictStr] = None 

72 

73 @validator("dataId") 

74 def _check_dataId(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805 

75 if (d := "datasetType") in values and values[d] is None: 

76 raise ValueError("Can not specify 'dataId' without specifying 'datasetType'") 

77 return v 

78 

79 @validator("run") 

80 def _check_run(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805 

81 if v and (i := "id") in values and values[i] is None: 

82 raise ValueError("'run' cannot be provided unless 'id' is.") 

83 return v 

84 

85 @validator("component") 

86 def _check_component(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805 

87 # Component should not be given if datasetType is given 

88 if v and (d := "datasetType") in values and values[d] is not None: 

89 raise ValueError(f"datasetType ({values[d]}) can not be set if component is given ({v}).") 

90 return v 

91 

92 @classmethod 

93 def direct(cls, *, id: Optional[Union[str, int]] = None, datasetType: Optional[Dict[str, Any]] = None, 

94 dataId: Optional[Dict[str, Any]] = None, run: str = None, component: Optional[str] = None 

95 ) -> SerializedDatasetRef: 

96 """Construct a `SerializedDatasetRef` directly without validators. 

97 

98 This differs from the pydantic "construct" method in that the arguments 

99 are explicitly what the model requires, and it will recurse through 

100 members, constructing them from their corresponding `direct` methods. 

101 

102 This method should only be called when the inputs are trusted. 

103 """ 

104 node = SerializedDatasetRef.__new__(cls) 

105 setter = object.__setattr__ 

106 setter(node, 'id', uuid.UUID(id) if isinstance(id, str) else id) 

107 setter(node, 'datasetType', 

108 datasetType if datasetType is None else SerializedDatasetType.direct(**datasetType)) 

109 setter(node, 'dataId', dataId if dataId is None else SerializedDataCoordinate.direct(**dataId)) 

110 setter(node, 'run', run) 

111 setter(node, 'component', component) 

112 setter(node, '__fields_set__', {'id', 'datasetType', 'dataId', 'run', 'component'}) 

113 return node 

114 

115 

116DatasetId = Union[int, uuid.UUID] 

117"""A type-annotation alias for dataset ID which could be either integer or 

118UUID. 

119""" 

120 

121 

122@immutable 

123class DatasetRef: 

124 """Reference to a Dataset in a `Registry`. 

125 

126 A `DatasetRef` may point to a Dataset that currently does not yet exist 

127 (e.g., because it is a predicted input for provenance). 

128 

129 Parameters 

130 ---------- 

131 datasetType : `DatasetType` 

132 The `DatasetType` for this Dataset. 

133 dataId : `DataCoordinate` 

134 A mapping of dimensions that labels the Dataset within a Collection. 

135 id : `DatasetId`, optional 

136 The unique identifier assigned when the dataset is created. 

137 run : `str`, optional 

138 The name of the run this dataset was associated with when it was 

139 created. Must be provided if ``id`` is. 

140 conform : `bool`, optional 

141 If `True` (default), call `DataCoordinate.standardize` to ensure that 

142 the data ID's dimensions are consistent with the dataset type's. 

143 `DatasetRef` instances for which those dimensions are not equal should 

144 not be created in new code, but are still supported for backwards 

145 compatibility. New code should only pass `False` if it can guarantee 

146 that the dimensions are already consistent. 

147 

148 Raises 

149 ------ 

150 ValueError 

151 Raised if ``run`` is provided but ``id`` is not, or if ``id`` is 

152 provided but ``run`` is not. 

153 

154 See Also 

155 -------- 

156 :ref:`daf_butler_organizing_datasets` 

157 """ 

158 

159 _serializedType = SerializedDatasetRef 

160 __slots__ = ("id", "datasetType", "dataId", "run",) 

161 

162 def __init__( 

163 self, 

164 datasetType: DatasetType, dataId: DataCoordinate, *, 

165 id: Optional[DatasetId] = None, 

166 run: Optional[str] = None, 

167 conform: bool = True 

168 ): 

169 self.id = id 

170 self.datasetType = datasetType 

171 if conform: 

172 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions) 

173 else: 

174 self.dataId = dataId 

175 if self.id is not None: 

176 if run is None: 

177 raise ValueError(f"Cannot provide id without run for dataset with id={id}, " 

178 f"type={datasetType}, and dataId={dataId}.") 

179 self.run = run 

180 else: 

181 if run is not None: 

182 raise ValueError("'run' cannot be provided unless 'id' is.") 

183 self.run = None 

184 

185 def __eq__(self, other: Any) -> bool: 

186 try: 

187 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id) 

188 except AttributeError: 

189 return NotImplemented 

190 

191 def __hash__(self) -> int: 

192 return hash((self.datasetType, self.dataId, self.id)) 

193 

194 @property 

195 def dimensions(self) -> DimensionGraph: 

196 """Dimensions associated with the underlying `DatasetType`.""" 

197 return self.datasetType.dimensions 

198 

199 def __repr__(self) -> str: 

200 # We delegate to __str__ (i.e use "!s") for the data ID) below because 

201 # DataCoordinate's __repr__ - while adhering to the guidelines for 

202 # __repr__ - is much harder to users to read, while its __str__ just 

203 # produces a dict that can also be passed to DatasetRef's constructor. 

204 if self.id is not None: 

205 return (f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r})") 

206 else: 

207 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})" 

208 

209 def __str__(self) -> str: 

210 s = f"{self.datasetType.name}@{self.dataId!s}, sc={self.datasetType.storageClass.name}]" 

211 if self.id is not None: 

212 s += f" (id={self.id})" 

213 return s 

214 

215 def __lt__(self, other: Any) -> bool: 

216 # Sort by run, DatasetType name and then by DataCoordinate 

217 # The __str__ representation is probably close enough but we 

218 # need to ensure that sorting a DatasetRef matches what you would 

219 # get if you sorted DatasetType+DataCoordinate 

220 if not isinstance(other, type(self)): 

221 return NotImplemented 

222 

223 # Group by run if defined, takes precedence over DatasetType 

224 self_run = "" if self.run is None else self.run 

225 other_run = "" if other.run is None else other.run 

226 

227 # Compare tuples in the priority order 

228 return (self_run, self.datasetType, self.dataId) < (other_run, other.datasetType, other.dataId) 

229 

230 def to_simple(self, minimal: bool = False) -> SerializedDatasetRef: 

231 """Convert this class to a simple python type. 

232 

233 This makes it suitable for serialization. 

234 

235 Parameters 

236 ---------- 

237 minimal : `bool`, optional 

238 Use minimal serialization. Requires Registry to convert 

239 back to a full type. 

240 

241 Returns 

242 ------- 

243 simple : `dict` or `int` 

244 The object converted to a dictionary. 

245 """ 

246 if minimal and self.id is not None: 

247 # The only thing needed to uniquely define a DatasetRef 

248 # is its id so that can be used directly if it is 

249 # resolved and if it is not a component DatasetRef. 

250 # Store is in a dict to allow us to easily add the planned 

251 # origin information later without having to support 

252 # an int and dict in simple form. 

253 simple: Dict[str, Any] = {"id": self.id} 

254 if self.isComponent(): 

255 # We can still be a little minimalist with a component 

256 # but we will also need to record the datasetType component 

257 simple["component"] = self.datasetType.component() 

258 return SerializedDatasetRef(**simple) 

259 

260 # Convert to a dict form 

261 as_dict: Dict[str, Any] = {"datasetType": self.datasetType.to_simple(minimal=minimal), 

262 "dataId": self.dataId.to_simple(), 

263 } 

264 

265 # Only include the id entry if it is defined 

266 if self.id is not None: 

267 as_dict["run"] = self.run 

268 as_dict["id"] = self.id 

269 

270 return SerializedDatasetRef(**as_dict) 

271 

272 @classmethod 

273 def from_simple(cls, simple: SerializedDatasetRef, 

274 universe: Optional[DimensionUniverse] = None, 

275 registry: Optional[Registry] = None, 

276 datasetType: Optional[DatasetType] = None) -> DatasetRef: 

277 """Construct a new object from simplified form. 

278 

279 Generally this is data returned from the `to_simple` method. 

280 

281 Parameters 

282 ---------- 

283 simple : `dict` of [`str`, `Any`] 

284 The value returned by `to_simple()`. 

285 universe : `DimensionUniverse` 

286 The special graph of all known dimensions. 

287 Can be `None` if a registry is provided. 

288 registry : `lsst.daf.butler.Registry`, optional 

289 Registry to use to convert simple form of a DatasetRef to 

290 a full `DatasetRef`. Can be `None` if a full description of 

291 the type is provided along with a universe. 

292 datasetType : DatasetType, optional 

293 If datasetType is supplied, this will be used as the datasetType 

294 object in the resulting DatasetRef instead of being read from 

295 the `SerializedDatasetRef`. This is useful when many refs share 

296 the same type as memory can be saved. Defaults to None. 

297 

298 Returns 

299 ------- 

300 ref : `DatasetRef` 

301 Newly-constructed object. 

302 """ 

303 # Minimalist component will just specify component and id and 

304 # require registry to reconstruct 

305 if set(simple.dict(exclude_unset=True, exclude_defaults=True)).issubset({"id", "component"}): 

306 if registry is None: 

307 raise ValueError("Registry is required to construct component DatasetRef from integer id") 

308 if simple.id is None: 

309 raise ValueError("For minimal DatasetRef the ID must be defined.") 

310 ref = registry.getDataset(simple.id) 

311 if ref is None: 

312 raise RuntimeError(f"No matching dataset found in registry for id {simple.id}") 

313 if simple.component: 

314 ref = ref.makeComponentRef(simple.component) 

315 return ref 

316 

317 if universe is None and registry is None: 

318 raise ValueError("One of universe or registry must be provided.") 

319 

320 if universe is None and registry is not None: 

321 universe = registry.dimensions 

322 

323 if universe is None: 

324 # this is for mypy 

325 raise ValueError("Unable to determine a usable universe") 

326 

327 if simple.datasetType is None and datasetType is None: 

328 # mypy 

329 raise ValueError("The DatasetType must be specified to construct a DatasetRef") 

330 if datasetType is None: 

331 if simple.datasetType is None: 

332 raise ValueError("Cannot determine Dataset type of this serialized class") 

333 datasetType = DatasetType.from_simple(simple.datasetType, universe=universe, registry=registry) 

334 

335 if simple.dataId is None: 

336 # mypy 

337 raise ValueError("The DataId must be specified to construct a DatasetRef") 

338 dataId = DataCoordinate.from_simple(simple.dataId, universe=universe) 

339 return cls(datasetType, dataId, 

340 id=simple.id, run=simple.run) 

341 

342 to_json = to_json_pydantic 

343 from_json = classmethod(from_json_pydantic) 

344 

345 @classmethod 

346 def _unpickle( 

347 cls, 

348 datasetType: DatasetType, 

349 dataId: DataCoordinate, 

350 id: Optional[DatasetId], 

351 run: Optional[str], 

352 ) -> DatasetRef: 

353 """Create new `DatasetRef`. 

354 

355 A custom factory method for use by `__reduce__` as a workaround for 

356 its lack of support for keyword arguments. 

357 """ 

358 return cls(datasetType, dataId, id=id, run=run) 

359 

360 def __reduce__(self) -> tuple: 

361 return (self._unpickle, (self.datasetType, self.dataId, self.id, self.run)) 

362 

363 def __deepcopy__(self, memo: dict) -> DatasetRef: 

364 # DatasetRef is recursively immutable; see note in @immutable 

365 # decorator. 

366 return self 

367 

368 def resolved(self, id: DatasetId, run: str) -> DatasetRef: 

369 """Return resolved `DatasetRef`. 

370 

371 This is a new `DatasetRef` with the same data ID and dataset type 

372 and the given ID and run. 

373 

374 Parameters 

375 ---------- 

376 id : `DatasetId` 

377 The unique identifier assigned when the dataset is created. 

378 run : `str` 

379 The run this dataset was associated with when it was created. 

380 

381 Returns 

382 ------- 

383 ref : `DatasetRef` 

384 A new `DatasetRef`. 

385 """ 

386 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, 

387 id=id, run=run, conform=False) 

388 

389 def unresolved(self) -> DatasetRef: 

390 """Return unresolved `DatasetRef`. 

391 

392 This is a new `DatasetRef` with the same data ID and dataset type, 

393 but no ID or run. 

394 

395 Returns 

396 ------- 

397 ref : `DatasetRef` 

398 A new `DatasetRef`. 

399 

400 Notes 

401 ----- 

402 This can be used to compare only the data ID and dataset type of a 

403 pair of `DatasetRef` instances, regardless of whether either is 

404 resolved:: 

405 

406 if ref1.unresolved() == ref2.unresolved(): 

407 ... 

408 """ 

409 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, conform=False) 

410 

411 def expanded(self, dataId: DataCoordinate) -> DatasetRef: 

412 """Return a new `DatasetRef` with the given expanded data ID. 

413 

414 Parameters 

415 ---------- 

416 dataId : `DataCoordinate` 

417 Data ID for the new `DatasetRef`. Must compare equal to the 

418 original data ID. 

419 

420 Returns 

421 ------- 

422 ref : `DatasetRef` 

423 A new `DatasetRef` with the given data ID. 

424 """ 

425 assert dataId == self.dataId 

426 return DatasetRef(datasetType=self.datasetType, dataId=dataId, 

427 id=self.id, run=self.run, 

428 conform=False) 

429 

430 def isComponent(self) -> bool: 

431 """Indicate whether this `DatasetRef` refers to a component. 

432 

433 Returns 

434 ------- 

435 isComponent : `bool` 

436 `True` if this `DatasetRef` is a component, `False` otherwise. 

437 """ 

438 return self.datasetType.isComponent() 

439 

440 def isComposite(self) -> bool: 

441 """Boolean indicating whether this `DatasetRef` is a composite type. 

442 

443 Returns 

444 ------- 

445 isComposite : `bool` 

446 `True` if this `DatasetRef` is a composite type, `False` 

447 otherwise. 

448 """ 

449 return self.datasetType.isComposite() 

450 

451 def _lookupNames(self) -> Tuple[LookupKey, ...]: 

452 """Name keys to use when looking up this DatasetRef in a configuration. 

453 

454 The names are returned in order of priority. 

455 

456 Returns 

457 ------- 

458 names : `tuple` of `LookupKey` 

459 Tuple of the `DatasetType` name and the `StorageClass` name. 

460 If ``instrument`` is defined in the dataId, each of those names 

461 is added to the start of the tuple with a key derived from the 

462 value of ``instrument``. 

463 """ 

464 # Special case the instrument Dimension since we allow configs 

465 # to include the instrument name in the hierarchy. 

466 names: Tuple[LookupKey, ...] = self.datasetType._lookupNames() 

467 

468 # mypy doesn't think this could return True, because even though 

469 # __contains__ can take an object of any type, it seems hard-coded to 

470 # assume it will return False if the type doesn't match the key type 

471 # of the Mapping. 

472 if "instrument" in self.dataId: # type: ignore 

473 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) 

474 for n in names) + names 

475 

476 return names 

477 

478 @staticmethod 

479 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

480 """Group an iterable of `DatasetRef` by `DatasetType`. 

481 

482 Parameters 

483 ---------- 

484 refs : `Iterable` [ `DatasetRef` ] 

485 `DatasetRef` instances to group. 

486 

487 Returns 

488 ------- 

489 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ] 

490 Grouped `DatasetRef` instances. 

491 """ 

492 result: NamedKeyDict[DatasetType, List[DatasetRef]] = NamedKeyDict() 

493 for ref in refs: 

494 result.setdefault(ref.datasetType, []).append(ref) 

495 return result 

496 

497 def getCheckedId(self) -> DatasetId: 

498 """Return ``self.id``, or raise if it is `None`. 

499 

500 This trivial method exists to allow operations that would otherwise be 

501 natural list comprehensions to check that the ID is not `None` as well. 

502 

503 Returns 

504 ------- 

505 id : `DatasetId` 

506 ``self.id`` if it is not `None`. 

507 

508 Raises 

509 ------ 

510 AmbiguousDatasetError 

511 Raised if ``ref.id`` is `None`. 

512 """ 

513 if self.id is None: 

514 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; " 

515 f"a resolved reference is required.") 

516 return self.id 

517 

518 def makeCompositeRef(self) -> DatasetRef: 

519 """Create a `DatasetRef` of the composite from a component ref. 

520 

521 Requires that this `DatasetRef` is a component. 

522 

523 Returns 

524 ------- 

525 ref : `DatasetRef` 

526 A `DatasetRef` with a dataset type that corresponds to the 

527 composite parent of this component, and the same ID and run 

528 (which may be `None`, if they are `None` in ``self``). 

529 """ 

530 # Assume that the data ID does not need to be standardized 

531 # and should match whatever this ref already has. 

532 return DatasetRef(self.datasetType.makeCompositeDatasetType(), self.dataId, 

533 id=self.id, run=self.run, conform=False) 

534 

535 def makeComponentRef(self, name: str) -> DatasetRef: 

536 """Create a `DatasetRef` that corresponds to a component. 

537 

538 Parameters 

539 ---------- 

540 name : `str` 

541 Name of the component. 

542 

543 Returns 

544 ------- 

545 ref : `DatasetRef` 

546 A `DatasetRef` with a dataset type that corresponds to the given 

547 component, and the same ID and run 

548 (which may be `None`, if they are `None` in ``self``). 

549 """ 

550 # Assume that the data ID does not need to be standardized 

551 # and should match whatever this ref already has. 

552 return DatasetRef(self.datasetType.makeComponentDatasetType(name), self.dataId, 

553 id=self.id, run=self.run, conform=False) 

554 

555 datasetType: DatasetType 

556 """The definition of this dataset (`DatasetType`). 

557 

558 Cannot be changed after a `DatasetRef` is constructed. 

559 """ 

560 

561 dataId: DataCoordinate 

562 """A mapping of `Dimension` primary key values that labels the dataset 

563 within a Collection (`DataCoordinate`). 

564 

565 Cannot be changed after a `DatasetRef` is constructed. 

566 """ 

567 

568 run: Optional[str] 

569 """The name of the run that produced the dataset. 

570 

571 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

572 `unresolved` to add or remove this information when creating a new 

573 `DatasetRef`. 

574 """ 

575 

576 id: Optional[DatasetId] 

577 """Primary key of the dataset (`DatasetId` or `None`). 

578 

579 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

580 `unresolved` to add or remove this information when creating a new 

581 `DatasetRef`. 

582 """