Coverage for python/lsst/daf/butler/core/datasets/ref.py: 31%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

162 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["AmbiguousDatasetError", "DatasetId", "DatasetRef", "SerializedDatasetRef"] 

24 

25import uuid 

26from typing import ( 

27 TYPE_CHECKING, 

28 Any, 

29 Dict, 

30 Iterable, 

31 List, 

32 Optional, 

33 Tuple, 

34 Union, 

35) 

36 

37from pydantic import BaseModel, StrictStr, ConstrainedInt, validator 

38 

39from lsst.utils.classes import immutable 

40from ..dimensions import DataCoordinate, DimensionGraph, DimensionUniverse, SerializedDataCoordinate 

41from ..configSupport import LookupKey 

42from ..named import NamedKeyDict 

43from .type import DatasetType, SerializedDatasetType 

44from ..json import from_json_pydantic, to_json_pydantic 

45 

46if TYPE_CHECKING: 46 ↛ 47line 46 didn't jump to line 47, because the condition on line 46 was never true

47 from ...registry import Registry 

48 

49 

50class AmbiguousDatasetError(Exception): 

51 """Raised when a `DatasetRef` is not resolved but should be. 

52 

53 This happens when the `DatasetRef` has no ID or run but the requested 

54 operation requires one of them. 

55 """ 

56 

57 

58class PositiveInt(ConstrainedInt): 

59 ge = 0 

60 strict = True 

61 

62 

63class SerializedDatasetRef(BaseModel): 

64 """Simplified model of a `DatasetRef` suitable for serialization.""" 

65 

66 # DO NOT change order in the Union, pydantic is sensitive to that! 

67 id: Optional[Union[uuid.UUID, PositiveInt]] = None 

68 datasetType: Optional[SerializedDatasetType] = None 

69 dataId: Optional[SerializedDataCoordinate] = None 

70 run: Optional[StrictStr] = None 

71 component: Optional[StrictStr] = None 

72 

73 @validator("dataId") 

74 def _check_dataId(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805 

75 if (d := "datasetType") in values and values[d] is None: 

76 raise ValueError("Can not specify 'dataId' without specifying 'datasetType'") 

77 return v 

78 

79 @validator("run") 

80 def _check_run(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805 

81 if v and (i := "id") in values and values[i] is None: 

82 raise ValueError("'run' cannot be provided unless 'id' is.") 

83 return v 

84 

85 @validator("component") 

86 def _check_component(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805 

87 # Component should not be given if datasetType is given 

88 if v and (d := "datasetType") in values and values[d] is not None: 

89 raise ValueError(f"datasetType ({values[d]}) can not be set if component is given ({v}).") 

90 return v 

91 

92 

93DatasetId = Union[int, uuid.UUID] 

94"""A type-annotation alias for dataset ID which could be either integer or 

95UUID. 

96""" 

97 

98 

99@immutable 

100class DatasetRef: 

101 """Reference to a Dataset in a `Registry`. 

102 

103 A `DatasetRef` may point to a Dataset that currently does not yet exist 

104 (e.g., because it is a predicted input for provenance). 

105 

106 Parameters 

107 ---------- 

108 datasetType : `DatasetType` 

109 The `DatasetType` for this Dataset. 

110 dataId : `DataCoordinate` 

111 A mapping of dimensions that labels the Dataset within a Collection. 

112 id : `DatasetId`, optional 

113 The unique identifier assigned when the dataset is created. 

114 run : `str`, optional 

115 The name of the run this dataset was associated with when it was 

116 created. Must be provided if ``id`` is. 

117 conform : `bool`, optional 

118 If `True` (default), call `DataCoordinate.standardize` to ensure that 

119 the data ID's dimensions are consistent with the dataset type's. 

120 `DatasetRef` instances for which those dimensions are not equal should 

121 not be created in new code, but are still supported for backwards 

122 compatibility. New code should only pass `False` if it can guarantee 

123 that the dimensions are already consistent. 

124 

125 Raises 

126 ------ 

127 ValueError 

128 Raised if ``run`` is provided but ``id`` is not, or if ``id`` is 

129 provided but ``run`` is not. 

130 

131 See Also 

132 -------- 

133 :ref:`daf_butler_organizing_datasets` 

134 """ 

135 

136 _serializedType = SerializedDatasetRef 

137 __slots__ = ("id", "datasetType", "dataId", "run",) 

138 

139 def __init__( 

140 self, 

141 datasetType: DatasetType, dataId: DataCoordinate, *, 

142 id: Optional[DatasetId] = None, 

143 run: Optional[str] = None, 

144 conform: bool = True 

145 ): 

146 self.id = id 

147 self.datasetType = datasetType 

148 if conform: 

149 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions) 

150 else: 

151 self.dataId = dataId 

152 if self.id is not None: 

153 if run is None: 

154 raise ValueError(f"Cannot provide id without run for dataset with id={id}, " 

155 f"type={datasetType}, and dataId={dataId}.") 

156 self.run = run 

157 else: 

158 if run is not None: 

159 raise ValueError("'run' cannot be provided unless 'id' is.") 

160 self.run = None 

161 

162 def __eq__(self, other: Any) -> bool: 

163 try: 

164 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id) 

165 except AttributeError: 

166 return NotImplemented 

167 

168 def __hash__(self) -> int: 

169 return hash((self.datasetType, self.dataId, self.id)) 

170 

171 @property 

172 def dimensions(self) -> DimensionGraph: 

173 """Dimensions associated with the underlying `DatasetType`.""" 

174 return self.datasetType.dimensions 

175 

176 def __repr__(self) -> str: 

177 # We delegate to __str__ (i.e use "!s") for the data ID) below because 

178 # DataCoordinate's __repr__ - while adhering to the guidelines for 

179 # __repr__ - is much harder to users to read, while its __str__ just 

180 # produces a dict that can also be passed to DatasetRef's constructor. 

181 if self.id is not None: 

182 return (f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r})") 

183 else: 

184 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})" 

185 

186 def __str__(self) -> str: 

187 s = f"{self.datasetType.name}@{self.dataId!s}, sc={self.datasetType.storageClass.name}]" 

188 if self.id is not None: 

189 s += f" (id={self.id})" 

190 return s 

191 

192 def __lt__(self, other: Any) -> bool: 

193 # Sort by run, DatasetType name and then by DataCoordinate 

194 # The __str__ representation is probably close enough but we 

195 # need to ensure that sorting a DatasetRef matches what you would 

196 # get if you sorted DatasetType+DataCoordinate 

197 if not isinstance(other, type(self)): 

198 return NotImplemented 

199 

200 # Group by run if defined, takes precedence over DatasetType 

201 self_run = "" if self.run is None else self.run 

202 other_run = "" if other.run is None else other.run 

203 

204 # Compare tuples in the priority order 

205 return (self_run, self.datasetType, self.dataId) < (other_run, other.datasetType, other.dataId) 

206 

207 def to_simple(self, minimal: bool = False) -> SerializedDatasetRef: 

208 """Convert this class to a simple python type. 

209 

210 This makes it suitable for serialization. 

211 

212 Parameters 

213 ---------- 

214 minimal : `bool`, optional 

215 Use minimal serialization. Requires Registry to convert 

216 back to a full type. 

217 

218 Returns 

219 ------- 

220 simple : `dict` or `int` 

221 The object converted to a dictionary. 

222 """ 

223 if minimal and self.id is not None: 

224 # The only thing needed to uniquely define a DatasetRef 

225 # is its id so that can be used directly if it is 

226 # resolved and if it is not a component DatasetRef. 

227 # Store is in a dict to allow us to easily add the planned 

228 # origin information later without having to support 

229 # an int and dict in simple form. 

230 simple: Dict[str, Any] = {"id": self.id} 

231 if self.isComponent(): 

232 # We can still be a little minimalist with a component 

233 # but we will also need to record the datasetType component 

234 simple["component"] = self.datasetType.component() 

235 return SerializedDatasetRef(**simple) 

236 

237 # Convert to a dict form 

238 as_dict: Dict[str, Any] = {"datasetType": self.datasetType.to_simple(minimal=minimal), 

239 "dataId": self.dataId.to_simple(), 

240 } 

241 

242 # Only include the id entry if it is defined 

243 if self.id is not None: 

244 as_dict["run"] = self.run 

245 as_dict["id"] = self.id 

246 

247 return SerializedDatasetRef(**as_dict) 

248 

249 @classmethod 

250 def from_simple(cls, simple: SerializedDatasetRef, 

251 universe: Optional[DimensionUniverse] = None, 

252 registry: Optional[Registry] = None) -> DatasetRef: 

253 """Construct a new object from simplified form. 

254 

255 Generally this is data returned from the `to_simple` method. 

256 

257 Parameters 

258 ---------- 

259 simple : `dict` of [`str`, `Any`] 

260 The value returned by `to_simple()`. 

261 universe : `DimensionUniverse` 

262 The special graph of all known dimensions. 

263 Can be `None` if a registry is provided. 

264 registry : `lsst.daf.butler.Registry`, optional 

265 Registry to use to convert simple form of a DatasetRef to 

266 a full `DatasetRef`. Can be `None` if a full description of 

267 the type is provided along with a universe. 

268 

269 Returns 

270 ------- 

271 ref : `DatasetRef` 

272 Newly-constructed object. 

273 """ 

274 # Minimalist component will just specify component and id and 

275 # require registry to reconstruct 

276 if set(simple.dict(exclude_unset=True, exclude_defaults=True)).issubset({"id", "component"}): 

277 if registry is None: 

278 raise ValueError("Registry is required to construct component DatasetRef from integer id") 

279 if simple.id is None: 

280 raise ValueError("For minimal DatasetRef the ID must be defined.") 

281 ref = registry.getDataset(simple.id) 

282 if ref is None: 

283 raise RuntimeError(f"No matching dataset found in registry for id {simple.id}") 

284 if simple.component: 

285 ref = ref.makeComponentRef(simple.component) 

286 return ref 

287 

288 if universe is None and registry is None: 

289 raise ValueError("One of universe or registry must be provided.") 

290 

291 if universe is None and registry is not None: 

292 universe = registry.dimensions 

293 

294 if universe is None: 

295 # this is for mypy 

296 raise ValueError("Unable to determine a usable universe") 

297 

298 if simple.datasetType is None: 

299 # mypy 

300 raise ValueError("The DatasetType must be specified to construct a DatasetRef") 

301 datasetType = DatasetType.from_simple(simple.datasetType, universe=universe, registry=registry) 

302 

303 if simple.dataId is None: 

304 # mypy 

305 raise ValueError("The DataId must be specified to construct a DatasetRef") 

306 dataId = DataCoordinate.from_simple(simple.dataId, universe=universe) 

307 return cls(datasetType, dataId, 

308 id=simple.id, run=simple.run) 

309 

310 to_json = to_json_pydantic 

311 from_json = classmethod(from_json_pydantic) 

312 

313 @classmethod 

314 def _unpickle( 

315 cls, 

316 datasetType: DatasetType, 

317 dataId: DataCoordinate, 

318 id: Optional[DatasetId], 

319 run: Optional[str], 

320 ) -> DatasetRef: 

321 """Create new `DatasetRef`. 

322 

323 A custom factory method for use by `__reduce__` as a workaround for 

324 its lack of support for keyword arguments. 

325 """ 

326 return cls(datasetType, dataId, id=id, run=run) 

327 

328 def __reduce__(self) -> tuple: 

329 return (self._unpickle, (self.datasetType, self.dataId, self.id, self.run)) 

330 

331 def __deepcopy__(self, memo: dict) -> DatasetRef: 

332 # DatasetRef is recursively immutable; see note in @immutable 

333 # decorator. 

334 return self 

335 

336 def resolved(self, id: DatasetId, run: str) -> DatasetRef: 

337 """Return resolved `DatasetRef`. 

338 

339 This is a new `DatasetRef` with the same data ID and dataset type 

340 and the given ID and run. 

341 

342 Parameters 

343 ---------- 

344 id : `DatasetId` 

345 The unique identifier assigned when the dataset is created. 

346 run : `str` 

347 The run this dataset was associated with when it was created. 

348 

349 Returns 

350 ------- 

351 ref : `DatasetRef` 

352 A new `DatasetRef`. 

353 """ 

354 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, 

355 id=id, run=run, conform=False) 

356 

357 def unresolved(self) -> DatasetRef: 

358 """Return unresolved `DatasetRef`. 

359 

360 This is a new `DatasetRef` with the same data ID and dataset type, 

361 but no ID or run. 

362 

363 Returns 

364 ------- 

365 ref : `DatasetRef` 

366 A new `DatasetRef`. 

367 

368 Notes 

369 ----- 

370 This can be used to compare only the data ID and dataset type of a 

371 pair of `DatasetRef` instances, regardless of whether either is 

372 resolved:: 

373 

374 if ref1.unresolved() == ref2.unresolved(): 

375 ... 

376 """ 

377 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, conform=False) 

378 

379 def expanded(self, dataId: DataCoordinate) -> DatasetRef: 

380 """Return a new `DatasetRef` with the given expanded data ID. 

381 

382 Parameters 

383 ---------- 

384 dataId : `DataCoordinate` 

385 Data ID for the new `DatasetRef`. Must compare equal to the 

386 original data ID. 

387 

388 Returns 

389 ------- 

390 ref : `DatasetRef` 

391 A new `DatasetRef` with the given data ID. 

392 """ 

393 assert dataId == self.dataId 

394 return DatasetRef(datasetType=self.datasetType, dataId=dataId, 

395 id=self.id, run=self.run, 

396 conform=False) 

397 

398 def isComponent(self) -> bool: 

399 """Indicate whether this `DatasetRef` refers to a component. 

400 

401 Returns 

402 ------- 

403 isComponent : `bool` 

404 `True` if this `DatasetRef` is a component, `False` otherwise. 

405 """ 

406 return self.datasetType.isComponent() 

407 

408 def isComposite(self) -> bool: 

409 """Boolean indicating whether this `DatasetRef` is a composite type. 

410 

411 Returns 

412 ------- 

413 isComposite : `bool` 

414 `True` if this `DatasetRef` is a composite type, `False` 

415 otherwise. 

416 """ 

417 return self.datasetType.isComposite() 

418 

419 def _lookupNames(self) -> Tuple[LookupKey, ...]: 

420 """Name keys to use when looking up this DatasetRef in a configuration. 

421 

422 The names are returned in order of priority. 

423 

424 Returns 

425 ------- 

426 names : `tuple` of `LookupKey` 

427 Tuple of the `DatasetType` name and the `StorageClass` name. 

428 If ``instrument`` is defined in the dataId, each of those names 

429 is added to the start of the tuple with a key derived from the 

430 value of ``instrument``. 

431 """ 

432 # Special case the instrument Dimension since we allow configs 

433 # to include the instrument name in the hierarchy. 

434 names: Tuple[LookupKey, ...] = self.datasetType._lookupNames() 

435 

436 # mypy doesn't think this could return True, because even though 

437 # __contains__ can take an object of any type, it seems hard-coded to 

438 # assume it will return False if the type doesn't match the key type 

439 # of the Mapping. 

440 if "instrument" in self.dataId: # type: ignore 

441 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) 

442 for n in names) + names 

443 

444 return names 

445 

446 @staticmethod 

447 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

448 """Group an iterable of `DatasetRef` by `DatasetType`. 

449 

450 Parameters 

451 ---------- 

452 refs : `Iterable` [ `DatasetRef` ] 

453 `DatasetRef` instances to group. 

454 

455 Returns 

456 ------- 

457 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ] 

458 Grouped `DatasetRef` instances. 

459 """ 

460 result: NamedKeyDict[DatasetType, List[DatasetRef]] = NamedKeyDict() 

461 for ref in refs: 

462 result.setdefault(ref.datasetType, []).append(ref) 

463 return result 

464 

465 def getCheckedId(self) -> DatasetId: 

466 """Return ``self.id``, or raise if it is `None`. 

467 

468 This trivial method exists to allow operations that would otherwise be 

469 natural list comprehensions to check that the ID is not `None` as well. 

470 

471 Returns 

472 ------- 

473 id : `DatasetId` 

474 ``self.id`` if it is not `None`. 

475 

476 Raises 

477 ------ 

478 AmbiguousDatasetError 

479 Raised if ``ref.id`` is `None`. 

480 """ 

481 if self.id is None: 

482 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; " 

483 f"a resolved reference is required.") 

484 return self.id 

485 

486 def makeCompositeRef(self) -> DatasetRef: 

487 """Create a `DatasetRef` of the composite from a component ref. 

488 

489 Requires that this `DatasetRef` is a component. 

490 

491 Returns 

492 ------- 

493 ref : `DatasetRef` 

494 A `DatasetRef` with a dataset type that corresponds to the 

495 composite parent of this component, and the same ID and run 

496 (which may be `None`, if they are `None` in ``self``). 

497 """ 

498 # Assume that the data ID does not need to be standardized 

499 # and should match whatever this ref already has. 

500 return DatasetRef(self.datasetType.makeCompositeDatasetType(), self.dataId, 

501 id=self.id, run=self.run, conform=False) 

502 

503 def makeComponentRef(self, name: str) -> DatasetRef: 

504 """Create a `DatasetRef` that corresponds to a component. 

505 

506 Parameters 

507 ---------- 

508 name : `str` 

509 Name of the component. 

510 

511 Returns 

512 ------- 

513 ref : `DatasetRef` 

514 A `DatasetRef` with a dataset type that corresponds to the given 

515 component, and the same ID and run 

516 (which may be `None`, if they are `None` in ``self``). 

517 """ 

518 # Assume that the data ID does not need to be standardized 

519 # and should match whatever this ref already has. 

520 return DatasetRef(self.datasetType.makeComponentDatasetType(name), self.dataId, 

521 id=self.id, run=self.run, conform=False) 

522 

523 datasetType: DatasetType 

524 """The definition of this dataset (`DatasetType`). 

525 

526 Cannot be changed after a `DatasetRef` is constructed. 

527 """ 

528 

529 dataId: DataCoordinate 

530 """A mapping of `Dimension` primary key values that labels the dataset 

531 within a Collection (`DataCoordinate`). 

532 

533 Cannot be changed after a `DatasetRef` is constructed. 

534 """ 

535 

536 run: Optional[str] 

537 """The name of the run that produced the dataset. 

538 

539 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

540 `unresolved` to add or remove this information when creating a new 

541 `DatasetRef`. 

542 """ 

543 

544 id: Optional[DatasetId] 

545 """Primary key of the dataset (`DatasetId` or `None`). 

546 

547 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

548 `unresolved` to add or remove this information when creating a new 

549 `DatasetRef`. 

550 """