Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["AmbiguousDatasetError", "DatasetId", "DatasetRef", "SerializedDatasetRef"] 

24 

25import uuid 

26from typing import ( 

27 TYPE_CHECKING, 

28 Any, 

29 Dict, 

30 Iterable, 

31 List, 

32 Optional, 

33 Tuple, 

34 Union, 

35) 

36 

37from pydantic import BaseModel, StrictStr, ConstrainedInt, validator 

38 

39from lsst.utils.classes import immutable 

40from ..dimensions import DataCoordinate, DimensionGraph, DimensionUniverse, SerializedDataCoordinate 

41from ..configSupport import LookupKey 

42from ..named import NamedKeyDict 

43from .type import DatasetType, SerializedDatasetType 

44from ..json import from_json_pydantic, to_json_pydantic 

45 

46if TYPE_CHECKING: 46 ↛ 47line 46 didn't jump to line 47, because the condition on line 46 was never true

47 from ...registry import Registry 

48 

49 

50class AmbiguousDatasetError(Exception): 

51 """Raised when a `DatasetRef` is not resolved but should be. 

52 

53 This happens when the `DatasetRef` has no ID or run but the requested 

54 operation requires one of them. 

55 """ 

56 

57 

58class PositiveInt(ConstrainedInt): 

59 ge = 0 

60 strict = True 

61 

62 

63class SerializedDatasetRef(BaseModel): 

64 """Simplified model of a `DatasetRef` suitable for serialization.""" 

65 

66 # DO NOT change order in the Union, pydantic is sensitive to that! 

67 id: Optional[Union[uuid.UUID, PositiveInt]] = None 

68 datasetType: Optional[SerializedDatasetType] = None 

69 dataId: Optional[SerializedDataCoordinate] = None 

70 run: Optional[StrictStr] = None 

71 component: Optional[StrictStr] = None 

72 

73 @validator("dataId") 

74 def _check_dataId(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805 

75 if (d := "datasetType") in values and values[d] is None: 

76 raise ValueError("Can not specify 'dataId' without specifying 'datasetType'") 

77 return v 

78 

79 @validator("run") 

80 def _check_run(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805 

81 if v and (i := "id") in values and values[i] is None: 

82 raise ValueError("'run' cannot be provided unless 'id' is.") 

83 return v 

84 

85 @validator("component") 

86 def _check_component(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805 

87 # Component should not be given if datasetType is given 

88 if v and (d := "datasetType") in values and values[d] is not None: 

89 raise ValueError(f"datasetType ({values[d]}) can not be set if component is given ({v}).") 

90 return v 

91 

92 

93DatasetId = Union[int, uuid.UUID] 

94"""A type-annotation alias for dataset ID which could be either integer or 

95UUID. 

96""" 

97 

98 

99@immutable 

100class DatasetRef: 

101 """Reference to a Dataset in a `Registry`. 

102 

103 A `DatasetRef` may point to a Dataset that currently does not yet exist 

104 (e.g., because it is a predicted input for provenance). 

105 

106 Parameters 

107 ---------- 

108 datasetType : `DatasetType` 

109 The `DatasetType` for this Dataset. 

110 dataId : `DataCoordinate` 

111 A mapping of dimensions that labels the Dataset within a Collection. 

112 id : `DatasetId`, optional 

113 The unique identifier assigned when the dataset is created. 

114 run : `str`, optional 

115 The name of the run this dataset was associated with when it was 

116 created. Must be provided if ``id`` is. 

117 conform : `bool`, optional 

118 If `True` (default), call `DataCoordinate.standardize` to ensure that 

119 the data ID's dimensions are consistent with the dataset type's. 

120 `DatasetRef` instances for which those dimensions are not equal should 

121 not be created in new code, but are still supported for backwards 

122 compatibility. New code should only pass `False` if it can guarantee 

123 that the dimensions are already consistent. 

124 

125 Raises 

126 ------ 

127 ValueError 

128 Raised if ``run`` is provided but ``id`` is not, or if ``id`` is 

129 provided but ``run`` is not. 

130 """ 

131 

132 _serializedType = SerializedDatasetRef 

133 __slots__ = ("id", "datasetType", "dataId", "run",) 

134 

135 def __init__( 

136 self, 

137 datasetType: DatasetType, dataId: DataCoordinate, *, 

138 id: Optional[DatasetId] = None, 

139 run: Optional[str] = None, 

140 conform: bool = True 

141 ): 

142 self.id = id 

143 self.datasetType = datasetType 

144 if conform: 

145 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions) 

146 else: 

147 self.dataId = dataId 

148 if self.id is not None: 

149 if run is None: 

150 raise ValueError(f"Cannot provide id without run for dataset with id={id}, " 

151 f"type={datasetType}, and dataId={dataId}.") 

152 self.run = run 

153 else: 

154 if run is not None: 

155 raise ValueError("'run' cannot be provided unless 'id' is.") 

156 self.run = None 

157 

158 def __eq__(self, other: Any) -> bool: 

159 try: 

160 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id) 

161 except AttributeError: 

162 return NotImplemented 

163 

164 def __hash__(self) -> int: 

165 return hash((self.datasetType, self.dataId, self.id)) 

166 

167 @property 

168 def dimensions(self) -> DimensionGraph: 

169 """Dimensions associated with the underlying `DatasetType`.""" 

170 return self.datasetType.dimensions 

171 

172 def __repr__(self) -> str: 

173 # We delegate to __str__ (i.e use "!s") for the data ID) below because 

174 # DataCoordinate's __repr__ - while adhering to the guidelines for 

175 # __repr__ - is much harder to users to read, while its __str__ just 

176 # produces a dict that can also be passed to DatasetRef's constructor. 

177 if self.id is not None: 

178 return (f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r})") 

179 else: 

180 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})" 

181 

182 def __str__(self) -> str: 

183 s = f"{self.datasetType.name}@{self.dataId!s}, sc={self.datasetType.storageClass.name}]" 

184 if self.id is not None: 

185 s += f" (id={self.id})" 

186 return s 

187 

188 def __lt__(self, other: Any) -> bool: 

189 # Sort by run, DatasetType name and then by DataCoordinate 

190 # The __str__ representation is probably close enough but we 

191 # need to ensure that sorting a DatasetRef matches what you would 

192 # get if you sorted DatasetType+DataCoordinate 

193 if not isinstance(other, type(self)): 

194 return NotImplemented 

195 

196 # Group by run if defined, takes precedence over DatasetType 

197 self_run = "" if self.run is None else self.run 

198 other_run = "" if other.run is None else other.run 

199 

200 # Compare tuples in the priority order 

201 return (self_run, self.datasetType, self.dataId) < (other_run, other.datasetType, other.dataId) 

202 

203 def to_simple(self, minimal: bool = False) -> SerializedDatasetRef: 

204 """Convert this class to a simple python type. 

205 

206 This makes it suitable for serialization. 

207 

208 Parameters 

209 ---------- 

210 minimal : `bool`, optional 

211 Use minimal serialization. Requires Registry to convert 

212 back to a full type. 

213 

214 Returns 

215 ------- 

216 simple : `dict` or `int` 

217 The object converted to a dictionary. 

218 """ 

219 if minimal and self.id is not None: 

220 # The only thing needed to uniquely define a DatasetRef 

221 # is its id so that can be used directly if it is 

222 # resolved and if it is not a component DatasetRef. 

223 # Store is in a dict to allow us to easily add the planned 

224 # origin information later without having to support 

225 # an int and dict in simple form. 

226 simple: Dict[str, Any] = {"id": self.id} 

227 if self.isComponent(): 

228 # We can still be a little minimalist with a component 

229 # but we will also need to record the datasetType component 

230 simple["component"] = self.datasetType.component() 

231 return SerializedDatasetRef(**simple) 

232 

233 # Convert to a dict form 

234 as_dict: Dict[str, Any] = {"datasetType": self.datasetType.to_simple(minimal=minimal), 

235 "dataId": self.dataId.to_simple(), 

236 } 

237 

238 # Only include the id entry if it is defined 

239 if self.id is not None: 

240 as_dict["run"] = self.run 

241 as_dict["id"] = self.id 

242 

243 return SerializedDatasetRef(**as_dict) 

244 

245 @classmethod 

246 def from_simple(cls, simple: SerializedDatasetRef, 

247 universe: Optional[DimensionUniverse] = None, 

248 registry: Optional[Registry] = None) -> DatasetRef: 

249 """Construct a new object from simplified form. 

250 

251 Generally this is data returned from the `to_simple` method. 

252 

253 Parameters 

254 ---------- 

255 simple : `dict` of [`str`, `Any`] 

256 The value returned by `to_simple()`. 

257 universe : `DimensionUniverse` 

258 The special graph of all known dimensions. 

259 Can be `None` if a registry is provided. 

260 registry : `lsst.daf.butler.Registry`, optional 

261 Registry to use to convert simple form of a DatasetRef to 

262 a full `DatasetRef`. Can be `None` if a full description of 

263 the type is provided along with a universe. 

264 

265 Returns 

266 ------- 

267 ref : `DatasetRef` 

268 Newly-constructed object. 

269 """ 

270 # Minimalist component will just specify component and id and 

271 # require registry to reconstruct 

272 if set(simple.dict(exclude_unset=True, exclude_defaults=True)).issubset({"id", "component"}): 

273 if registry is None: 

274 raise ValueError("Registry is required to construct component DatasetRef from integer id") 

275 if simple.id is None: 

276 raise ValueError("For minimal DatasetRef the ID must be defined.") 

277 ref = registry.getDataset(simple.id) 

278 if ref is None: 

279 raise RuntimeError(f"No matching dataset found in registry for id {simple.id}") 

280 if simple.component: 

281 ref = ref.makeComponentRef(simple.component) 

282 return ref 

283 

284 if universe is None and registry is None: 

285 raise ValueError("One of universe or registry must be provided.") 

286 

287 if universe is None and registry is not None: 

288 universe = registry.dimensions 

289 

290 if universe is None: 

291 # this is for mypy 

292 raise ValueError("Unable to determine a usable universe") 

293 

294 if simple.datasetType is None: 

295 # mypy 

296 raise ValueError("The DatasetType must be specified to construct a DatasetRef") 

297 datasetType = DatasetType.from_simple(simple.datasetType, universe=universe, registry=registry) 

298 

299 if simple.dataId is None: 

300 # mypy 

301 raise ValueError("The DataId must be specified to construct a DatasetRef") 

302 dataId = DataCoordinate.from_simple(simple.dataId, universe=universe) 

303 return cls(datasetType, dataId, 

304 id=simple.id, run=simple.run) 

305 

306 to_json = to_json_pydantic 

307 from_json = classmethod(from_json_pydantic) 

308 

309 @classmethod 

310 def _unpickle( 

311 cls, 

312 datasetType: DatasetType, 

313 dataId: DataCoordinate, 

314 id: Optional[DatasetId], 

315 run: Optional[str], 

316 ) -> DatasetRef: 

317 """Create new `DatasetRef`. 

318 

319 A custom factory method for use by `__reduce__` as a workaround for 

320 its lack of support for keyword arguments. 

321 """ 

322 return cls(datasetType, dataId, id=id, run=run) 

323 

324 def __reduce__(self) -> tuple: 

325 return (self._unpickle, (self.datasetType, self.dataId, self.id, self.run)) 

326 

327 def __deepcopy__(self, memo: dict) -> DatasetRef: 

328 # DatasetRef is recursively immutable; see note in @immutable 

329 # decorator. 

330 return self 

331 

332 def resolved(self, id: DatasetId, run: str) -> DatasetRef: 

333 """Return resolved `DatasetRef`. 

334 

335 This is a new `DatasetRef` with the same data ID and dataset type 

336 and the given ID and run. 

337 

338 Parameters 

339 ---------- 

340 id : `DatasetId` 

341 The unique identifier assigned when the dataset is created. 

342 run : `str` 

343 The run this dataset was associated with when it was created. 

344 

345 Returns 

346 ------- 

347 ref : `DatasetRef` 

348 A new `DatasetRef`. 

349 """ 

350 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, 

351 id=id, run=run, conform=False) 

352 

353 def unresolved(self) -> DatasetRef: 

354 """Return unresolved `DatasetRef`. 

355 

356 This is a new `DatasetRef` with the same data ID and dataset type, 

357 but no ID or run. 

358 

359 Returns 

360 ------- 

361 ref : `DatasetRef` 

362 A new `DatasetRef`. 

363 

364 Notes 

365 ----- 

366 This can be used to compare only the data ID and dataset type of a 

367 pair of `DatasetRef` instances, regardless of whether either is 

368 resolved:: 

369 

370 if ref1.unresolved() == ref2.unresolved(): 

371 ... 

372 """ 

373 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, conform=False) 

374 

375 def expanded(self, dataId: DataCoordinate) -> DatasetRef: 

376 """Return a new `DatasetRef` with the given expanded data ID. 

377 

378 Parameters 

379 ---------- 

380 dataId : `DataCoordinate` 

381 Data ID for the new `DatasetRef`. Must compare equal to the 

382 original data ID. 

383 

384 Returns 

385 ------- 

386 ref : `DatasetRef` 

387 A new `DatasetRef` with the given data ID. 

388 """ 

389 assert dataId == self.dataId 

390 return DatasetRef(datasetType=self.datasetType, dataId=dataId, 

391 id=self.id, run=self.run, 

392 conform=False) 

393 

394 def isComponent(self) -> bool: 

395 """Indicate whether this `DatasetRef` refers to a component. 

396 

397 Returns 

398 ------- 

399 isComponent : `bool` 

400 `True` if this `DatasetRef` is a component, `False` otherwise. 

401 """ 

402 return self.datasetType.isComponent() 

403 

404 def isComposite(self) -> bool: 

405 """Boolean indicating whether this `DatasetRef` is a composite type. 

406 

407 Returns 

408 ------- 

409 isComposite : `bool` 

410 `True` if this `DatasetRef` is a composite type, `False` 

411 otherwise. 

412 """ 

413 return self.datasetType.isComposite() 

414 

415 def _lookupNames(self) -> Tuple[LookupKey, ...]: 

416 """Name keys to use when looking up this DatasetRef in a configuration. 

417 

418 The names are returned in order of priority. 

419 

420 Returns 

421 ------- 

422 names : `tuple` of `LookupKey` 

423 Tuple of the `DatasetType` name and the `StorageClass` name. 

424 If ``instrument`` is defined in the dataId, each of those names 

425 is added to the start of the tuple with a key derived from the 

426 value of ``instrument``. 

427 """ 

428 # Special case the instrument Dimension since we allow configs 

429 # to include the instrument name in the hierarchy. 

430 names: Tuple[LookupKey, ...] = self.datasetType._lookupNames() 

431 

432 # mypy doesn't think this could return True, because even though 

433 # __contains__ can take an object of any type, it seems hard-coded to 

434 # assume it will return False if the type doesn't match the key type 

435 # of the Mapping. 

436 if "instrument" in self.dataId: # type: ignore 

437 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) 

438 for n in names) + names 

439 

440 return names 

441 

442 @staticmethod 

443 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

444 """Group an iterable of `DatasetRef` by `DatasetType`. 

445 

446 Parameters 

447 ---------- 

448 refs : `Iterable` [ `DatasetRef` ] 

449 `DatasetRef` instances to group. 

450 

451 Returns 

452 ------- 

453 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ] 

454 Grouped `DatasetRef` instances. 

455 """ 

456 result: NamedKeyDict[DatasetType, List[DatasetRef]] = NamedKeyDict() 

457 for ref in refs: 

458 result.setdefault(ref.datasetType, []).append(ref) 

459 return result 

460 

461 def getCheckedId(self) -> DatasetId: 

462 """Return ``self.id``, or raise if it is `None`. 

463 

464 This trivial method exists to allow operations that would otherwise be 

465 natural list comprehensions to check that the ID is not `None` as well. 

466 

467 Returns 

468 ------- 

469 id : `DatasetId` 

470 ``self.id`` if it is not `None`. 

471 

472 Raises 

473 ------ 

474 AmbiguousDatasetError 

475 Raised if ``ref.id`` is `None`. 

476 """ 

477 if self.id is None: 

478 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; " 

479 f"a resolved reference is required.") 

480 return self.id 

481 

482 def makeCompositeRef(self) -> DatasetRef: 

483 """Create a `DatasetRef` of the composite from a component ref. 

484 

485 Requires that this `DatasetRef` is a component. 

486 

487 Returns 

488 ------- 

489 ref : `DatasetRef` 

490 A `DatasetRef` with a dataset type that corresponds to the 

491 composite parent of this component, and the same ID and run 

492 (which may be `None`, if they are `None` in ``self``). 

493 """ 

494 # Assume that the data ID does not need to be standardized 

495 # and should match whatever this ref already has. 

496 return DatasetRef(self.datasetType.makeCompositeDatasetType(), self.dataId, 

497 id=self.id, run=self.run, conform=False) 

498 

499 def makeComponentRef(self, name: str) -> DatasetRef: 

500 """Create a `DatasetRef` that corresponds to a component. 

501 

502 Parameters 

503 ---------- 

504 name : `str` 

505 Name of the component. 

506 

507 Returns 

508 ------- 

509 ref : `DatasetRef` 

510 A `DatasetRef` with a dataset type that corresponds to the given 

511 component, and the same ID and run 

512 (which may be `None`, if they are `None` in ``self``). 

513 """ 

514 # Assume that the data ID does not need to be standardized 

515 # and should match whatever this ref already has. 

516 return DatasetRef(self.datasetType.makeComponentDatasetType(name), self.dataId, 

517 id=self.id, run=self.run, conform=False) 

518 

519 datasetType: DatasetType 

520 """The definition of this dataset (`DatasetType`). 

521 

522 Cannot be changed after a `DatasetRef` is constructed. 

523 """ 

524 

525 dataId: DataCoordinate 

526 """A mapping of `Dimension` primary key values that labels the dataset 

527 within a Collection (`DataCoordinate`). 

528 

529 Cannot be changed after a `DatasetRef` is constructed. 

530 """ 

531 

532 run: Optional[str] 

533 """The name of the run that produced the dataset. 

534 

535 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

536 `unresolved` to add or remove this information when creating a new 

537 `DatasetRef`. 

538 """ 

539 

540 id: Optional[DatasetId] 

541 """Primary key of the dataset (`DatasetId` or `None`). 

542 

543 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

544 `unresolved` to add or remove this information when creating a new 

545 `DatasetRef`. 

546 """