Coverage for python/lsst/daf/butler/core/datasets/ref.py: 33%

189 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-06-06 09:38 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = [ 

24 "AmbiguousDatasetError", 

25 "DatasetId", 

26 "DatasetIdFactory", 

27 "DatasetIdGenEnum", 

28 "DatasetRef", 

29 "SerializedDatasetRef", 

30] 

31 

32import enum 

33import uuid 

34from typing import TYPE_CHECKING, Any, ClassVar, Dict, Iterable, List, Optional, Tuple 

35 

36from lsst.utils.classes import immutable 

37from pydantic import BaseModel, ConstrainedInt, StrictStr, validator 

38 

39from ..configSupport import LookupKey 

40from ..dimensions import DataCoordinate, DimensionGraph, DimensionUniverse, SerializedDataCoordinate 

41from ..json import from_json_pydantic, to_json_pydantic 

42from ..named import NamedKeyDict 

43from .type import DatasetType, SerializedDatasetType 

44 

45if TYPE_CHECKING: 

46 from ...registry import Registry 

47 from ..storageClass import StorageClass 

48 

49 

50class AmbiguousDatasetError(Exception): 

51 """Raised when a `DatasetRef` is not resolved but should be. 

52 

53 This happens when the `DatasetRef` has no ID or run but the requested 

54 operation requires one of them. 

55 """ 

56 

57 

58class PositiveInt(ConstrainedInt): 

59 ge = 0 

60 strict = True 

61 

62 

63class DatasetIdGenEnum(enum.Enum): 

64 """This enum is used to specify dataset ID generation options.""" 

65 

66 UNIQUE = 0 

67 """Unique mode generates unique ID for each inserted dataset, e.g. 

68 auto-generated by database or random UUID. 

69 """ 

70 

71 DATAID_TYPE = 1 

72 """In this mode ID is computed deterministically from a combination of 

73 dataset type and dataId. 

74 """ 

75 

76 DATAID_TYPE_RUN = 2 

77 """In this mode ID is computed deterministically from a combination of 

78 dataset type, dataId, and run collection name. 

79 """ 

80 

81 

82class DatasetIdFactory: 

83 """Factory for dataset IDs (UUIDs). 

84 

85 For now the logic is hard-coded and is controlled by the user-provided 

86 value of `DatasetIdGenEnum`. In the future we may implement a configurable 

87 logic that can guess `DatasetIdGenEnum` value from other parameters. 

88 """ 

89 

90 NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f") 

91 """Namespace UUID used for UUID5 generation. Do not change. This was 

92 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`. 

93 """ 

94 

95 def makeDatasetId( 

96 self, 

97 run: str, 

98 datasetType: DatasetType, 

99 dataId: DataCoordinate, 

100 idGenerationMode: DatasetIdGenEnum, 

101 ) -> uuid.UUID: 

102 """Generate dataset ID for a dataset. 

103 

104 Parameters 

105 ---------- 

106 run : `str` 

107 Name of the RUN collection for the dataset. 

108 datasetType : `DatasetType` 

109 Dataset type. 

110 dataId : `DataCoordinate` 

111 Expanded data ID for the dataset. 

112 idGenerationMode : `DatasetIdGenEnum` 

113 ID generation option. `~DatasetIdGenEnum.UNIQUE` makes a random 

114 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a 

115 deterministic UUID5-type ID based on a dataset type name and 

116 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a 

117 deterministic UUID5-type ID based on a dataset type name, run 

118 collection name, and ``dataId``. 

119 

120 Returns 

121 ------- 

122 datasetId : `uuid.UUID` 

123 Dataset identifier. 

124 """ 

125 if idGenerationMode is DatasetIdGenEnum.UNIQUE: 

126 return uuid.uuid4() 

127 else: 

128 # WARNING: If you modify this code make sure that the order of 

129 # items in the `items` list below never changes. 

130 items: list[tuple[str, str]] = [] 

131 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE: 

132 items = [ 

133 ("dataset_type", datasetType.name), 

134 ] 

135 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN: 

136 items = [ 

137 ("dataset_type", datasetType.name), 

138 ("run", run), 

139 ] 

140 else: 

141 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}") 

142 

143 for name, value in sorted(dataId.byName().items()): 

144 items.append((name, str(value))) 

145 data = ",".join(f"{key}={value}" for key, value in items) 

146 return uuid.uuid5(self.NS_UUID, data) 

147 

148 

149class SerializedDatasetRef(BaseModel): 

150 """Simplified model of a `DatasetRef` suitable for serialization.""" 

151 

152 id: uuid.UUID 

153 datasetType: Optional[SerializedDatasetType] = None 

154 dataId: Optional[SerializedDataCoordinate] = None 

155 run: Optional[StrictStr] = None 

156 component: Optional[StrictStr] = None 

157 

158 @validator("dataId") 

159 def _check_dataId(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805 

160 if (d := "datasetType") in values and values[d] is None: 

161 raise ValueError("Can not specify 'dataId' without specifying 'datasetType'") 

162 return v 

163 

164 @validator("run") 

165 def _check_run(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805 

166 if v and (i := "id") in values and values[i] is None: 

167 raise ValueError("'run' cannot be provided unless 'id' is.") 

168 return v 

169 

170 @validator("component") 

171 def _check_component(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805 

172 # Component should not be given if datasetType is given 

173 if v and (d := "datasetType") in values and values[d] is not None: 

174 raise ValueError(f"datasetType ({values[d]}) can not be set if component is given ({v}).") 

175 return v 

176 

177 @classmethod 

178 def direct( 

179 cls, 

180 *, 

181 id: str, 

182 run: str, 

183 datasetType: Optional[Dict[str, Any]] = None, 

184 dataId: Optional[Dict[str, Any]] = None, 

185 component: Optional[str] = None, 

186 ) -> SerializedDatasetRef: 

187 """Construct a `SerializedDatasetRef` directly without validators. 

188 

189 Notes 

190 ----- 

191 This differs from the pydantic "construct" method in that the arguments 

192 are explicitly what the model requires, and it will recurse through 

193 members, constructing them from their corresponding `direct` methods. 

194 

195 The ``id`` parameter is a string representation of dataset ID, it is 

196 converted to UUID by this method. 

197 

198 This method should only be called when the inputs are trusted. 

199 """ 

200 node = SerializedDatasetRef.__new__(cls) 

201 setter = object.__setattr__ 

202 setter(node, "id", uuid.UUID(id)) 

203 setter( 

204 node, 

205 "datasetType", 

206 datasetType if datasetType is None else SerializedDatasetType.direct(**datasetType), 

207 ) 

208 setter(node, "dataId", dataId if dataId is None else SerializedDataCoordinate.direct(**dataId)) 

209 setter(node, "run", run) 

210 setter(node, "component", component) 

211 setter(node, "__fields_set__", {"id", "datasetType", "dataId", "run", "component"}) 

212 return node 

213 

214 

215DatasetId = uuid.UUID 

216"""A type-annotation alias for dataset ID providing typing flexibility. 

217""" 

218 

219 

220@immutable 

221class DatasetRef: 

222 """Reference to a Dataset in a `Registry`. 

223 

224 A `DatasetRef` may point to a Dataset that currently does not yet exist 

225 (e.g., because it is a predicted input for provenance). 

226 

227 Parameters 

228 ---------- 

229 datasetType : `DatasetType` 

230 The `DatasetType` for this Dataset. 

231 dataId : `DataCoordinate` 

232 A mapping of dimensions that labels the Dataset within a Collection. 

233 run : `str` 

234 The name of the run this dataset was associated with when it was 

235 created. 

236 id : `DatasetId`, optional 

237 The unique identifier assigned when the dataset is created. If ``id`` 

238 is not specified, a new unique ID will be created. 

239 conform : `bool`, optional 

240 If `True` (default), call `DataCoordinate.standardize` to ensure that 

241 the data ID's dimensions are consistent with the dataset type's. 

242 `DatasetRef` instances for which those dimensions are not equal should 

243 not be created in new code, but are still supported for backwards 

244 compatibility. New code should only pass `False` if it can guarantee 

245 that the dimensions are already consistent. 

246 id_generation_mode : `DatasetIdGenEnum` 

247 ID generation option. `~DatasetIdGenEnum.UNIQUE` makes a random 

248 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a 

249 deterministic UUID5-type ID based on a dataset type name and 

250 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a 

251 deterministic UUID5-type ID based on a dataset type name, run 

252 collection name, and ``dataId``. 

253 

254 Raises 

255 ------ 

256 ValueError 

257 Raised if ``run`` is provided but ``id`` is not, or if ``id`` is 

258 provided but ``run`` is not. 

259 

260 See Also 

261 -------- 

262 :ref:`daf_butler_organizing_datasets` 

263 """ 

264 

265 _serializedType = SerializedDatasetRef 

266 __slots__ = ( 

267 "id", 

268 "datasetType", 

269 "dataId", 

270 "run", 

271 ) 

272 

273 def __init__( 

274 self, 

275 datasetType: DatasetType, 

276 dataId: DataCoordinate, 

277 run: str, 

278 *, 

279 id: Optional[DatasetId] = None, 

280 conform: bool = True, 

281 id_generation_mode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

282 ): 

283 self.datasetType = datasetType 

284 if conform: 

285 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions) 

286 else: 

287 self.dataId = dataId 

288 self.run = run 

289 if id is not None: 

290 self.id = id 

291 else: 

292 self.id = DatasetIdFactory().makeDatasetId( 

293 self.run, self.datasetType, self.dataId, id_generation_mode 

294 ) 

295 

296 def __eq__(self, other: Any) -> bool: 

297 try: 

298 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id) 

299 except AttributeError: 

300 return NotImplemented 

301 

302 def __hash__(self) -> int: 

303 return hash((self.datasetType, self.dataId, self.id)) 

304 

305 @property 

306 def dimensions(self) -> DimensionGraph: 

307 """Dimensions associated with the underlying `DatasetType`.""" 

308 return self.datasetType.dimensions 

309 

310 def __repr__(self) -> str: 

311 # We delegate to __str__ (i.e use "!s") for the data ID) below because 

312 # DataCoordinate's __repr__ - while adhering to the guidelines for 

313 # __repr__ - is much harder to users to read, while its __str__ just 

314 # produces a dict that can also be passed to DatasetRef's constructor. 

315 return f"DatasetRef({self.datasetType!r}, {self.dataId!s}, run={self.run!r}, id={self.id})" 

316 

317 def __str__(self) -> str: 

318 s = ( 

319 f"{self.datasetType.name}@{self.dataId!s} [sc={self.datasetType.storageClass_name}]" 

320 f" (run={self.run} id={self.id})" 

321 ) 

322 return s 

323 

324 def __lt__(self, other: Any) -> bool: 

325 # Sort by run, DatasetType name and then by DataCoordinate 

326 # The __str__ representation is probably close enough but we 

327 # need to ensure that sorting a DatasetRef matches what you would 

328 # get if you sorted DatasetType+DataCoordinate 

329 if not isinstance(other, type(self)): 

330 return NotImplemented 

331 

332 # Group by run if defined, takes precedence over DatasetType 

333 self_run = "" if self.run is None else self.run 

334 other_run = "" if other.run is None else other.run 

335 

336 # Compare tuples in the priority order 

337 return (self_run, self.datasetType, self.dataId) < (other_run, other.datasetType, other.dataId) 

338 

339 def to_simple(self, minimal: bool = False) -> SerializedDatasetRef: 

340 """Convert this class to a simple python type. 

341 

342 This makes it suitable for serialization. 

343 

344 Parameters 

345 ---------- 

346 minimal : `bool`, optional 

347 Use minimal serialization. Requires Registry to convert 

348 back to a full type. 

349 

350 Returns 

351 ------- 

352 simple : `dict` or `int` 

353 The object converted to a dictionary. 

354 """ 

355 if minimal: 

356 # The only thing needed to uniquely define a DatasetRef is its id 

357 # so that can be used directly if it is not a component DatasetRef. 

358 # Store is in a dict to allow us to easily add the planned origin 

359 # information later without having to support an int and dict in 

360 # simple form. 

361 simple: Dict[str, Any] = {"id": self.id} 

362 if self.isComponent(): 

363 # We can still be a little minimalist with a component 

364 # but we will also need to record the datasetType component 

365 simple["component"] = self.datasetType.component() 

366 return SerializedDatasetRef(**simple) 

367 

368 return SerializedDatasetRef( 

369 datasetType=self.datasetType.to_simple(minimal=minimal), 

370 dataId=self.dataId.to_simple(), 

371 run=self.run, 

372 id=self.id, 

373 ) 

374 

375 @classmethod 

376 def from_simple( 

377 cls, 

378 simple: SerializedDatasetRef, 

379 universe: Optional[DimensionUniverse] = None, 

380 registry: Optional[Registry] = None, 

381 datasetType: Optional[DatasetType] = None, 

382 ) -> DatasetRef: 

383 """Construct a new object from simplified form. 

384 

385 Generally this is data returned from the `to_simple` method. 

386 

387 Parameters 

388 ---------- 

389 simple : `dict` of [`str`, `Any`] 

390 The value returned by `to_simple()`. 

391 universe : `DimensionUniverse` 

392 The special graph of all known dimensions. 

393 Can be `None` if a registry is provided. 

394 registry : `lsst.daf.butler.Registry`, optional 

395 Registry to use to convert simple form of a DatasetRef to 

396 a full `DatasetRef`. Can be `None` if a full description of 

397 the type is provided along with a universe. 

398 datasetType : DatasetType, optional 

399 If datasetType is supplied, this will be used as the datasetType 

400 object in the resulting DatasetRef instead of being read from 

401 the `SerializedDatasetRef`. This is useful when many refs share 

402 the same type as memory can be saved. Defaults to None. 

403 

404 Returns 

405 ------- 

406 ref : `DatasetRef` 

407 Newly-constructed object. 

408 """ 

409 # Minimalist component will just specify component and id and 

410 # require registry to reconstruct 

411 if set(simple.dict(exclude_unset=True, exclude_defaults=True)).issubset({"id", "component"}): 

412 if registry is None: 

413 raise ValueError("Registry is required to construct component DatasetRef from integer id") 

414 if simple.id is None: 

415 raise ValueError("For minimal DatasetRef the ID must be defined.") 

416 ref = registry.getDataset(simple.id) 

417 if ref is None: 

418 raise RuntimeError(f"No matching dataset found in registry for id {simple.id}") 

419 if simple.component: 

420 ref = ref.makeComponentRef(simple.component) 

421 return ref 

422 

423 if universe is None and registry is None: 

424 raise ValueError("One of universe or registry must be provided.") 

425 

426 if universe is None and registry is not None: 

427 universe = registry.dimensions 

428 

429 if universe is None: 

430 # this is for mypy 

431 raise ValueError("Unable to determine a usable universe") 

432 

433 if simple.datasetType is None and datasetType is None: 

434 # mypy 

435 raise ValueError("The DatasetType must be specified to construct a DatasetRef") 

436 if datasetType is None: 

437 if simple.datasetType is None: 

438 raise ValueError("Cannot determine Dataset type of this serialized class") 

439 datasetType = DatasetType.from_simple(simple.datasetType, universe=universe, registry=registry) 

440 

441 if simple.dataId is None: 

442 # mypy 

443 raise ValueError("The DataId must be specified to construct a DatasetRef") 

444 dataId = DataCoordinate.from_simple(simple.dataId, universe=universe) 

445 

446 # Check that simple ref is resolved. 

447 if simple.run is None: 

448 dstr = "" 

449 if simple.datasetType is None: 

450 dstr = f" (datasetType={datasetType.name!r})" 

451 raise ValueError( 

452 "Run collection name is missing from serialized representation. " 

453 f"Encountered with {simple!r}{dstr}." 

454 ) 

455 

456 return cls(datasetType, dataId, id=simple.id, run=simple.run) 

457 

458 to_json = to_json_pydantic 

459 from_json: ClassVar = classmethod(from_json_pydantic) 

460 

461 @classmethod 

462 def _unpickle( 

463 cls, 

464 datasetType: DatasetType, 

465 dataId: DataCoordinate, 

466 id: DatasetId, 

467 run: str, 

468 ) -> DatasetRef: 

469 """Create new `DatasetRef`. 

470 

471 A custom factory method for use by `__reduce__` as a workaround for 

472 its lack of support for keyword arguments. 

473 """ 

474 return cls(datasetType, dataId, id=id, run=run) 

475 

476 def __reduce__(self) -> tuple: 

477 return (self._unpickle, (self.datasetType, self.dataId, self.id, self.run)) 

478 

479 def __deepcopy__(self, memo: dict) -> DatasetRef: 

480 # DatasetRef is recursively immutable; see note in @immutable 

481 # decorator. 

482 return self 

483 

484 def expanded(self, dataId: DataCoordinate) -> DatasetRef: 

485 """Return a new `DatasetRef` with the given expanded data ID. 

486 

487 Parameters 

488 ---------- 

489 dataId : `DataCoordinate` 

490 Data ID for the new `DatasetRef`. Must compare equal to the 

491 original data ID. 

492 

493 Returns 

494 ------- 

495 ref : `DatasetRef` 

496 A new `DatasetRef` with the given data ID. 

497 """ 

498 assert dataId == self.dataId 

499 return DatasetRef( 

500 datasetType=self.datasetType, dataId=dataId, id=self.id, run=self.run, conform=False 

501 ) 

502 

503 def isComponent(self) -> bool: 

504 """Indicate whether this `DatasetRef` refers to a component. 

505 

506 Returns 

507 ------- 

508 isComponent : `bool` 

509 `True` if this `DatasetRef` is a component, `False` otherwise. 

510 """ 

511 return self.datasetType.isComponent() 

512 

513 def isComposite(self) -> bool: 

514 """Boolean indicating whether this `DatasetRef` is a composite type. 

515 

516 Returns 

517 ------- 

518 isComposite : `bool` 

519 `True` if this `DatasetRef` is a composite type, `False` 

520 otherwise. 

521 """ 

522 return self.datasetType.isComposite() 

523 

524 def _lookupNames(self) -> Tuple[LookupKey, ...]: 

525 """Name keys to use when looking up this DatasetRef in a configuration. 

526 

527 The names are returned in order of priority. 

528 

529 Returns 

530 ------- 

531 names : `tuple` of `LookupKey` 

532 Tuple of the `DatasetType` name and the `StorageClass` name. 

533 If ``instrument`` is defined in the dataId, each of those names 

534 is added to the start of the tuple with a key derived from the 

535 value of ``instrument``. 

536 """ 

537 # Special case the instrument Dimension since we allow configs 

538 # to include the instrument name in the hierarchy. 

539 names: Tuple[LookupKey, ...] = self.datasetType._lookupNames() 

540 

541 if "instrument" in self.dataId: 

542 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) for n in names) + names 

543 

544 return names 

545 

546 @staticmethod 

547 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

548 """Group an iterable of `DatasetRef` by `DatasetType`. 

549 

550 Parameters 

551 ---------- 

552 refs : `Iterable` [ `DatasetRef` ] 

553 `DatasetRef` instances to group. 

554 

555 Returns 

556 ------- 

557 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ] 

558 Grouped `DatasetRef` instances. 

559 """ 

560 result: NamedKeyDict[DatasetType, List[DatasetRef]] = NamedKeyDict() 

561 for ref in refs: 

562 result.setdefault(ref.datasetType, []).append(ref) 

563 return result 

564 

565 def makeCompositeRef(self) -> DatasetRef: 

566 """Create a `DatasetRef` of the composite from a component ref. 

567 

568 Requires that this `DatasetRef` is a component. 

569 

570 Returns 

571 ------- 

572 ref : `DatasetRef` 

573 A `DatasetRef` with a dataset type that corresponds to the 

574 composite parent of this component, and the same ID and run 

575 (which may be `None`, if they are `None` in ``self``). 

576 """ 

577 # Assume that the data ID does not need to be standardized 

578 # and should match whatever this ref already has. 

579 return DatasetRef( 

580 self.datasetType.makeCompositeDatasetType(), self.dataId, id=self.id, run=self.run, conform=False 

581 ) 

582 

583 def makeComponentRef(self, name: str) -> DatasetRef: 

584 """Create a `DatasetRef` that corresponds to a component. 

585 

586 Parameters 

587 ---------- 

588 name : `str` 

589 Name of the component. 

590 

591 Returns 

592 ------- 

593 ref : `DatasetRef` 

594 A `DatasetRef` with a dataset type that corresponds to the given 

595 component, and the same ID and run 

596 (which may be `None`, if they are `None` in ``self``). 

597 """ 

598 # Assume that the data ID does not need to be standardized 

599 # and should match whatever this ref already has. 

600 return DatasetRef( 

601 self.datasetType.makeComponentDatasetType(name), 

602 self.dataId, 

603 id=self.id, 

604 run=self.run, 

605 conform=False, 

606 ) 

607 

608 def overrideStorageClass(self, storageClass: str | StorageClass) -> DatasetRef: 

609 """Create a new `DatasetRef` from this one, but with a modified 

610 `DatasetType` that has a different `StorageClass`. 

611 

612 Parameters 

613 ---------- 

614 storageClass : `str` or `StorageClass` 

615 The new storage class. 

616 

617 Returns 

618 ------- 

619 modified : `DatasetRef` 

620 A new dataset reference that is the same as the current one but 

621 with a different storage class in the `DatasetType`. 

622 """ 

623 return DatasetRef( 

624 datasetType=self.datasetType.overrideStorageClass(storageClass), 

625 dataId=self.dataId, 

626 id=self.id, 

627 run=self.run, 

628 conform=False, 

629 ) 

630 

631 datasetType: DatasetType 

632 """The definition of this dataset (`DatasetType`). 

633 

634 Cannot be changed after a `DatasetRef` is constructed. 

635 """ 

636 

637 dataId: DataCoordinate 

638 """A mapping of `Dimension` primary key values that labels the dataset 

639 within a Collection (`DataCoordinate`). 

640 

641 Cannot be changed after a `DatasetRef` is constructed. 

642 """ 

643 

644 run: str 

645 """The name of the run that produced the dataset. 

646 

647 Cannot be changed after a `DatasetRef` is constructed. 

648 """ 

649 

650 id: DatasetId 

651 """Primary key of the dataset (`DatasetId`). 

652 

653 Cannot be changed after a `DatasetRef` is constructed. 

654 """