Coverage for python/lsst/daf/butler/_dataset_ref.py: 32%

242 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-01 11:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = [ 

30 "AmbiguousDatasetError", 

31 "DatasetDatastoreRecords", 

32 "DatasetId", 

33 "DatasetIdFactory", 

34 "DatasetIdGenEnum", 

35 "DatasetRef", 

36 "SerializedDatasetRef", 

37] 

38 

39import enum 

40import sys 

41import uuid 

42from collections.abc import Iterable, Mapping 

43from typing import TYPE_CHECKING, Any, ClassVar, Literal, Protocol, TypeAlias, runtime_checkable 

44 

45import pydantic 

46from lsst.daf.butler._compat import PYDANTIC_V2, _BaseModelCompat 

47from lsst.utils.classes import immutable 

48from pydantic import StrictStr 

49 

50from ._config_support import LookupKey 

51from ._dataset_type import DatasetType, SerializedDatasetType 

52from ._named import NamedKeyDict 

53from .datastore.stored_file_info import StoredDatastoreItemInfo 

54from .dimensions import DataCoordinate, DimensionGraph, DimensionUniverse, SerializedDataCoordinate 

55from .json import from_json_pydantic, to_json_pydantic 

56from .persistence_context import PersistenceContextVars 

57 

58if TYPE_CHECKING: 

59 from ._storage_class import StorageClass 

60 from .registry import Registry 

61 

62# Per-dataset records grouped by opaque table name, usually there is just one 

63# opaque table. 

64DatasetDatastoreRecords: TypeAlias = Mapping[str, Iterable[StoredDatastoreItemInfo]] 

65 

66 

67class AmbiguousDatasetError(Exception): 

68 """Raised when a `DatasetRef` is not resolved but should be. 

69 

70 This happens when the `DatasetRef` has no ID or run but the requested 

71 operation requires one of them. 

72 """ 

73 

74 

75@runtime_checkable 

76class _DatasetRefGroupedIterable(Protocol): 

77 """A package-private interface for iterables of `DatasetRef` that know how 

78 to efficiently group their contents by `DatasetType`. 

79 

80 """ 

81 

82 def _iter_by_dataset_type(self) -> Iterable[tuple[DatasetType, Iterable[DatasetRef]]]: 

83 """Iterate over `DatasetRef` instances, one `DatasetType` at a time. 

84 

85 Returns 

86 ------- 

87 grouped : `~collections.abc.Iterable` [ `tuple` [ `DatasetType`, \ 

88 `~collections.abc.Iterable` [ `DatasetRef` ] 

89 An iterable of tuples, in which the first element is a dataset type 

90 and the second is an iterable of `DatasetRef` objects with exactly 

91 that dataset type. 

92 """ 

93 ... 

94 

95 

96class DatasetIdGenEnum(enum.Enum): 

97 """Enum used to specify dataset ID generation options.""" 

98 

99 UNIQUE = 0 

100 """Unique mode generates unique ID for each inserted dataset, e.g. 

101 auto-generated by database or random UUID. 

102 """ 

103 

104 DATAID_TYPE = 1 

105 """In this mode ID is computed deterministically from a combination of 

106 dataset type and dataId. 

107 """ 

108 

109 DATAID_TYPE_RUN = 2 

110 """In this mode ID is computed deterministically from a combination of 

111 dataset type, dataId, and run collection name. 

112 """ 

113 

114 

115class DatasetIdFactory: 

116 """Factory for dataset IDs (UUIDs). 

117 

118 For now the logic is hard-coded and is controlled by the user-provided 

119 value of `DatasetIdGenEnum`. In the future we may implement a configurable 

120 logic that can guess `DatasetIdGenEnum` value from other parameters. 

121 """ 

122 

123 NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f") 

124 """Namespace UUID used for UUID5 generation. Do not change. This was 

125 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`. 

126 """ 

127 

128 def makeDatasetId( 

129 self, 

130 run: str, 

131 datasetType: DatasetType, 

132 dataId: DataCoordinate, 

133 idGenerationMode: DatasetIdGenEnum, 

134 ) -> uuid.UUID: 

135 """Generate dataset ID for a dataset. 

136 

137 Parameters 

138 ---------- 

139 run : `str` 

140 Name of the RUN collection for the dataset. 

141 datasetType : `DatasetType` 

142 Dataset type. 

143 dataId : `DataCoordinate` 

144 Expanded data ID for the dataset. 

145 idGenerationMode : `DatasetIdGenEnum` 

146 ID generation option. `~DatasetIdGenEnum.UNIQUE` makes a random 

147 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a 

148 deterministic UUID5-type ID based on a dataset type name and 

149 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a 

150 deterministic UUID5-type ID based on a dataset type name, run 

151 collection name, and ``dataId``. 

152 

153 Returns 

154 ------- 

155 datasetId : `uuid.UUID` 

156 Dataset identifier. 

157 """ 

158 if idGenerationMode is DatasetIdGenEnum.UNIQUE: 

159 return uuid.uuid4() 

160 else: 

161 # WARNING: If you modify this code make sure that the order of 

162 # items in the `items` list below never changes. 

163 items: list[tuple[str, str]] = [] 

164 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE: 

165 items = [ 

166 ("dataset_type", datasetType.name), 

167 ] 

168 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN: 

169 items = [ 

170 ("dataset_type", datasetType.name), 

171 ("run", run), 

172 ] 

173 else: 

174 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}") 

175 

176 for name, value in sorted(dataId.required.items()): 

177 items.append((name, str(value))) 

178 data = ",".join(f"{key}={value}" for key, value in items) 

179 return uuid.uuid5(self.NS_UUID, data) 

180 

181 

182# This is constant, so don't recreate a set for each instance 

183_serializedDatasetRefFieldsSet = {"id", "datasetType", "dataId", "run", "component"} 

184 

185 

186class SerializedDatasetRef(_BaseModelCompat): 

187 """Simplified model of a `DatasetRef` suitable for serialization.""" 

188 

189 id: uuid.UUID 

190 datasetType: SerializedDatasetType | None = None 

191 dataId: SerializedDataCoordinate | None = None 

192 run: StrictStr | None = None 

193 component: StrictStr | None = None 

194 

195 if PYDANTIC_V2: 195 ↛ 198line 195 didn't jump to line 198, because the condition on line 195 was never true

196 # Can not use "after" validator since in some cases the validator 

197 # seems to trigger with the datasetType field not yet set. 

198 @pydantic.model_validator(mode="before") # type: ignore[attr-defined] 

199 @classmethod 

200 def check_consistent_parameters(cls, data: dict[str, Any]) -> dict[str, Any]: 

201 has_datasetType = data.get("datasetType") is not None 

202 has_dataId = data.get("dataId") is not None 

203 if has_datasetType is not has_dataId: 

204 raise ValueError("If specifying datasetType or dataId, must specify both.") 

205 

206 if data.get("component") is not None and has_datasetType: 

207 raise ValueError("datasetType can not be set if component is given.") 

208 return data 

209 

210 else: 

211 

212 @pydantic.validator("dataId") 

213 def _check_dataId(cls, v: Any, values: dict[str, Any]) -> Any: # noqa: N805 

214 if v and (d := "datasetType") in values and values[d] is None: 

215 raise ValueError("Can not specify 'dataId' without specifying 'datasetType'") 

216 return v 

217 

218 @pydantic.validator("component") 

219 def _check_component(cls, v: Any, values: dict[str, Any]) -> Any: # noqa: N805 

220 # Component should not be given if datasetType is given 

221 if v and (d := "datasetType") in values and values[d] is not None: 

222 raise ValueError(f"datasetType ({values[d]}) can not be set if component is given ({v}).") 

223 return v 

224 

225 @classmethod 

226 def direct( 

227 cls, 

228 *, 

229 id: str, 

230 run: str, 

231 datasetType: dict[str, Any] | None = None, 

232 dataId: dict[str, Any] | None = None, 

233 component: str | None = None, 

234 ) -> SerializedDatasetRef: 

235 """Construct a `SerializedDatasetRef` directly without validators. 

236 

237 Notes 

238 ----- 

239 This differs from the pydantic "construct" method in that the arguments 

240 are explicitly what the model requires, and it will recurse through 

241 members, constructing them from their corresponding `direct` methods. 

242 

243 The ``id`` parameter is a string representation of dataset ID, it is 

244 converted to UUID by this method. 

245 

246 This method should only be called when the inputs are trusted. 

247 """ 

248 serialized_datasetType = ( 

249 SerializedDatasetType.direct(**datasetType) if datasetType is not None else None 

250 ) 

251 serialized_dataId = SerializedDataCoordinate.direct(**dataId) if dataId is not None else None 

252 

253 node = cls.model_construct( 

254 _fields_set=_serializedDatasetRefFieldsSet, 

255 id=uuid.UUID(id), 

256 datasetType=serialized_datasetType, 

257 dataId=serialized_dataId, 

258 run=sys.intern(run), 

259 component=component, 

260 ) 

261 

262 return node 

263 

264 

265DatasetId: TypeAlias = uuid.UUID 

266"""A type-annotation alias for dataset ID providing typing flexibility. 

267""" 

268 

269 

270@immutable 

271class DatasetRef: 

272 """Reference to a Dataset in a `Registry`. 

273 

274 A `DatasetRef` may point to a Dataset that currently does not yet exist 

275 (e.g., because it is a predicted input for provenance). 

276 

277 Parameters 

278 ---------- 

279 datasetType : `DatasetType` 

280 The `DatasetType` for this Dataset. 

281 dataId : `DataCoordinate` 

282 A mapping of dimensions that labels the Dataset within a Collection. 

283 run : `str` 

284 The name of the run this dataset was associated with when it was 

285 created. 

286 id : `DatasetId`, optional 

287 The unique identifier assigned when the dataset is created. If ``id`` 

288 is not specified, a new unique ID will be created. 

289 conform : `bool`, optional 

290 If `True` (default), call `DataCoordinate.standardize` to ensure that 

291 the data ID's dimensions are consistent with the dataset type's. 

292 `DatasetRef` instances for which those dimensions are not equal should 

293 not be created in new code, but are still supported for backwards 

294 compatibility. New code should only pass `False` if it can guarantee 

295 that the dimensions are already consistent. 

296 id_generation_mode : `DatasetIdGenEnum` 

297 ID generation option. `~DatasetIdGenEnum.UNIQUE` makes a random 

298 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a 

299 deterministic UUID5-type ID based on a dataset type name and 

300 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a 

301 deterministic UUID5-type ID based on a dataset type name, run 

302 collection name, and ``dataId``. 

303 

304 See Also 

305 -------- 

306 :ref:`daf_butler_organizing_datasets` 

307 """ 

308 

309 _serializedType = SerializedDatasetRef 

310 __slots__ = ( 

311 "_id", 

312 "datasetType", 

313 "dataId", 

314 "run", 

315 "_datastore_records", 

316 ) 

317 

318 def __init__( 

319 self, 

320 datasetType: DatasetType, 

321 dataId: DataCoordinate, 

322 run: str, 

323 *, 

324 id: DatasetId | None = None, 

325 conform: bool = True, 

326 id_generation_mode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

327 datastore_records: DatasetDatastoreRecords | None = None, 

328 ): 

329 self.datasetType = datasetType 

330 if conform: 

331 self.dataId = DataCoordinate.standardize(dataId, dimensions=datasetType.dimensions) 

332 else: 

333 self.dataId = dataId 

334 self.run = run 

335 if id is not None: 

336 self._id = id.int 

337 else: 

338 self._id = ( 

339 DatasetIdFactory() 

340 .makeDatasetId(self.run, self.datasetType, self.dataId, id_generation_mode) 

341 .int 

342 ) 

343 self._datastore_records = datastore_records 

344 

345 @property 

346 def id(self) -> DatasetId: 

347 """Primary key of the dataset (`DatasetId`). 

348 

349 Cannot be changed after a `DatasetRef` is constructed. 

350 """ 

351 return uuid.UUID(int=self._id) 

352 

353 def __eq__(self, other: Any) -> bool: 

354 try: 

355 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id) 

356 except AttributeError: 

357 return NotImplemented 

358 

359 def __hash__(self) -> int: 

360 return hash((self.datasetType, self.dataId, self.id)) 

361 

362 @property 

363 def dimensions(self) -> DimensionGraph: 

364 """Dimensions associated with the underlying `DatasetType`.""" 

365 return self.datasetType.dimensions 

366 

367 def __repr__(self) -> str: 

368 # We delegate to __str__ (i.e use "!s") for the data ID) below because 

369 # DataCoordinate's __repr__ - while adhering to the guidelines for 

370 # __repr__ - is much harder to users to read, while its __str__ just 

371 # produces a dict that can also be passed to DatasetRef's constructor. 

372 return f"DatasetRef({self.datasetType!r}, {self.dataId!s}, run={self.run!r}, id={self.id})" 

373 

374 def __str__(self) -> str: 

375 s = ( 

376 f"{self.datasetType.name}@{self.dataId!s} [sc={self.datasetType.storageClass_name}]" 

377 f" (run={self.run} id={self.id})" 

378 ) 

379 return s 

380 

381 def __lt__(self, other: Any) -> bool: 

382 # Sort by run, DatasetType name and then by DataCoordinate 

383 # The __str__ representation is probably close enough but we 

384 # need to ensure that sorting a DatasetRef matches what you would 

385 # get if you sorted DatasetType+DataCoordinate 

386 if not isinstance(other, type(self)): 

387 return NotImplemented 

388 

389 # Group by run if defined, takes precedence over DatasetType 

390 self_run = "" if self.run is None else self.run 

391 other_run = "" if other.run is None else other.run 

392 

393 # Compare tuples in the priority order 

394 return (self_run, self.datasetType, self.dataId) < (other_run, other.datasetType, other.dataId) 

395 

396 def to_simple(self, minimal: bool = False) -> SerializedDatasetRef: 

397 """Convert this class to a simple python type. 

398 

399 This makes it suitable for serialization. 

400 

401 Parameters 

402 ---------- 

403 minimal : `bool`, optional 

404 Use minimal serialization. Requires Registry to convert 

405 back to a full type. 

406 

407 Returns 

408 ------- 

409 simple : `dict` or `int` 

410 The object converted to a dictionary. 

411 """ 

412 if minimal: 

413 # The only thing needed to uniquely define a DatasetRef is its id 

414 # so that can be used directly if it is not a component DatasetRef. 

415 # Store is in a dict to allow us to easily add the planned origin 

416 # information later without having to support an int and dict in 

417 # simple form. 

418 simple: dict[str, Any] = {"id": self.id} 

419 if self.isComponent(): 

420 # We can still be a little minimalist with a component 

421 # but we will also need to record the datasetType component 

422 simple["component"] = self.datasetType.component() 

423 return SerializedDatasetRef(**simple) 

424 

425 return SerializedDatasetRef( 

426 datasetType=self.datasetType.to_simple(minimal=minimal), 

427 dataId=self.dataId.to_simple(), 

428 run=self.run, 

429 id=self.id, 

430 ) 

431 

432 @classmethod 

433 def from_simple( 

434 cls, 

435 simple: SerializedDatasetRef, 

436 universe: DimensionUniverse | None = None, 

437 registry: Registry | None = None, 

438 datasetType: DatasetType | None = None, 

439 ) -> DatasetRef: 

440 """Construct a new object from simplified form. 

441 

442 Generally this is data returned from the `to_simple` method. 

443 

444 Parameters 

445 ---------- 

446 simple : `dict` of [`str`, `Any`] 

447 The value returned by `to_simple()`. 

448 universe : `DimensionUniverse` 

449 The special graph of all known dimensions. 

450 Can be `None` if a registry is provided. 

451 registry : `lsst.daf.butler.Registry`, optional 

452 Registry to use to convert simple form of a DatasetRef to 

453 a full `DatasetRef`. Can be `None` if a full description of 

454 the type is provided along with a universe. 

455 datasetType : DatasetType, optional 

456 If datasetType is supplied, this will be used as the datasetType 

457 object in the resulting DatasetRef instead of being read from 

458 the `SerializedDatasetRef`. This is useful when many refs share 

459 the same type as memory can be saved. Defaults to None. 

460 

461 Returns 

462 ------- 

463 ref : `DatasetRef` 

464 Newly-constructed object. 

465 """ 

466 cache = PersistenceContextVars.datasetRefs.get() 

467 key = simple.id.int 

468 if cache is not None and (ref := cache.get(key, None)) is not None: 

469 if datasetType is not None: 

470 if (component := datasetType.component()) is not None: 

471 ref = ref.makeComponentRef(component) 

472 ref = ref.overrideStorageClass(datasetType.storageClass_name) 

473 return ref 

474 if simple.datasetType is not None: 

475 _, component = DatasetType.splitDatasetTypeName(simple.datasetType.name) 

476 if component is not None: 

477 ref = ref.makeComponentRef(component) 

478 if simple.datasetType.storageClass is not None: 

479 ref = ref.overrideStorageClass(simple.datasetType.storageClass) 

480 return ref 

481 # If dataset type is not given ignore the cache, because we can't 

482 # reliably return the right storage class. 

483 # Minimalist component will just specify component and id and 

484 # require registry to reconstruct 

485 if simple.datasetType is None and simple.dataId is None and simple.run is None: 

486 if registry is None: 

487 raise ValueError("Registry is required to construct component DatasetRef from integer id") 

488 if simple.id is None: 

489 raise ValueError("For minimal DatasetRef the ID must be defined.") 

490 ref = registry.getDataset(simple.id) 

491 if ref is None: 

492 raise RuntimeError(f"No matching dataset found in registry for id {simple.id}") 

493 if simple.component: 

494 ref = ref.makeComponentRef(simple.component) 

495 else: 

496 if universe is None: 

497 if registry is None: 

498 raise ValueError("One of universe or registry must be provided.") 

499 universe = registry.dimensions 

500 if datasetType is None: 

501 if simple.datasetType is None: 

502 raise ValueError("Cannot determine Dataset type of this serialized class") 

503 datasetType = DatasetType.from_simple( 

504 simple.datasetType, universe=universe, registry=registry 

505 ) 

506 if simple.dataId is None: 

507 # mypy 

508 raise ValueError("The DataId must be specified to construct a DatasetRef") 

509 dataId = DataCoordinate.from_simple(simple.dataId, universe=universe) 

510 # Check that simple ref is resolved. 

511 if simple.run is None: 

512 dstr = "" 

513 if simple.datasetType is None: 

514 dstr = f" (datasetType={datasetType.name!r})" 

515 raise ValueError( 

516 "Run collection name is missing from serialized representation. " 

517 f"Encountered with {simple!r}{dstr}." 

518 ) 

519 ref = cls( 

520 datasetType, 

521 dataId, 

522 id=simple.id, 

523 run=simple.run, 

524 ) 

525 if cache is not None: 

526 if ref.datasetType.component() is not None: 

527 cache[key] = ref.makeCompositeRef() 

528 else: 

529 cache[key] = ref 

530 return ref 

531 

532 to_json = to_json_pydantic 

533 from_json: ClassVar = classmethod(from_json_pydantic) 

534 

535 @classmethod 

536 def _unpickle( 

537 cls, 

538 datasetType: DatasetType, 

539 dataId: DataCoordinate, 

540 id: DatasetId, 

541 run: str, 

542 datastore_records: DatasetDatastoreRecords | None, 

543 ) -> DatasetRef: 

544 """Create new `DatasetRef`. 

545 

546 A custom factory method for use by `__reduce__` as a workaround for 

547 its lack of support for keyword arguments. 

548 """ 

549 return cls(datasetType, dataId, id=id, run=run, datastore_records=datastore_records) 

550 

551 def __reduce__(self) -> tuple: 

552 return ( 

553 self._unpickle, 

554 (self.datasetType, self.dataId, self.id, self.run, self._datastore_records), 

555 ) 

556 

557 def __deepcopy__(self, memo: dict) -> DatasetRef: 

558 # DatasetRef is recursively immutable; see note in @immutable 

559 # decorator. 

560 return self 

561 

562 def expanded(self, dataId: DataCoordinate) -> DatasetRef: 

563 """Return a new `DatasetRef` with the given expanded data ID. 

564 

565 Parameters 

566 ---------- 

567 dataId : `DataCoordinate` 

568 Data ID for the new `DatasetRef`. Must compare equal to the 

569 original data ID. 

570 

571 Returns 

572 ------- 

573 ref : `DatasetRef` 

574 A new `DatasetRef` with the given data ID. 

575 """ 

576 assert dataId == self.dataId 

577 return DatasetRef( 

578 datasetType=self.datasetType, 

579 dataId=dataId, 

580 id=self.id, 

581 run=self.run, 

582 conform=False, 

583 datastore_records=self._datastore_records, 

584 ) 

585 

586 def isComponent(self) -> bool: 

587 """Indicate whether this `DatasetRef` refers to a component. 

588 

589 Returns 

590 ------- 

591 isComponent : `bool` 

592 `True` if this `DatasetRef` is a component, `False` otherwise. 

593 """ 

594 return self.datasetType.isComponent() 

595 

596 def isComposite(self) -> bool: 

597 """Boolean indicating whether this `DatasetRef` is a composite type. 

598 

599 Returns 

600 ------- 

601 isComposite : `bool` 

602 `True` if this `DatasetRef` is a composite type, `False` 

603 otherwise. 

604 """ 

605 return self.datasetType.isComposite() 

606 

607 def _lookupNames(self) -> tuple[LookupKey, ...]: 

608 """Name keys to use when looking up this DatasetRef in a configuration. 

609 

610 The names are returned in order of priority. 

611 

612 Returns 

613 ------- 

614 names : `tuple` of `LookupKey` 

615 Tuple of the `DatasetType` name and the `StorageClass` name. 

616 If ``instrument`` is defined in the dataId, each of those names 

617 is added to the start of the tuple with a key derived from the 

618 value of ``instrument``. 

619 """ 

620 # Special case the instrument Dimension since we allow configs 

621 # to include the instrument name in the hierarchy. 

622 names: tuple[LookupKey, ...] = self.datasetType._lookupNames() 

623 

624 if "instrument" in self.dataId: 

625 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) for n in names) + names 

626 

627 return names 

628 

629 @staticmethod 

630 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, list[DatasetRef]]: 

631 """Group an iterable of `DatasetRef` by `DatasetType`. 

632 

633 Parameters 

634 ---------- 

635 refs : `~collections.abc.Iterable` [ `DatasetRef` ] 

636 `DatasetRef` instances to group. 

637 

638 Returns 

639 ------- 

640 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ] 

641 Grouped `DatasetRef` instances. 

642 

643 Notes 

644 ----- 

645 When lazy item-iterables are acceptable instead of a full mapping, 

646 `iter_by_type` can in some cases be far more efficient. 

647 """ 

648 result: NamedKeyDict[DatasetType, list[DatasetRef]] = NamedKeyDict() 

649 for ref in refs: 

650 result.setdefault(ref.datasetType, []).append(ref) 

651 return result 

652 

653 @staticmethod 

654 def iter_by_type( 

655 refs: Iterable[DatasetRef], 

656 ) -> Iterable[tuple[DatasetType, Iterable[DatasetRef]]]: 

657 """Group an iterable of `DatasetRef` by `DatasetType` with special 

658 hooks for custom iterables that can do this efficiently. 

659 

660 Parameters 

661 ---------- 

662 refs : `~collections.abc.Iterable` [ `DatasetRef` ] 

663 `DatasetRef` instances to group. If this satisfies the 

664 `_DatasetRefGroupedIterable` protocol, its 

665 `~_DatasetRefGroupedIterable._iter_by_dataset_type` method will 

666 be called. 

667 

668 Returns 

669 ------- 

670 grouped : `~collections.abc.Iterable` [ `tuple` [ `DatasetType`, \ 

671 `Iterable` [ `DatasetRef` ] ]] 

672 Grouped `DatasetRef` instances. 

673 """ 

674 if isinstance(refs, _DatasetRefGroupedIterable): 

675 return refs._iter_by_dataset_type() 

676 return DatasetRef.groupByType(refs).items() 

677 

678 def makeCompositeRef(self) -> DatasetRef: 

679 """Create a `DatasetRef` of the composite from a component ref. 

680 

681 Requires that this `DatasetRef` is a component. 

682 

683 Returns 

684 ------- 

685 ref : `DatasetRef` 

686 A `DatasetRef` with a dataset type that corresponds to the 

687 composite parent of this component, and the same ID and run 

688 (which may be `None`, if they are `None` in ``self``). 

689 """ 

690 # Assume that the data ID does not need to be standardized 

691 # and should match whatever this ref already has. 

692 return DatasetRef( 

693 self.datasetType.makeCompositeDatasetType(), 

694 self.dataId, 

695 id=self.id, 

696 run=self.run, 

697 conform=False, 

698 datastore_records=self._datastore_records, 

699 ) 

700 

701 def makeComponentRef(self, name: str) -> DatasetRef: 

702 """Create a `DatasetRef` that corresponds to a component. 

703 

704 Parameters 

705 ---------- 

706 name : `str` 

707 Name of the component. 

708 

709 Returns 

710 ------- 

711 ref : `DatasetRef` 

712 A `DatasetRef` with a dataset type that corresponds to the given 

713 component, and the same ID and run 

714 (which may be `None`, if they are `None` in ``self``). 

715 """ 

716 # Assume that the data ID does not need to be standardized 

717 # and should match whatever this ref already has. 

718 return DatasetRef( 

719 self.datasetType.makeComponentDatasetType(name), 

720 self.dataId, 

721 id=self.id, 

722 run=self.run, 

723 conform=False, 

724 datastore_records=self._datastore_records, 

725 ) 

726 

727 def overrideStorageClass(self, storageClass: str | StorageClass) -> DatasetRef: 

728 """Create a new `DatasetRef` from this one, but with a modified 

729 `DatasetType` that has a different `StorageClass`. 

730 

731 Parameters 

732 ---------- 

733 storageClass : `str` or `StorageClass` 

734 The new storage class. 

735 

736 Returns 

737 ------- 

738 modified : `DatasetRef` 

739 A new dataset reference that is the same as the current one but 

740 with a different storage class in the `DatasetType`. 

741 """ 

742 return self.replace(storage_class=storageClass) 

743 

744 def replace( 

745 self, 

746 *, 

747 id: DatasetId | None = None, 

748 run: str | None = None, 

749 storage_class: str | StorageClass | None = None, 

750 datastore_records: DatasetDatastoreRecords | None | Literal[False] = False, 

751 ) -> DatasetRef: 

752 """Create a new `DatasetRef` from this one, but with some modified 

753 attributes. 

754 

755 Parameters 

756 ---------- 

757 id : `DatasetId` or `None` 

758 If not `None` then update dataset ID. 

759 run : `str` or `None` 

760 If not `None` then update run collection name. If ``dataset_id`` is 

761 `None` then this will also cause new dataset ID to be generated. 

762 storage_class : `str` or `StorageClass` or `None`. 

763 The new storage class. If not `None`, replaces existing storage 

764 class. 

765 datastore_records : `DatasetDatastoreRecords` or `None` 

766 New datastore records. If `None` remove all records. By default 

767 datastore records are preserved. 

768 

769 Returns 

770 ------- 

771 modified : `DatasetRef` 

772 A new dataset reference with updated attributes. 

773 """ 

774 if datastore_records is False: 

775 datastore_records = self._datastore_records 

776 if storage_class is None: 

777 datasetType = self.datasetType 

778 else: 

779 datasetType = self.datasetType.overrideStorageClass(storage_class) 

780 if run is None: 

781 run = self.run 

782 # Do not regenerate dataset ID if run is the same. 

783 if id is None: 

784 id = self.id 

785 return DatasetRef( 

786 datasetType=datasetType, 

787 dataId=self.dataId, 

788 run=run, 

789 id=id, 

790 conform=False, 

791 datastore_records=datastore_records, 

792 ) 

793 

794 def is_compatible_with(self, ref: DatasetRef) -> bool: 

795 """Determine if the given `DatasetRef` is compatible with this one. 

796 

797 Parameters 

798 ---------- 

799 other : `DatasetRef` 

800 Dataset ref to check. 

801 

802 Returns 

803 ------- 

804 is_compatible : `bool` 

805 Returns `True` if the other dataset ref is either the same as this 

806 or the dataset type associated with the other is compatible with 

807 this one and the dataId and dataset ID match. 

808 

809 Notes 

810 ----- 

811 Compatibility requires that the dataId and dataset ID match and the 

812 `DatasetType` is compatible. Compatibility is defined as the storage 

813 class associated with the dataset type of the other ref can be 

814 converted to this storage class. 

815 

816 Specifically this means that if you have done: 

817 

818 .. code-block:: py 

819 

820 new_ref = ref.overrideStorageClass(sc) 

821 

822 and this is successful, then the guarantee is that: 

823 

824 .. code-block:: py 

825 

826 assert ref.is_compatible_with(new_ref) is True 

827 

828 since we know that the python type associated with the new ref can 

829 be converted to the original python type. The reverse is not guaranteed 

830 and depends on whether bidirectional converters have been registered. 

831 """ 

832 if self.id != ref.id: 

833 return False 

834 if self.dataId != ref.dataId: 

835 return False 

836 if self.run != ref.run: 

837 return False 

838 return self.datasetType.is_compatible_with(ref.datasetType) 

839 

840 datasetType: DatasetType 

841 """The definition of this dataset (`DatasetType`). 

842 

843 Cannot be changed after a `DatasetRef` is constructed. 

844 """ 

845 

846 dataId: DataCoordinate 

847 """A mapping of `Dimension` primary key values that labels the dataset 

848 within a Collection (`DataCoordinate`). 

849 

850 Cannot be changed after a `DatasetRef` is constructed. 

851 """ 

852 

853 run: str 

854 """The name of the run that produced the dataset. 

855 

856 Cannot be changed after a `DatasetRef` is constructed. 

857 """ 

858 

859 datastore_records: DatasetDatastoreRecords | None 

860 """Optional datastore records (`DatasetDatastoreRecords`). 

861 

862 Cannot be changed after a `DatasetRef` is constructed. 

863 """