Coverage for python/lsst/daf/butler/_dataset_ref.py: 34%

230 statements  

« prev     ^ index     » next       coverage.py v7.4.3, created at 2024-03-07 11:04 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = [ 

30 "AmbiguousDatasetError", 

31 "DatasetDatastoreRecords", 

32 "DatasetId", 

33 "DatasetIdFactory", 

34 "DatasetIdGenEnum", 

35 "DatasetRef", 

36 "SerializedDatasetRef", 

37] 

38 

39import enum 

40import sys 

41import uuid 

42from collections.abc import Iterable, Mapping 

43from typing import TYPE_CHECKING, Any, ClassVar, Literal, Protocol, TypeAlias, runtime_checkable 

44 

45import pydantic 

46from lsst.utils.classes import immutable 

47from pydantic import StrictStr 

48 

49from ._config_support import LookupKey 

50from ._dataset_type import DatasetType, SerializedDatasetType 

51from ._named import NamedKeyDict 

52from .datastore.stored_file_info import StoredDatastoreItemInfo 

53from .dimensions import DataCoordinate, DimensionGraph, DimensionUniverse, SerializedDataCoordinate 

54from .json import from_json_pydantic, to_json_pydantic 

55from .persistence_context import PersistenceContextVars 

56 

57if TYPE_CHECKING: 

58 from ._storage_class import StorageClass 

59 from .registry import Registry 

60 

61# Per-dataset records grouped by opaque table name, usually there is just one 

62# opaque table. 

63DatasetDatastoreRecords: TypeAlias = Mapping[str, list[StoredDatastoreItemInfo]] 

64 

65 

66class AmbiguousDatasetError(Exception): 

67 """Raised when a `DatasetRef` is not resolved but should be. 

68 

69 This happens when the `DatasetRef` has no ID or run but the requested 

70 operation requires one of them. 

71 """ 

72 

73 

74@runtime_checkable 

75class _DatasetRefGroupedIterable(Protocol): 

76 """A package-private interface for iterables of `DatasetRef` that know how 

77 to efficiently group their contents by `DatasetType`. 

78 

79 """ 

80 

81 def _iter_by_dataset_type(self) -> Iterable[tuple[DatasetType, Iterable[DatasetRef]]]: 

82 """Iterate over `DatasetRef` instances, one `DatasetType` at a time. 

83 

84 Returns 

85 ------- 

86 grouped : `~collections.abc.Iterable` [ `tuple` [ `DatasetType`, \ 

87 `~collections.abc.Iterable` [ `DatasetRef` ] 

88 An iterable of tuples, in which the first element is a dataset type 

89 and the second is an iterable of `DatasetRef` objects with exactly 

90 that dataset type. 

91 """ 

92 ... 

93 

94 

95class DatasetIdGenEnum(enum.Enum): 

96 """Enum used to specify dataset ID generation options.""" 

97 

98 UNIQUE = 0 

99 """Unique mode generates unique ID for each inserted dataset, e.g. 

100 auto-generated by database or random UUID. 

101 """ 

102 

103 DATAID_TYPE = 1 

104 """In this mode ID is computed deterministically from a combination of 

105 dataset type and dataId. 

106 """ 

107 

108 DATAID_TYPE_RUN = 2 

109 """In this mode ID is computed deterministically from a combination of 

110 dataset type, dataId, and run collection name. 

111 """ 

112 

113 

114class DatasetIdFactory: 

115 """Factory for dataset IDs (UUIDs). 

116 

117 For now the logic is hard-coded and is controlled by the user-provided 

118 value of `DatasetIdGenEnum`. In the future we may implement a configurable 

119 logic that can guess `DatasetIdGenEnum` value from other parameters. 

120 """ 

121 

122 NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f") 

123 """Namespace UUID used for UUID5 generation. Do not change. This was 

124 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`. 

125 """ 

126 

127 def makeDatasetId( 

128 self, 

129 run: str, 

130 datasetType: DatasetType, 

131 dataId: DataCoordinate, 

132 idGenerationMode: DatasetIdGenEnum, 

133 ) -> uuid.UUID: 

134 """Generate dataset ID for a dataset. 

135 

136 Parameters 

137 ---------- 

138 run : `str` 

139 Name of the RUN collection for the dataset. 

140 datasetType : `DatasetType` 

141 Dataset type. 

142 dataId : `DataCoordinate` 

143 Expanded data ID for the dataset. 

144 idGenerationMode : `DatasetIdGenEnum` 

145 ID generation option. `~DatasetIdGenEnum.UNIQUE` makes a random 

146 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a 

147 deterministic UUID5-type ID based on a dataset type name and 

148 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a 

149 deterministic UUID5-type ID based on a dataset type name, run 

150 collection name, and ``dataId``. 

151 

152 Returns 

153 ------- 

154 datasetId : `uuid.UUID` 

155 Dataset identifier. 

156 """ 

157 if idGenerationMode is DatasetIdGenEnum.UNIQUE: 

158 return uuid.uuid4() 

159 else: 

160 # WARNING: If you modify this code make sure that the order of 

161 # items in the `items` list below never changes. 

162 items: list[tuple[str, str]] = [] 

163 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE: 

164 items = [ 

165 ("dataset_type", datasetType.name), 

166 ] 

167 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN: 

168 items = [ 

169 ("dataset_type", datasetType.name), 

170 ("run", run), 

171 ] 

172 else: 

173 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}") 

174 

175 for name, value in sorted(dataId.required.items()): 

176 items.append((name, str(value))) 

177 data = ",".join(f"{key}={value}" for key, value in items) 

178 return uuid.uuid5(self.NS_UUID, data) 

179 

180 

181# This is constant, so don't recreate a set for each instance 

182_serializedDatasetRefFieldsSet = {"id", "datasetType", "dataId", "run", "component"} 

183 

184 

185class SerializedDatasetRef(pydantic.BaseModel): 

186 """Simplified model of a `DatasetRef` suitable for serialization.""" 

187 

188 id: uuid.UUID 

189 datasetType: SerializedDatasetType | None = None 

190 dataId: SerializedDataCoordinate | None = None 

191 run: StrictStr | None = None 

192 component: StrictStr | None = None 

193 

194 # Can not use "after" validator since in some cases the validator 

195 # seems to trigger with the datasetType field not yet set. 

196 @pydantic.model_validator(mode="before") # type: ignore[attr-defined] 

197 @classmethod 

198 def check_consistent_parameters(cls, data: dict[str, Any]) -> dict[str, Any]: 

199 has_datasetType = data.get("datasetType") is not None 

200 has_dataId = data.get("dataId") is not None 

201 if has_datasetType is not has_dataId: 

202 raise ValueError("If specifying datasetType or dataId, must specify both.") 

203 

204 if data.get("component") is not None and has_datasetType: 

205 raise ValueError("datasetType can not be set if component is given.") 

206 return data 

207 

208 @classmethod 

209 def direct( 

210 cls, 

211 *, 

212 id: str, 

213 run: str, 

214 datasetType: dict[str, Any] | None = None, 

215 dataId: dict[str, Any] | None = None, 

216 component: str | None = None, 

217 ) -> SerializedDatasetRef: 

218 """Construct a `SerializedDatasetRef` directly without validators. 

219 

220 Parameters 

221 ---------- 

222 id : `str` 

223 The UUID in string form. 

224 run : `str` 

225 The run for this dataset. 

226 datasetType : `dict` [`str`, `typing.Any`] 

227 A representation of the dataset type. 

228 dataId : `dict` [`str`, `typing.Any`] 

229 A representation of the data ID. 

230 component : `str` or `None` 

231 Any component associated with this ref. 

232 

233 Returns 

234 ------- 

235 serialized : `SerializedDatasetRef` 

236 A Pydantic model representing the given parameters. 

237 

238 Notes 

239 ----- 

240 This differs from the pydantic "construct" method in that the arguments 

241 are explicitly what the model requires, and it will recurse through 

242 members, constructing them from their corresponding `direct` methods. 

243 

244 The ``id`` parameter is a string representation of dataset ID, it is 

245 converted to UUID by this method. 

246 

247 This method should only be called when the inputs are trusted. 

248 """ 

249 serialized_datasetType = ( 

250 SerializedDatasetType.direct(**datasetType) if datasetType is not None else None 

251 ) 

252 serialized_dataId = SerializedDataCoordinate.direct(**dataId) if dataId is not None else None 

253 

254 node = cls.model_construct( 

255 _fields_set=_serializedDatasetRefFieldsSet, 

256 id=uuid.UUID(id), 

257 datasetType=serialized_datasetType, 

258 dataId=serialized_dataId, 

259 run=sys.intern(run), 

260 component=component, 

261 ) 

262 

263 return node 

264 

265 

266DatasetId: TypeAlias = uuid.UUID 

267"""A type-annotation alias for dataset ID providing typing flexibility. 

268""" 

269 

270 

271@immutable 

272class DatasetRef: 

273 """Reference to a Dataset in a `Registry`. 

274 

275 A `DatasetRef` may point to a Dataset that currently does not yet exist 

276 (e.g., because it is a predicted input for provenance). 

277 

278 Parameters 

279 ---------- 

280 datasetType : `DatasetType` 

281 The `DatasetType` for this Dataset. 

282 dataId : `DataCoordinate` 

283 A mapping of dimensions that labels the Dataset within a Collection. 

284 run : `str` 

285 The name of the run this dataset was associated with when it was 

286 created. 

287 id : `DatasetId`, optional 

288 The unique identifier assigned when the dataset is created. If ``id`` 

289 is not specified, a new unique ID will be created. 

290 conform : `bool`, optional 

291 If `True` (default), call `DataCoordinate.standardize` to ensure that 

292 the data ID's dimensions are consistent with the dataset type's. 

293 `DatasetRef` instances for which those dimensions are not equal should 

294 not be created in new code, but are still supported for backwards 

295 compatibility. New code should only pass `False` if it can guarantee 

296 that the dimensions are already consistent. 

297 id_generation_mode : `DatasetIdGenEnum` 

298 ID generation option. `~DatasetIdGenEnum.UNIQUE` makes a random 

299 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a 

300 deterministic UUID5-type ID based on a dataset type name and 

301 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a 

302 deterministic UUID5-type ID based on a dataset type name, run 

303 collection name, and ``dataId``. 

304 datastore_records : `DatasetDatastoreRecords` or `None` 

305 Datastore records to attach. 

306 

307 Notes 

308 ----- 

309 See also :ref:`daf_butler_organizing_datasets` 

310 """ 

311 

312 _serializedType = SerializedDatasetRef 

313 __slots__ = ( 

314 "_id", 

315 "datasetType", 

316 "dataId", 

317 "run", 

318 "_datastore_records", 

319 ) 

320 

321 def __init__( 

322 self, 

323 datasetType: DatasetType, 

324 dataId: DataCoordinate, 

325 run: str, 

326 *, 

327 id: DatasetId | None = None, 

328 conform: bool = True, 

329 id_generation_mode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

330 datastore_records: DatasetDatastoreRecords | None = None, 

331 ): 

332 self.datasetType = datasetType 

333 if conform: 

334 self.dataId = DataCoordinate.standardize(dataId, dimensions=datasetType.dimensions) 

335 else: 

336 self.dataId = dataId 

337 self.run = run 

338 if id is not None: 

339 self._id = id.int 

340 else: 

341 self._id = ( 

342 DatasetIdFactory() 

343 .makeDatasetId(self.run, self.datasetType, self.dataId, id_generation_mode) 

344 .int 

345 ) 

346 self._datastore_records = datastore_records 

347 

348 @property 

349 def id(self) -> DatasetId: 

350 """Primary key of the dataset (`DatasetId`). 

351 

352 Cannot be changed after a `DatasetRef` is constructed. 

353 """ 

354 return uuid.UUID(int=self._id) 

355 

356 def __eq__(self, other: Any) -> bool: 

357 try: 

358 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id) 

359 except AttributeError: 

360 return NotImplemented 

361 

362 def __hash__(self) -> int: 

363 return hash((self.datasetType, self.dataId, self.id)) 

364 

365 @property 

366 def dimensions(self) -> DimensionGraph: 

367 """Dimensions associated with the underlying `DatasetType`.""" 

368 return self.datasetType.dimensions 

369 

370 def __repr__(self) -> str: 

371 # We delegate to __str__ (i.e use "!s") for the data ID) below because 

372 # DataCoordinate's __repr__ - while adhering to the guidelines for 

373 # __repr__ - is much harder to users to read, while its __str__ just 

374 # produces a dict that can also be passed to DatasetRef's constructor. 

375 return f"DatasetRef({self.datasetType!r}, {self.dataId!s}, run={self.run!r}, id={self.id})" 

376 

377 def __str__(self) -> str: 

378 s = ( 

379 f"{self.datasetType.name}@{self.dataId!s} [sc={self.datasetType.storageClass_name}]" 

380 f" (run={self.run} id={self.id})" 

381 ) 

382 return s 

383 

384 def __lt__(self, other: Any) -> bool: 

385 # Sort by run, DatasetType name and then by DataCoordinate 

386 # The __str__ representation is probably close enough but we 

387 # need to ensure that sorting a DatasetRef matches what you would 

388 # get if you sorted DatasetType+DataCoordinate 

389 if not isinstance(other, type(self)): 

390 return NotImplemented 

391 

392 # Group by run if defined, takes precedence over DatasetType 

393 self_run = "" if self.run is None else self.run 

394 other_run = "" if other.run is None else other.run 

395 

396 # Compare tuples in the priority order 

397 return (self_run, self.datasetType, self.dataId) < (other_run, other.datasetType, other.dataId) 

398 

399 def to_simple(self, minimal: bool = False) -> SerializedDatasetRef: 

400 """Convert this class to a simple python type. 

401 

402 This makes it suitable for serialization. 

403 

404 Parameters 

405 ---------- 

406 minimal : `bool`, optional 

407 Use minimal serialization. Requires Registry to convert 

408 back to a full type. 

409 

410 Returns 

411 ------- 

412 simple : `dict` or `int` 

413 The object converted to a dictionary. 

414 """ 

415 if minimal: 

416 # The only thing needed to uniquely define a DatasetRef is its id 

417 # so that can be used directly if it is not a component DatasetRef. 

418 # Store is in a dict to allow us to easily add the planned origin 

419 # information later without having to support an int and dict in 

420 # simple form. 

421 simple: dict[str, Any] = {"id": self.id} 

422 if self.isComponent(): 

423 # We can still be a little minimalist with a component 

424 # but we will also need to record the datasetType component 

425 simple["component"] = self.datasetType.component() 

426 return SerializedDatasetRef(**simple) 

427 

428 return SerializedDatasetRef( 

429 datasetType=self.datasetType.to_simple(minimal=minimal), 

430 dataId=self.dataId.to_simple(), 

431 run=self.run, 

432 id=self.id, 

433 ) 

434 

435 @classmethod 

436 def from_simple( 

437 cls, 

438 simple: SerializedDatasetRef, 

439 universe: DimensionUniverse | None = None, 

440 registry: Registry | None = None, 

441 datasetType: DatasetType | None = None, 

442 ) -> DatasetRef: 

443 """Construct a new object from simplified form. 

444 

445 Generally this is data returned from the `to_simple` method. 

446 

447 Parameters 

448 ---------- 

449 simple : `dict` of [`str`, `Any`] 

450 The value returned by `to_simple()`. 

451 universe : `DimensionUniverse` 

452 The special graph of all known dimensions. 

453 Can be `None` if a registry is provided. 

454 registry : `lsst.daf.butler.Registry`, optional 

455 Registry to use to convert simple form of a DatasetRef to 

456 a full `DatasetRef`. Can be `None` if a full description of 

457 the type is provided along with a universe. 

458 datasetType : DatasetType, optional 

459 If datasetType is supplied, this will be used as the datasetType 

460 object in the resulting DatasetRef instead of being read from 

461 the `SerializedDatasetRef`. This is useful when many refs share 

462 the same type as memory can be saved. Defaults to None. 

463 

464 Returns 

465 ------- 

466 ref : `DatasetRef` 

467 Newly-constructed object. 

468 """ 

469 cache = PersistenceContextVars.datasetRefs.get() 

470 key = simple.id.int 

471 if cache is not None and (ref := cache.get(key, None)) is not None: 

472 if datasetType is not None: 

473 if (component := datasetType.component()) is not None: 

474 ref = ref.makeComponentRef(component) 

475 ref = ref.overrideStorageClass(datasetType.storageClass_name) 

476 return ref 

477 if simple.datasetType is not None: 

478 _, component = DatasetType.splitDatasetTypeName(simple.datasetType.name) 

479 if component is not None: 

480 ref = ref.makeComponentRef(component) 

481 if simple.datasetType.storageClass is not None: 

482 ref = ref.overrideStorageClass(simple.datasetType.storageClass) 

483 return ref 

484 # If dataset type is not given ignore the cache, because we can't 

485 # reliably return the right storage class. 

486 # Minimalist component will just specify component and id and 

487 # require registry to reconstruct 

488 if simple.datasetType is None and simple.dataId is None and simple.run is None: 

489 if registry is None: 

490 raise ValueError("Registry is required to construct component DatasetRef from integer id") 

491 if simple.id is None: 

492 raise ValueError("For minimal DatasetRef the ID must be defined.") 

493 ref = registry.getDataset(simple.id) 

494 if ref is None: 

495 raise RuntimeError(f"No matching dataset found in registry for id {simple.id}") 

496 if simple.component: 

497 ref = ref.makeComponentRef(simple.component) 

498 else: 

499 if universe is None: 

500 if registry is None: 

501 raise ValueError("One of universe or registry must be provided.") 

502 universe = registry.dimensions 

503 if datasetType is None: 

504 if simple.datasetType is None: 

505 raise ValueError("Cannot determine Dataset type of this serialized class") 

506 datasetType = DatasetType.from_simple( 

507 simple.datasetType, universe=universe, registry=registry 

508 ) 

509 if simple.dataId is None: 

510 # mypy 

511 raise ValueError("The DataId must be specified to construct a DatasetRef") 

512 dataId = DataCoordinate.from_simple(simple.dataId, universe=universe) 

513 # Check that simple ref is resolved. 

514 if simple.run is None: 

515 dstr = "" 

516 if simple.datasetType is None: 

517 dstr = f" (datasetType={datasetType.name!r})" 

518 raise ValueError( 

519 "Run collection name is missing from serialized representation. " 

520 f"Encountered with {simple!r}{dstr}." 

521 ) 

522 ref = cls( 

523 datasetType, 

524 dataId, 

525 id=simple.id, 

526 run=simple.run, 

527 ) 

528 if cache is not None: 

529 if ref.datasetType.component() is not None: 

530 cache[key] = ref.makeCompositeRef() 

531 else: 

532 cache[key] = ref 

533 return ref 

534 

535 to_json = to_json_pydantic 

536 from_json: ClassVar = classmethod(from_json_pydantic) 

537 

538 @classmethod 

539 def _unpickle( 

540 cls, 

541 datasetType: DatasetType, 

542 dataId: DataCoordinate, 

543 id: DatasetId, 

544 run: str, 

545 datastore_records: DatasetDatastoreRecords | None, 

546 ) -> DatasetRef: 

547 """Create new `DatasetRef`. 

548 

549 A custom factory method for use by `__reduce__` as a workaround for 

550 its lack of support for keyword arguments. 

551 """ 

552 return cls(datasetType, dataId, id=id, run=run, datastore_records=datastore_records) 

553 

554 def __reduce__(self) -> tuple: 

555 return ( 

556 self._unpickle, 

557 (self.datasetType, self.dataId, self.id, self.run, self._datastore_records), 

558 ) 

559 

560 def __deepcopy__(self, memo: dict) -> DatasetRef: 

561 # DatasetRef is recursively immutable; see note in @immutable 

562 # decorator. 

563 return self 

564 

565 def expanded(self, dataId: DataCoordinate) -> DatasetRef: 

566 """Return a new `DatasetRef` with the given expanded data ID. 

567 

568 Parameters 

569 ---------- 

570 dataId : `DataCoordinate` 

571 Data ID for the new `DatasetRef`. Must compare equal to the 

572 original data ID. 

573 

574 Returns 

575 ------- 

576 ref : `DatasetRef` 

577 A new `DatasetRef` with the given data ID. 

578 """ 

579 assert dataId == self.dataId 

580 return DatasetRef( 

581 datasetType=self.datasetType, 

582 dataId=dataId, 

583 id=self.id, 

584 run=self.run, 

585 conform=False, 

586 datastore_records=self._datastore_records, 

587 ) 

588 

589 def isComponent(self) -> bool: 

590 """Indicate whether this `DatasetRef` refers to a component. 

591 

592 Returns 

593 ------- 

594 isComponent : `bool` 

595 `True` if this `DatasetRef` is a component, `False` otherwise. 

596 """ 

597 return self.datasetType.isComponent() 

598 

599 def isComposite(self) -> bool: 

600 """Boolean indicating whether this `DatasetRef` is a composite type. 

601 

602 Returns 

603 ------- 

604 isComposite : `bool` 

605 `True` if this `DatasetRef` is a composite type, `False` 

606 otherwise. 

607 """ 

608 return self.datasetType.isComposite() 

609 

610 def _lookupNames(self) -> tuple[LookupKey, ...]: 

611 """Name keys to use when looking up this DatasetRef in a configuration. 

612 

613 The names are returned in order of priority. 

614 

615 Returns 

616 ------- 

617 names : `tuple` of `LookupKey` 

618 Tuple of the `DatasetType` name and the `StorageClass` name. 

619 If ``instrument`` is defined in the dataId, each of those names 

620 is added to the start of the tuple with a key derived from the 

621 value of ``instrument``. 

622 """ 

623 # Special case the instrument Dimension since we allow configs 

624 # to include the instrument name in the hierarchy. 

625 names: tuple[LookupKey, ...] = self.datasetType._lookupNames() 

626 

627 if "instrument" in self.dataId: 

628 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) for n in names) + names 

629 

630 return names 

631 

632 @staticmethod 

633 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, list[DatasetRef]]: 

634 """Group an iterable of `DatasetRef` by `DatasetType`. 

635 

636 Parameters 

637 ---------- 

638 refs : `~collections.abc.Iterable` [ `DatasetRef` ] 

639 `DatasetRef` instances to group. 

640 

641 Returns 

642 ------- 

643 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ] 

644 Grouped `DatasetRef` instances. 

645 

646 Notes 

647 ----- 

648 When lazy item-iterables are acceptable instead of a full mapping, 

649 `iter_by_type` can in some cases be far more efficient. 

650 """ 

651 result: NamedKeyDict[DatasetType, list[DatasetRef]] = NamedKeyDict() 

652 for ref in refs: 

653 result.setdefault(ref.datasetType, []).append(ref) 

654 return result 

655 

656 @staticmethod 

657 def iter_by_type( 

658 refs: Iterable[DatasetRef], 

659 ) -> Iterable[tuple[DatasetType, Iterable[DatasetRef]]]: 

660 """Group an iterable of `DatasetRef` by `DatasetType` with special 

661 hooks for custom iterables that can do this efficiently. 

662 

663 Parameters 

664 ---------- 

665 refs : `~collections.abc.Iterable` [ `DatasetRef` ] 

666 `DatasetRef` instances to group. If this satisfies the 

667 `_DatasetRefGroupedIterable` protocol, its 

668 `~_DatasetRefGroupedIterable._iter_by_dataset_type` method will 

669 be called. 

670 

671 Returns 

672 ------- 

673 grouped : `~collections.abc.Iterable` [ `tuple` [ `DatasetType`, \ 

674 `Iterable` [ `DatasetRef` ] ]] 

675 Grouped `DatasetRef` instances. 

676 """ 

677 if isinstance(refs, _DatasetRefGroupedIterable): 

678 return refs._iter_by_dataset_type() 

679 return DatasetRef.groupByType(refs).items() 

680 

681 def makeCompositeRef(self) -> DatasetRef: 

682 """Create a `DatasetRef` of the composite from a component ref. 

683 

684 Requires that this `DatasetRef` is a component. 

685 

686 Returns 

687 ------- 

688 ref : `DatasetRef` 

689 A `DatasetRef` with a dataset type that corresponds to the 

690 composite parent of this component, and the same ID and run 

691 (which may be `None`, if they are `None` in ``self``). 

692 """ 

693 # Assume that the data ID does not need to be standardized 

694 # and should match whatever this ref already has. 

695 return DatasetRef( 

696 self.datasetType.makeCompositeDatasetType(), 

697 self.dataId, 

698 id=self.id, 

699 run=self.run, 

700 conform=False, 

701 datastore_records=self._datastore_records, 

702 ) 

703 

704 def makeComponentRef(self, name: str) -> DatasetRef: 

705 """Create a `DatasetRef` that corresponds to a component. 

706 

707 Parameters 

708 ---------- 

709 name : `str` 

710 Name of the component. 

711 

712 Returns 

713 ------- 

714 ref : `DatasetRef` 

715 A `DatasetRef` with a dataset type that corresponds to the given 

716 component, and the same ID and run 

717 (which may be `None`, if they are `None` in ``self``). 

718 """ 

719 # Assume that the data ID does not need to be standardized 

720 # and should match whatever this ref already has. 

721 return DatasetRef( 

722 self.datasetType.makeComponentDatasetType(name), 

723 self.dataId, 

724 id=self.id, 

725 run=self.run, 

726 conform=False, 

727 datastore_records=self._datastore_records, 

728 ) 

729 

730 def overrideStorageClass(self, storageClass: str | StorageClass) -> DatasetRef: 

731 """Create a new `DatasetRef` from this one, but with a modified 

732 `DatasetType` that has a different `StorageClass`. 

733 

734 Parameters 

735 ---------- 

736 storageClass : `str` or `StorageClass` 

737 The new storage class. 

738 

739 Returns 

740 ------- 

741 modified : `DatasetRef` 

742 A new dataset reference that is the same as the current one but 

743 with a different storage class in the `DatasetType`. 

744 """ 

745 return self.replace(storage_class=storageClass) 

746 

747 def replace( 

748 self, 

749 *, 

750 id: DatasetId | None = None, 

751 run: str | None = None, 

752 storage_class: str | StorageClass | None = None, 

753 datastore_records: DatasetDatastoreRecords | None | Literal[False] = False, 

754 ) -> DatasetRef: 

755 """Create a new `DatasetRef` from this one, but with some modified 

756 attributes. 

757 

758 Parameters 

759 ---------- 

760 id : `DatasetId` or `None` 

761 If not `None` then update dataset ID. 

762 run : `str` or `None` 

763 If not `None` then update run collection name. If ``dataset_id`` is 

764 `None` then this will also cause new dataset ID to be generated. 

765 storage_class : `str` or `StorageClass` or `None` 

766 The new storage class. If not `None`, replaces existing storage 

767 class. 

768 datastore_records : `DatasetDatastoreRecords` or `None` 

769 New datastore records. If `None` remove all records. By default 

770 datastore records are preserved. 

771 

772 Returns 

773 ------- 

774 modified : `DatasetRef` 

775 A new dataset reference with updated attributes. 

776 """ 

777 if datastore_records is False: 

778 datastore_records = self._datastore_records 

779 if storage_class is None: 

780 datasetType = self.datasetType 

781 else: 

782 datasetType = self.datasetType.overrideStorageClass(storage_class) 

783 if run is None: 

784 run = self.run 

785 # Do not regenerate dataset ID if run is the same. 

786 if id is None: 

787 id = self.id 

788 return DatasetRef( 

789 datasetType=datasetType, 

790 dataId=self.dataId, 

791 run=run, 

792 id=id, 

793 conform=False, 

794 datastore_records=datastore_records, 

795 ) 

796 

797 def is_compatible_with(self, other: DatasetRef) -> bool: 

798 """Determine if the given `DatasetRef` is compatible with this one. 

799 

800 Parameters 

801 ---------- 

802 other : `DatasetRef` 

803 Dataset ref to check. 

804 

805 Returns 

806 ------- 

807 is_compatible : `bool` 

808 Returns `True` if the other dataset ref is either the same as this 

809 or the dataset type associated with the other is compatible with 

810 this one and the dataId and dataset ID match. 

811 

812 Notes 

813 ----- 

814 Compatibility requires that the dataId and dataset ID match and the 

815 `DatasetType` is compatible. Compatibility is defined as the storage 

816 class associated with the dataset type of the other ref can be 

817 converted to this storage class. 

818 

819 Specifically this means that if you have done: 

820 

821 .. code-block:: py 

822 

823 new_ref = ref.overrideStorageClass(sc) 

824 

825 and this is successful, then the guarantee is that: 

826 

827 .. code-block:: py 

828 

829 assert ref.is_compatible_with(new_ref) is True 

830 

831 since we know that the python type associated with the new ref can 

832 be converted to the original python type. The reverse is not guaranteed 

833 and depends on whether bidirectional converters have been registered. 

834 """ 

835 if self.id != other.id: 

836 return False 

837 if self.dataId != other.dataId: 

838 return False 

839 if self.run != other.run: 

840 return False 

841 return self.datasetType.is_compatible_with(other.datasetType) 

842 

843 datasetType: DatasetType 

844 """The definition of this dataset (`DatasetType`). 

845 

846 Cannot be changed after a `DatasetRef` is constructed. 

847 """ 

848 

849 dataId: DataCoordinate 

850 """A mapping of `Dimension` primary key values that labels the dataset 

851 within a Collection (`DataCoordinate`). 

852 

853 Cannot be changed after a `DatasetRef` is constructed. 

854 """ 

855 

856 run: str 

857 """The name of the run that produced the dataset. 

858 

859 Cannot be changed after a `DatasetRef` is constructed. 

860 """ 

861 

862 datastore_records: DatasetDatastoreRecords | None 

863 """Optional datastore records (`DatasetDatastoreRecords`). 

864 

865 Cannot be changed after a `DatasetRef` is constructed. 

866 """