Coverage for python/lsst/daf/butler/core/datasets/ref.py: 33%

230 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-10-02 08:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = [ 

30 "AmbiguousDatasetError", 

31 "DatasetId", 

32 "DatasetIdFactory", 

33 "DatasetIdGenEnum", 

34 "DatasetRef", 

35 "SerializedDatasetRef", 

36] 

37 

38import enum 

39import sys 

40import uuid 

41from collections.abc import Iterable 

42from typing import TYPE_CHECKING, Any, ClassVar, Protocol, TypeAlias, runtime_checkable 

43 

44import pydantic 

45from lsst.daf.butler._compat import PYDANTIC_V2, _BaseModelCompat 

46from lsst.utils.classes import immutable 

47from pydantic import StrictStr 

48 

49from ..configSupport import LookupKey 

50from ..dimensions import DataCoordinate, DimensionGraph, DimensionUniverse, SerializedDataCoordinate 

51from ..json import from_json_pydantic, to_json_pydantic 

52from ..named import NamedKeyDict 

53from ..persistenceContext import PersistenceContextVars 

54from .type import DatasetType, SerializedDatasetType 

55 

56if TYPE_CHECKING: 

57 from ...registry import Registry 

58 from ..storageClass import StorageClass 

59 

60 

61class AmbiguousDatasetError(Exception): 

62 """Raised when a `DatasetRef` is not resolved but should be. 

63 

64 This happens when the `DatasetRef` has no ID or run but the requested 

65 operation requires one of them. 

66 """ 

67 

68 

69@runtime_checkable 

70class _DatasetRefGroupedIterable(Protocol): 

71 """A package-private interface for iterables of `DatasetRef` that know how 

72 to efficiently group their contents by `DatasetType`. 

73 

74 """ 

75 

76 def _iter_by_dataset_type(self) -> Iterable[tuple[DatasetType, Iterable[DatasetRef]]]: 

77 """Iterate over `DatasetRef` instances, one `DatasetType` at a time. 

78 

79 Returns 

80 ------- 

81 grouped : `~collections.abc.Iterable` [ `tuple` [ `DatasetType`, \ 

82 `~collections.abc.Iterable` [ `DatasetRef` ] 

83 An iterable of tuples, in which the first element is a dataset type 

84 and the second is an iterable of `DatasetRef` objects with exactly 

85 that dataset type. 

86 """ 

87 ... 

88 

89 

90class DatasetIdGenEnum(enum.Enum): 

91 """Enum used to specify dataset ID generation options.""" 

92 

93 UNIQUE = 0 

94 """Unique mode generates unique ID for each inserted dataset, e.g. 

95 auto-generated by database or random UUID. 

96 """ 

97 

98 DATAID_TYPE = 1 

99 """In this mode ID is computed deterministically from a combination of 

100 dataset type and dataId. 

101 """ 

102 

103 DATAID_TYPE_RUN = 2 

104 """In this mode ID is computed deterministically from a combination of 

105 dataset type, dataId, and run collection name. 

106 """ 

107 

108 

109class DatasetIdFactory: 

110 """Factory for dataset IDs (UUIDs). 

111 

112 For now the logic is hard-coded and is controlled by the user-provided 

113 value of `DatasetIdGenEnum`. In the future we may implement a configurable 

114 logic that can guess `DatasetIdGenEnum` value from other parameters. 

115 """ 

116 

117 NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f") 

118 """Namespace UUID used for UUID5 generation. Do not change. This was 

119 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`. 

120 """ 

121 

122 def makeDatasetId( 

123 self, 

124 run: str, 

125 datasetType: DatasetType, 

126 dataId: DataCoordinate, 

127 idGenerationMode: DatasetIdGenEnum, 

128 ) -> uuid.UUID: 

129 """Generate dataset ID for a dataset. 

130 

131 Parameters 

132 ---------- 

133 run : `str` 

134 Name of the RUN collection for the dataset. 

135 datasetType : `DatasetType` 

136 Dataset type. 

137 dataId : `DataCoordinate` 

138 Expanded data ID for the dataset. 

139 idGenerationMode : `DatasetIdGenEnum` 

140 ID generation option. `~DatasetIdGenEnum.UNIQUE` makes a random 

141 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a 

142 deterministic UUID5-type ID based on a dataset type name and 

143 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a 

144 deterministic UUID5-type ID based on a dataset type name, run 

145 collection name, and ``dataId``. 

146 

147 Returns 

148 ------- 

149 datasetId : `uuid.UUID` 

150 Dataset identifier. 

151 """ 

152 if idGenerationMode is DatasetIdGenEnum.UNIQUE: 

153 return uuid.uuid4() 

154 else: 

155 # WARNING: If you modify this code make sure that the order of 

156 # items in the `items` list below never changes. 

157 items: list[tuple[str, str]] = [] 

158 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE: 

159 items = [ 

160 ("dataset_type", datasetType.name), 

161 ] 

162 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN: 

163 items = [ 

164 ("dataset_type", datasetType.name), 

165 ("run", run), 

166 ] 

167 else: 

168 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}") 

169 

170 for name, value in sorted(dataId.byName().items()): 

171 items.append((name, str(value))) 

172 data = ",".join(f"{key}={value}" for key, value in items) 

173 return uuid.uuid5(self.NS_UUID, data) 

174 

175 

176# This is constant, so don't recreate a set for each instance 

177_serializedDatasetRefFieldsSet = {"id", "datasetType", "dataId", "run", "component"} 

178 

179 

180class SerializedDatasetRef(_BaseModelCompat): 

181 """Simplified model of a `DatasetRef` suitable for serialization.""" 

182 

183 id: uuid.UUID 

184 datasetType: SerializedDatasetType | None = None 

185 dataId: SerializedDataCoordinate | None = None 

186 run: StrictStr | None = None 

187 component: StrictStr | None = None 

188 

189 if PYDANTIC_V2: 189 ↛ 192line 189 didn't jump to line 192, because the condition on line 189 was never true

190 # Can not use "after" validator since in some cases the validator 

191 # seems to trigger with the datasetType field not yet set. 

192 @pydantic.model_validator(mode="before") # type: ignore[attr-defined] 

193 @classmethod 

194 def check_consistent_parameters(cls, data: dict[str, Any]) -> dict[str, Any]: 

195 has_datasetType = data.get("datasetType") is not None 

196 has_dataId = data.get("dataId") is not None 

197 if has_datasetType is not has_dataId: 

198 raise ValueError("If specifying datasetType or dataId, must specify both.") 

199 

200 if data.get("component") is not None and has_datasetType: 

201 raise ValueError("datasetType can not be set if component is given.") 

202 return data 

203 

204 else: 

205 

206 @pydantic.validator("dataId") 

207 def _check_dataId(cls, v: Any, values: dict[str, Any]) -> Any: # noqa: N805 

208 if v and (d := "datasetType") in values and values[d] is None: 

209 raise ValueError("Can not specify 'dataId' without specifying 'datasetType'") 

210 return v 

211 

212 @pydantic.validator("component") 

213 def _check_component(cls, v: Any, values: dict[str, Any]) -> Any: # noqa: N805 

214 # Component should not be given if datasetType is given 

215 if v and (d := "datasetType") in values and values[d] is not None: 

216 raise ValueError(f"datasetType ({values[d]}) can not be set if component is given ({v}).") 

217 return v 

218 

219 @classmethod 

220 def direct( 

221 cls, 

222 *, 

223 id: str, 

224 run: str, 

225 datasetType: dict[str, Any] | None = None, 

226 dataId: dict[str, Any] | None = None, 

227 component: str | None = None, 

228 ) -> SerializedDatasetRef: 

229 """Construct a `SerializedDatasetRef` directly without validators. 

230 

231 Notes 

232 ----- 

233 This differs from the pydantic "construct" method in that the arguments 

234 are explicitly what the model requires, and it will recurse through 

235 members, constructing them from their corresponding `direct` methods. 

236 

237 The ``id`` parameter is a string representation of dataset ID, it is 

238 converted to UUID by this method. 

239 

240 This method should only be called when the inputs are trusted. 

241 """ 

242 serialized_datasetType = ( 

243 SerializedDatasetType.direct(**datasetType) if datasetType is not None else None 

244 ) 

245 serialized_dataId = SerializedDataCoordinate.direct(**dataId) if dataId is not None else None 

246 

247 node = cls.model_construct( 

248 _fields_set=_serializedDatasetRefFieldsSet, 

249 id=uuid.UUID(id), 

250 datasetType=serialized_datasetType, 

251 dataId=serialized_dataId, 

252 run=sys.intern(run), 

253 component=component, 

254 ) 

255 

256 return node 

257 

258 

259DatasetId: TypeAlias = uuid.UUID 

260"""A type-annotation alias for dataset ID providing typing flexibility. 

261""" 

262 

263 

264@immutable 

265class DatasetRef: 

266 """Reference to a Dataset in a `Registry`. 

267 

268 A `DatasetRef` may point to a Dataset that currently does not yet exist 

269 (e.g., because it is a predicted input for provenance). 

270 

271 Parameters 

272 ---------- 

273 datasetType : `DatasetType` 

274 The `DatasetType` for this Dataset. 

275 dataId : `DataCoordinate` 

276 A mapping of dimensions that labels the Dataset within a Collection. 

277 run : `str` 

278 The name of the run this dataset was associated with when it was 

279 created. 

280 id : `DatasetId`, optional 

281 The unique identifier assigned when the dataset is created. If ``id`` 

282 is not specified, a new unique ID will be created. 

283 conform : `bool`, optional 

284 If `True` (default), call `DataCoordinate.standardize` to ensure that 

285 the data ID's dimensions are consistent with the dataset type's. 

286 `DatasetRef` instances for which those dimensions are not equal should 

287 not be created in new code, but are still supported for backwards 

288 compatibility. New code should only pass `False` if it can guarantee 

289 that the dimensions are already consistent. 

290 id_generation_mode : `DatasetIdGenEnum` 

291 ID generation option. `~DatasetIdGenEnum.UNIQUE` makes a random 

292 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a 

293 deterministic UUID5-type ID based on a dataset type name and 

294 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a 

295 deterministic UUID5-type ID based on a dataset type name, run 

296 collection name, and ``dataId``. 

297 

298 See Also 

299 -------- 

300 :ref:`daf_butler_organizing_datasets` 

301 """ 

302 

303 _serializedType = SerializedDatasetRef 

304 __slots__ = ( 

305 "_id", 

306 "datasetType", 

307 "dataId", 

308 "run", 

309 ) 

310 

311 def __init__( 

312 self, 

313 datasetType: DatasetType, 

314 dataId: DataCoordinate, 

315 run: str, 

316 *, 

317 id: DatasetId | None = None, 

318 conform: bool = True, 

319 id_generation_mode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

320 ): 

321 self.datasetType = datasetType 

322 if conform: 

323 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions) 

324 else: 

325 self.dataId = dataId 

326 self.run = run 

327 if id is not None: 

328 self._id = id.int 

329 else: 

330 self._id = ( 

331 DatasetIdFactory() 

332 .makeDatasetId(self.run, self.datasetType, self.dataId, id_generation_mode) 

333 .int 

334 ) 

335 

336 @property 

337 def id(self) -> DatasetId: 

338 """Primary key of the dataset (`DatasetId`). 

339 

340 Cannot be changed after a `DatasetRef` is constructed. 

341 """ 

342 return uuid.UUID(int=self._id) 

343 

344 def __eq__(self, other: Any) -> bool: 

345 try: 

346 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id) 

347 except AttributeError: 

348 return NotImplemented 

349 

350 def __hash__(self) -> int: 

351 return hash((self.datasetType, self.dataId, self.id)) 

352 

353 @property 

354 def dimensions(self) -> DimensionGraph: 

355 """Dimensions associated with the underlying `DatasetType`.""" 

356 return self.datasetType.dimensions 

357 

358 def __repr__(self) -> str: 

359 # We delegate to __str__ (i.e use "!s") for the data ID) below because 

360 # DataCoordinate's __repr__ - while adhering to the guidelines for 

361 # __repr__ - is much harder to users to read, while its __str__ just 

362 # produces a dict that can also be passed to DatasetRef's constructor. 

363 return f"DatasetRef({self.datasetType!r}, {self.dataId!s}, run={self.run!r}, id={self.id})" 

364 

365 def __str__(self) -> str: 

366 s = ( 

367 f"{self.datasetType.name}@{self.dataId!s} [sc={self.datasetType.storageClass_name}]" 

368 f" (run={self.run} id={self.id})" 

369 ) 

370 return s 

371 

372 def __lt__(self, other: Any) -> bool: 

373 # Sort by run, DatasetType name and then by DataCoordinate 

374 # The __str__ representation is probably close enough but we 

375 # need to ensure that sorting a DatasetRef matches what you would 

376 # get if you sorted DatasetType+DataCoordinate 

377 if not isinstance(other, type(self)): 

378 return NotImplemented 

379 

380 # Group by run if defined, takes precedence over DatasetType 

381 self_run = "" if self.run is None else self.run 

382 other_run = "" if other.run is None else other.run 

383 

384 # Compare tuples in the priority order 

385 return (self_run, self.datasetType, self.dataId) < (other_run, other.datasetType, other.dataId) 

386 

387 def to_simple(self, minimal: bool = False) -> SerializedDatasetRef: 

388 """Convert this class to a simple python type. 

389 

390 This makes it suitable for serialization. 

391 

392 Parameters 

393 ---------- 

394 minimal : `bool`, optional 

395 Use minimal serialization. Requires Registry to convert 

396 back to a full type. 

397 

398 Returns 

399 ------- 

400 simple : `dict` or `int` 

401 The object converted to a dictionary. 

402 """ 

403 if minimal: 

404 # The only thing needed to uniquely define a DatasetRef is its id 

405 # so that can be used directly if it is not a component DatasetRef. 

406 # Store is in a dict to allow us to easily add the planned origin 

407 # information later without having to support an int and dict in 

408 # simple form. 

409 simple: dict[str, Any] = {"id": self.id} 

410 if self.isComponent(): 

411 # We can still be a little minimalist with a component 

412 # but we will also need to record the datasetType component 

413 simple["component"] = self.datasetType.component() 

414 return SerializedDatasetRef(**simple) 

415 

416 return SerializedDatasetRef( 

417 datasetType=self.datasetType.to_simple(minimal=minimal), 

418 dataId=self.dataId.to_simple(), 

419 run=self.run, 

420 id=self.id, 

421 ) 

422 

423 @classmethod 

424 def from_simple( 

425 cls, 

426 simple: SerializedDatasetRef, 

427 universe: DimensionUniverse | None = None, 

428 registry: Registry | None = None, 

429 datasetType: DatasetType | None = None, 

430 ) -> DatasetRef: 

431 """Construct a new object from simplified form. 

432 

433 Generally this is data returned from the `to_simple` method. 

434 

435 Parameters 

436 ---------- 

437 simple : `dict` of [`str`, `Any`] 

438 The value returned by `to_simple()`. 

439 universe : `DimensionUniverse` 

440 The special graph of all known dimensions. 

441 Can be `None` if a registry is provided. 

442 registry : `lsst.daf.butler.Registry`, optional 

443 Registry to use to convert simple form of a DatasetRef to 

444 a full `DatasetRef`. Can be `None` if a full description of 

445 the type is provided along with a universe. 

446 datasetType : DatasetType, optional 

447 If datasetType is supplied, this will be used as the datasetType 

448 object in the resulting DatasetRef instead of being read from 

449 the `SerializedDatasetRef`. This is useful when many refs share 

450 the same type as memory can be saved. Defaults to None. 

451 

452 Returns 

453 ------- 

454 ref : `DatasetRef` 

455 Newly-constructed object. 

456 """ 

457 cache = PersistenceContextVars.datasetRefs.get() 

458 localName = sys.intern( 

459 datasetType.name 

460 if datasetType is not None 

461 else (x.name if (x := simple.datasetType) is not None else "") 

462 ) 

463 key = (simple.id.int, localName) 

464 if cache is not None and (cachedRef := cache.get(key, None)) is not None: 

465 return cachedRef 

466 # Minimalist component will just specify component and id and 

467 # require registry to reconstruct 

468 if not (simple.datasetType is not None or simple.dataId is not None or simple.run is not None): 

469 if registry is None: 

470 raise ValueError("Registry is required to construct component DatasetRef from integer id") 

471 if simple.id is None: 

472 raise ValueError("For minimal DatasetRef the ID must be defined.") 

473 ref = registry.getDataset(simple.id) 

474 if ref is None: 

475 raise RuntimeError(f"No matching dataset found in registry for id {simple.id}") 

476 if simple.component: 

477 ref = ref.makeComponentRef(simple.component) 

478 if cache is not None: 

479 cache[key] = ref 

480 return ref 

481 

482 if universe is None and registry is None: 

483 raise ValueError("One of universe or registry must be provided.") 

484 

485 if universe is None and registry is not None: 

486 universe = registry.dimensions 

487 

488 if universe is None: 

489 # this is for mypy 

490 raise ValueError("Unable to determine a usable universe") 

491 

492 if simple.datasetType is None and datasetType is None: 

493 # mypy 

494 raise ValueError("The DatasetType must be specified to construct a DatasetRef") 

495 if datasetType is None: 

496 if simple.datasetType is None: 

497 raise ValueError("Cannot determine Dataset type of this serialized class") 

498 datasetType = DatasetType.from_simple(simple.datasetType, universe=universe, registry=registry) 

499 

500 if simple.dataId is None: 

501 # mypy 

502 raise ValueError("The DataId must be specified to construct a DatasetRef") 

503 dataId = DataCoordinate.from_simple(simple.dataId, universe=universe) 

504 

505 # Check that simple ref is resolved. 

506 if simple.run is None: 

507 dstr = "" 

508 if simple.datasetType is None: 

509 dstr = f" (datasetType={datasetType.name!r})" 

510 raise ValueError( 

511 "Run collection name is missing from serialized representation. " 

512 f"Encountered with {simple!r}{dstr}." 

513 ) 

514 

515 newRef = cls(datasetType, dataId, id=simple.id, run=simple.run) 

516 if cache is not None: 

517 cache[key] = newRef 

518 return newRef 

519 

520 to_json = to_json_pydantic 

521 from_json: ClassVar = classmethod(from_json_pydantic) 

522 

523 @classmethod 

524 def _unpickle( 

525 cls, 

526 datasetType: DatasetType, 

527 dataId: DataCoordinate, 

528 id: DatasetId, 

529 run: str, 

530 ) -> DatasetRef: 

531 """Create new `DatasetRef`. 

532 

533 A custom factory method for use by `__reduce__` as a workaround for 

534 its lack of support for keyword arguments. 

535 """ 

536 return cls(datasetType, dataId, id=id, run=run) 

537 

538 def __reduce__(self) -> tuple: 

539 return (self._unpickle, (self.datasetType, self.dataId, self.id, self.run)) 

540 

541 def __deepcopy__(self, memo: dict) -> DatasetRef: 

542 # DatasetRef is recursively immutable; see note in @immutable 

543 # decorator. 

544 return self 

545 

546 def expanded(self, dataId: DataCoordinate) -> DatasetRef: 

547 """Return a new `DatasetRef` with the given expanded data ID. 

548 

549 Parameters 

550 ---------- 

551 dataId : `DataCoordinate` 

552 Data ID for the new `DatasetRef`. Must compare equal to the 

553 original data ID. 

554 

555 Returns 

556 ------- 

557 ref : `DatasetRef` 

558 A new `DatasetRef` with the given data ID. 

559 """ 

560 assert dataId == self.dataId 

561 return DatasetRef( 

562 datasetType=self.datasetType, dataId=dataId, id=self.id, run=self.run, conform=False 

563 ) 

564 

565 def isComponent(self) -> bool: 

566 """Indicate whether this `DatasetRef` refers to a component. 

567 

568 Returns 

569 ------- 

570 isComponent : `bool` 

571 `True` if this `DatasetRef` is a component, `False` otherwise. 

572 """ 

573 return self.datasetType.isComponent() 

574 

575 def isComposite(self) -> bool: 

576 """Boolean indicating whether this `DatasetRef` is a composite type. 

577 

578 Returns 

579 ------- 

580 isComposite : `bool` 

581 `True` if this `DatasetRef` is a composite type, `False` 

582 otherwise. 

583 """ 

584 return self.datasetType.isComposite() 

585 

586 def _lookupNames(self) -> tuple[LookupKey, ...]: 

587 """Name keys to use when looking up this DatasetRef in a configuration. 

588 

589 The names are returned in order of priority. 

590 

591 Returns 

592 ------- 

593 names : `tuple` of `LookupKey` 

594 Tuple of the `DatasetType` name and the `StorageClass` name. 

595 If ``instrument`` is defined in the dataId, each of those names 

596 is added to the start of the tuple with a key derived from the 

597 value of ``instrument``. 

598 """ 

599 # Special case the instrument Dimension since we allow configs 

600 # to include the instrument name in the hierarchy. 

601 names: tuple[LookupKey, ...] = self.datasetType._lookupNames() 

602 

603 if "instrument" in self.dataId: 

604 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) for n in names) + names 

605 

606 return names 

607 

608 @staticmethod 

609 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, list[DatasetRef]]: 

610 """Group an iterable of `DatasetRef` by `DatasetType`. 

611 

612 Parameters 

613 ---------- 

614 refs : `~collections.abc.Iterable` [ `DatasetRef` ] 

615 `DatasetRef` instances to group. 

616 

617 Returns 

618 ------- 

619 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ] 

620 Grouped `DatasetRef` instances. 

621 

622 Notes 

623 ----- 

624 When lazy item-iterables are acceptable instead of a full mapping, 

625 `iter_by_type` can in some cases be far more efficient. 

626 """ 

627 result: NamedKeyDict[DatasetType, list[DatasetRef]] = NamedKeyDict() 

628 for ref in refs: 

629 result.setdefault(ref.datasetType, []).append(ref) 

630 return result 

631 

632 @staticmethod 

633 def iter_by_type( 

634 refs: Iterable[DatasetRef], 

635 ) -> Iterable[tuple[DatasetType, Iterable[DatasetRef]]]: 

636 """Group an iterable of `DatasetRef` by `DatasetType` with special 

637 hooks for custom iterables that can do this efficiently. 

638 

639 Parameters 

640 ---------- 

641 refs : `~collections.abc.Iterable` [ `DatasetRef` ] 

642 `DatasetRef` instances to group. If this satisfies the 

643 `_DatasetRefGroupedIterable` protocol, its 

644 `~_DatasetRefGroupedIterable._iter_by_dataset_type` method will 

645 be called. 

646 

647 Returns 

648 ------- 

649 grouped : `~collections.abc.Iterable` [ `tuple` [ `DatasetType`, \ 

650 `Iterable` [ `DatasetRef` ] ]] 

651 Grouped `DatasetRef` instances. 

652 """ 

653 if isinstance(refs, _DatasetRefGroupedIterable): 

654 return refs._iter_by_dataset_type() 

655 return DatasetRef.groupByType(refs).items() 

656 

657 def makeCompositeRef(self) -> DatasetRef: 

658 """Create a `DatasetRef` of the composite from a component ref. 

659 

660 Requires that this `DatasetRef` is a component. 

661 

662 Returns 

663 ------- 

664 ref : `DatasetRef` 

665 A `DatasetRef` with a dataset type that corresponds to the 

666 composite parent of this component, and the same ID and run 

667 (which may be `None`, if they are `None` in ``self``). 

668 """ 

669 # Assume that the data ID does not need to be standardized 

670 # and should match whatever this ref already has. 

671 return DatasetRef( 

672 self.datasetType.makeCompositeDatasetType(), self.dataId, id=self.id, run=self.run, conform=False 

673 ) 

674 

675 def makeComponentRef(self, name: str) -> DatasetRef: 

676 """Create a `DatasetRef` that corresponds to a component. 

677 

678 Parameters 

679 ---------- 

680 name : `str` 

681 Name of the component. 

682 

683 Returns 

684 ------- 

685 ref : `DatasetRef` 

686 A `DatasetRef` with a dataset type that corresponds to the given 

687 component, and the same ID and run 

688 (which may be `None`, if they are `None` in ``self``). 

689 """ 

690 # Assume that the data ID does not need to be standardized 

691 # and should match whatever this ref already has. 

692 return DatasetRef( 

693 self.datasetType.makeComponentDatasetType(name), 

694 self.dataId, 

695 id=self.id, 

696 run=self.run, 

697 conform=False, 

698 ) 

699 

700 def overrideStorageClass(self, storageClass: str | StorageClass) -> DatasetRef: 

701 """Create a new `DatasetRef` from this one, but with a modified 

702 `DatasetType` that has a different `StorageClass`. 

703 

704 Parameters 

705 ---------- 

706 storageClass : `str` or `StorageClass` 

707 The new storage class. 

708 

709 Returns 

710 ------- 

711 modified : `DatasetRef` 

712 A new dataset reference that is the same as the current one but 

713 with a different storage class in the `DatasetType`. 

714 """ 

715 return self.replace(storage_class=storageClass) 

716 

717 def replace( 

718 self, 

719 *, 

720 id: DatasetId | None = None, 

721 run: str | None = None, 

722 storage_class: str | StorageClass | None = None, 

723 ) -> DatasetRef: 

724 """Create a new `DatasetRef` from this one, but with some modified 

725 attributes. 

726 

727 Parameters 

728 ---------- 

729 id : `DatasetId` or `None` 

730 If not `None` then update dataset ID. 

731 run : `str` or `None` 

732 If not `None` then update run collection name. If ``dataset_id`` is 

733 `None` then this will also cause new dataset ID to be generated. 

734 storage_class : `str` or `StorageClass` or `None`. 

735 The new storage class. If not `None`, replaces existing storage 

736 class. 

737 

738 Returns 

739 ------- 

740 modified : `DatasetRef` 

741 A new dataset reference with updated attributes. 

742 """ 

743 if storage_class is None: 

744 datasetType = self.datasetType 

745 else: 

746 datasetType = self.datasetType.overrideStorageClass(storage_class) 

747 if run is None: 

748 run = self.run 

749 # Do not regenerate dataset ID if run is the same. 

750 if id is None: 

751 id = self.id 

752 return DatasetRef( 

753 datasetType=datasetType, 

754 dataId=self.dataId, 

755 run=run, 

756 id=id, 

757 conform=False, 

758 ) 

759 

760 def is_compatible_with(self, ref: DatasetRef) -> bool: 

761 """Determine if the given `DatasetRef` is compatible with this one. 

762 

763 Parameters 

764 ---------- 

765 other : `DatasetRef` 

766 Dataset ref to check. 

767 

768 Returns 

769 ------- 

770 is_compatible : `bool` 

771 Returns `True` if the other dataset ref is either the same as this 

772 or the dataset type associated with the other is compatible with 

773 this one and the dataId and dataset ID match. 

774 

775 Notes 

776 ----- 

777 Compatibility requires that the dataId and dataset ID match and the 

778 `DatasetType` is compatible. Compatibility is defined as the storage 

779 class associated with the dataset type of the other ref can be 

780 converted to this storage class. 

781 

782 Specifically this means that if you have done: 

783 

784 .. code-block:: py 

785 

786 new_ref = ref.overrideStorageClass(sc) 

787 

788 and this is successful, then the guarantee is that: 

789 

790 .. code-block:: py 

791 

792 assert ref.is_compatible_with(new_ref) is True 

793 

794 since we know that the python type associated with the new ref can 

795 be converted to the original python type. The reverse is not guaranteed 

796 and depends on whether bidirectional converters have been registered. 

797 """ 

798 if self.id != ref.id: 

799 return False 

800 if self.dataId != ref.dataId: 

801 return False 

802 if self.run != ref.run: 

803 return False 

804 return self.datasetType.is_compatible_with(ref.datasetType) 

805 

806 datasetType: DatasetType 

807 """The definition of this dataset (`DatasetType`). 

808 

809 Cannot be changed after a `DatasetRef` is constructed. 

810 """ 

811 

812 dataId: DataCoordinate 

813 """A mapping of `Dimension` primary key values that labels the dataset 

814 within a Collection (`DataCoordinate`). 

815 

816 Cannot be changed after a `DatasetRef` is constructed. 

817 """ 

818 

819 run: str 

820 """The name of the run that produced the dataset. 

821 

822 Cannot be changed after a `DatasetRef` is constructed. 

823 """