Coverage for python/lsst/daf/butler/registry/interfaces/_collections.py: 72%

118 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-27 09:44 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29from ... import ddl 

30 

31__all__ = [ 

32 "ChainedCollectionRecord", 

33 "CollectionManager", 

34 "CollectionRecord", 

35 "RunRecord", 

36] 

37 

38from abc import abstractmethod 

39from collections import defaultdict 

40from collections.abc import Iterator, Set 

41from typing import TYPE_CHECKING, Any 

42 

43from ..._timespan import Timespan 

44from ...dimensions import DimensionUniverse 

45from .._collection_type import CollectionType 

46from ..wildcards import CollectionWildcard 

47from ._versioning import VersionedExtension, VersionTuple 

48 

49if TYPE_CHECKING: 

50 from ._database import Database, StaticTablesContext 

51 from ._dimensions import DimensionRecordStorageManager 

52 

53 

54class CollectionRecord: 

55 """A struct used to represent a collection in internal `Registry` APIs. 

56 

57 User-facing code should always just use a `str` to represent collections. 

58 

59 Parameters 

60 ---------- 

61 key 

62 Unique collection ID, can be the same as ``name`` if ``name`` is used 

63 for identification. Usually this is an integer or string, but can be 

64 other database-specific type. 

65 name : `str` 

66 Name of the collection. 

67 type : `CollectionType` 

68 Enumeration value describing the type of the collection. 

69 

70 Notes 

71 ----- 

72 The `name`, `key`, and `type` attributes set by the base class should be 

73 considered immutable by all users and derived classes (as these are used 

74 in the definition of equality and this is a hashable type). Other 

75 attributes defined by subclasses may be mutable, as long as they do not 

76 participate in some subclass equality definition. 

77 """ 

78 

79 def __init__(self, key: Any, name: str, type: CollectionType): 

80 self.key = key 

81 self.name = name 

82 self.type = type 

83 assert isinstance(self.type, CollectionType) 

84 

85 name: str 

86 """Name of the collection (`str`). 

87 """ 

88 

89 key: Any 

90 """The primary/foreign key value for this collection. 

91 """ 

92 

93 type: CollectionType 

94 """Enumeration value describing the type of the collection 

95 (`CollectionType`). 

96 """ 

97 

98 def __eq__(self, other: Any) -> bool: 

99 try: 

100 return self.name == other.name and self.type == other.type and self.key == other.key 

101 except AttributeError: 

102 return NotImplemented 

103 

104 def __hash__(self) -> int: 

105 return hash(self.name) 

106 

107 def __repr__(self) -> str: 

108 return f"CollectionRecord(key={self.key!r}, name={self.name!r}, type={self.type!r})" 

109 

110 def __str__(self) -> str: 

111 return self.name 

112 

113 

114class RunRecord(CollectionRecord): 

115 """A subclass of `CollectionRecord` that adds execution information and 

116 an interface for updating it. 

117 """ 

118 

119 @abstractmethod 

120 def update(self, host: str | None = None, timespan: Timespan | None = None) -> None: 

121 """Update the database record for this run with new execution 

122 information. 

123 

124 Values not provided will set to ``NULL`` in the database, not ignored. 

125 

126 Parameters 

127 ---------- 

128 host : `str`, optional 

129 Name of the host or system on which this run was produced. 

130 Detailed form to be set by higher-level convention; from the 

131 `Registry` perspective, this is an entirely opaque value. 

132 timespan : `Timespan`, optional 

133 Begin and end timestamps for the period over which the run was 

134 produced. `None`/``NULL`` values are interpreted as infinite 

135 bounds. 

136 """ 

137 raise NotImplementedError() 

138 

139 @property 

140 @abstractmethod 

141 def host(self) -> str | None: 

142 """Return the name of the host or system on which this run was 

143 produced (`str` or `None`). 

144 """ 

145 raise NotImplementedError() 

146 

147 @property 

148 @abstractmethod 

149 def timespan(self) -> Timespan: 

150 """Begin and end timestamps for the period over which the run was 

151 produced. `None`/``NULL`` values are interpreted as infinite 

152 bounds. 

153 """ 

154 raise NotImplementedError() 

155 

156 def __repr__(self) -> str: 

157 return f"RunRecord(key={self.key!r}, name={self.name!r})" 

158 

159 

160class ChainedCollectionRecord(CollectionRecord): 

161 """A subclass of `CollectionRecord` that adds the list of child collections 

162 in a ``CHAINED`` collection. 

163 

164 Parameters 

165 ---------- 

166 key 

167 Unique collection ID, can be the same as ``name`` if ``name`` is used 

168 for identification. Usually this is an integer or string, but can be 

169 other database-specific type. 

170 name : `str` 

171 Name of the collection. 

172 """ 

173 

174 def __init__(self, key: Any, name: str, universe: DimensionUniverse): 

175 super().__init__(key=key, name=name, type=CollectionType.CHAINED) 

176 self._children: tuple[str, ...] = () 

177 

178 @property 

179 def children(self) -> tuple[str, ...]: 

180 """The ordered search path of child collections that define this chain 

181 (`tuple` [ `str` ]). 

182 """ 

183 return self._children 

184 

185 def update(self, manager: CollectionManager, children: tuple[str, ...], flatten: bool) -> None: 

186 """Redefine this chain to search the given child collections. 

187 

188 This method should be used by all external code to set children. It 

189 delegates to `_update`, which is what should be overridden by 

190 subclasses. 

191 

192 Parameters 

193 ---------- 

194 manager : `CollectionManager` 

195 The object that manages this records instance and all records 

196 instances that may appear as its children. 

197 children : `tuple` [ `str` ] 

198 A collection search path that should be resolved to set the child 

199 collections of this chain. 

200 flatten : `bool` 

201 If `True`, recursively flatten out any nested 

202 `~CollectionType.CHAINED` collections in ``children`` first. 

203 

204 Raises 

205 ------ 

206 ValueError 

207 Raised when the child collections contain a cycle. 

208 """ 

209 children_as_wildcard = CollectionWildcard.from_names(children) 

210 for record in manager.resolve_wildcard( 

211 children_as_wildcard, 

212 flatten_chains=True, 

213 include_chains=True, 

214 collection_types={CollectionType.CHAINED}, 

215 ): 

216 if record == self: 

217 raise ValueError(f"Cycle in collection chaining when defining '{self.name}'.") 

218 if flatten: 

219 children = tuple( 

220 record.name for record in manager.resolve_wildcard(children_as_wildcard, flatten_chains=True) 

221 ) 

222 # Delegate to derived classes to do the database updates. 

223 self._update(manager, children) 

224 # Update the reverse mapping (from child to parents) in the manager, 

225 # by removing the old relationships and adding back in the new ones. 

226 for old_child in self._children: 

227 manager._parents_by_child[manager.find(old_child).key].discard(self.key) 

228 for new_child in children: 

229 manager._parents_by_child[manager.find(new_child).key].add(self.key) 

230 # Actually set this instances sequence of children. 

231 self._children = children 

232 

233 def refresh(self, manager: CollectionManager) -> None: 

234 """Load children from the database, using the given manager to resolve 

235 collection primary key values into records. 

236 

237 This method exists to ensure that all collections that may appear in a 

238 chain are known to the manager before any particular chain tries to 

239 retrieve their records from it. `ChainedCollectionRecord` subclasses 

240 can rely on it being called sometime after their own ``__init__`` to 

241 finish construction. 

242 

243 Parameters 

244 ---------- 

245 manager : `CollectionManager` 

246 The object that manages this records instance and all records 

247 instances that may appear as its children. 

248 """ 

249 # Clear out the old reverse mapping (from child to parents). 

250 for child in self._children: 

251 manager._parents_by_child[manager.find(child).key].discard(self.key) 

252 self._children = self._load(manager) 

253 # Update the reverse mapping (from child to parents) in the manager. 

254 for child in self._children: 

255 manager._parents_by_child[manager.find(child).key].add(self.key) 

256 

257 @abstractmethod 

258 def _update(self, manager: CollectionManager, children: tuple[str, ...]) -> None: 

259 """Protected implementation hook for `update`. 

260 

261 This method should be implemented by subclasses to update the database 

262 to reflect the children given. It should never be called by anything 

263 other than `update`, which should be used by all external code. 

264 

265 Parameters 

266 ---------- 

267 manager : `CollectionManager` 

268 The object that manages this records instance and all records 

269 instances that may appear as its children. 

270 children : `tuple` [ `str` ] 

271 A collection search path that should be resolved to set the child 

272 collections of this chain. Guaranteed not to contain cycles. 

273 """ 

274 raise NotImplementedError() 

275 

276 @abstractmethod 

277 def _load(self, manager: CollectionManager) -> tuple[str, ...]: 

278 """Protected implementation hook for `refresh`. 

279 

280 This method should be implemented by subclasses to retrieve the chain's 

281 child collections from the database and return them. It should never 

282 be called by anything other than `refresh`, which should be used by all 

283 external code. 

284 

285 Parameters 

286 ---------- 

287 manager : `CollectionManager` 

288 The object that manages this records instance and all records 

289 instances that may appear as its children. 

290 

291 Returns 

292 ------- 

293 children : `tuple` [ `str` ] 

294 The ordered sequence of collection names that defines the chained 

295 collection. Guaranteed not to contain cycles. 

296 """ 

297 raise NotImplementedError() 

298 

299 def __repr__(self) -> str: 

300 return f"ChainedCollectionRecord(key={self.key!r}, name={self.name!r}, children={self.children!r})" 

301 

302 

303class CollectionManager(VersionedExtension): 

304 """An interface for managing the collections (including runs) in a 

305 `Registry`. 

306 

307 Notes 

308 ----- 

309 Each layer in a multi-layer `Registry` has its own record for any 

310 collection for which it has datasets (or quanta). Different layers may 

311 use different IDs for the same collection, so any usage of the IDs 

312 obtained through the `CollectionManager` APIs are strictly for internal 

313 (to `Registry`) use. 

314 """ 

315 

316 def __init__(self, *, registry_schema_version: VersionTuple | None = None) -> None: 

317 super().__init__(registry_schema_version=registry_schema_version) 

318 self._parents_by_child: defaultdict[Any, set[Any]] = defaultdict(set) 

319 

320 @classmethod 

321 @abstractmethod 

322 def initialize( 

323 cls, 

324 db: Database, 

325 context: StaticTablesContext, 

326 *, 

327 dimensions: DimensionRecordStorageManager, 

328 registry_schema_version: VersionTuple | None = None, 

329 ) -> CollectionManager: 

330 """Construct an instance of the manager. 

331 

332 Parameters 

333 ---------- 

334 db : `Database` 

335 Interface to the underlying database engine and namespace. 

336 context : `StaticTablesContext` 

337 Context object obtained from `Database.declareStaticTables`; used 

338 to declare any tables that should always be present in a layer 

339 implemented with this manager. 

340 dimensions : `DimensionRecordStorageManager` 

341 Manager object for the dimensions in this `Registry`. 

342 registry_schema_version : `VersionTuple` or `None` 

343 Schema version of this extension as defined in registry. 

344 

345 Returns 

346 ------- 

347 manager : `CollectionManager` 

348 An instance of a concrete `CollectionManager` subclass. 

349 """ 

350 raise NotImplementedError() 

351 

352 @classmethod 

353 @abstractmethod 

354 def addCollectionForeignKey( 

355 cls, 

356 tableSpec: ddl.TableSpec, 

357 *, 

358 prefix: str = "collection", 

359 onDelete: str | None = None, 

360 constraint: bool = True, 

361 **kwargs: Any, 

362 ) -> ddl.FieldSpec: 

363 """Add a foreign key (field and constraint) referencing the collection 

364 table. 

365 

366 Parameters 

367 ---------- 

368 tableSpec : `ddl.TableSpec` 

369 Specification for the table that should reference the collection 

370 table. Will be modified in place. 

371 prefix: `str`, optional 

372 A name to use for the prefix of the new field; the full name may 

373 have a suffix (and is given in the returned `ddl.FieldSpec`). 

374 onDelete: `str`, optional 

375 One of "CASCADE" or "SET NULL", indicating what should happen to 

376 the referencing row if the collection row is deleted. `None` 

377 indicates that this should be an integrity error. 

378 constraint: `bool`, optional 

379 If `False` (`True` is default), add a field that can be joined to 

380 the collection primary key, but do not add a foreign key 

381 constraint. 

382 **kwargs 

383 Additional keyword arguments are forwarded to the `ddl.FieldSpec` 

384 constructor (only the ``name`` and ``dtype`` arguments are 

385 otherwise provided). 

386 

387 Returns 

388 ------- 

389 fieldSpec : `ddl.FieldSpec` 

390 Specification for the field being added. 

391 """ 

392 raise NotImplementedError() 

393 

394 @classmethod 

395 @abstractmethod 

396 def addRunForeignKey( 

397 cls, 

398 tableSpec: ddl.TableSpec, 

399 *, 

400 prefix: str = "run", 

401 onDelete: str | None = None, 

402 constraint: bool = True, 

403 **kwargs: Any, 

404 ) -> ddl.FieldSpec: 

405 """Add a foreign key (field and constraint) referencing the run 

406 table. 

407 

408 Parameters 

409 ---------- 

410 tableSpec : `ddl.TableSpec` 

411 Specification for the table that should reference the run table. 

412 Will be modified in place. 

413 prefix: `str`, optional 

414 A name to use for the prefix of the new field; the full name may 

415 have a suffix (and is given in the returned `ddl.FieldSpec`). 

416 onDelete: `str`, optional 

417 One of "CASCADE" or "SET NULL", indicating what should happen to 

418 the referencing row if the collection row is deleted. `None` 

419 indicates that this should be an integrity error. 

420 constraint: `bool`, optional 

421 If `False` (`True` is default), add a field that can be joined to 

422 the run primary key, but do not add a foreign key constraint. 

423 **kwargs 

424 Additional keyword arguments are forwarded to the `ddl.FieldSpec` 

425 constructor (only the ``name`` and ``dtype`` arguments are 

426 otherwise provided). 

427 

428 Returns 

429 ------- 

430 fieldSpec : `ddl.FieldSpec` 

431 Specification for the field being added. 

432 """ 

433 raise NotImplementedError() 

434 

435 @classmethod 

436 @abstractmethod 

437 def getCollectionForeignKeyName(cls, prefix: str = "collection") -> str: 

438 """Return the name of the field added by `addCollectionForeignKey` 

439 if called with the same prefix. 

440 

441 Parameters 

442 ---------- 

443 prefix : `str` 

444 A name to use for the prefix of the new field; the full name may 

445 have a suffix. 

446 

447 Returns 

448 ------- 

449 name : `str` 

450 The field name. 

451 """ 

452 raise NotImplementedError() 

453 

454 @classmethod 

455 @abstractmethod 

456 def getRunForeignKeyName(cls, prefix: str = "run") -> str: 

457 """Return the name of the field added by `addRunForeignKey` 

458 if called with the same prefix. 

459 

460 Parameters 

461 ---------- 

462 prefix : `str` 

463 A name to use for the prefix of the new field; the full name may 

464 have a suffix. 

465 

466 Returns 

467 ------- 

468 name : `str` 

469 The field name. 

470 """ 

471 raise NotImplementedError() 

472 

473 @abstractmethod 

474 def refresh(self) -> None: 

475 """Ensure all other operations on this manager are aware of any 

476 collections that may have been registered by other clients since it 

477 was initialized or last refreshed. 

478 """ 

479 raise NotImplementedError() 

480 

481 @abstractmethod 

482 def register( 

483 self, name: str, type: CollectionType, doc: str | None = None 

484 ) -> tuple[CollectionRecord, bool]: 

485 """Ensure that a collection of the given name and type are present 

486 in the layer this manager is associated with. 

487 

488 Parameters 

489 ---------- 

490 name : `str` 

491 Name of the collection. 

492 type : `CollectionType` 

493 Enumeration value indicating the type of collection. 

494 doc : `str`, optional 

495 Documentation string for the collection. Ignored if the collection 

496 already exists. 

497 

498 Returns 

499 ------- 

500 record : `CollectionRecord` 

501 Object representing the collection, including its type and ID. 

502 If ``type is CollectionType.RUN``, this will be a `RunRecord` 

503 instance. If ``type is CollectionType.CHAIN``, this will be a 

504 `ChainedCollectionRecord` instance. 

505 registered : `bool` 

506 True if the collection was registered, `False` if it already 

507 existed. 

508 

509 Raises 

510 ------ 

511 TransactionInterruption 

512 Raised if this operation is invoked within a `Database.transaction` 

513 context. 

514 DatabaseConflictError 

515 Raised if a collection with this name but a different type already 

516 exists. 

517 

518 Notes 

519 ----- 

520 Concurrent registrations of the same collection should be safe; nothing 

521 should happen if the types are consistent, and integrity errors due to 

522 inconsistent types should happen before any database changes are made. 

523 """ 

524 raise NotImplementedError() 

525 

526 @abstractmethod 

527 def remove(self, name: str) -> None: 

528 """Completely remove a collection. 

529 

530 Any existing `CollectionRecord` objects that correspond to the removed 

531 collection are considered invalidated. 

532 

533 Parameters 

534 ---------- 

535 name : `str` 

536 Name of the collection to remove. 

537 

538 Notes 

539 ----- 

540 If this collection is referenced by foreign keys in tables managed by 

541 other objects, the ON DELETE clauses of those tables will be invoked. 

542 That will frequently delete many dependent rows automatically (via 

543 "CASCADE", but it may also cause this operation to fail (with rollback) 

544 unless dependent rows that do not have an ON DELETE clause are removed 

545 first. 

546 """ 

547 raise NotImplementedError() 

548 

549 @abstractmethod 

550 def find(self, name: str) -> CollectionRecord: 

551 """Return the collection record associated with the given name. 

552 

553 Parameters 

554 ---------- 

555 name : `str` 

556 Name of the collection. 

557 

558 Returns 

559 ------- 

560 record : `CollectionRecord` 

561 Object representing the collection, including its type and ID. 

562 If ``record.type is CollectionType.RUN``, this will be a 

563 `RunRecord` instance. If ``record.type is CollectionType.CHAIN``, 

564 this will be a `ChainedCollectionRecord` instance. 

565 

566 Raises 

567 ------ 

568 MissingCollectionError 

569 Raised if the given collection does not exist. 

570 

571 Notes 

572 ----- 

573 Collections registered by another client of the same layer since the 

574 last call to `initialize` or `refresh` may not be found. 

575 """ 

576 raise NotImplementedError() 

577 

578 @abstractmethod 

579 def __getitem__(self, key: Any) -> CollectionRecord: 

580 """Return the collection record associated with the given 

581 primary/foreign key value. 

582 

583 Parameters 

584 ---------- 

585 key 

586 Internal primary key value for the collection. 

587 

588 Returns 

589 ------- 

590 record : `CollectionRecord` 

591 Object representing the collection, including its type and name. 

592 If ``record.type is CollectionType.RUN``, this will be a 

593 `RunRecord` instance. If ``record.type is CollectionType.CHAIN``, 

594 this will be a `ChainedCollectionRecord` instance. 

595 

596 Raises 

597 ------ 

598 MissingCollectionError 

599 Raised if no collection with this key exists. 

600 

601 Notes 

602 ----- 

603 Collections registered by another client of the same layer since the 

604 last call to `initialize` or `refresh` may not be found. 

605 """ 

606 raise NotImplementedError() 

607 

608 @abstractmethod 

609 def resolve_wildcard( 

610 self, 

611 wildcard: CollectionWildcard, 

612 *, 

613 collection_types: Set[CollectionType] = CollectionType.all(), 

614 done: set[str] | None = None, 

615 flatten_chains: bool = True, 

616 include_chains: bool | None = None, 

617 ) -> list[CollectionRecord]: 

618 """Iterate over collection records that match a wildcard. 

619 

620 Parameters 

621 ---------- 

622 wildcard : `CollectionWildcard` 

623 Names and/or patterns for collections. 

624 collection_types : `collections.abc.Set` [ `CollectionType` ], optional 

625 If provided, only yield collections of these types. 

626 done : `set` [ `str` ], optional 

627 A `set` of collection names that will not be returned (presumably 

628 because they have already been returned in some higher-level logic) 

629 that will also be updated with the names of the collections 

630 returned. 

631 flatten_chains : `bool`, optional 

632 If `True` (default) recursively yield the child collections of 

633 `~CollectionType.CHAINED` collections. 

634 include_chains : `bool`, optional 

635 If `False`, return records for `~CollectionType.CHAINED` 

636 collections themselves. The default is the opposite of 

637 ``flattenChains``: either return records for CHAINED collections or 

638 their children, but not both. 

639 

640 Returns 

641 ------- 

642 records : `list` [ `CollectionRecord` ] 

643 Matching collection records. 

644 """ 

645 raise NotImplementedError() 

646 

647 @abstractmethod 

648 def getDocumentation(self, key: Any) -> str | None: 

649 """Retrieve the documentation string for a collection. 

650 

651 Parameters 

652 ---------- 

653 key 

654 Internal primary key value for the collection. 

655 

656 Returns 

657 ------- 

658 docs : `str` or `None` 

659 Docstring for the collection with the given key. 

660 """ 

661 raise NotImplementedError() 

662 

663 @abstractmethod 

664 def setDocumentation(self, key: Any, doc: str | None) -> None: 

665 """Set the documentation string for a collection. 

666 

667 Parameters 

668 ---------- 

669 key 

670 Internal primary key value for the collection. 

671 docs : `str`, optional 

672 Docstring for the collection with the given key. 

673 """ 

674 raise NotImplementedError() 

675 

676 def getParentChains(self, key: Any) -> Iterator[ChainedCollectionRecord]: 

677 """Find all CHAINED collections that directly contain the given 

678 collection. 

679 

680 Parameters 

681 ---------- 

682 key 

683 Internal primary key value for the collection. 

684 """ 

685 for parent_key in self._parents_by_child[key]: 

686 result = self[parent_key] 

687 assert isinstance(result, ChainedCollectionRecord) 

688 yield result