Coverage for python/lsst/daf/butler/registry/interfaces/_collections.py: 72%

116 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-10-02 08:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = [ 

30 "ChainedCollectionRecord", 

31 "CollectionManager", 

32 "CollectionRecord", 

33 "RunRecord", 

34] 

35 

36from abc import abstractmethod 

37from collections import defaultdict 

38from collections.abc import Iterator, Set 

39from typing import TYPE_CHECKING, Any 

40 

41from ...core import DimensionUniverse, Timespan, ddl 

42from .._collectionType import CollectionType 

43from ..wildcards import CollectionWildcard 

44from ._versioning import VersionedExtension, VersionTuple 

45 

46if TYPE_CHECKING: 

47 from ._database import Database, StaticTablesContext 

48 from ._dimensions import DimensionRecordStorageManager 

49 

50 

51class CollectionRecord: 

52 """A struct used to represent a collection in internal `Registry` APIs. 

53 

54 User-facing code should always just use a `str` to represent collections. 

55 

56 Parameters 

57 ---------- 

58 key 

59 Unique collection ID, can be the same as ``name`` if ``name`` is used 

60 for identification. Usually this is an integer or string, but can be 

61 other database-specific type. 

62 name : `str` 

63 Name of the collection. 

64 type : `CollectionType` 

65 Enumeration value describing the type of the collection. 

66 

67 Notes 

68 ----- 

69 The `name`, `key`, and `type` attributes set by the base class should be 

70 considered immutable by all users and derived classes (as these are used 

71 in the definition of equality and this is a hashable type). Other 

72 attributes defined by subclasses may be mutable, as long as they do not 

73 participate in some subclass equality definition. 

74 """ 

75 

76 def __init__(self, key: Any, name: str, type: CollectionType): 

77 self.key = key 

78 self.name = name 

79 self.type = type 

80 assert isinstance(self.type, CollectionType) 

81 

82 name: str 

83 """Name of the collection (`str`). 

84 """ 

85 

86 key: Any 

87 """The primary/foreign key value for this collection. 

88 """ 

89 

90 type: CollectionType 

91 """Enumeration value describing the type of the collection 

92 (`CollectionType`). 

93 """ 

94 

95 def __eq__(self, other: Any) -> bool: 

96 try: 

97 return self.name == other.name and self.type == other.type and self.key == other.key 

98 except AttributeError: 

99 return NotImplemented 

100 

101 def __hash__(self) -> int: 

102 return hash(self.name) 

103 

104 def __repr__(self) -> str: 

105 return f"CollectionRecord(key={self.key!r}, name={self.name!r}, type={self.type!r})" 

106 

107 def __str__(self) -> str: 

108 return self.name 

109 

110 

111class RunRecord(CollectionRecord): 

112 """A subclass of `CollectionRecord` that adds execution information and 

113 an interface for updating it. 

114 """ 

115 

116 @abstractmethod 

117 def update(self, host: str | None = None, timespan: Timespan | None = None) -> None: 

118 """Update the database record for this run with new execution 

119 information. 

120 

121 Values not provided will set to ``NULL`` in the database, not ignored. 

122 

123 Parameters 

124 ---------- 

125 host : `str`, optional 

126 Name of the host or system on which this run was produced. 

127 Detailed form to be set by higher-level convention; from the 

128 `Registry` perspective, this is an entirely opaque value. 

129 timespan : `Timespan`, optional 

130 Begin and end timestamps for the period over which the run was 

131 produced. `None`/``NULL`` values are interpreted as infinite 

132 bounds. 

133 """ 

134 raise NotImplementedError() 

135 

136 @property 

137 @abstractmethod 

138 def host(self) -> str | None: 

139 """Return the name of the host or system on which this run was 

140 produced (`str` or `None`). 

141 """ 

142 raise NotImplementedError() 

143 

144 @property 

145 @abstractmethod 

146 def timespan(self) -> Timespan: 

147 """Begin and end timestamps for the period over which the run was 

148 produced. `None`/``NULL`` values are interpreted as infinite 

149 bounds. 

150 """ 

151 raise NotImplementedError() 

152 

153 def __repr__(self) -> str: 

154 return f"RunRecord(key={self.key!r}, name={self.name!r})" 

155 

156 

157class ChainedCollectionRecord(CollectionRecord): 

158 """A subclass of `CollectionRecord` that adds the list of child collections 

159 in a ``CHAINED`` collection. 

160 

161 Parameters 

162 ---------- 

163 key 

164 Unique collection ID, can be the same as ``name`` if ``name`` is used 

165 for identification. Usually this is an integer or string, but can be 

166 other database-specific type. 

167 name : `str` 

168 Name of the collection. 

169 """ 

170 

171 def __init__(self, key: Any, name: str, universe: DimensionUniverse): 

172 super().__init__(key=key, name=name, type=CollectionType.CHAINED) 

173 self._children: tuple[str, ...] = () 

174 

175 @property 

176 def children(self) -> tuple[str, ...]: 

177 """The ordered search path of child collections that define this chain 

178 (`tuple` [ `str` ]). 

179 """ 

180 return self._children 

181 

182 def update(self, manager: CollectionManager, children: tuple[str, ...], flatten: bool) -> None: 

183 """Redefine this chain to search the given child collections. 

184 

185 This method should be used by all external code to set children. It 

186 delegates to `_update`, which is what should be overridden by 

187 subclasses. 

188 

189 Parameters 

190 ---------- 

191 manager : `CollectionManager` 

192 The object that manages this records instance and all records 

193 instances that may appear as its children. 

194 children : `tuple` [ `str` ] 

195 A collection search path that should be resolved to set the child 

196 collections of this chain. 

197 flatten : `bool` 

198 If `True`, recursively flatten out any nested 

199 `~CollectionType.CHAINED` collections in ``children`` first. 

200 

201 Raises 

202 ------ 

203 ValueError 

204 Raised when the child collections contain a cycle. 

205 """ 

206 children_as_wildcard = CollectionWildcard.from_names(children) 

207 for record in manager.resolve_wildcard( 

208 children_as_wildcard, 

209 flatten_chains=True, 

210 include_chains=True, 

211 collection_types={CollectionType.CHAINED}, 

212 ): 

213 if record == self: 

214 raise ValueError(f"Cycle in collection chaining when defining '{self.name}'.") 

215 if flatten: 

216 children = tuple( 

217 record.name for record in manager.resolve_wildcard(children_as_wildcard, flatten_chains=True) 

218 ) 

219 # Delegate to derived classes to do the database updates. 

220 self._update(manager, children) 

221 # Update the reverse mapping (from child to parents) in the manager, 

222 # by removing the old relationships and adding back in the new ones. 

223 for old_child in self._children: 

224 manager._parents_by_child[manager.find(old_child).key].discard(self.key) 

225 for new_child in children: 

226 manager._parents_by_child[manager.find(new_child).key].add(self.key) 

227 # Actually set this instances sequence of children. 

228 self._children = children 

229 

230 def refresh(self, manager: CollectionManager) -> None: 

231 """Load children from the database, using the given manager to resolve 

232 collection primary key values into records. 

233 

234 This method exists to ensure that all collections that may appear in a 

235 chain are known to the manager before any particular chain tries to 

236 retrieve their records from it. `ChainedCollectionRecord` subclasses 

237 can rely on it being called sometime after their own ``__init__`` to 

238 finish construction. 

239 

240 Parameters 

241 ---------- 

242 manager : `CollectionManager` 

243 The object that manages this records instance and all records 

244 instances that may appear as its children. 

245 """ 

246 # Clear out the old reverse mapping (from child to parents). 

247 for child in self._children: 

248 manager._parents_by_child[manager.find(child).key].discard(self.key) 

249 self._children = self._load(manager) 

250 # Update the reverse mapping (from child to parents) in the manager. 

251 for child in self._children: 

252 manager._parents_by_child[manager.find(child).key].add(self.key) 

253 

254 @abstractmethod 

255 def _update(self, manager: CollectionManager, children: tuple[str, ...]) -> None: 

256 """Protected implementation hook for `update`. 

257 

258 This method should be implemented by subclasses to update the database 

259 to reflect the children given. It should never be called by anything 

260 other than `update`, which should be used by all external code. 

261 

262 Parameters 

263 ---------- 

264 manager : `CollectionManager` 

265 The object that manages this records instance and all records 

266 instances that may appear as its children. 

267 children : `tuple` [ `str` ] 

268 A collection search path that should be resolved to set the child 

269 collections of this chain. Guaranteed not to contain cycles. 

270 """ 

271 raise NotImplementedError() 

272 

273 @abstractmethod 

274 def _load(self, manager: CollectionManager) -> tuple[str, ...]: 

275 """Protected implementation hook for `refresh`. 

276 

277 This method should be implemented by subclasses to retrieve the chain's 

278 child collections from the database and return them. It should never 

279 be called by anything other than `refresh`, which should be used by all 

280 external code. 

281 

282 Parameters 

283 ---------- 

284 manager : `CollectionManager` 

285 The object that manages this records instance and all records 

286 instances that may appear as its children. 

287 

288 Returns 

289 ------- 

290 children : `tuple` [ `str` ] 

291 The ordered sequence of collection names that defines the chained 

292 collection. Guaranteed not to contain cycles. 

293 """ 

294 raise NotImplementedError() 

295 

296 def __repr__(self) -> str: 

297 return f"ChainedCollectionRecord(key={self.key!r}, name={self.name!r}, children={self.children!r})" 

298 

299 

300class CollectionManager(VersionedExtension): 

301 """An interface for managing the collections (including runs) in a 

302 `Registry`. 

303 

304 Notes 

305 ----- 

306 Each layer in a multi-layer `Registry` has its own record for any 

307 collection for which it has datasets (or quanta). Different layers may 

308 use different IDs for the same collection, so any usage of the IDs 

309 obtained through the `CollectionManager` APIs are strictly for internal 

310 (to `Registry`) use. 

311 """ 

312 

313 def __init__(self, *, registry_schema_version: VersionTuple | None = None) -> None: 

314 super().__init__(registry_schema_version=registry_schema_version) 

315 self._parents_by_child: defaultdict[Any, set[Any]] = defaultdict(set) 

316 

317 @classmethod 

318 @abstractmethod 

319 def initialize( 

320 cls, 

321 db: Database, 

322 context: StaticTablesContext, 

323 *, 

324 dimensions: DimensionRecordStorageManager, 

325 registry_schema_version: VersionTuple | None = None, 

326 ) -> CollectionManager: 

327 """Construct an instance of the manager. 

328 

329 Parameters 

330 ---------- 

331 db : `Database` 

332 Interface to the underlying database engine and namespace. 

333 context : `StaticTablesContext` 

334 Context object obtained from `Database.declareStaticTables`; used 

335 to declare any tables that should always be present in a layer 

336 implemented with this manager. 

337 dimensions : `DimensionRecordStorageManager` 

338 Manager object for the dimensions in this `Registry`. 

339 registry_schema_version : `VersionTuple` or `None` 

340 Schema version of this extension as defined in registry. 

341 

342 Returns 

343 ------- 

344 manager : `CollectionManager` 

345 An instance of a concrete `CollectionManager` subclass. 

346 """ 

347 raise NotImplementedError() 

348 

349 @classmethod 

350 @abstractmethod 

351 def addCollectionForeignKey( 

352 cls, 

353 tableSpec: ddl.TableSpec, 

354 *, 

355 prefix: str = "collection", 

356 onDelete: str | None = None, 

357 constraint: bool = True, 

358 **kwargs: Any, 

359 ) -> ddl.FieldSpec: 

360 """Add a foreign key (field and constraint) referencing the collection 

361 table. 

362 

363 Parameters 

364 ---------- 

365 tableSpec : `ddl.TableSpec` 

366 Specification for the table that should reference the collection 

367 table. Will be modified in place. 

368 prefix: `str`, optional 

369 A name to use for the prefix of the new field; the full name may 

370 have a suffix (and is given in the returned `ddl.FieldSpec`). 

371 onDelete: `str`, optional 

372 One of "CASCADE" or "SET NULL", indicating what should happen to 

373 the referencing row if the collection row is deleted. `None` 

374 indicates that this should be an integrity error. 

375 constraint: `bool`, optional 

376 If `False` (`True` is default), add a field that can be joined to 

377 the collection primary key, but do not add a foreign key 

378 constraint. 

379 **kwargs 

380 Additional keyword arguments are forwarded to the `ddl.FieldSpec` 

381 constructor (only the ``name`` and ``dtype`` arguments are 

382 otherwise provided). 

383 

384 Returns 

385 ------- 

386 fieldSpec : `ddl.FieldSpec` 

387 Specification for the field being added. 

388 """ 

389 raise NotImplementedError() 

390 

391 @classmethod 

392 @abstractmethod 

393 def addRunForeignKey( 

394 cls, 

395 tableSpec: ddl.TableSpec, 

396 *, 

397 prefix: str = "run", 

398 onDelete: str | None = None, 

399 constraint: bool = True, 

400 **kwargs: Any, 

401 ) -> ddl.FieldSpec: 

402 """Add a foreign key (field and constraint) referencing the run 

403 table. 

404 

405 Parameters 

406 ---------- 

407 tableSpec : `ddl.TableSpec` 

408 Specification for the table that should reference the run table. 

409 Will be modified in place. 

410 prefix: `str`, optional 

411 A name to use for the prefix of the new field; the full name may 

412 have a suffix (and is given in the returned `ddl.FieldSpec`). 

413 onDelete: `str`, optional 

414 One of "CASCADE" or "SET NULL", indicating what should happen to 

415 the referencing row if the collection row is deleted. `None` 

416 indicates that this should be an integrity error. 

417 constraint: `bool`, optional 

418 If `False` (`True` is default), add a field that can be joined to 

419 the run primary key, but do not add a foreign key constraint. 

420 **kwargs 

421 Additional keyword arguments are forwarded to the `ddl.FieldSpec` 

422 constructor (only the ``name`` and ``dtype`` arguments are 

423 otherwise provided). 

424 

425 Returns 

426 ------- 

427 fieldSpec : `ddl.FieldSpec` 

428 Specification for the field being added. 

429 """ 

430 raise NotImplementedError() 

431 

432 @classmethod 

433 @abstractmethod 

434 def getCollectionForeignKeyName(cls, prefix: str = "collection") -> str: 

435 """Return the name of the field added by `addCollectionForeignKey` 

436 if called with the same prefix. 

437 

438 Parameters 

439 ---------- 

440 prefix : `str` 

441 A name to use for the prefix of the new field; the full name may 

442 have a suffix. 

443 

444 Returns 

445 ------- 

446 name : `str` 

447 The field name. 

448 """ 

449 raise NotImplementedError() 

450 

451 @classmethod 

452 @abstractmethod 

453 def getRunForeignKeyName(cls, prefix: str = "run") -> str: 

454 """Return the name of the field added by `addRunForeignKey` 

455 if called with the same prefix. 

456 

457 Parameters 

458 ---------- 

459 prefix : `str` 

460 A name to use for the prefix of the new field; the full name may 

461 have a suffix. 

462 

463 Returns 

464 ------- 

465 name : `str` 

466 The field name. 

467 """ 

468 raise NotImplementedError() 

469 

470 @abstractmethod 

471 def refresh(self) -> None: 

472 """Ensure all other operations on this manager are aware of any 

473 collections that may have been registered by other clients since it 

474 was initialized or last refreshed. 

475 """ 

476 raise NotImplementedError() 

477 

478 @abstractmethod 

479 def register( 

480 self, name: str, type: CollectionType, doc: str | None = None 

481 ) -> tuple[CollectionRecord, bool]: 

482 """Ensure that a collection of the given name and type are present 

483 in the layer this manager is associated with. 

484 

485 Parameters 

486 ---------- 

487 name : `str` 

488 Name of the collection. 

489 type : `CollectionType` 

490 Enumeration value indicating the type of collection. 

491 doc : `str`, optional 

492 Documentation string for the collection. Ignored if the collection 

493 already exists. 

494 

495 Returns 

496 ------- 

497 record : `CollectionRecord` 

498 Object representing the collection, including its type and ID. 

499 If ``type is CollectionType.RUN``, this will be a `RunRecord` 

500 instance. If ``type is CollectionType.CHAIN``, this will be a 

501 `ChainedCollectionRecord` instance. 

502 registered : `bool` 

503 True if the collection was registered, `False` if it already 

504 existed. 

505 

506 Raises 

507 ------ 

508 TransactionInterruption 

509 Raised if this operation is invoked within a `Database.transaction` 

510 context. 

511 DatabaseConflictError 

512 Raised if a collection with this name but a different type already 

513 exists. 

514 

515 Notes 

516 ----- 

517 Concurrent registrations of the same collection should be safe; nothing 

518 should happen if the types are consistent, and integrity errors due to 

519 inconsistent types should happen before any database changes are made. 

520 """ 

521 raise NotImplementedError() 

522 

523 @abstractmethod 

524 def remove(self, name: str) -> None: 

525 """Completely remove a collection. 

526 

527 Any existing `CollectionRecord` objects that correspond to the removed 

528 collection are considered invalidated. 

529 

530 Parameters 

531 ---------- 

532 name : `str` 

533 Name of the collection to remove. 

534 

535 Notes 

536 ----- 

537 If this collection is referenced by foreign keys in tables managed by 

538 other objects, the ON DELETE clauses of those tables will be invoked. 

539 That will frequently delete many dependent rows automatically (via 

540 "CASCADE", but it may also cause this operation to fail (with rollback) 

541 unless dependent rows that do not have an ON DELETE clause are removed 

542 first. 

543 """ 

544 raise NotImplementedError() 

545 

546 @abstractmethod 

547 def find(self, name: str) -> CollectionRecord: 

548 """Return the collection record associated with the given name. 

549 

550 Parameters 

551 ---------- 

552 name : `str` 

553 Name of the collection. 

554 

555 Returns 

556 ------- 

557 record : `CollectionRecord` 

558 Object representing the collection, including its type and ID. 

559 If ``record.type is CollectionType.RUN``, this will be a 

560 `RunRecord` instance. If ``record.type is CollectionType.CHAIN``, 

561 this will be a `ChainedCollectionRecord` instance. 

562 

563 Raises 

564 ------ 

565 MissingCollectionError 

566 Raised if the given collection does not exist. 

567 

568 Notes 

569 ----- 

570 Collections registered by another client of the same layer since the 

571 last call to `initialize` or `refresh` may not be found. 

572 """ 

573 raise NotImplementedError() 

574 

575 @abstractmethod 

576 def __getitem__(self, key: Any) -> CollectionRecord: 

577 """Return the collection record associated with the given 

578 primary/foreign key value. 

579 

580 Parameters 

581 ---------- 

582 key 

583 Internal primary key value for the collection. 

584 

585 Returns 

586 ------- 

587 record : `CollectionRecord` 

588 Object representing the collection, including its type and name. 

589 If ``record.type is CollectionType.RUN``, this will be a 

590 `RunRecord` instance. If ``record.type is CollectionType.CHAIN``, 

591 this will be a `ChainedCollectionRecord` instance. 

592 

593 Raises 

594 ------ 

595 MissingCollectionError 

596 Raised if no collection with this key exists. 

597 

598 Notes 

599 ----- 

600 Collections registered by another client of the same layer since the 

601 last call to `initialize` or `refresh` may not be found. 

602 """ 

603 raise NotImplementedError() 

604 

605 @abstractmethod 

606 def resolve_wildcard( 

607 self, 

608 wildcard: CollectionWildcard, 

609 *, 

610 collection_types: Set[CollectionType] = CollectionType.all(), 

611 done: set[str] | None = None, 

612 flatten_chains: bool = True, 

613 include_chains: bool | None = None, 

614 ) -> list[CollectionRecord]: 

615 """Iterate over collection records that match a wildcard. 

616 

617 Parameters 

618 ---------- 

619 wildcard : `CollectionWildcard` 

620 Names and/or patterns for collections. 

621 collection_types : `collections.abc.Set` [ `CollectionType` ], optional 

622 If provided, only yield collections of these types. 

623 done : `set` [ `str` ], optional 

624 A `set` of collection names that will not be returned (presumably 

625 because they have already been returned in some higher-level logic) 

626 that will also be updated with the names of the collections 

627 returned. 

628 flatten_chains : `bool`, optional 

629 If `True` (default) recursively yield the child collections of 

630 `~CollectionType.CHAINED` collections. 

631 include_chains : `bool`, optional 

632 If `False`, return records for `~CollectionType.CHAINED` 

633 collections themselves. The default is the opposite of 

634 ``flattenChains``: either return records for CHAINED collections or 

635 their children, but not both. 

636 

637 Returns 

638 ------- 

639 records : `list` [ `CollectionRecord` ] 

640 Matching collection records. 

641 """ 

642 raise NotImplementedError() 

643 

644 @abstractmethod 

645 def getDocumentation(self, key: Any) -> str | None: 

646 """Retrieve the documentation string for a collection. 

647 

648 Parameters 

649 ---------- 

650 key 

651 Internal primary key value for the collection. 

652 

653 Returns 

654 ------- 

655 docs : `str` or `None` 

656 Docstring for the collection with the given key. 

657 """ 

658 raise NotImplementedError() 

659 

660 @abstractmethod 

661 def setDocumentation(self, key: Any, doc: str | None) -> None: 

662 """Set the documentation string for a collection. 

663 

664 Parameters 

665 ---------- 

666 key 

667 Internal primary key value for the collection. 

668 docs : `str`, optional 

669 Docstring for the collection with the given key. 

670 """ 

671 raise NotImplementedError() 

672 

673 def getParentChains(self, key: Any) -> Iterator[ChainedCollectionRecord]: 

674 """Find all CHAINED collections that directly contain the given 

675 collection. 

676 

677 Parameters 

678 ---------- 

679 key 

680 Internal primary key value for the collection. 

681 """ 

682 for parent_key in self._parents_by_child[key]: 

683 result = self[parent_key] 

684 assert isinstance(result, ChainedCollectionRecord) 

685 yield result