Coverage for python/lsst/daf/butler/registry/interfaces/_collections.py: 57%

116 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-17 09:33 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = [ 

24 "ChainedCollectionRecord", 

25 "CollectionManager", 

26 "CollectionRecord", 

27 "RunRecord", 

28] 

29 

30from abc import abstractmethod 

31from collections import defaultdict 

32from collections.abc import Iterator, Set 

33from typing import TYPE_CHECKING, Any 

34 

35from ...core import DimensionUniverse, Timespan, ddl 

36from .._collectionType import CollectionType 

37from ..wildcards import CollectionWildcard 

38from ._versioning import VersionedExtension, VersionTuple 

39 

40if TYPE_CHECKING: 

41 from ._database import Database, StaticTablesContext 

42 from ._dimensions import DimensionRecordStorageManager 

43 

44 

45class CollectionRecord: 

46 """A struct used to represent a collection in internal `Registry` APIs. 

47 

48 User-facing code should always just use a `str` to represent collections. 

49 

50 Parameters 

51 ---------- 

52 key 

53 Unique collection ID, can be the same as ``name`` if ``name`` is used 

54 for identification. Usually this is an integer or string, but can be 

55 other database-specific type. 

56 name : `str` 

57 Name of the collection. 

58 type : `CollectionType` 

59 Enumeration value describing the type of the collection. 

60 

61 Notes 

62 ----- 

63 The `name`, `key`, and `type` attributes set by the base class should be 

64 considered immutable by all users and derived classes (as these are used 

65 in the definition of equality and this is a hashable type). Other 

66 attributes defined by subclasses may be mutable, as long as they do not 

67 participate in some subclass equality definition. 

68 """ 

69 

70 def __init__(self, key: Any, name: str, type: CollectionType): 

71 self.key = key 

72 self.name = name 

73 self.type = type 

74 assert isinstance(self.type, CollectionType) 

75 

76 name: str 

77 """Name of the collection (`str`). 

78 """ 

79 

80 key: Any 

81 """The primary/foreign key value for this collection. 

82 """ 

83 

84 type: CollectionType 

85 """Enumeration value describing the type of the collection 

86 (`CollectionType`). 

87 """ 

88 

89 def __eq__(self, other: Any) -> bool: 

90 try: 

91 return self.name == other.name and self.type == other.type and self.key == other.key 

92 except AttributeError: 

93 return NotImplemented 

94 

95 def __hash__(self) -> int: 

96 return hash(self.name) 

97 

98 def __repr__(self) -> str: 

99 return f"CollectionRecord(key={self.key!r}, name={self.name!r}, type={self.type!r})" 

100 

101 def __str__(self) -> str: 

102 return self.name 

103 

104 

105class RunRecord(CollectionRecord): 

106 """A subclass of `CollectionRecord` that adds execution information and 

107 an interface for updating it. 

108 """ 

109 

110 @abstractmethod 

111 def update(self, host: str | None = None, timespan: Timespan | None = None) -> None: 

112 """Update the database record for this run with new execution 

113 information. 

114 

115 Values not provided will set to ``NULL`` in the database, not ignored. 

116 

117 Parameters 

118 ---------- 

119 host : `str`, optional 

120 Name of the host or system on which this run was produced. 

121 Detailed form to be set by higher-level convention; from the 

122 `Registry` perspective, this is an entirely opaque value. 

123 timespan : `Timespan`, optional 

124 Begin and end timestamps for the period over which the run was 

125 produced. `None`/``NULL`` values are interpreted as infinite 

126 bounds. 

127 """ 

128 raise NotImplementedError() 

129 

130 @property 

131 @abstractmethod 

132 def host(self) -> str | None: 

133 """Return the name of the host or system on which this run was 

134 produced (`str` or `None`). 

135 """ 

136 raise NotImplementedError() 

137 

138 @property 

139 @abstractmethod 

140 def timespan(self) -> Timespan: 

141 """Begin and end timestamps for the period over which the run was 

142 produced. `None`/``NULL`` values are interpreted as infinite 

143 bounds. 

144 """ 

145 raise NotImplementedError() 

146 

147 def __repr__(self) -> str: 

148 return f"RunRecord(key={self.key!r}, name={self.name!r})" 

149 

150 

151class ChainedCollectionRecord(CollectionRecord): 

152 """A subclass of `CollectionRecord` that adds the list of child collections 

153 in a ``CHAINED`` collection. 

154 

155 Parameters 

156 ---------- 

157 key 

158 Unique collection ID, can be the same as ``name`` if ``name`` is used 

159 for identification. Usually this is an integer or string, but can be 

160 other database-specific type. 

161 name : `str` 

162 Name of the collection. 

163 """ 

164 

165 def __init__(self, key: Any, name: str, universe: DimensionUniverse): 

166 super().__init__(key=key, name=name, type=CollectionType.CHAINED) 

167 self._children: tuple[str, ...] = () 

168 

169 @property 

170 def children(self) -> tuple[str, ...]: 

171 """The ordered search path of child collections that define this chain 

172 (`tuple` [ `str` ]). 

173 """ 

174 return self._children 

175 

176 def update(self, manager: CollectionManager, children: tuple[str, ...], flatten: bool) -> None: 

177 """Redefine this chain to search the given child collections. 

178 

179 This method should be used by all external code to set children. It 

180 delegates to `_update`, which is what should be overridden by 

181 subclasses. 

182 

183 Parameters 

184 ---------- 

185 manager : `CollectionManager` 

186 The object that manages this records instance and all records 

187 instances that may appear as its children. 

188 children : `tuple` [ `str` ] 

189 A collection search path that should be resolved to set the child 

190 collections of this chain. 

191 flatten : `bool` 

192 If `True`, recursively flatten out any nested 

193 `~CollectionType.CHAINED` collections in ``children`` first. 

194 

195 Raises 

196 ------ 

197 ValueError 

198 Raised when the child collections contain a cycle. 

199 """ 

200 children_as_wildcard = CollectionWildcard.from_names(children) 

201 for record in manager.resolve_wildcard( 

202 children_as_wildcard, 

203 flatten_chains=True, 

204 include_chains=True, 

205 collection_types={CollectionType.CHAINED}, 

206 ): 

207 if record == self: 

208 raise ValueError(f"Cycle in collection chaining when defining '{self.name}'.") 

209 if flatten: 

210 children = tuple( 

211 record.name for record in manager.resolve_wildcard(children_as_wildcard, flatten_chains=True) 

212 ) 

213 # Delegate to derived classes to do the database updates. 

214 self._update(manager, children) 

215 # Update the reverse mapping (from child to parents) in the manager, 

216 # by removing the old relationships and adding back in the new ones. 

217 for old_child in self._children: 

218 manager._parents_by_child[manager.find(old_child).key].discard(self.key) 

219 for new_child in children: 

220 manager._parents_by_child[manager.find(new_child).key].add(self.key) 

221 # Actually set this instances sequence of children. 

222 self._children = children 

223 

224 def refresh(self, manager: CollectionManager) -> None: 

225 """Load children from the database, using the given manager to resolve 

226 collection primary key values into records. 

227 

228 This method exists to ensure that all collections that may appear in a 

229 chain are known to the manager before any particular chain tries to 

230 retrieve their records from it. `ChainedCollectionRecord` subclasses 

231 can rely on it being called sometime after their own ``__init__`` to 

232 finish construction. 

233 

234 Parameters 

235 ---------- 

236 manager : `CollectionManager` 

237 The object that manages this records instance and all records 

238 instances that may appear as its children. 

239 """ 

240 # Clear out the old reverse mapping (from child to parents). 

241 for child in self._children: 

242 manager._parents_by_child[manager.find(child).key].discard(self.key) 

243 self._children = self._load(manager) 

244 # Update the reverse mapping (from child to parents) in the manager. 

245 for child in self._children: 

246 manager._parents_by_child[manager.find(child).key].add(self.key) 

247 

248 @abstractmethod 

249 def _update(self, manager: CollectionManager, children: tuple[str, ...]) -> None: 

250 """Protected implementation hook for `update`. 

251 

252 This method should be implemented by subclasses to update the database 

253 to reflect the children given. It should never be called by anything 

254 other than `update`, which should be used by all external code. 

255 

256 Parameters 

257 ---------- 

258 manager : `CollectionManager` 

259 The object that manages this records instance and all records 

260 instances that may appear as its children. 

261 children : `tuple` [ `str` ] 

262 A collection search path that should be resolved to set the child 

263 collections of this chain. Guaranteed not to contain cycles. 

264 """ 

265 raise NotImplementedError() 

266 

267 @abstractmethod 

268 def _load(self, manager: CollectionManager) -> tuple[str, ...]: 

269 """Protected implementation hook for `refresh`. 

270 

271 This method should be implemented by subclasses to retrieve the chain's 

272 child collections from the database and return them. It should never 

273 be called by anything other than `refresh`, which should be used by all 

274 external code. 

275 

276 Parameters 

277 ---------- 

278 manager : `CollectionManager` 

279 The object that manages this records instance and all records 

280 instances that may appear as its children. 

281 

282 Returns 

283 ------- 

284 children : `tuple` [ `str` ] 

285 The ordered sequence of collection names that defines the chained 

286 collection. Guaranteed not to contain cycles. 

287 """ 

288 raise NotImplementedError() 

289 

290 def __repr__(self) -> str: 

291 return f"ChainedCollectionRecord(key={self.key!r}, name={self.name!r}, children={self.children!r})" 

292 

293 

294class CollectionManager(VersionedExtension): 

295 """An interface for managing the collections (including runs) in a 

296 `Registry`. 

297 

298 Notes 

299 ----- 

300 Each layer in a multi-layer `Registry` has its own record for any 

301 collection for which it has datasets (or quanta). Different layers may 

302 use different IDs for the same collection, so any usage of the IDs 

303 obtained through the `CollectionManager` APIs are strictly for internal 

304 (to `Registry`) use. 

305 """ 

306 

307 def __init__(self, *, registry_schema_version: VersionTuple | None = None) -> None: 

308 super().__init__(registry_schema_version=registry_schema_version) 

309 self._parents_by_child: defaultdict[Any, set[Any]] = defaultdict(set) 

310 

311 @classmethod 

312 @abstractmethod 

313 def initialize( 

314 cls, 

315 db: Database, 

316 context: StaticTablesContext, 

317 *, 

318 dimensions: DimensionRecordStorageManager, 

319 registry_schema_version: VersionTuple | None = None, 

320 ) -> CollectionManager: 

321 """Construct an instance of the manager. 

322 

323 Parameters 

324 ---------- 

325 db : `Database` 

326 Interface to the underlying database engine and namespace. 

327 context : `StaticTablesContext` 

328 Context object obtained from `Database.declareStaticTables`; used 

329 to declare any tables that should always be present in a layer 

330 implemented with this manager. 

331 dimensions : `DimensionRecordStorageManager` 

332 Manager object for the dimensions in this `Registry`. 

333 registry_schema_version : `VersionTuple` or `None` 

334 Schema version of this extension as defined in registry. 

335 

336 Returns 

337 ------- 

338 manager : `CollectionManager` 

339 An instance of a concrete `CollectionManager` subclass. 

340 """ 

341 raise NotImplementedError() 

342 

343 @classmethod 

344 @abstractmethod 

345 def addCollectionForeignKey( 

346 cls, 

347 tableSpec: ddl.TableSpec, 

348 *, 

349 prefix: str = "collection", 

350 onDelete: str | None = None, 

351 constraint: bool = True, 

352 **kwargs: Any, 

353 ) -> ddl.FieldSpec: 

354 """Add a foreign key (field and constraint) referencing the collection 

355 table. 

356 

357 Parameters 

358 ---------- 

359 tableSpec : `ddl.TableSpec` 

360 Specification for the table that should reference the collection 

361 table. Will be modified in place. 

362 prefix: `str`, optional 

363 A name to use for the prefix of the new field; the full name may 

364 have a suffix (and is given in the returned `ddl.FieldSpec`). 

365 onDelete: `str`, optional 

366 One of "CASCADE" or "SET NULL", indicating what should happen to 

367 the referencing row if the collection row is deleted. `None` 

368 indicates that this should be an integrity error. 

369 constraint: `bool`, optional 

370 If `False` (`True` is default), add a field that can be joined to 

371 the collection primary key, but do not add a foreign key 

372 constraint. 

373 **kwargs 

374 Additional keyword arguments are forwarded to the `ddl.FieldSpec` 

375 constructor (only the ``name`` and ``dtype`` arguments are 

376 otherwise provided). 

377 

378 Returns 

379 ------- 

380 fieldSpec : `ddl.FieldSpec` 

381 Specification for the field being added. 

382 """ 

383 raise NotImplementedError() 

384 

385 @classmethod 

386 @abstractmethod 

387 def addRunForeignKey( 

388 cls, 

389 tableSpec: ddl.TableSpec, 

390 *, 

391 prefix: str = "run", 

392 onDelete: str | None = None, 

393 constraint: bool = True, 

394 **kwargs: Any, 

395 ) -> ddl.FieldSpec: 

396 """Add a foreign key (field and constraint) referencing the run 

397 table. 

398 

399 Parameters 

400 ---------- 

401 tableSpec : `ddl.TableSpec` 

402 Specification for the table that should reference the run table. 

403 Will be modified in place. 

404 prefix: `str`, optional 

405 A name to use for the prefix of the new field; the full name may 

406 have a suffix (and is given in the returned `ddl.FieldSpec`). 

407 onDelete: `str`, optional 

408 One of "CASCADE" or "SET NULL", indicating what should happen to 

409 the referencing row if the collection row is deleted. `None` 

410 indicates that this should be an integrity error. 

411 constraint: `bool`, optional 

412 If `False` (`True` is default), add a field that can be joined to 

413 the run primary key, but do not add a foreign key constraint. 

414 **kwargs 

415 Additional keyword arguments are forwarded to the `ddl.FieldSpec` 

416 constructor (only the ``name`` and ``dtype`` arguments are 

417 otherwise provided). 

418 

419 Returns 

420 ------- 

421 fieldSpec : `ddl.FieldSpec` 

422 Specification for the field being added. 

423 """ 

424 raise NotImplementedError() 

425 

426 @classmethod 

427 @abstractmethod 

428 def getCollectionForeignKeyName(cls, prefix: str = "collection") -> str: 

429 """Return the name of the field added by `addCollectionForeignKey` 

430 if called with the same prefix. 

431 

432 Parameters 

433 ---------- 

434 prefix : `str` 

435 A name to use for the prefix of the new field; the full name may 

436 have a suffix. 

437 

438 Returns 

439 ------- 

440 name : `str` 

441 The field name. 

442 """ 

443 raise NotImplementedError() 

444 

445 @classmethod 

446 @abstractmethod 

447 def getRunForeignKeyName(cls, prefix: str = "run") -> str: 

448 """Return the name of the field added by `addRunForeignKey` 

449 if called with the same prefix. 

450 

451 Parameters 

452 ---------- 

453 prefix : `str` 

454 A name to use for the prefix of the new field; the full name may 

455 have a suffix. 

456 

457 Returns 

458 ------- 

459 name : `str` 

460 The field name. 

461 """ 

462 raise NotImplementedError() 

463 

464 @abstractmethod 

465 def refresh(self) -> None: 

466 """Ensure all other operations on this manager are aware of any 

467 collections that may have been registered by other clients since it 

468 was initialized or last refreshed. 

469 """ 

470 raise NotImplementedError() 

471 

472 @abstractmethod 

473 def register( 

474 self, name: str, type: CollectionType, doc: str | None = None 

475 ) -> tuple[CollectionRecord, bool]: 

476 """Ensure that a collection of the given name and type are present 

477 in the layer this manager is associated with. 

478 

479 Parameters 

480 ---------- 

481 name : `str` 

482 Name of the collection. 

483 type : `CollectionType` 

484 Enumeration value indicating the type of collection. 

485 doc : `str`, optional 

486 Documentation string for the collection. Ignored if the collection 

487 already exists. 

488 

489 Returns 

490 ------- 

491 record : `CollectionRecord` 

492 Object representing the collection, including its type and ID. 

493 If ``type is CollectionType.RUN``, this will be a `RunRecord` 

494 instance. If ``type is CollectionType.CHAIN``, this will be a 

495 `ChainedCollectionRecord` instance. 

496 registered : `bool` 

497 True if the collection was registered, `False` if it already 

498 existed. 

499 

500 Raises 

501 ------ 

502 TransactionInterruption 

503 Raised if this operation is invoked within a `Database.transaction` 

504 context. 

505 DatabaseConflictError 

506 Raised if a collection with this name but a different type already 

507 exists. 

508 

509 Notes 

510 ----- 

511 Concurrent registrations of the same collection should be safe; nothing 

512 should happen if the types are consistent, and integrity errors due to 

513 inconsistent types should happen before any database changes are made. 

514 """ 

515 raise NotImplementedError() 

516 

517 @abstractmethod 

518 def remove(self, name: str) -> None: 

519 """Completely remove a collection. 

520 

521 Any existing `CollectionRecord` objects that correspond to the removed 

522 collection are considered invalidated. 

523 

524 Parameters 

525 ---------- 

526 name : `str` 

527 Name of the collection to remove. 

528 

529 Notes 

530 ----- 

531 If this collection is referenced by foreign keys in tables managed by 

532 other objects, the ON DELETE clauses of those tables will be invoked. 

533 That will frequently delete many dependent rows automatically (via 

534 "CASCADE", but it may also cause this operation to fail (with rollback) 

535 unless dependent rows that do not have an ON DELETE clause are removed 

536 first. 

537 """ 

538 raise NotImplementedError() 

539 

540 @abstractmethod 

541 def find(self, name: str) -> CollectionRecord: 

542 """Return the collection record associated with the given name. 

543 

544 Parameters 

545 ---------- 

546 name : `str` 

547 Name of the collection. 

548 

549 Returns 

550 ------- 

551 record : `CollectionRecord` 

552 Object representing the collection, including its type and ID. 

553 If ``record.type is CollectionType.RUN``, this will be a 

554 `RunRecord` instance. If ``record.type is CollectionType.CHAIN``, 

555 this will be a `ChainedCollectionRecord` instance. 

556 

557 Raises 

558 ------ 

559 MissingCollectionError 

560 Raised if the given collection does not exist. 

561 

562 Notes 

563 ----- 

564 Collections registered by another client of the same layer since the 

565 last call to `initialize` or `refresh` may not be found. 

566 """ 

567 raise NotImplementedError() 

568 

569 @abstractmethod 

570 def __getitem__(self, key: Any) -> CollectionRecord: 

571 """Return the collection record associated with the given 

572 primary/foreign key value. 

573 

574 Parameters 

575 ---------- 

576 key 

577 Internal primary key value for the collection. 

578 

579 Returns 

580 ------- 

581 record : `CollectionRecord` 

582 Object representing the collection, including its type and name. 

583 If ``record.type is CollectionType.RUN``, this will be a 

584 `RunRecord` instance. If ``record.type is CollectionType.CHAIN``, 

585 this will be a `ChainedCollectionRecord` instance. 

586 

587 Raises 

588 ------ 

589 MissingCollectionError 

590 Raised if no collection with this key exists. 

591 

592 Notes 

593 ----- 

594 Collections registered by another client of the same layer since the 

595 last call to `initialize` or `refresh` may not be found. 

596 """ 

597 raise NotImplementedError() 

598 

599 @abstractmethod 

600 def resolve_wildcard( 

601 self, 

602 wildcard: CollectionWildcard, 

603 *, 

604 collection_types: Set[CollectionType] = CollectionType.all(), 

605 done: set[str] | None = None, 

606 flatten_chains: bool = True, 

607 include_chains: bool | None = None, 

608 ) -> list[CollectionRecord]: 

609 """Iterate over collection records that match a wildcard. 

610 

611 Parameters 

612 ---------- 

613 wildcard : `CollectionWildcard` 

614 Names and/or patterns for collections. 

615 collection_types : `collections.abc.Set` [ `CollectionType` ], optional 

616 If provided, only yield collections of these types. 

617 done : `set` [ `str` ], optional 

618 A `set` of collection names that will not be returned (presumably 

619 because they have already been returned in some higher-level logic) 

620 that will also be updated with the names of the collections 

621 returned. 

622 flatten_chains : `bool`, optional 

623 If `True` (default) recursively yield the child collections of 

624 `~CollectionType.CHAINED` collections. 

625 include_chains : `bool`, optional 

626 If `False`, return records for `~CollectionType.CHAINED` 

627 collections themselves. The default is the opposite of 

628 ``flattenChains``: either return records for CHAINED collections or 

629 their children, but not both. 

630 

631 Returns 

632 ------- 

633 records : `list` [ `CollectionRecord` ] 

634 Matching collection records. 

635 """ 

636 raise NotImplementedError() 

637 

638 @abstractmethod 

639 def getDocumentation(self, key: Any) -> str | None: 

640 """Retrieve the documentation string for a collection. 

641 

642 Parameters 

643 ---------- 

644 key 

645 Internal primary key value for the collection. 

646 

647 Returns 

648 ------- 

649 docs : `str` or `None` 

650 Docstring for the collection with the given key. 

651 """ 

652 raise NotImplementedError() 

653 

654 @abstractmethod 

655 def setDocumentation(self, key: Any, doc: str | None) -> None: 

656 """Set the documentation string for a collection. 

657 

658 Parameters 

659 ---------- 

660 key 

661 Internal primary key value for the collection. 

662 docs : `str`, optional 

663 Docstring for the collection with the given key. 

664 """ 

665 raise NotImplementedError() 

666 

667 def getParentChains(self, key: Any) -> Iterator[ChainedCollectionRecord]: 

668 """Find all CHAINED collections that directly contain the given 

669 collection. 

670 

671 Parameters 

672 ---------- 

673 key 

674 Internal primary key value for the collection. 

675 """ 

676 for parent_key in self._parents_by_child[key]: 

677 result = self[parent_key] 

678 assert isinstance(result, ChainedCollectionRecord) 

679 yield result