Coverage for python/lsst/daf/butler/registry/interfaces/_collections.py: 72%
118 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:44 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:44 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29from ... import ddl
31__all__ = [
32 "ChainedCollectionRecord",
33 "CollectionManager",
34 "CollectionRecord",
35 "RunRecord",
36]
38from abc import abstractmethod
39from collections import defaultdict
40from collections.abc import Iterator, Set
41from typing import TYPE_CHECKING, Any
43from ..._timespan import Timespan
44from ...dimensions import DimensionUniverse
45from .._collection_type import CollectionType
46from ..wildcards import CollectionWildcard
47from ._versioning import VersionedExtension, VersionTuple
49if TYPE_CHECKING:
50 from ._database import Database, StaticTablesContext
51 from ._dimensions import DimensionRecordStorageManager
54class CollectionRecord:
55 """A struct used to represent a collection in internal `Registry` APIs.
57 User-facing code should always just use a `str` to represent collections.
59 Parameters
60 ----------
61 key
62 Unique collection ID, can be the same as ``name`` if ``name`` is used
63 for identification. Usually this is an integer or string, but can be
64 other database-specific type.
65 name : `str`
66 Name of the collection.
67 type : `CollectionType`
68 Enumeration value describing the type of the collection.
70 Notes
71 -----
72 The `name`, `key`, and `type` attributes set by the base class should be
73 considered immutable by all users and derived classes (as these are used
74 in the definition of equality and this is a hashable type). Other
75 attributes defined by subclasses may be mutable, as long as they do not
76 participate in some subclass equality definition.
77 """
79 def __init__(self, key: Any, name: str, type: CollectionType):
80 self.key = key
81 self.name = name
82 self.type = type
83 assert isinstance(self.type, CollectionType)
85 name: str
86 """Name of the collection (`str`).
87 """
89 key: Any
90 """The primary/foreign key value for this collection.
91 """
93 type: CollectionType
94 """Enumeration value describing the type of the collection
95 (`CollectionType`).
96 """
98 def __eq__(self, other: Any) -> bool:
99 try:
100 return self.name == other.name and self.type == other.type and self.key == other.key
101 except AttributeError:
102 return NotImplemented
104 def __hash__(self) -> int:
105 return hash(self.name)
107 def __repr__(self) -> str:
108 return f"CollectionRecord(key={self.key!r}, name={self.name!r}, type={self.type!r})"
110 def __str__(self) -> str:
111 return self.name
114class RunRecord(CollectionRecord):
115 """A subclass of `CollectionRecord` that adds execution information and
116 an interface for updating it.
117 """
119 @abstractmethod
120 def update(self, host: str | None = None, timespan: Timespan | None = None) -> None:
121 """Update the database record for this run with new execution
122 information.
124 Values not provided will set to ``NULL`` in the database, not ignored.
126 Parameters
127 ----------
128 host : `str`, optional
129 Name of the host or system on which this run was produced.
130 Detailed form to be set by higher-level convention; from the
131 `Registry` perspective, this is an entirely opaque value.
132 timespan : `Timespan`, optional
133 Begin and end timestamps for the period over which the run was
134 produced. `None`/``NULL`` values are interpreted as infinite
135 bounds.
136 """
137 raise NotImplementedError()
139 @property
140 @abstractmethod
141 def host(self) -> str | None:
142 """Return the name of the host or system on which this run was
143 produced (`str` or `None`).
144 """
145 raise NotImplementedError()
147 @property
148 @abstractmethod
149 def timespan(self) -> Timespan:
150 """Begin and end timestamps for the period over which the run was
151 produced. `None`/``NULL`` values are interpreted as infinite
152 bounds.
153 """
154 raise NotImplementedError()
156 def __repr__(self) -> str:
157 return f"RunRecord(key={self.key!r}, name={self.name!r})"
160class ChainedCollectionRecord(CollectionRecord):
161 """A subclass of `CollectionRecord` that adds the list of child collections
162 in a ``CHAINED`` collection.
164 Parameters
165 ----------
166 key
167 Unique collection ID, can be the same as ``name`` if ``name`` is used
168 for identification. Usually this is an integer or string, but can be
169 other database-specific type.
170 name : `str`
171 Name of the collection.
172 """
174 def __init__(self, key: Any, name: str, universe: DimensionUniverse):
175 super().__init__(key=key, name=name, type=CollectionType.CHAINED)
176 self._children: tuple[str, ...] = ()
178 @property
179 def children(self) -> tuple[str, ...]:
180 """The ordered search path of child collections that define this chain
181 (`tuple` [ `str` ]).
182 """
183 return self._children
185 def update(self, manager: CollectionManager, children: tuple[str, ...], flatten: bool) -> None:
186 """Redefine this chain to search the given child collections.
188 This method should be used by all external code to set children. It
189 delegates to `_update`, which is what should be overridden by
190 subclasses.
192 Parameters
193 ----------
194 manager : `CollectionManager`
195 The object that manages this records instance and all records
196 instances that may appear as its children.
197 children : `tuple` [ `str` ]
198 A collection search path that should be resolved to set the child
199 collections of this chain.
200 flatten : `bool`
201 If `True`, recursively flatten out any nested
202 `~CollectionType.CHAINED` collections in ``children`` first.
204 Raises
205 ------
206 ValueError
207 Raised when the child collections contain a cycle.
208 """
209 children_as_wildcard = CollectionWildcard.from_names(children)
210 for record in manager.resolve_wildcard(
211 children_as_wildcard,
212 flatten_chains=True,
213 include_chains=True,
214 collection_types={CollectionType.CHAINED},
215 ):
216 if record == self:
217 raise ValueError(f"Cycle in collection chaining when defining '{self.name}'.")
218 if flatten:
219 children = tuple(
220 record.name for record in manager.resolve_wildcard(children_as_wildcard, flatten_chains=True)
221 )
222 # Delegate to derived classes to do the database updates.
223 self._update(manager, children)
224 # Update the reverse mapping (from child to parents) in the manager,
225 # by removing the old relationships and adding back in the new ones.
226 for old_child in self._children:
227 manager._parents_by_child[manager.find(old_child).key].discard(self.key)
228 for new_child in children:
229 manager._parents_by_child[manager.find(new_child).key].add(self.key)
230 # Actually set this instances sequence of children.
231 self._children = children
233 def refresh(self, manager: CollectionManager) -> None:
234 """Load children from the database, using the given manager to resolve
235 collection primary key values into records.
237 This method exists to ensure that all collections that may appear in a
238 chain are known to the manager before any particular chain tries to
239 retrieve their records from it. `ChainedCollectionRecord` subclasses
240 can rely on it being called sometime after their own ``__init__`` to
241 finish construction.
243 Parameters
244 ----------
245 manager : `CollectionManager`
246 The object that manages this records instance and all records
247 instances that may appear as its children.
248 """
249 # Clear out the old reverse mapping (from child to parents).
250 for child in self._children:
251 manager._parents_by_child[manager.find(child).key].discard(self.key)
252 self._children = self._load(manager)
253 # Update the reverse mapping (from child to parents) in the manager.
254 for child in self._children:
255 manager._parents_by_child[manager.find(child).key].add(self.key)
257 @abstractmethod
258 def _update(self, manager: CollectionManager, children: tuple[str, ...]) -> None:
259 """Protected implementation hook for `update`.
261 This method should be implemented by subclasses to update the database
262 to reflect the children given. It should never be called by anything
263 other than `update`, which should be used by all external code.
265 Parameters
266 ----------
267 manager : `CollectionManager`
268 The object that manages this records instance and all records
269 instances that may appear as its children.
270 children : `tuple` [ `str` ]
271 A collection search path that should be resolved to set the child
272 collections of this chain. Guaranteed not to contain cycles.
273 """
274 raise NotImplementedError()
276 @abstractmethod
277 def _load(self, manager: CollectionManager) -> tuple[str, ...]:
278 """Protected implementation hook for `refresh`.
280 This method should be implemented by subclasses to retrieve the chain's
281 child collections from the database and return them. It should never
282 be called by anything other than `refresh`, which should be used by all
283 external code.
285 Parameters
286 ----------
287 manager : `CollectionManager`
288 The object that manages this records instance and all records
289 instances that may appear as its children.
291 Returns
292 -------
293 children : `tuple` [ `str` ]
294 The ordered sequence of collection names that defines the chained
295 collection. Guaranteed not to contain cycles.
296 """
297 raise NotImplementedError()
299 def __repr__(self) -> str:
300 return f"ChainedCollectionRecord(key={self.key!r}, name={self.name!r}, children={self.children!r})"
303class CollectionManager(VersionedExtension):
304 """An interface for managing the collections (including runs) in a
305 `Registry`.
307 Notes
308 -----
309 Each layer in a multi-layer `Registry` has its own record for any
310 collection for which it has datasets (or quanta). Different layers may
311 use different IDs for the same collection, so any usage of the IDs
312 obtained through the `CollectionManager` APIs are strictly for internal
313 (to `Registry`) use.
314 """
316 def __init__(self, *, registry_schema_version: VersionTuple | None = None) -> None:
317 super().__init__(registry_schema_version=registry_schema_version)
318 self._parents_by_child: defaultdict[Any, set[Any]] = defaultdict(set)
320 @classmethod
321 @abstractmethod
322 def initialize(
323 cls,
324 db: Database,
325 context: StaticTablesContext,
326 *,
327 dimensions: DimensionRecordStorageManager,
328 registry_schema_version: VersionTuple | None = None,
329 ) -> CollectionManager:
330 """Construct an instance of the manager.
332 Parameters
333 ----------
334 db : `Database`
335 Interface to the underlying database engine and namespace.
336 context : `StaticTablesContext`
337 Context object obtained from `Database.declareStaticTables`; used
338 to declare any tables that should always be present in a layer
339 implemented with this manager.
340 dimensions : `DimensionRecordStorageManager`
341 Manager object for the dimensions in this `Registry`.
342 registry_schema_version : `VersionTuple` or `None`
343 Schema version of this extension as defined in registry.
345 Returns
346 -------
347 manager : `CollectionManager`
348 An instance of a concrete `CollectionManager` subclass.
349 """
350 raise NotImplementedError()
352 @classmethod
353 @abstractmethod
354 def addCollectionForeignKey(
355 cls,
356 tableSpec: ddl.TableSpec,
357 *,
358 prefix: str = "collection",
359 onDelete: str | None = None,
360 constraint: bool = True,
361 **kwargs: Any,
362 ) -> ddl.FieldSpec:
363 """Add a foreign key (field and constraint) referencing the collection
364 table.
366 Parameters
367 ----------
368 tableSpec : `ddl.TableSpec`
369 Specification for the table that should reference the collection
370 table. Will be modified in place.
371 prefix: `str`, optional
372 A name to use for the prefix of the new field; the full name may
373 have a suffix (and is given in the returned `ddl.FieldSpec`).
374 onDelete: `str`, optional
375 One of "CASCADE" or "SET NULL", indicating what should happen to
376 the referencing row if the collection row is deleted. `None`
377 indicates that this should be an integrity error.
378 constraint: `bool`, optional
379 If `False` (`True` is default), add a field that can be joined to
380 the collection primary key, but do not add a foreign key
381 constraint.
382 **kwargs
383 Additional keyword arguments are forwarded to the `ddl.FieldSpec`
384 constructor (only the ``name`` and ``dtype`` arguments are
385 otherwise provided).
387 Returns
388 -------
389 fieldSpec : `ddl.FieldSpec`
390 Specification for the field being added.
391 """
392 raise NotImplementedError()
394 @classmethod
395 @abstractmethod
396 def addRunForeignKey(
397 cls,
398 tableSpec: ddl.TableSpec,
399 *,
400 prefix: str = "run",
401 onDelete: str | None = None,
402 constraint: bool = True,
403 **kwargs: Any,
404 ) -> ddl.FieldSpec:
405 """Add a foreign key (field and constraint) referencing the run
406 table.
408 Parameters
409 ----------
410 tableSpec : `ddl.TableSpec`
411 Specification for the table that should reference the run table.
412 Will be modified in place.
413 prefix: `str`, optional
414 A name to use for the prefix of the new field; the full name may
415 have a suffix (and is given in the returned `ddl.FieldSpec`).
416 onDelete: `str`, optional
417 One of "CASCADE" or "SET NULL", indicating what should happen to
418 the referencing row if the collection row is deleted. `None`
419 indicates that this should be an integrity error.
420 constraint: `bool`, optional
421 If `False` (`True` is default), add a field that can be joined to
422 the run primary key, but do not add a foreign key constraint.
423 **kwargs
424 Additional keyword arguments are forwarded to the `ddl.FieldSpec`
425 constructor (only the ``name`` and ``dtype`` arguments are
426 otherwise provided).
428 Returns
429 -------
430 fieldSpec : `ddl.FieldSpec`
431 Specification for the field being added.
432 """
433 raise NotImplementedError()
435 @classmethod
436 @abstractmethod
437 def getCollectionForeignKeyName(cls, prefix: str = "collection") -> str:
438 """Return the name of the field added by `addCollectionForeignKey`
439 if called with the same prefix.
441 Parameters
442 ----------
443 prefix : `str`
444 A name to use for the prefix of the new field; the full name may
445 have a suffix.
447 Returns
448 -------
449 name : `str`
450 The field name.
451 """
452 raise NotImplementedError()
454 @classmethod
455 @abstractmethod
456 def getRunForeignKeyName(cls, prefix: str = "run") -> str:
457 """Return the name of the field added by `addRunForeignKey`
458 if called with the same prefix.
460 Parameters
461 ----------
462 prefix : `str`
463 A name to use for the prefix of the new field; the full name may
464 have a suffix.
466 Returns
467 -------
468 name : `str`
469 The field name.
470 """
471 raise NotImplementedError()
473 @abstractmethod
474 def refresh(self) -> None:
475 """Ensure all other operations on this manager are aware of any
476 collections that may have been registered by other clients since it
477 was initialized or last refreshed.
478 """
479 raise NotImplementedError()
481 @abstractmethod
482 def register(
483 self, name: str, type: CollectionType, doc: str | None = None
484 ) -> tuple[CollectionRecord, bool]:
485 """Ensure that a collection of the given name and type are present
486 in the layer this manager is associated with.
488 Parameters
489 ----------
490 name : `str`
491 Name of the collection.
492 type : `CollectionType`
493 Enumeration value indicating the type of collection.
494 doc : `str`, optional
495 Documentation string for the collection. Ignored if the collection
496 already exists.
498 Returns
499 -------
500 record : `CollectionRecord`
501 Object representing the collection, including its type and ID.
502 If ``type is CollectionType.RUN``, this will be a `RunRecord`
503 instance. If ``type is CollectionType.CHAIN``, this will be a
504 `ChainedCollectionRecord` instance.
505 registered : `bool`
506 True if the collection was registered, `False` if it already
507 existed.
509 Raises
510 ------
511 TransactionInterruption
512 Raised if this operation is invoked within a `Database.transaction`
513 context.
514 DatabaseConflictError
515 Raised if a collection with this name but a different type already
516 exists.
518 Notes
519 -----
520 Concurrent registrations of the same collection should be safe; nothing
521 should happen if the types are consistent, and integrity errors due to
522 inconsistent types should happen before any database changes are made.
523 """
524 raise NotImplementedError()
526 @abstractmethod
527 def remove(self, name: str) -> None:
528 """Completely remove a collection.
530 Any existing `CollectionRecord` objects that correspond to the removed
531 collection are considered invalidated.
533 Parameters
534 ----------
535 name : `str`
536 Name of the collection to remove.
538 Notes
539 -----
540 If this collection is referenced by foreign keys in tables managed by
541 other objects, the ON DELETE clauses of those tables will be invoked.
542 That will frequently delete many dependent rows automatically (via
543 "CASCADE", but it may also cause this operation to fail (with rollback)
544 unless dependent rows that do not have an ON DELETE clause are removed
545 first.
546 """
547 raise NotImplementedError()
549 @abstractmethod
550 def find(self, name: str) -> CollectionRecord:
551 """Return the collection record associated with the given name.
553 Parameters
554 ----------
555 name : `str`
556 Name of the collection.
558 Returns
559 -------
560 record : `CollectionRecord`
561 Object representing the collection, including its type and ID.
562 If ``record.type is CollectionType.RUN``, this will be a
563 `RunRecord` instance. If ``record.type is CollectionType.CHAIN``,
564 this will be a `ChainedCollectionRecord` instance.
566 Raises
567 ------
568 MissingCollectionError
569 Raised if the given collection does not exist.
571 Notes
572 -----
573 Collections registered by another client of the same layer since the
574 last call to `initialize` or `refresh` may not be found.
575 """
576 raise NotImplementedError()
578 @abstractmethod
579 def __getitem__(self, key: Any) -> CollectionRecord:
580 """Return the collection record associated with the given
581 primary/foreign key value.
583 Parameters
584 ----------
585 key
586 Internal primary key value for the collection.
588 Returns
589 -------
590 record : `CollectionRecord`
591 Object representing the collection, including its type and name.
592 If ``record.type is CollectionType.RUN``, this will be a
593 `RunRecord` instance. If ``record.type is CollectionType.CHAIN``,
594 this will be a `ChainedCollectionRecord` instance.
596 Raises
597 ------
598 MissingCollectionError
599 Raised if no collection with this key exists.
601 Notes
602 -----
603 Collections registered by another client of the same layer since the
604 last call to `initialize` or `refresh` may not be found.
605 """
606 raise NotImplementedError()
608 @abstractmethod
609 def resolve_wildcard(
610 self,
611 wildcard: CollectionWildcard,
612 *,
613 collection_types: Set[CollectionType] = CollectionType.all(),
614 done: set[str] | None = None,
615 flatten_chains: bool = True,
616 include_chains: bool | None = None,
617 ) -> list[CollectionRecord]:
618 """Iterate over collection records that match a wildcard.
620 Parameters
621 ----------
622 wildcard : `CollectionWildcard`
623 Names and/or patterns for collections.
624 collection_types : `collections.abc.Set` [ `CollectionType` ], optional
625 If provided, only yield collections of these types.
626 done : `set` [ `str` ], optional
627 A `set` of collection names that will not be returned (presumably
628 because they have already been returned in some higher-level logic)
629 that will also be updated with the names of the collections
630 returned.
631 flatten_chains : `bool`, optional
632 If `True` (default) recursively yield the child collections of
633 `~CollectionType.CHAINED` collections.
634 include_chains : `bool`, optional
635 If `False`, return records for `~CollectionType.CHAINED`
636 collections themselves. The default is the opposite of
637 ``flattenChains``: either return records for CHAINED collections or
638 their children, but not both.
640 Returns
641 -------
642 records : `list` [ `CollectionRecord` ]
643 Matching collection records.
644 """
645 raise NotImplementedError()
647 @abstractmethod
648 def getDocumentation(self, key: Any) -> str | None:
649 """Retrieve the documentation string for a collection.
651 Parameters
652 ----------
653 key
654 Internal primary key value for the collection.
656 Returns
657 -------
658 docs : `str` or `None`
659 Docstring for the collection with the given key.
660 """
661 raise NotImplementedError()
663 @abstractmethod
664 def setDocumentation(self, key: Any, doc: str | None) -> None:
665 """Set the documentation string for a collection.
667 Parameters
668 ----------
669 key
670 Internal primary key value for the collection.
671 docs : `str`, optional
672 Docstring for the collection with the given key.
673 """
674 raise NotImplementedError()
676 def getParentChains(self, key: Any) -> Iterator[ChainedCollectionRecord]:
677 """Find all CHAINED collections that directly contain the given
678 collection.
680 Parameters
681 ----------
682 key
683 Internal primary key value for the collection.
684 """
685 for parent_key in self._parents_by_child[key]:
686 result = self[parent_key]
687 assert isinstance(result, ChainedCollectionRecord)
688 yield result