Coverage for python / lsst / daf / butler / _butler_collections.py: 61%
78 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-26 08:49 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-26 08:49 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = ("ButlerCollections", "CollectionInfo")
32from abc import ABC, abstractmethod
33from collections import defaultdict
34from collections.abc import Iterable, Mapping, Sequence, Set
35from typing import TYPE_CHECKING, Any, overload
37from deprecated.sphinx import deprecated
38from pydantic import BaseModel
40from ._collection_type import CollectionType
42if TYPE_CHECKING:
43 from ._dataset_type import DatasetType
46class CollectionInfo(BaseModel):
47 """Information about a single Butler collection."""
49 # This class is serialized for the server API -- any new properties you add
50 # must have default values provided to preserve backwards compatibility.
52 name: str
53 """Name of the collection."""
54 type: CollectionType
55 """Type of the collection."""
56 doc: str = ""
57 """Documentation string associated with this collection."""
58 children: tuple[str, ...] = tuple()
59 """Children of this collection (only if CHAINED)."""
60 parents: frozenset[str] | None = None
61 """Any parents of this collection.
63 `None` if the parents were not requested.
64 """
65 dataset_types: frozenset[str] | None = None
66 """Names of any dataset types associated with datasets in this collection.
68 `None` if no dataset type information was requested
69 """
71 def __lt__(self, other: Any) -> bool:
72 """Compare objects by collection name."""
73 if not isinstance(other, type(self)):
74 return NotImplemented
75 return self.name < other.name
78class ButlerCollections(ABC, Sequence):
79 """Methods for working with collections stored in the Butler."""
81 @overload
82 def __getitem__(self, index: int) -> str: ... 82 ↛ exitline 82 didn't return from function '__getitem__' because
84 @overload
85 def __getitem__(self, index: slice) -> Sequence[str]: ... 85 ↛ exitline 85 didn't return from function '__getitem__' because
87 @deprecated(
88 "‘Butler.collections’ should no longer be used to get the list of default collections."
89 " Use ‘Butler.collections.default’ instead. Will be removed after v28.",
90 version="v28",
91 category=FutureWarning,
92 )
93 def __getitem__(self, index: int | slice) -> str | Sequence[str]:
94 return self.defaults[index]
96 @deprecated(
97 "‘Butler.collections’ should no longer be used to get the list of default collections."
98 " Use ‘Butler.collections.default’ instead. Will be removed after v28.",
99 version="v28",
100 category=FutureWarning,
101 )
102 def __len__(self) -> int:
103 return len(self.defaults)
105 @property
106 @abstractmethod
107 def defaults(self) -> Sequence[str]:
108 """Collection defaults associated with this butler."""
109 raise NotImplementedError("Defaults must be implemented by a subclass")
111 def __str__(self) -> str:
112 """Return string representation."""
113 return f"{self.__class__.__name__}(defaults={self.defaults})"
115 @abstractmethod
116 def extend_chain(self, parent_collection_name: str, child_collection_names: str | Iterable[str]) -> None:
117 """Add children to the end of a CHAINED collection.
119 If any of the children already existed in the chain, they will be moved
120 to the new position at the end of the chain.
122 Parameters
123 ----------
124 parent_collection_name : `str`
125 The name of a CHAINED collection to which we will add new children.
126 child_collection_names : `~collections.abc.Iterable` [ `str` ] | `str`
127 A child collection name or list of child collection names to be
128 added to the parent.
130 Raises
131 ------
132 MissingCollectionError
133 If any of the specified collections do not exist.
134 CollectionTypeError
135 If the parent collection is not a CHAINED collection.
136 CollectionCycleError
137 If this operation would create a collection cycle.
139 Notes
140 -----
141 If this function is called within a call to ``Butler.transaction``, it
142 will hold a lock that prevents other processes from modifying the
143 parent collection until the end of the transaction. Keep these
144 transactions short.
145 """
146 raise NotImplementedError()
148 @abstractmethod
149 def prepend_chain(self, parent_collection_name: str, child_collection_names: str | Iterable[str]) -> None:
150 """Add children to the beginning of a CHAINED collection.
152 If any of the children already existed in the chain, they will be moved
153 to the new position at the beginning of the chain.
155 Parameters
156 ----------
157 parent_collection_name : `str`
158 The name of a CHAINED collection to which we will add new children.
159 child_collection_names : `~collections.abc.Iterable` [ `str` ] | `str`
160 A child collection name or list of child collection names to be
161 added to the parent.
163 Raises
164 ------
165 MissingCollectionError
166 If any of the specified collections do not exist.
167 CollectionTypeError
168 If the parent collection is not a CHAINED collection.
169 CollectionCycleError
170 If this operation would create a collection cycle.
172 Notes
173 -----
174 If this function is called within a call to ``Butler.transaction``, it
175 will hold a lock that prevents other processes from modifying the
176 parent collection until the end of the transaction. Keep these
177 transactions short.
178 """
179 raise NotImplementedError()
181 @abstractmethod
182 def redefine_chain(
183 self, parent_collection_name: str, child_collection_names: str | Iterable[str]
184 ) -> None:
185 """Replace the contents of a CHAINED collection with new children.
187 Parameters
188 ----------
189 parent_collection_name : `str`
190 The name of a CHAINED collection to which we will assign new
191 children.
192 child_collection_names : `~collections.abc.Iterable` [ `str` ] | `str`
193 A child collection name or list of child collection names to be
194 added to the parent.
196 Raises
197 ------
198 MissingCollectionError
199 If any of the specified collections do not exist.
200 CollectionTypeError
201 If the parent collection is not a CHAINED collection.
202 CollectionCycleError
203 If this operation would create a collection cycle.
205 Notes
206 -----
207 If this function is called within a call to ``Butler.transaction``, it
208 will hold a lock that prevents other processes from modifying the
209 parent collection until the end of the transaction. Keep these
210 transactions short.
211 """
212 raise NotImplementedError()
214 @abstractmethod
215 def remove_from_chain(
216 self, parent_collection_name: str, child_collection_names: str | Iterable[str]
217 ) -> None:
218 """Remove children from a CHAINED collection.
220 Parameters
221 ----------
222 parent_collection_name : `str`
223 The name of a CHAINED collection from which we will remove
224 children.
225 child_collection_names : `~collections.abc.Iterable` [ `str` ] | `str`
226 A child collection name or list of child collection names to be
227 removed from the parent.
229 Raises
230 ------
231 MissingCollectionError
232 If any of the specified collections do not exist.
233 CollectionTypeError
234 If the parent collection is not a CHAINED collection.
236 Notes
237 -----
238 If this function is called within a call to ``Butler.transaction``, it
239 will hold a lock that prevents other processes from modifying the
240 parent collection until the end of the transaction. Keep these
241 transactions short.
242 """
243 raise NotImplementedError()
245 def query(
246 self,
247 expression: str | Iterable[str],
248 collection_types: Set[CollectionType] | CollectionType | None = None,
249 flatten_chains: bool = False,
250 include_chains: bool | None = None,
251 ) -> Sequence[str]:
252 """Query the butler for collections matching an expression.
254 Parameters
255 ----------
256 expression : `str` or `~collections.abc.Iterable` [ `str` ]
257 One or more collection names or globs to include in the search.
258 collection_types : `set` [`CollectionType`], `CollectionType` or `None`
259 Restrict the types of collections to be searched. If `None` all
260 collection types are searched.
261 flatten_chains : `bool`, optional
262 If `True` (`False` is default), recursively yield the child
263 collections of matching `~CollectionType.CHAINED` collections.
264 include_chains : `bool` or `None`, optional
265 If `True`, yield records for matching `~CollectionType.CHAINED`
266 collections. Default is the opposite of ``flatten_chains``:
267 include either CHAINED collections or their children, but not both.
269 Returns
270 -------
271 collections : `~collections.abc.Sequence` [ `str` ]
272 The names of collections that match ``expression``.
274 Notes
275 -----
276 The order in which collections are returned is unspecified, except that
277 the children of a `~CollectionType.CHAINED` collection are guaranteed
278 to be in the order in which they are searched. When multiple parent
279 `~CollectionType.CHAINED` collections match the same criteria, the
280 order in which the two lists appear is unspecified, and the lists of
281 children may be incomplete if a child has multiple parents.
283 The default implementation is a wrapper around `x_query_info`.
284 """
285 collections_info = self.query_info(
286 expression,
287 collection_types=collection_types,
288 flatten_chains=flatten_chains,
289 include_chains=include_chains,
290 )
291 return [info.name for info in collections_info]
293 @abstractmethod
294 def query_info(
295 self,
296 expression: str | Iterable[str],
297 collection_types: Set[CollectionType] | CollectionType | None = None,
298 flatten_chains: bool = False,
299 include_chains: bool | None = None,
300 include_parents: bool = False,
301 include_summary: bool = False,
302 include_doc: bool = False,
303 summary_datasets: Iterable[DatasetType] | Iterable[str] | None = None,
304 ) -> Sequence[CollectionInfo]:
305 """Query the butler for collections matching an expression and
306 return detailed information about those collections.
308 Parameters
309 ----------
310 expression : `str` or `~collections.abc.Iterable` [ `str` ]
311 One or more collection names or globs to include in the search.
312 collection_types : `set` [`CollectionType`], `CollectionType` or `None`
313 Restrict the types of collections to be searched. If `None` all
314 collection types are searched.
315 flatten_chains : `bool`, optional
316 If `True` (`False` is default), recursively yield the child
317 collections of matching `~CollectionType.CHAINED` collections.
318 include_chains : `bool` or `None`, optional
319 If `True`, yield records for matching `~CollectionType.CHAINED`
320 collections. Default is the opposite of ``flatten_chains``:
321 include either CHAINED collections or their children, but not both.
322 include_parents : `bool`, optional
323 Whether the returned information includes parents.
324 include_summary : `bool`, optional
325 Whether the returned information includes dataset type and
326 governor information for the collections.
327 include_doc : `bool`, optional
328 Whether the returned information includes collection documentation
329 string.
330 summary_datasets : `~collections.abc.Iterable` [ `DatasetType` ] or \
331 `~collections.abc.Iterable` [ `str` ], optional
332 Dataset types to include in returned summaries. Only used if
333 ``include_summary`` is `True`. If not specified then all dataset
334 types will be included.
336 Returns
337 -------
338 collections : `~collections.abc.Sequence` [ `CollectionInfo` ]
339 The names of collections that match ``expression``.
341 Notes
342 -----
343 The order in which collections are returned is unspecified, except that
344 the children of a `~CollectionType.CHAINED` collection are guaranteed
345 to be in the order in which they are searched. When multiple parent
346 `~CollectionType.CHAINED` collections match the same criteria, the
347 order in which the two lists appear is unspecified, and the lists of
348 children may be incomplete if a child has multiple parents.
349 """
350 raise NotImplementedError()
352 @abstractmethod
353 def get_info(
354 self, name: str, include_parents: bool = False, include_summary: bool = False
355 ) -> CollectionInfo:
356 """Obtain information for a specific collection.
358 Parameters
359 ----------
360 name : `str`
361 The name of the collection of interest.
362 include_parents : `bool`, optional
363 If `True` any parents of this collection will be included.
364 include_summary : `bool`, optional
365 If `True` dataset type names and governor dimensions of datasets
366 stored in this collection will be included in the result.
368 Returns
369 -------
370 info : `CollectionInfo`
371 Information on the requested collection.
372 """
373 raise NotImplementedError()
375 @abstractmethod
376 def register(self, name: str, type: CollectionType = CollectionType.RUN, doc: str | None = None) -> bool:
377 """Add a new collection if one with the given name does not exist.
379 Parameters
380 ----------
381 name : `str`
382 The name of the collection to create.
383 type : `CollectionType`, optional
384 Enum value indicating the type of collection to create. Default
385 is to create a RUN collection.
386 doc : `str`, optional
387 Documentation string for the collection.
389 Returns
390 -------
391 registered : `bool`
392 Boolean indicating whether the collection was already registered
393 or was created by this call.
395 Notes
396 -----
397 Avoid calling this method multiple times within a `Butler.transaction`.
398 If concurrent processes register the same collection names, they may
399 block each other until the end of the transaction and in some cases the
400 database will be required to abort one of the transactions to prevent
401 deadlock.
402 """
403 raise NotImplementedError()
405 @abstractmethod
406 def x_remove(self, name: str) -> None:
407 """Remove the given collection from the registry.
409 **This is an experimental interface that can change at any time.**
411 Parameters
412 ----------
413 name : `str`
414 The name of the collection to remove.
416 Raises
417 ------
418 lsst.daf.butler.registry.MissingCollectionError
419 Raised if no collection with the given name exists.
420 lsst.daf.butler.registry.OrphanedRecordError
421 Raised if the database rows associated with the collection are
422 still referenced by some other table, such as a dataset in a
423 datastore (for `~CollectionType.RUN` collections only) or a
424 `~CollectionType.CHAINED` collection of which this collection is
425 a child.
427 Notes
428 -----
429 If this is a `~CollectionType.RUN` collection, all datasets and quanta
430 in it will removed from the `Registry` database. This requires that
431 those datasets be removed (or at least trashed) from any datastores
432 that hold them first.
434 A collection may not be deleted as long as it is referenced by a
435 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must
436 be deleted or redefined first.
437 """
438 raise NotImplementedError()
440 def _filter_dataset_types(
441 self, dataset_types: Iterable[str], collections: Iterable[CollectionInfo]
442 ) -> Iterable[str]:
443 dataset_types_set = set(dataset_types)
444 collection_dataset_types: set[str] = set()
445 for info in collections:
446 if info.dataset_types is None:
447 raise RuntimeError("Can only filter by collections if include_summary was True")
448 collection_dataset_types.update(info.dataset_types)
449 dataset_types_set = dataset_types_set.intersection(collection_dataset_types)
450 return dataset_types_set
452 def _group_by_dataset_type(
453 self, dataset_types: Set[str], collection_infos: Iterable[CollectionInfo]
454 ) -> Mapping[str, list[str]]:
455 """Filter dataset types and collections names based on summary in
456 collecion infos.
458 Parameters
459 ----------
460 dataset_types : `~collections.abc.Set` [`str`]
461 Set of dataset type names to extract.
462 collection_infos : `~collections.abc.Iterable` [`CollectionInfo`]
463 Collection infos, must contain dataset type summary.
465 Returns
466 -------
467 filtered : `~collections.abc.Mapping` [`str`, `list` [`str`]]
468 Mapping of the dataset type name to its corresponding list of
469 collection names.
470 """
471 # Although this is marked as private, it is called from outside this
472 # class by other functions internal to daf_butler.
473 dataset_type_collections: dict[str, list[str]] = defaultdict(list)
474 for info in collection_infos:
475 if info.dataset_types is None:
476 raise RuntimeError("Can only filter by collections if include_summary was True")
477 for dataset_type in info.dataset_types & dataset_types:
478 dataset_type_collections[dataset_type].append(info.name)
479 return dataset_type_collections