Coverage for python / lsst / daf / butler / _butler_collections.py: 61%

78 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-26 08:49 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ("ButlerCollections", "CollectionInfo") 

31 

32from abc import ABC, abstractmethod 

33from collections import defaultdict 

34from collections.abc import Iterable, Mapping, Sequence, Set 

35from typing import TYPE_CHECKING, Any, overload 

36 

37from deprecated.sphinx import deprecated 

38from pydantic import BaseModel 

39 

40from ._collection_type import CollectionType 

41 

42if TYPE_CHECKING: 

43 from ._dataset_type import DatasetType 

44 

45 

46class CollectionInfo(BaseModel): 

47 """Information about a single Butler collection.""" 

48 

49 # This class is serialized for the server API -- any new properties you add 

50 # must have default values provided to preserve backwards compatibility. 

51 

52 name: str 

53 """Name of the collection.""" 

54 type: CollectionType 

55 """Type of the collection.""" 

56 doc: str = "" 

57 """Documentation string associated with this collection.""" 

58 children: tuple[str, ...] = tuple() 

59 """Children of this collection (only if CHAINED).""" 

60 parents: frozenset[str] | None = None 

61 """Any parents of this collection. 

62 

63 `None` if the parents were not requested. 

64 """ 

65 dataset_types: frozenset[str] | None = None 

66 """Names of any dataset types associated with datasets in this collection. 

67 

68 `None` if no dataset type information was requested 

69 """ 

70 

71 def __lt__(self, other: Any) -> bool: 

72 """Compare objects by collection name.""" 

73 if not isinstance(other, type(self)): 

74 return NotImplemented 

75 return self.name < other.name 

76 

77 

78class ButlerCollections(ABC, Sequence): 

79 """Methods for working with collections stored in the Butler.""" 

80 

81 @overload 

82 def __getitem__(self, index: int) -> str: ... 82 ↛ exitline 82 didn't return from function '__getitem__' because

83 

84 @overload 

85 def __getitem__(self, index: slice) -> Sequence[str]: ... 85 ↛ exitline 85 didn't return from function '__getitem__' because

86 

87 @deprecated( 

88 "‘Butler.collections’ should no longer be used to get the list of default collections." 

89 " Use ‘Butler.collections.default’ instead. Will be removed after v28.", 

90 version="v28", 

91 category=FutureWarning, 

92 ) 

93 def __getitem__(self, index: int | slice) -> str | Sequence[str]: 

94 return self.defaults[index] 

95 

96 @deprecated( 

97 "‘Butler.collections’ should no longer be used to get the list of default collections." 

98 " Use ‘Butler.collections.default’ instead. Will be removed after v28.", 

99 version="v28", 

100 category=FutureWarning, 

101 ) 

102 def __len__(self) -> int: 

103 return len(self.defaults) 

104 

105 @property 

106 @abstractmethod 

107 def defaults(self) -> Sequence[str]: 

108 """Collection defaults associated with this butler.""" 

109 raise NotImplementedError("Defaults must be implemented by a subclass") 

110 

111 def __str__(self) -> str: 

112 """Return string representation.""" 

113 return f"{self.__class__.__name__}(defaults={self.defaults})" 

114 

115 @abstractmethod 

116 def extend_chain(self, parent_collection_name: str, child_collection_names: str | Iterable[str]) -> None: 

117 """Add children to the end of a CHAINED collection. 

118 

119 If any of the children already existed in the chain, they will be moved 

120 to the new position at the end of the chain. 

121 

122 Parameters 

123 ---------- 

124 parent_collection_name : `str` 

125 The name of a CHAINED collection to which we will add new children. 

126 child_collection_names : `~collections.abc.Iterable` [ `str` ] | `str` 

127 A child collection name or list of child collection names to be 

128 added to the parent. 

129 

130 Raises 

131 ------ 

132 MissingCollectionError 

133 If any of the specified collections do not exist. 

134 CollectionTypeError 

135 If the parent collection is not a CHAINED collection. 

136 CollectionCycleError 

137 If this operation would create a collection cycle. 

138 

139 Notes 

140 ----- 

141 If this function is called within a call to ``Butler.transaction``, it 

142 will hold a lock that prevents other processes from modifying the 

143 parent collection until the end of the transaction. Keep these 

144 transactions short. 

145 """ 

146 raise NotImplementedError() 

147 

148 @abstractmethod 

149 def prepend_chain(self, parent_collection_name: str, child_collection_names: str | Iterable[str]) -> None: 

150 """Add children to the beginning of a CHAINED collection. 

151 

152 If any of the children already existed in the chain, they will be moved 

153 to the new position at the beginning of the chain. 

154 

155 Parameters 

156 ---------- 

157 parent_collection_name : `str` 

158 The name of a CHAINED collection to which we will add new children. 

159 child_collection_names : `~collections.abc.Iterable` [ `str` ] | `str` 

160 A child collection name or list of child collection names to be 

161 added to the parent. 

162 

163 Raises 

164 ------ 

165 MissingCollectionError 

166 If any of the specified collections do not exist. 

167 CollectionTypeError 

168 If the parent collection is not a CHAINED collection. 

169 CollectionCycleError 

170 If this operation would create a collection cycle. 

171 

172 Notes 

173 ----- 

174 If this function is called within a call to ``Butler.transaction``, it 

175 will hold a lock that prevents other processes from modifying the 

176 parent collection until the end of the transaction. Keep these 

177 transactions short. 

178 """ 

179 raise NotImplementedError() 

180 

181 @abstractmethod 

182 def redefine_chain( 

183 self, parent_collection_name: str, child_collection_names: str | Iterable[str] 

184 ) -> None: 

185 """Replace the contents of a CHAINED collection with new children. 

186 

187 Parameters 

188 ---------- 

189 parent_collection_name : `str` 

190 The name of a CHAINED collection to which we will assign new 

191 children. 

192 child_collection_names : `~collections.abc.Iterable` [ `str` ] | `str` 

193 A child collection name or list of child collection names to be 

194 added to the parent. 

195 

196 Raises 

197 ------ 

198 MissingCollectionError 

199 If any of the specified collections do not exist. 

200 CollectionTypeError 

201 If the parent collection is not a CHAINED collection. 

202 CollectionCycleError 

203 If this operation would create a collection cycle. 

204 

205 Notes 

206 ----- 

207 If this function is called within a call to ``Butler.transaction``, it 

208 will hold a lock that prevents other processes from modifying the 

209 parent collection until the end of the transaction. Keep these 

210 transactions short. 

211 """ 

212 raise NotImplementedError() 

213 

214 @abstractmethod 

215 def remove_from_chain( 

216 self, parent_collection_name: str, child_collection_names: str | Iterable[str] 

217 ) -> None: 

218 """Remove children from a CHAINED collection. 

219 

220 Parameters 

221 ---------- 

222 parent_collection_name : `str` 

223 The name of a CHAINED collection from which we will remove 

224 children. 

225 child_collection_names : `~collections.abc.Iterable` [ `str` ] | `str` 

226 A child collection name or list of child collection names to be 

227 removed from the parent. 

228 

229 Raises 

230 ------ 

231 MissingCollectionError 

232 If any of the specified collections do not exist. 

233 CollectionTypeError 

234 If the parent collection is not a CHAINED collection. 

235 

236 Notes 

237 ----- 

238 If this function is called within a call to ``Butler.transaction``, it 

239 will hold a lock that prevents other processes from modifying the 

240 parent collection until the end of the transaction. Keep these 

241 transactions short. 

242 """ 

243 raise NotImplementedError() 

244 

245 def query( 

246 self, 

247 expression: str | Iterable[str], 

248 collection_types: Set[CollectionType] | CollectionType | None = None, 

249 flatten_chains: bool = False, 

250 include_chains: bool | None = None, 

251 ) -> Sequence[str]: 

252 """Query the butler for collections matching an expression. 

253 

254 Parameters 

255 ---------- 

256 expression : `str` or `~collections.abc.Iterable` [ `str` ] 

257 One or more collection names or globs to include in the search. 

258 collection_types : `set` [`CollectionType`], `CollectionType` or `None` 

259 Restrict the types of collections to be searched. If `None` all 

260 collection types are searched. 

261 flatten_chains : `bool`, optional 

262 If `True` (`False` is default), recursively yield the child 

263 collections of matching `~CollectionType.CHAINED` collections. 

264 include_chains : `bool` or `None`, optional 

265 If `True`, yield records for matching `~CollectionType.CHAINED` 

266 collections. Default is the opposite of ``flatten_chains``: 

267 include either CHAINED collections or their children, but not both. 

268 

269 Returns 

270 ------- 

271 collections : `~collections.abc.Sequence` [ `str` ] 

272 The names of collections that match ``expression``. 

273 

274 Notes 

275 ----- 

276 The order in which collections are returned is unspecified, except that 

277 the children of a `~CollectionType.CHAINED` collection are guaranteed 

278 to be in the order in which they are searched. When multiple parent 

279 `~CollectionType.CHAINED` collections match the same criteria, the 

280 order in which the two lists appear is unspecified, and the lists of 

281 children may be incomplete if a child has multiple parents. 

282 

283 The default implementation is a wrapper around `x_query_info`. 

284 """ 

285 collections_info = self.query_info( 

286 expression, 

287 collection_types=collection_types, 

288 flatten_chains=flatten_chains, 

289 include_chains=include_chains, 

290 ) 

291 return [info.name for info in collections_info] 

292 

293 @abstractmethod 

294 def query_info( 

295 self, 

296 expression: str | Iterable[str], 

297 collection_types: Set[CollectionType] | CollectionType | None = None, 

298 flatten_chains: bool = False, 

299 include_chains: bool | None = None, 

300 include_parents: bool = False, 

301 include_summary: bool = False, 

302 include_doc: bool = False, 

303 summary_datasets: Iterable[DatasetType] | Iterable[str] | None = None, 

304 ) -> Sequence[CollectionInfo]: 

305 """Query the butler for collections matching an expression and 

306 return detailed information about those collections. 

307 

308 Parameters 

309 ---------- 

310 expression : `str` or `~collections.abc.Iterable` [ `str` ] 

311 One or more collection names or globs to include in the search. 

312 collection_types : `set` [`CollectionType`], `CollectionType` or `None` 

313 Restrict the types of collections to be searched. If `None` all 

314 collection types are searched. 

315 flatten_chains : `bool`, optional 

316 If `True` (`False` is default), recursively yield the child 

317 collections of matching `~CollectionType.CHAINED` collections. 

318 include_chains : `bool` or `None`, optional 

319 If `True`, yield records for matching `~CollectionType.CHAINED` 

320 collections. Default is the opposite of ``flatten_chains``: 

321 include either CHAINED collections or their children, but not both. 

322 include_parents : `bool`, optional 

323 Whether the returned information includes parents. 

324 include_summary : `bool`, optional 

325 Whether the returned information includes dataset type and 

326 governor information for the collections. 

327 include_doc : `bool`, optional 

328 Whether the returned information includes collection documentation 

329 string. 

330 summary_datasets : `~collections.abc.Iterable` [ `DatasetType` ] or \ 

331 `~collections.abc.Iterable` [ `str` ], optional 

332 Dataset types to include in returned summaries. Only used if 

333 ``include_summary`` is `True`. If not specified then all dataset 

334 types will be included. 

335 

336 Returns 

337 ------- 

338 collections : `~collections.abc.Sequence` [ `CollectionInfo` ] 

339 The names of collections that match ``expression``. 

340 

341 Notes 

342 ----- 

343 The order in which collections are returned is unspecified, except that 

344 the children of a `~CollectionType.CHAINED` collection are guaranteed 

345 to be in the order in which they are searched. When multiple parent 

346 `~CollectionType.CHAINED` collections match the same criteria, the 

347 order in which the two lists appear is unspecified, and the lists of 

348 children may be incomplete if a child has multiple parents. 

349 """ 

350 raise NotImplementedError() 

351 

352 @abstractmethod 

353 def get_info( 

354 self, name: str, include_parents: bool = False, include_summary: bool = False 

355 ) -> CollectionInfo: 

356 """Obtain information for a specific collection. 

357 

358 Parameters 

359 ---------- 

360 name : `str` 

361 The name of the collection of interest. 

362 include_parents : `bool`, optional 

363 If `True` any parents of this collection will be included. 

364 include_summary : `bool`, optional 

365 If `True` dataset type names and governor dimensions of datasets 

366 stored in this collection will be included in the result. 

367 

368 Returns 

369 ------- 

370 info : `CollectionInfo` 

371 Information on the requested collection. 

372 """ 

373 raise NotImplementedError() 

374 

375 @abstractmethod 

376 def register(self, name: str, type: CollectionType = CollectionType.RUN, doc: str | None = None) -> bool: 

377 """Add a new collection if one with the given name does not exist. 

378 

379 Parameters 

380 ---------- 

381 name : `str` 

382 The name of the collection to create. 

383 type : `CollectionType`, optional 

384 Enum value indicating the type of collection to create. Default 

385 is to create a RUN collection. 

386 doc : `str`, optional 

387 Documentation string for the collection. 

388 

389 Returns 

390 ------- 

391 registered : `bool` 

392 Boolean indicating whether the collection was already registered 

393 or was created by this call. 

394 

395 Notes 

396 ----- 

397 Avoid calling this method multiple times within a `Butler.transaction`. 

398 If concurrent processes register the same collection names, they may 

399 block each other until the end of the transaction and in some cases the 

400 database will be required to abort one of the transactions to prevent 

401 deadlock. 

402 """ 

403 raise NotImplementedError() 

404 

405 @abstractmethod 

406 def x_remove(self, name: str) -> None: 

407 """Remove the given collection from the registry. 

408 

409 **This is an experimental interface that can change at any time.** 

410 

411 Parameters 

412 ---------- 

413 name : `str` 

414 The name of the collection to remove. 

415 

416 Raises 

417 ------ 

418 lsst.daf.butler.registry.MissingCollectionError 

419 Raised if no collection with the given name exists. 

420 lsst.daf.butler.registry.OrphanedRecordError 

421 Raised if the database rows associated with the collection are 

422 still referenced by some other table, such as a dataset in a 

423 datastore (for `~CollectionType.RUN` collections only) or a 

424 `~CollectionType.CHAINED` collection of which this collection is 

425 a child. 

426 

427 Notes 

428 ----- 

429 If this is a `~CollectionType.RUN` collection, all datasets and quanta 

430 in it will removed from the `Registry` database. This requires that 

431 those datasets be removed (or at least trashed) from any datastores 

432 that hold them first. 

433 

434 A collection may not be deleted as long as it is referenced by a 

435 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must 

436 be deleted or redefined first. 

437 """ 

438 raise NotImplementedError() 

439 

440 def _filter_dataset_types( 

441 self, dataset_types: Iterable[str], collections: Iterable[CollectionInfo] 

442 ) -> Iterable[str]: 

443 dataset_types_set = set(dataset_types) 

444 collection_dataset_types: set[str] = set() 

445 for info in collections: 

446 if info.dataset_types is None: 

447 raise RuntimeError("Can only filter by collections if include_summary was True") 

448 collection_dataset_types.update(info.dataset_types) 

449 dataset_types_set = dataset_types_set.intersection(collection_dataset_types) 

450 return dataset_types_set 

451 

452 def _group_by_dataset_type( 

453 self, dataset_types: Set[str], collection_infos: Iterable[CollectionInfo] 

454 ) -> Mapping[str, list[str]]: 

455 """Filter dataset types and collections names based on summary in 

456 collecion infos. 

457 

458 Parameters 

459 ---------- 

460 dataset_types : `~collections.abc.Set` [`str`] 

461 Set of dataset type names to extract. 

462 collection_infos : `~collections.abc.Iterable` [`CollectionInfo`] 

463 Collection infos, must contain dataset type summary. 

464 

465 Returns 

466 ------- 

467 filtered : `~collections.abc.Mapping` [`str`, `list` [`str`]] 

468 Mapping of the dataset type name to its corresponding list of 

469 collection names. 

470 """ 

471 # Although this is marked as private, it is called from outside this 

472 # class by other functions internal to daf_butler. 

473 dataset_type_collections: dict[str, list[str]] = defaultdict(list) 

474 for info in collection_infos: 

475 if info.dataset_types is None: 

476 raise RuntimeError("Can only filter by collections if include_summary was True") 

477 for dataset_type in info.dataset_types & dataset_types: 

478 dataset_type_collections[dataset_type].append(info.name) 

479 return dataset_type_collections