Coverage for python/lsst/daf/butler/registry/wildcards.py: 17%

194 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2022-09-27 02:00 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "CategorizedWildcard", 

25 "CollectionQuery", 

26 "CollectionSearch", 

27) 

28 

29import re 

30from collections.abc import Callable, Iterator, Sequence, Set 

31from dataclasses import dataclass 

32from typing import TYPE_CHECKING, Any 

33 

34from lsst.utils.ellipsis import Ellipsis, EllipsisType 

35from lsst.utils.iteration import ensure_iterable 

36from pydantic import BaseModel 

37 

38from ..core import DatasetType 

39from ..core.utils import globToRegex 

40from ._collectionType import CollectionType 

41 

42if TYPE_CHECKING: 42 ↛ 43line 42 didn't jump to line 43, because the condition on line 42 was never true

43 from .interfaces import CollectionManager, CollectionRecord 

44 

45 

46@dataclass 

47class CategorizedWildcard: 

48 """The results of preprocessing a wildcard expression to separate match 

49 patterns from strings. 

50 

51 The `fromExpression` method should almost always be used to construct 

52 instances, as the regular constructor performs no checking of inputs (and 

53 that can lead to confusing error messages downstream). 

54 """ 

55 

56 @classmethod 

57 def fromExpression( 

58 cls, 

59 expression: Any, 

60 *, 

61 allowAny: bool = True, 

62 allowPatterns: bool = True, 

63 coerceUnrecognized: Callable[[Any], tuple[str, Any] | str] | None = None, 

64 coerceItemValue: Callable[[Any], Any] | None = None, 

65 defaultItemValue: Any | None = None, 

66 ) -> CategorizedWildcard | EllipsisType: 

67 """Categorize a wildcard expression. 

68 

69 Parameters 

70 ---------- 

71 expression 

72 The expression to categorize. May be any of: 

73 - `str` (including glob patterns if ``allowPatterns`` is `True`); 

74 - `re.Pattern` (only if ``allowPatterns`` is `True`); 

75 - objects recognized by ``coerceUnrecognized`` (if provided); 

76 - two-element tuples of (`str`, value) where value is recognized 

77 by ``coerceItemValue`` (if provided); 

78 - a non-`str`, non-mapping iterable containing any of the above; 

79 - the special value `...` (only if ``allowAny`` is `True`), which 

80 matches anything; 

81 - a mapping from `str` to a value are recognized by 

82 ``coerceItemValue`` (if provided); 

83 - a `CategorizedWildcard` instance (passed through unchanged if 

84 it meets the requirements specified by keyword arguments). 

85 allowAny: `bool`, optional 

86 If `False` (`True` is default) raise `TypeError` if `...` is 

87 encountered. 

88 allowPatterns: `bool`, optional 

89 If `False` (`True` is default) raise `TypeError` if a `re.Pattern` 

90 is encountered, or if ``expression`` is a `CategorizedWildcard` 

91 with `patterns` not empty. 

92 coerceUnrecognized: `Callable`, optional 

93 A callback that takes a single argument of arbitrary type and 

94 returns either a `str` - appended to `strings` - or a `tuple` of 

95 (`str`, `Any`) to be appended to `items`. This will be called on 

96 objects of unrecognized type, with the return value added to 

97 `strings`. Exceptions will be reraised as `TypeError` (and 

98 chained). 

99 coerceItemValue: `Callable`, optional 

100 If provided, ``expression`` may be a mapping from `str` to any 

101 type that can be passed to this function; the result of that call 

102 will be stored instead as the value in ``self.items``. 

103 defaultItemValue: `Any`, optional 

104 If provided, combine this value with any string values encountered 

105 (including any returned by ``coerceUnrecognized``) to form a 

106 `tuple` and add it to `items`, guaranteeing that `strings` will be 

107 empty. Patterns are never added to `items`. 

108 

109 Returns 

110 ------- 

111 categorized : `CategorizedWildcard` or ``...``. 

112 The struct describing the wildcard. ``...`` is passed through 

113 unchanged. 

114 

115 Raises 

116 ------ 

117 TypeError 

118 Raised if an unsupported type is found in the expression. 

119 """ 

120 assert expression is not None 

121 # See if we were given ...; just return that if we were. 

122 if expression is Ellipsis: 

123 if not allowAny: 

124 raise TypeError("This expression may not be unconstrained.") 

125 return Ellipsis 

126 if isinstance(expression, cls): 

127 # This is already a CategorizedWildcard. Make sure it meets the 

128 # reqs. implied by the kwargs we got. 

129 if not allowPatterns and expression.patterns: 

130 raise TypeError( 

131 f"Regular expression(s) {expression.patterns} are not allowed in this context." 

132 ) 

133 if defaultItemValue is not None and expression.strings: 

134 if expression.items: 

135 raise TypeError( 

136 "Incompatible preprocessed expression: an ordered sequence of str is " 

137 "needed, but the original order was lost in the preprocessing." 

138 ) 

139 return cls( 

140 strings=[], 

141 patterns=expression.patterns, 

142 items=[(k, defaultItemValue) for k in expression.strings], 

143 ) 

144 elif defaultItemValue is None and expression.items: 

145 if expression.strings: 

146 raise TypeError( 

147 "Incompatible preprocessed expression: an ordered sequence of items is " 

148 "needed, but the original order was lost in the preprocessing." 

149 ) 

150 return cls(strings=[k for k, _ in expression.items], patterns=expression.patterns, items=[]) 

151 else: 

152 # Original expression was created with keyword arguments that 

153 # were at least as restrictive as what we just got; pass it 

154 # through. 

155 return expression 

156 

157 # If we get here, we know we'll be creating a new instance. 

158 # Initialize an empty one now. 

159 self = cls(strings=[], patterns=[], items=[]) 

160 

161 # If mappings are allowed, see if we were given a single mapping by 

162 # trying to get items. 

163 if coerceItemValue is not None: 

164 rawItems = None 

165 try: 

166 rawItems = expression.items() 

167 except AttributeError: 

168 pass 

169 if rawItems is not None: 

170 for k, v in rawItems: 

171 try: 

172 self.items.append((k, coerceItemValue(v))) 

173 except Exception as err: 

174 raise TypeError(f"Could not coerce mapping value '{v}' for key '{k}'.") from err 

175 return self 

176 

177 # Not ..., a CategorizedWildcard instance, or a mapping. Just 

178 # process scalars or an iterable. We put the body of the loop inside 

179 # a local function so we can recurse after coercion. 

180 

181 def process(element: Any, alreadyCoerced: bool = False) -> EllipsisType | None: 

182 if isinstance(element, str): 

183 if defaultItemValue is not None: 

184 self.items.append((element, defaultItemValue)) 

185 return None 

186 else: 

187 # This returns a list but we know we only passed in 

188 # single value. 

189 converted = globToRegex(element) 

190 if converted is Ellipsis: 

191 return Ellipsis 

192 element = converted[0] 

193 # Let regex and ... go through to the next check 

194 if isinstance(element, str): 

195 self.strings.append(element) 

196 return None 

197 if allowPatterns and isinstance(element, re.Pattern): 

198 self.patterns.append(element) 

199 return None 

200 if coerceItemValue is not None: 

201 try: 

202 k, v = element 

203 except TypeError: 

204 pass 

205 else: 

206 if not alreadyCoerced: 

207 if not isinstance(k, str): 

208 raise TypeError(f"Item key '{k}' is not a string.") 

209 try: 

210 v = coerceItemValue(v) 

211 except Exception as err: 

212 raise TypeError( 

213 f"Could not coerce tuple item value '{v}' for key '{k}'." 

214 ) from err 

215 self.items.append((k, v)) 

216 return None 

217 if alreadyCoerced: 

218 raise TypeError(f"Object '{element!r}' returned by coercion function is still unrecognized.") 

219 if coerceUnrecognized is not None: 

220 try: 

221 # This should be safe but flake8 cant tell that the 

222 # function will be re-declared next function call 

223 process(coerceUnrecognized(element), alreadyCoerced=True) # noqa: F821 

224 except Exception as err: 

225 raise TypeError(f"Could not coerce expression element '{element!r}'.") from err 

226 else: 

227 extra = "." 

228 if isinstance(element, re.Pattern): 

229 extra = " and patterns are not allowed." 

230 raise TypeError(f"Unsupported object in wildcard expression: '{element!r}'{extra}") 

231 return None 

232 

233 for element in ensure_iterable(expression): 

234 retval = process(element) 

235 if retval is Ellipsis: 

236 # One of the globs matched everything 

237 if not allowAny: 

238 raise TypeError("This expression may not be unconstrained.") 

239 return Ellipsis 

240 del process 

241 return self 

242 

243 strings: list[str] 

244 """Explicit string values found in the wildcard (`list` [ `str` ]). 

245 """ 

246 

247 patterns: list[re.Pattern] 

248 """Regular expression patterns found in the wildcard 

249 (`list` [ `re.Pattern` ]). 

250 """ 

251 

252 items: list[tuple[str, Any]] 

253 """Two-item tuples that relate string values to other objects 

254 (`list` [ `tuple` [ `str`, `Any` ] ]). 

255 """ 

256 

257 

258def _yieldCollectionRecords( 

259 manager: CollectionManager, 

260 record: CollectionRecord, 

261 collectionTypes: Set[CollectionType] = CollectionType.all(), 

262 done: set[str] | None = None, 

263 flattenChains: bool = True, 

264 includeChains: bool | None = None, 

265) -> Iterator[CollectionRecord]: 

266 """A helper function containing common logic for `CollectionSearch.iter` 

267 and `CollectionQuery.iter`: recursively yield `CollectionRecord` only if 

268 they match the criteria given in other arguments. 

269 

270 Parameters 

271 ---------- 

272 manager : `CollectionManager` 

273 Object responsible for managing the collection tables in a `Registry`. 

274 record : `CollectionRecord` 

275 Record to conditionally yield. 

276 collectionTypes : `AbstractSet` [ `CollectionType` ], optional 

277 If provided, only yield collections of these types. 

278 done : `set` [ `str` ], optional 

279 A `set` of already-yielded collection names; if provided, ``record`` 

280 will only be yielded if it is not already in ``done``, and ``done`` 

281 will be updated to include it on return. 

282 flattenChains : `bool`, optional 

283 If `True` (default) recursively yield the child collections of 

284 `~CollectionType.CHAINED` collections. 

285 includeChains : `bool`, optional 

286 If `False`, return records for `~CollectionType.CHAINED` collections 

287 themselves. The default is the opposite of ``flattenChains``: either 

288 return records for CHAINED collections or their children, but not both. 

289 

290 Yields 

291 ------ 

292 record : `CollectionRecord` 

293 Matching collection records. 

294 """ 

295 if done is None: 

296 done = set() 

297 includeChains = includeChains if includeChains is not None else not flattenChains 

298 if record.type in collectionTypes: 

299 done.add(record.name) 

300 if record.type is not CollectionType.CHAINED or includeChains: 

301 yield record 

302 if flattenChains and record.type is CollectionType.CHAINED: 

303 done.add(record.name) 

304 # We know this is a ChainedCollectionRecord because of the enum value, 

305 # but MyPy doesn't. 

306 yield from record.children.iter( # type: ignore 

307 manager, 

308 collectionTypes=collectionTypes, 

309 done=done, 

310 flattenChains=flattenChains, 

311 includeChains=includeChains, 

312 ) 

313 

314 

315class CollectionSearch(BaseModel, Sequence[str]): 

316 """An ordered search path of collections. 

317 

318 The `fromExpression` method should almost always be used to construct 

319 instances, as the regular constructor performs no checking of inputs (and 

320 that can lead to confusing error messages downstream). 

321 

322 Parameters 

323 ---------- 

324 collections : `tuple` [ `str` ] 

325 Tuple of collection names, ordered from the first searched to the last 

326 searched. 

327 

328 Notes 

329 ----- 

330 A `CollectionSearch` is used to find a single dataset (or set of datasets 

331 with different dataset types or data IDs) according to its dataset type and 

332 data ID, giving preference to collections in the order in which they are 

333 specified. A `CollectionQuery` can be constructed from a broader range of 

334 expressions but does not order the collections to be searched. 

335 

336 `CollectionSearch` is an immutable sequence of `str` collection names. 

337 

338 A `CollectionSearch` instance constructed properly (e.g. via 

339 `fromExpression`) is a unique representation of a particular search path; 

340 it is exactly the same internally and compares as equal to any 

341 `CollectionSearch` constructed from an equivalent expression, regardless of 

342 how different the original expressions appear. 

343 """ 

344 

345 __root__: tuple[str, ...] 

346 

347 @classmethod 

348 def fromExpression(cls, expression: Any) -> CollectionSearch: 

349 """Process a general expression to construct a `CollectionSearch` 

350 instance. 

351 

352 Parameters 

353 ---------- 

354 expression 

355 May be: 

356 - a `str` collection name; 

357 - an iterable of `str` collection names; 

358 - another `CollectionSearch` instance (passed through 

359 unchanged). 

360 

361 Duplicate entries will be removed (preserving the first appearance 

362 of each collection name). 

363 Returns 

364 ------- 

365 collections : `CollectionSearch` 

366 A `CollectionSearch` instance. 

367 """ 

368 # First see if this is already a CollectionSearch; just pass that 

369 # through unchanged. This lets us standardize expressions (and turn 

370 # single-pass iterators into multi-pass iterables) in advance and pass 

371 # them down to other routines that accept arbitrary expressions. 

372 if isinstance(expression, cls): 

373 return expression 

374 wildcard = CategorizedWildcard.fromExpression( 

375 expression, 

376 allowAny=False, 

377 allowPatterns=False, 

378 ) 

379 assert wildcard is not Ellipsis 

380 assert not wildcard.patterns 

381 assert not wildcard.items 

382 deduplicated = [] 

383 for name in wildcard.strings: 

384 if name not in deduplicated: 

385 deduplicated.append(name) 

386 return cls(__root__=tuple(deduplicated)) 

387 

388 def iter( 

389 self, 

390 manager: CollectionManager, 

391 *, 

392 datasetType: DatasetType | None = None, 

393 collectionTypes: Set[CollectionType] = CollectionType.all(), 

394 done: set[str] | None = None, 

395 flattenChains: bool = True, 

396 includeChains: bool | None = None, 

397 ) -> Iterator[CollectionRecord]: 

398 """Iterate over collection records that match this instance and the 

399 given criteria, in order. 

400 

401 This method is primarily intended for internal use by `Registry`; 

402 other callers should generally prefer `Registry.findDatasets` or 

403 other `Registry` query methods. 

404 

405 Parameters 

406 ---------- 

407 manager : `CollectionManager` 

408 Object responsible for managing the collection tables in a 

409 `Registry`. 

410 collectionTypes : `AbstractSet` [ `CollectionType` ], optional 

411 If provided, only yield collections of these types. 

412 done : `set`, optional 

413 A `set` containing the names of all collections already yielded; 

414 any collections whose names are already present in this set will 

415 not be yielded again, and those yielded will be added to it while 

416 iterating. If not provided, an empty `set` will be created and 

417 used internally to avoid duplicates. 

418 flattenChains : `bool`, optional 

419 If `True` (default) recursively yield the child collections of 

420 `~CollectionType.CHAINED` collections. 

421 includeChains : `bool`, optional 

422 If `False`, return records for `~CollectionType.CHAINED` 

423 collections themselves. The default is the opposite of 

424 ``flattenChains``: either return records for CHAINED collections or 

425 their children, but not both. 

426 

427 Yields 

428 ------ 

429 record : `CollectionRecord` 

430 Matching collection records. 

431 """ 

432 if done is None: 

433 done = set() 

434 for name in self: 

435 if name not in done: 

436 yield from _yieldCollectionRecords( 

437 manager, 

438 manager.find(name), 

439 collectionTypes=collectionTypes, 

440 done=done, 

441 flattenChains=flattenChains, 

442 includeChains=includeChains, 

443 ) 

444 

445 def explicitNames(self) -> Iterator[str]: 

446 """Iterate over collection names that were specified explicitly.""" 

447 yield from self.__root__ 

448 

449 def __iter__(self) -> Iterator[str]: # type: ignore 

450 yield from self.__root__ 

451 

452 def __len__(self) -> int: 

453 return len(self.__root__) 

454 

455 def __getitem__(self, index: Any) -> str: 

456 return self.__root__[index] 

457 

458 def __eq__(self, other: Any) -> bool: 

459 if isinstance(other, CollectionSearch): 

460 return self.__root__ == other.__root__ 

461 return False 

462 

463 def __str__(self) -> str: 

464 return "[{}]".format(", ".join(self)) 

465 

466 def __repr__(self) -> str: 

467 return f"CollectionSearch({self.__root__!r})" 

468 

469 

470class CollectionQuery: 

471 """An unordered query for collections and dataset type restrictions. 

472 

473 The `fromExpression` method should almost always be used to construct 

474 instances, as the regular constructor performs no checking of inputs (and 

475 that can lead to confusing error messages downstream). 

476 

477 Parameters 

478 ---------- 

479 search : `CollectionSearch` or `...` 

480 An object representing an ordered search for explicitly-named 

481 collections (to be interpreted here as unordered), or the special 

482 value `...` indicating all collections. `...` must be accompanied 

483 by ``patterns=None``. 

484 patterns : `tuple` of `re.Pattern` 

485 Regular expression patterns to match against collection names. 

486 universe : `DimensionUniverse` 

487 Object managing all dimensions. 

488 

489 Notes 

490 ----- 

491 A `CollectionQuery` is used to find all matching datasets in any number 

492 of collections, or to find collections themselves. 

493 

494 `CollectionQuery` is expected to be rarely used outside of `Registry` 

495 (which uses it to back several of its "query" methods that take general 

496 expressions for collections), but it may occassionally be useful outside 

497 `Registry` as a way to preprocess expressions that contain single-pass 

498 iterators into a form that can be used to call those `Registry` methods 

499 multiple times. 

500 """ 

501 

502 def __init__( 

503 self, 

504 search: CollectionSearch | EllipsisType = Ellipsis, 

505 patterns: tuple[re.Pattern, ...] = (), 

506 ): 

507 self._search = search 

508 self._patterns = patterns 

509 

510 __slots__ = ("_search", "_patterns") 

511 

512 @classmethod 

513 def fromExpression(cls, expression: Any) -> CollectionQuery: 

514 """Process a general expression to construct a `CollectionQuery` 

515 instance. 

516 

517 Parameters 

518 ---------- 

519 expression 

520 May be: 

521 - a `str` collection name; 

522 - an `re.Pattern` instance to match (with `re.Pattern.fullmatch`) 

523 against collection names; 

524 - any iterable containing any of the above; 

525 - a `CollectionSearch` instance; 

526 - another `CollectionQuery` instance (passed through unchanged). 

527 

528 Duplicate collection names will be removed (preserving the first 

529 appearance of each collection name). 

530 

531 Returns 

532 ------- 

533 collections : `CollectionQuery` 

534 A `CollectionQuery` instance. 

535 """ 

536 if isinstance(expression, cls): 

537 return expression 

538 if expression is Ellipsis: 

539 return cls() 

540 if isinstance(expression, CollectionSearch): 

541 return cls(search=expression, patterns=()) 

542 wildcard = CategorizedWildcard.fromExpression( 

543 expression, 

544 allowAny=True, 

545 allowPatterns=True, 

546 ) 

547 if wildcard is Ellipsis: 

548 return cls() 

549 assert ( 

550 not wildcard.items 

551 ), "We should no longer be transforming to (str, DatasetTypeRestriction) tuples." 

552 return cls( 

553 search=CollectionSearch.fromExpression(wildcard.strings), 

554 patterns=tuple(wildcard.patterns), 

555 ) 

556 

557 def iter( 

558 self, 

559 manager: CollectionManager, 

560 *, 

561 collectionTypes: Set[CollectionType] = CollectionType.all(), 

562 flattenChains: bool = True, 

563 includeChains: bool | None = None, 

564 ) -> Iterator[CollectionRecord]: 

565 """Iterate over collection records that match this instance and the 

566 given criteria, in an arbitrary order. 

567 

568 This method is primarily intended for internal use by `Registry`; 

569 other callers should generally prefer `Registry.queryDatasets` or 

570 other `Registry` query methods. 

571 

572 Parameters 

573 ---------- 

574 manager : `CollectionManager` 

575 Object responsible for managing the collection tables in a 

576 `Registry`. 

577 collectionTypes : `AbstractSet` [ `CollectionType` ], optional 

578 If provided, only yield collections of these types. 

579 flattenChains : `bool`, optional 

580 If `True` (default) recursively yield the child collections of 

581 `~CollectionType.CHAINED` collections. 

582 includeChains : `bool`, optional 

583 If `False`, return records for `~CollectionType.CHAINED` 

584 collections themselves. The default is the opposite of 

585 ``flattenChains``: either return records for CHAINED collections or 

586 their children, but not both. 

587 

588 Yields 

589 ------ 

590 record : `CollectionRecord` 

591 Matching collection records. 

592 """ 

593 if self._search is Ellipsis: 

594 for record in manager: 

595 yield from _yieldCollectionRecords( 

596 manager, 

597 record, 

598 collectionTypes=collectionTypes, 

599 flattenChains=flattenChains, 

600 includeChains=includeChains, 

601 ) 

602 else: 

603 done: set[str] = set() 

604 yield from self._search.iter( 

605 manager, 

606 collectionTypes=collectionTypes, 

607 done=done, 

608 flattenChains=flattenChains, 

609 includeChains=includeChains, 

610 ) 

611 for record in manager: 

612 if record.name not in done and any(p.fullmatch(record.name) for p in self._patterns): 

613 yield from _yieldCollectionRecords( 

614 manager, 

615 record, 

616 collectionTypes=collectionTypes, 

617 done=done, 

618 flattenChains=flattenChains, 

619 includeChains=includeChains, 

620 ) 

621 

622 def explicitNames(self) -> Iterator[str]: 

623 """Iterate over collection names that were specified explicitly.""" 

624 if isinstance(self._search, CollectionSearch): 

625 yield from self._search.explicitNames() 

626 

627 def __eq__(self, other: Any) -> bool: 

628 if isinstance(other, CollectionQuery): 

629 return self._search == other._search and self._patterns == other._patterns 

630 else: 

631 return False 

632 

633 def __str__(self) -> str: 

634 if self._search is Ellipsis: 

635 return "..." 

636 else: 

637 terms = list(self._search) 

638 terms.extend(str(p) for p in self._patterns) 

639 return "[{}]".format(", ".join(terms)) 

640 

641 def __repr__(self) -> str: 

642 return f"CollectionQuery({self._search!r}, {self._patterns!r})"