Coverage for python/lsst/daf/butler/registry/wildcards.py: 25%

220 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-27 09:44 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ( 

30 "CategorizedWildcard", 

31 "CollectionWildcard", 

32 "CollectionSearch", 

33 "DatasetTypeWildcard", 

34) 

35 

36import contextlib 

37import dataclasses 

38import re 

39from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence 

40from types import EllipsisType 

41from typing import Any 

42 

43from deprecated.sphinx import deprecated 

44from lsst.daf.butler._compat import PYDANTIC_V2 

45from lsst.utils.iteration import ensure_iterable 

46 

47from .._dataset_type import DatasetType 

48from ..utils import globToRegex 

49from ._exceptions import CollectionExpressionError, DatasetTypeExpressionError 

50 

51 

52@dataclasses.dataclass 

53class CategorizedWildcard: 

54 """The results of preprocessing a wildcard expression to separate match 

55 patterns from strings. 

56 

57 The `fromExpression` method should almost always be used to construct 

58 instances, as the regular constructor performs no checking of inputs (and 

59 that can lead to confusing error messages downstream). 

60 """ 

61 

62 @classmethod 

63 def fromExpression( 

64 cls, 

65 expression: Any, 

66 *, 

67 allowAny: bool = True, 

68 allowPatterns: bool = True, 

69 coerceUnrecognized: Callable[[Any], tuple[str, Any] | str] | None = None, 

70 coerceItemValue: Callable[[Any], Any] | None = None, 

71 defaultItemValue: Any | None = None, 

72 ) -> CategorizedWildcard | EllipsisType: 

73 """Categorize a wildcard expression. 

74 

75 Parameters 

76 ---------- 

77 expression 

78 The expression to categorize. May be any of: 

79 - `str` (including glob patterns if ``allowPatterns`` is `True`); 

80 - `re.Pattern` (only if ``allowPatterns`` is `True`); 

81 - objects recognized by ``coerceUnrecognized`` (if provided); 

82 - two-element tuples of (`str`, value) where value is recognized 

83 by ``coerceItemValue`` (if provided); 

84 - a non-`str`, non-mapping iterable containing any of the above; 

85 - the special value `...` (only if ``allowAny`` is `True`), which 

86 matches anything; 

87 - a mapping from `str` to a value are recognized by 

88 ``coerceItemValue`` (if provided); 

89 - a `CategorizedWildcard` instance (passed through unchanged if 

90 it meets the requirements specified by keyword arguments). 

91 allowAny: `bool`, optional 

92 If `False` (`True` is default) raise `TypeError` if `...` is 

93 encountered. 

94 allowPatterns: `bool`, optional 

95 If `False` (`True` is default) raise `TypeError` if a `re.Pattern` 

96 is encountered, or if ``expression`` is a `CategorizedWildcard` 

97 with `patterns` not empty. 

98 coerceUnrecognized: `~collections.abc.Callable`, optional 

99 A callback that takes a single argument of arbitrary type and 

100 returns either a `str` - appended to `strings` - or a `tuple` of 

101 (`str`, `Any`) to be appended to `items`. This will be called on 

102 objects of unrecognized type. Exceptions will be reraised as 

103 `TypeError` (and chained). 

104 coerceItemValue: `~collections.abc.Callable`, optional 

105 If provided, ``expression`` may be a mapping from `str` to any 

106 type that can be passed to this function; the result of that call 

107 will be stored instead as the value in ``self.items``. 

108 defaultItemValue: `Any`, optional 

109 If provided, combine this value with any string values encountered 

110 (including any returned by ``coerceUnrecognized``) to form a 

111 `tuple` and add it to `items`, guaranteeing that `strings` will be 

112 empty. Patterns are never added to `items`. 

113 

114 Returns 

115 ------- 

116 categorized : `CategorizedWildcard` or ``...``. 

117 The struct describing the wildcard. ``...`` is passed through 

118 unchanged. 

119 

120 Raises 

121 ------ 

122 TypeError 

123 Raised if an unsupported type is found in the expression. 

124 """ 

125 assert expression is not None 

126 # See if we were given ...; just return that if we were. 

127 if expression is ...: 

128 if not allowAny: 

129 raise TypeError("This expression may not be unconstrained.") 

130 return ... 

131 if isinstance(expression, cls): 

132 # This is already a CategorizedWildcard. Make sure it meets the 

133 # reqs. implied by the kwargs we got. 

134 if not allowPatterns and expression.patterns: 

135 raise TypeError( 

136 f"Regular expression(s) {expression.patterns} are not allowed in this context." 

137 ) 

138 if defaultItemValue is not None and expression.strings: 

139 if expression.items: 

140 raise TypeError( 

141 "Incompatible preprocessed expression: an ordered sequence of str is " 

142 "needed, but the original order was lost in the preprocessing." 

143 ) 

144 return cls( 

145 strings=[], 

146 patterns=expression.patterns, 

147 items=[(k, defaultItemValue) for k in expression.strings], 

148 ) 

149 elif defaultItemValue is None and expression.items: 

150 if expression.strings: 

151 raise TypeError( 

152 "Incompatible preprocessed expression: an ordered sequence of items is " 

153 "needed, but the original order was lost in the preprocessing." 

154 ) 

155 return cls(strings=[k for k, _ in expression.items], patterns=expression.patterns, items=[]) 

156 else: 

157 # Original expression was created with keyword arguments that 

158 # were at least as restrictive as what we just got; pass it 

159 # through. 

160 return expression 

161 

162 # If we get here, we know we'll be creating a new instance. 

163 # Initialize an empty one now. 

164 self = cls(strings=[], patterns=[], items=[]) 

165 

166 # If mappings are allowed, see if we were given a single mapping by 

167 # trying to get items. 

168 if coerceItemValue is not None: 

169 rawItems = None 

170 with contextlib.suppress(AttributeError): 

171 rawItems = expression.items() 

172 

173 if rawItems is not None: 

174 for k, v in rawItems: 

175 try: 

176 self.items.append((k, coerceItemValue(v))) 

177 except Exception as err: 

178 raise TypeError(f"Could not coerce mapping value '{v}' for key '{k}'.") from err 

179 return self 

180 

181 # Not ..., a CategorizedWildcard instance, or a mapping. Just 

182 # process scalars or an iterable. We put the body of the loop inside 

183 # a local function so we can recurse after coercion. 

184 

185 def process(element: Any, alreadyCoerced: bool = False) -> EllipsisType | None: 

186 if isinstance(element, str): 

187 if defaultItemValue is not None: 

188 self.items.append((element, defaultItemValue)) 

189 return None 

190 else: 

191 # This returns a list but we know we only passed in 

192 # single value. 

193 converted = globToRegex(element) 

194 if converted is ...: 

195 return ... 

196 element = converted[0] 

197 # Let regex and ... go through to the next check 

198 if isinstance(element, str): 

199 self.strings.append(element) 

200 return None 

201 if allowPatterns and isinstance(element, re.Pattern): 

202 self.patterns.append(element) 

203 return None 

204 if alreadyCoerced: 

205 try: 

206 k, v = element 

207 except TypeError: 

208 raise TypeError( 

209 f"Object '{element!r}' returned by coercion function must be `str` or `tuple`." 

210 ) from None 

211 else: 

212 self.items.append((k, v)) 

213 return None 

214 if coerceItemValue is not None: 

215 try: 

216 k, v = element 

217 except TypeError: 

218 pass 

219 else: 

220 if not isinstance(k, str): 

221 raise TypeError(f"Item key '{k}' is not a string.") 

222 try: 

223 v = coerceItemValue(v) 

224 except Exception as err: 

225 raise TypeError(f"Could not coerce tuple item value '{v}' for key '{k}'.") from err 

226 self.items.append((k, v)) 

227 return None 

228 if coerceUnrecognized is not None: 

229 try: 

230 # This should be safe but flake8 cant tell that the 

231 # function will be re-declared next function call 

232 process(coerceUnrecognized(element), alreadyCoerced=True) # noqa: F821 

233 except Exception as err: 

234 raise TypeError(f"Could not coerce expression element '{element!r}'.") from err 

235 else: 

236 extra = "." 

237 if isinstance(element, re.Pattern): 

238 extra = " and patterns are not allowed." 

239 raise TypeError(f"Unsupported object in wildcard expression: '{element!r}'{extra}") 

240 return None 

241 

242 for element in ensure_iterable(expression): 

243 retval = process(element) 

244 if retval is ...: 

245 # One of the globs matched everything 

246 if not allowAny: 

247 raise TypeError("This expression may not be unconstrained.") 

248 return ... 

249 del process 

250 return self 

251 

252 strings: list[str] 

253 """Explicit string values found in the wildcard (`list` [ `str` ]). 

254 """ 

255 

256 patterns: list[re.Pattern] 

257 """Regular expression patterns found in the wildcard 

258 (`list` [ `re.Pattern` ]). 

259 """ 

260 

261 items: list[tuple[str, Any]] 

262 """Two-item tuples that relate string values to other objects 

263 (`list` [ `tuple` [ `str`, `Any` ] ]). 

264 """ 

265 

266 

267if PYDANTIC_V2: 267 ↛ 268line 267 didn't jump to line 268, because the condition on line 267 was never true

268 from pydantic import RootModel # type: ignore 

269 

270 class _CollectionSearch(RootModel): 

271 root: tuple[str, ...] 

272 

273else: 

274 from pydantic import BaseModel 

275 

276 class _CollectionSearch(BaseModel, Sequence[str]): # type: ignore 

277 __root__: tuple[str, ...] 

278 

279 @property 

280 def root(self) -> tuple[str, ...]: 

281 return self.__root__ 

282 

283 

284@deprecated( 

285 reason="Tuples of string collection names are now preferred. Will be removed after v26.", 

286 version="v25.0", 

287 category=FutureWarning, 

288) 

289class CollectionSearch(_CollectionSearch): 

290 """An ordered search path of collections. 

291 

292 The `fromExpression` method should almost always be used to construct 

293 instances, as the regular constructor performs no checking of inputs (and 

294 that can lead to confusing error messages downstream). 

295 

296 Parameters 

297 ---------- 

298 collections : `tuple` [ `str` ] 

299 Tuple of collection names, ordered from the first searched to the last 

300 searched. 

301 

302 Notes 

303 ----- 

304 A `CollectionSearch` is used to find a single dataset (or set of datasets 

305 with different dataset types or data IDs) according to its dataset type and 

306 data ID, giving preference to collections in the order in which they are 

307 specified. A `CollectionWildcard` can be constructed from a broader range 

308 of expressions but does not order the collections to be searched. 

309 

310 `CollectionSearch` is an immutable sequence of `str` collection names. 

311 

312 A `CollectionSearch` instance constructed properly (e.g. via 

313 `fromExpression`) is a unique representation of a particular search path; 

314 it is exactly the same internally and compares as equal to any 

315 `CollectionSearch` constructed from an equivalent expression, regardless of 

316 how different the original expressions appear. 

317 """ 

318 

319 @classmethod 

320 def fromExpression(cls, expression: Any) -> CollectionSearch: 

321 """Process a general expression to construct a `CollectionSearch` 

322 instance. 

323 

324 Parameters 

325 ---------- 

326 expression 

327 May be: 

328 - a `str` collection name; 

329 - an iterable of `str` collection names; 

330 - another `CollectionSearch` instance (passed through 

331 unchanged). 

332 

333 Duplicate entries will be removed (preserving the first appearance 

334 of each collection name). 

335 

336 Returns 

337 ------- 

338 collections : `CollectionSearch` 

339 A `CollectionSearch` instance. 

340 """ 

341 # First see if this is already a CollectionSearch; just pass that 

342 # through unchanged. This lets us standardize expressions (and turn 

343 # single-pass iterators into multi-pass iterables) in advance and pass 

344 # them down to other routines that accept arbitrary expressions. 

345 if isinstance(expression, cls): 

346 return expression 

347 try: 

348 wildcard = CategorizedWildcard.fromExpression( 

349 expression, 

350 allowAny=False, 

351 allowPatterns=False, 

352 ) 

353 except TypeError as err: 

354 raise CollectionExpressionError(str(err)) from None 

355 assert wildcard is not ... 

356 assert not wildcard.patterns 

357 assert not wildcard.items 

358 deduplicated = [] 

359 for name in wildcard.strings: 

360 if name not in deduplicated: 

361 deduplicated.append(name) 

362 if PYDANTIC_V2: 

363 model = cls(tuple(deduplicated)) # type: ignore 

364 else: 

365 model = cls(__root__=tuple(deduplicated)) # type: ignore 

366 return model 

367 

368 def explicitNames(self) -> Iterator[str]: 

369 """Iterate over collection names that were specified explicitly.""" 

370 yield from self.root 

371 

372 def __iter__(self) -> Iterator[str]: # type: ignore 

373 yield from self.root 

374 

375 def __len__(self) -> int: 

376 return len(self.root) 

377 

378 def __getitem__(self, index: Any) -> str: 

379 return self.root[index] 

380 

381 def __eq__(self, other: Any) -> bool: 

382 if isinstance(other, CollectionSearch): 

383 return self.root == other.root 

384 return False 

385 

386 def __str__(self) -> str: 

387 return "[{}]".format(", ".join(self)) 

388 

389 def __repr__(self) -> str: 

390 return f"CollectionSearch({self.root!r})" 

391 

392 

393@dataclasses.dataclass(frozen=True) 

394class CollectionWildcard: 

395 """A validated wildcard for collection names. 

396 

397 The `from_expression` method should almost always be used to construct 

398 instances, as the regular constructor performs no checking of inputs (and 

399 that can lead to confusing error messages downstream). 

400 

401 Notes 

402 ----- 

403 `CollectionWildcard` is expected to be rarely used outside of `Registry` 

404 (which uses it to back several of its "query" methods that take general 

405 expressions for collections), but it may occasionally be useful outside 

406 `Registry` as a way to preprocess expressions that contain single-pass 

407 iterators into a form that can be used to call those `Registry` methods 

408 multiple times. 

409 """ 

410 

411 strings: tuple[str, ...] = () 

412 """An an ordered list of explicitly-named collections. (`tuple` [ `str` ]). 

413 """ 

414 

415 patterns: tuple[re.Pattern, ...] | EllipsisType = ... 

416 """Regular expression patterns to match against collection names, or the 

417 special value ``...`` indicating all collections. 

418 

419 `...` must be accompanied by ``strings=()``. 

420 """ 

421 

422 def __post_init__(self) -> None: 

423 if self.patterns is ... and self.strings: 

424 raise ValueError( 

425 f"Collection wildcard matches any string, but still has explicit strings {self.strings}." 

426 ) 

427 

428 @classmethod 

429 def from_expression(cls, expression: Any, require_ordered: bool = False) -> CollectionWildcard: 

430 """Process a general expression to construct a `CollectionWildcard` 

431 instance. 

432 

433 Parameters 

434 ---------- 

435 expression 

436 May be: 

437 - a `str` collection name; 

438 - an `re.Pattern` instance to match (with `re.Pattern.fullmatch`) 

439 against collection names; 

440 - any iterable containing any of the above; 

441 - another `CollectionWildcard` instance (passed through 

442 unchanged). 

443 

444 Duplicate collection names will be removed (preserving the first 

445 appearance of each collection name). 

446 require_ordered : `bool`, optional 

447 If `True` (`False` is default) require the expression to be 

448 ordered, and raise `CollectionExpressionError` if it is not. 

449 

450 Returns 

451 ------- 

452 wildcard : `CollectionWildcard` 

453 A `CollectionWildcard` instance. 

454 

455 Raises 

456 ------ 

457 CollectionExpressionError 

458 Raised if the patterns has regular expression, glob patterns, or 

459 the ``...`` wildcard, and ``require_ordered=True``. 

460 """ 

461 if isinstance(expression, cls): 

462 return expression 

463 if expression is ...: 

464 return cls() 

465 wildcard = CategorizedWildcard.fromExpression( 

466 expression, 

467 allowAny=True, 

468 allowPatterns=True, 

469 ) 

470 if wildcard is ...: 

471 return cls() 

472 result = cls( 

473 strings=tuple(wildcard.strings), 

474 patterns=tuple(wildcard.patterns), 

475 ) 

476 if require_ordered: 

477 result.require_ordered() 

478 return result 

479 

480 @classmethod 

481 def from_names(cls, names: Iterable[str]) -> CollectionWildcard: 

482 """Construct from an iterable of explicit collection names. 

483 

484 Parameters 

485 ---------- 

486 names : `~collections.abc.Iterable` [ `str` ] 

487 Iterable of collection names. 

488 

489 Returns 

490 ------- 

491 wildcard : ~CollectionWildcard` 

492 A `CollectionWildcard` instance. `require_ordered` is guaranteed 

493 to succeed and return the given names in order. 

494 """ 

495 return cls(strings=tuple(names), patterns=()) 

496 

497 def require_ordered(self) -> tuple[str, ...]: 

498 """Require that this wildcard contains no patterns, and return the 

499 ordered tuple of names that it does hold. 

500 

501 Returns 

502 ------- 

503 names : `tuple` [ `str` ] 

504 Ordered tuple of collection names. 

505 

506 Raises 

507 ------ 

508 CollectionExpressionError 

509 Raised if the patterns has regular expression, glob patterns, or 

510 the ``...`` wildcard. 

511 """ 

512 if self.patterns: 

513 raise CollectionExpressionError( 

514 f"An ordered collection expression is required; got patterns {self.patterns}." 

515 ) 

516 return self.strings 

517 

518 def empty(self) -> bool: 

519 """Return true if both ``strings`` and ``patterns`` are empty.""" 

520 # bool(Ellipsis) is True 

521 return not self.strings and not self.patterns 

522 

523 def __str__(self) -> str: 

524 if self.patterns is ...: 

525 return "..." 

526 else: 

527 terms = list(self.strings) 

528 terms.extend(str(p) for p in self.patterns) 

529 return "[{}]".format(", ".join(terms)) 

530 

531 

532@dataclasses.dataclass 

533class DatasetTypeWildcard: 

534 """A validated expression that resolves to one or more dataset types. 

535 

536 The `from_expression` method should almost always be used to construct 

537 instances, as the regular constructor performs no checking of inputs (and 

538 that can lead to confusing error messages downstream). 

539 """ 

540 

541 values: Mapping[str, DatasetType | None] = dataclasses.field(default_factory=dict) 

542 """A mapping with `str` dataset type name keys and optional `DatasetType` 

543 instances. 

544 """ 

545 

546 patterns: tuple[re.Pattern, ...] | EllipsisType = ... 

547 """Regular expressions to be matched against dataset type names, or the 

548 special value ``...`` indicating all dataset types. 

549 

550 Any pattern matching a dataset type is considered an overall match for 

551 the expression. 

552 """ 

553 

554 @classmethod 

555 def from_expression(cls, expression: Any) -> DatasetTypeWildcard: 

556 """Construct an instance by analyzing the given expression. 

557 

558 Parameters 

559 ---------- 

560 expression 

561 Expression to analyze. May be any of the following: 

562 

563 - a `str` dataset type name; 

564 - a `DatasetType` instance; 

565 - a `re.Pattern` to match against dataset type names; 

566 - an iterable whose elements may be any of the above (any dataset 

567 type matching any element in the list is an overall match); 

568 - an existing `DatasetTypeWildcard` instance; 

569 - the special ``...`` ellipsis object, which matches any dataset 

570 type. 

571 

572 Returns 

573 ------- 

574 query : `DatasetTypeWildcard` 

575 An instance of this class (new unless an existing instance was 

576 passed in). 

577 

578 Raises 

579 ------ 

580 DatasetTypeExpressionError 

581 Raised if the given expression does not have one of the allowed 

582 types. 

583 """ 

584 if isinstance(expression, cls): 

585 return expression 

586 try: 

587 wildcard = CategorizedWildcard.fromExpression( 

588 expression, coerceUnrecognized=lambda d: (d.name, d) 

589 ) 

590 except TypeError as err: 

591 raise DatasetTypeExpressionError(f"Invalid dataset type expression: {expression!r}.") from err 

592 if wildcard is ...: 

593 return cls() 

594 values: dict[str, DatasetType | None] = {} 

595 for name in wildcard.strings: 

596 values[name] = None 

597 for name, item in wildcard.items: 

598 if not isinstance(item, DatasetType): 

599 raise DatasetTypeExpressionError( 

600 f"Invalid value '{item}' of type {type(item)} in dataset type expression; " 

601 "expected str, re.Pattern, DatasetType objects, iterables thereof, or '...'." 

602 ) 

603 values[name] = item 

604 return cls(values, patterns=tuple(wildcard.patterns)) 

605 

606 def __str__(self) -> str: 

607 if self.patterns is ...: 

608 return "..." 

609 else: 

610 terms = list(self.values.keys()) 

611 terms.extend(str(p) for p in self.patterns) 

612 return "[{}]".format(", ".join(terms))