Coverage for python/lsst/daf/butler/registry/wildcards.py: 21%

207 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-03-25 02:06 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "CategorizedWildcard", 

25 "CollectionWildcard", 

26 "CollectionSearch", 

27 "DatasetTypeWildcard", 

28) 

29 

30import dataclasses 

31import re 

32from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence 

33from typing import Any 

34 

35from deprecated.sphinx import deprecated 

36from lsst.utils.ellipsis import Ellipsis, EllipsisType 

37from lsst.utils.iteration import ensure_iterable 

38from pydantic import BaseModel 

39 

40from ..core import DatasetType 

41from ..core.utils import globToRegex 

42from ._exceptions import CollectionExpressionError, DatasetTypeExpressionError 

43 

44 

45@dataclasses.dataclass 

46class CategorizedWildcard: 

47 """The results of preprocessing a wildcard expression to separate match 

48 patterns from strings. 

49 

50 The `fromExpression` method should almost always be used to construct 

51 instances, as the regular constructor performs no checking of inputs (and 

52 that can lead to confusing error messages downstream). 

53 """ 

54 

55 @classmethod 

56 def fromExpression( 

57 cls, 

58 expression: Any, 

59 *, 

60 allowAny: bool = True, 

61 allowPatterns: bool = True, 

62 coerceUnrecognized: Callable[[Any], tuple[str, Any] | str] | None = None, 

63 coerceItemValue: Callable[[Any], Any] | None = None, 

64 defaultItemValue: Any | None = None, 

65 ) -> CategorizedWildcard | EllipsisType: 

66 """Categorize a wildcard expression. 

67 

68 Parameters 

69 ---------- 

70 expression 

71 The expression to categorize. May be any of: 

72 - `str` (including glob patterns if ``allowPatterns`` is `True`); 

73 - `re.Pattern` (only if ``allowPatterns`` is `True`); 

74 - objects recognized by ``coerceUnrecognized`` (if provided); 

75 - two-element tuples of (`str`, value) where value is recognized 

76 by ``coerceItemValue`` (if provided); 

77 - a non-`str`, non-mapping iterable containing any of the above; 

78 - the special value `...` (only if ``allowAny`` is `True`), which 

79 matches anything; 

80 - a mapping from `str` to a value are recognized by 

81 ``coerceItemValue`` (if provided); 

82 - a `CategorizedWildcard` instance (passed through unchanged if 

83 it meets the requirements specified by keyword arguments). 

84 allowAny: `bool`, optional 

85 If `False` (`True` is default) raise `TypeError` if `...` is 

86 encountered. 

87 allowPatterns: `bool`, optional 

88 If `False` (`True` is default) raise `TypeError` if a `re.Pattern` 

89 is encountered, or if ``expression`` is a `CategorizedWildcard` 

90 with `patterns` not empty. 

91 coerceUnrecognized: `Callable`, optional 

92 A callback that takes a single argument of arbitrary type and 

93 returns either a `str` - appended to `strings` - or a `tuple` of 

94 (`str`, `Any`) to be appended to `items`. This will be called on 

95 objects of unrecognized type. Exceptions will be reraised as 

96 `TypeError` (and chained). 

97 coerceItemValue: `Callable`, optional 

98 If provided, ``expression`` may be a mapping from `str` to any 

99 type that can be passed to this function; the result of that call 

100 will be stored instead as the value in ``self.items``. 

101 defaultItemValue: `Any`, optional 

102 If provided, combine this value with any string values encountered 

103 (including any returned by ``coerceUnrecognized``) to form a 

104 `tuple` and add it to `items`, guaranteeing that `strings` will be 

105 empty. Patterns are never added to `items`. 

106 

107 Returns 

108 ------- 

109 categorized : `CategorizedWildcard` or ``...``. 

110 The struct describing the wildcard. ``...`` is passed through 

111 unchanged. 

112 

113 Raises 

114 ------ 

115 TypeError 

116 Raised if an unsupported type is found in the expression. 

117 """ 

118 assert expression is not None 

119 # See if we were given ...; just return that if we were. 

120 if expression is Ellipsis: 

121 if not allowAny: 

122 raise TypeError("This expression may not be unconstrained.") 

123 return Ellipsis 

124 if isinstance(expression, cls): 

125 # This is already a CategorizedWildcard. Make sure it meets the 

126 # reqs. implied by the kwargs we got. 

127 if not allowPatterns and expression.patterns: 

128 raise TypeError( 

129 f"Regular expression(s) {expression.patterns} are not allowed in this context." 

130 ) 

131 if defaultItemValue is not None and expression.strings: 

132 if expression.items: 

133 raise TypeError( 

134 "Incompatible preprocessed expression: an ordered sequence of str is " 

135 "needed, but the original order was lost in the preprocessing." 

136 ) 

137 return cls( 

138 strings=[], 

139 patterns=expression.patterns, 

140 items=[(k, defaultItemValue) for k in expression.strings], 

141 ) 

142 elif defaultItemValue is None and expression.items: 

143 if expression.strings: 

144 raise TypeError( 

145 "Incompatible preprocessed expression: an ordered sequence of items is " 

146 "needed, but the original order was lost in the preprocessing." 

147 ) 

148 return cls(strings=[k for k, _ in expression.items], patterns=expression.patterns, items=[]) 

149 else: 

150 # Original expression was created with keyword arguments that 

151 # were at least as restrictive as what we just got; pass it 

152 # through. 

153 return expression 

154 

155 # If we get here, we know we'll be creating a new instance. 

156 # Initialize an empty one now. 

157 self = cls(strings=[], patterns=[], items=[]) 

158 

159 # If mappings are allowed, see if we were given a single mapping by 

160 # trying to get items. 

161 if coerceItemValue is not None: 

162 rawItems = None 

163 try: 

164 rawItems = expression.items() 

165 except AttributeError: 

166 pass 

167 if rawItems is not None: 

168 for k, v in rawItems: 

169 try: 

170 self.items.append((k, coerceItemValue(v))) 

171 except Exception as err: 

172 raise TypeError(f"Could not coerce mapping value '{v}' for key '{k}'.") from err 

173 return self 

174 

175 # Not ..., a CategorizedWildcard instance, or a mapping. Just 

176 # process scalars or an iterable. We put the body of the loop inside 

177 # a local function so we can recurse after coercion. 

178 

179 def process(element: Any, alreadyCoerced: bool = False) -> EllipsisType | None: 

180 if isinstance(element, str): 

181 if defaultItemValue is not None: 

182 self.items.append((element, defaultItemValue)) 

183 return None 

184 else: 

185 # This returns a list but we know we only passed in 

186 # single value. 

187 converted = globToRegex(element) 

188 if converted is Ellipsis: 

189 return Ellipsis 

190 element = converted[0] 

191 # Let regex and ... go through to the next check 

192 if isinstance(element, str): 

193 self.strings.append(element) 

194 return None 

195 if allowPatterns and isinstance(element, re.Pattern): 

196 self.patterns.append(element) 

197 return None 

198 if alreadyCoerced: 

199 try: 

200 k, v = element 

201 except TypeError: 

202 raise TypeError( 

203 f"Object '{element!r}' returned by coercion function must be `str` or `tuple`." 

204 ) from None 

205 else: 

206 self.items.append((k, v)) 

207 return None 

208 if coerceItemValue is not None: 

209 try: 

210 k, v = element 

211 except TypeError: 

212 pass 

213 else: 

214 if not isinstance(k, str): 

215 raise TypeError(f"Item key '{k}' is not a string.") 

216 try: 

217 v = coerceItemValue(v) 

218 except Exception as err: 

219 raise TypeError(f"Could not coerce tuple item value '{v}' for key '{k}'.") from err 

220 self.items.append((k, v)) 

221 return None 

222 if coerceUnrecognized is not None: 

223 try: 

224 # This should be safe but flake8 cant tell that the 

225 # function will be re-declared next function call 

226 process(coerceUnrecognized(element), alreadyCoerced=True) # noqa: F821 

227 except Exception as err: 

228 raise TypeError(f"Could not coerce expression element '{element!r}'.") from err 

229 else: 

230 extra = "." 

231 if isinstance(element, re.Pattern): 

232 extra = " and patterns are not allowed." 

233 raise TypeError(f"Unsupported object in wildcard expression: '{element!r}'{extra}") 

234 return None 

235 

236 for element in ensure_iterable(expression): 

237 retval = process(element) 

238 if retval is Ellipsis: 

239 # One of the globs matched everything 

240 if not allowAny: 

241 raise TypeError("This expression may not be unconstrained.") 

242 return Ellipsis 

243 del process 

244 return self 

245 

246 strings: list[str] 

247 """Explicit string values found in the wildcard (`list` [ `str` ]). 

248 """ 

249 

250 patterns: list[re.Pattern] 

251 """Regular expression patterns found in the wildcard 

252 (`list` [ `re.Pattern` ]). 

253 """ 

254 

255 items: list[tuple[str, Any]] 

256 """Two-item tuples that relate string values to other objects 

257 (`list` [ `tuple` [ `str`, `Any` ] ]). 

258 """ 

259 

260 

261@deprecated( 

262 reason="Tuples of string collection names are now preferred. Will be removed after v26.", 

263 version="v25.0", 

264 category=FutureWarning, 

265) 

266class CollectionSearch(BaseModel, Sequence[str]): 

267 """An ordered search path of collections. 

268 

269 The `fromExpression` method should almost always be used to construct 

270 instances, as the regular constructor performs no checking of inputs (and 

271 that can lead to confusing error messages downstream). 

272 

273 Parameters 

274 ---------- 

275 collections : `tuple` [ `str` ] 

276 Tuple of collection names, ordered from the first searched to the last 

277 searched. 

278 

279 Notes 

280 ----- 

281 A `CollectionSearch` is used to find a single dataset (or set of datasets 

282 with different dataset types or data IDs) according to its dataset type and 

283 data ID, giving preference to collections in the order in which they are 

284 specified. A `CollectionWildcard` can be constructed from a broader range 

285 of expressions but does not order the collections to be searched. 

286 

287 `CollectionSearch` is an immutable sequence of `str` collection names. 

288 

289 A `CollectionSearch` instance constructed properly (e.g. via 

290 `fromExpression`) is a unique representation of a particular search path; 

291 it is exactly the same internally and compares as equal to any 

292 `CollectionSearch` constructed from an equivalent expression, regardless of 

293 how different the original expressions appear. 

294 """ 

295 

296 __root__: tuple[str, ...] 

297 

298 @classmethod 

299 def fromExpression(cls, expression: Any) -> CollectionSearch: 

300 """Process a general expression to construct a `CollectionSearch` 

301 instance. 

302 

303 Parameters 

304 ---------- 

305 expression 

306 May be: 

307 - a `str` collection name; 

308 - an iterable of `str` collection names; 

309 - another `CollectionSearch` instance (passed through 

310 unchanged). 

311 

312 Duplicate entries will be removed (preserving the first appearance 

313 of each collection name). 

314 

315 Returns 

316 ------- 

317 collections : `CollectionSearch` 

318 A `CollectionSearch` instance. 

319 """ 

320 # First see if this is already a CollectionSearch; just pass that 

321 # through unchanged. This lets us standardize expressions (and turn 

322 # single-pass iterators into multi-pass iterables) in advance and pass 

323 # them down to other routines that accept arbitrary expressions. 

324 if isinstance(expression, cls): 

325 return expression 

326 try: 

327 wildcard = CategorizedWildcard.fromExpression( 

328 expression, 

329 allowAny=False, 

330 allowPatterns=False, 

331 ) 

332 except TypeError as err: 

333 raise CollectionExpressionError(str(err)) from None 

334 assert wildcard is not Ellipsis 

335 assert not wildcard.patterns 

336 assert not wildcard.items 

337 deduplicated = [] 

338 for name in wildcard.strings: 

339 if name not in deduplicated: 

340 deduplicated.append(name) 

341 return cls(__root__=tuple(deduplicated)) 

342 

343 def explicitNames(self) -> Iterator[str]: 

344 """Iterate over collection names that were specified explicitly.""" 

345 yield from self.__root__ 

346 

347 def __iter__(self) -> Iterator[str]: # type: ignore 

348 yield from self.__root__ 

349 

350 def __len__(self) -> int: 

351 return len(self.__root__) 

352 

353 def __getitem__(self, index: Any) -> str: 

354 return self.__root__[index] 

355 

356 def __eq__(self, other: Any) -> bool: 

357 if isinstance(other, CollectionSearch): 

358 return self.__root__ == other.__root__ 

359 return False 

360 

361 def __str__(self) -> str: 

362 return "[{}]".format(", ".join(self)) 

363 

364 def __repr__(self) -> str: 

365 return f"CollectionSearch({self.__root__!r})" 

366 

367 

368@dataclasses.dataclass(frozen=True) 

369class CollectionWildcard: 

370 """A validated wildcard for collection names 

371 

372 The `from_expression` method should almost always be used to construct 

373 instances, as the regular constructor performs no checking of inputs (and 

374 that can lead to confusing error messages downstream). 

375 

376 Notes 

377 ----- 

378 `CollectionWildcard` is expected to be rarely used outside of `Registry` 

379 (which uses it to back several of its "query" methods that take general 

380 expressions for collections), but it may occasionally be useful outside 

381 `Registry` as a way to preprocess expressions that contain single-pass 

382 iterators into a form that can be used to call those `Registry` methods 

383 multiple times. 

384 """ 

385 

386 strings: tuple[str, ...] = () 

387 """An an ordered list of explicitly-named collections. (`tuple` [ `str` ]). 

388 """ 

389 

390 patterns: tuple[re.Pattern, ...] | EllipsisType = Ellipsis 

391 """Regular expression patterns to match against collection names, or the 

392 special value ``...`` indicating all collections. 

393 

394 `...` must be accompanied by ``strings=()``. 

395 """ 

396 

397 def __post_init__(self) -> None: 

398 if self.patterns is Ellipsis and self.strings: 

399 raise ValueError( 

400 f"Collection wildcard matches any string, but still has explicit strings {self.strings}." 

401 ) 

402 

403 @classmethod 

404 def from_expression(cls, expression: Any, require_ordered: bool = False) -> CollectionWildcard: 

405 """Process a general expression to construct a `CollectionWildcard` 

406 instance. 

407 

408 Parameters 

409 ---------- 

410 expression 

411 May be: 

412 - a `str` collection name; 

413 - an `re.Pattern` instance to match (with `re.Pattern.fullmatch`) 

414 against collection names; 

415 - any iterable containing any of the above; 

416 - another `CollectionWildcard` instance (passed through 

417 unchanged). 

418 

419 Duplicate collection names will be removed (preserving the first 

420 appearance of each collection name). 

421 require_ordered : `bool`, optional 

422 If `True` (`False` is default) require the expression to be 

423 ordered, and raise `CollectionExpressionError` if it is not. 

424 

425 Returns 

426 ------- 

427 wildcard : `CollectionWildcard` 

428 A `CollectionWildcard` instance. 

429 

430 Raises 

431 ------ 

432 CollectionExpressionError 

433 Raised if the patterns has regular expression, glob patterns, or 

434 the ``...`` wildcard, and ``require_ordered=True``. 

435 """ 

436 if isinstance(expression, cls): 

437 return expression 

438 if expression is Ellipsis: 

439 return cls() 

440 wildcard = CategorizedWildcard.fromExpression( 

441 expression, 

442 allowAny=True, 

443 allowPatterns=True, 

444 ) 

445 if wildcard is Ellipsis: 

446 return cls() 

447 result = cls( 

448 strings=tuple(wildcard.strings), 

449 patterns=tuple(wildcard.patterns), 

450 ) 

451 if require_ordered: 

452 result.require_ordered() 

453 return result 

454 

455 @classmethod 

456 def from_names(cls, names: Iterable[str]) -> CollectionWildcard: 

457 """Construct from an iterable of explicit collection names. 

458 

459 Parameters 

460 ---------- 

461 names : `Iterable` [ `str` ] 

462 Iterable of collection names. 

463 

464 Returns 

465 ------- 

466 wildcard : ~CollectionWildcard` 

467 A `CollectionWildcard` instance. `require_ordered` is guaranteed 

468 to succeed and return the given names in order. 

469 """ 

470 return cls(strings=tuple(names), patterns=()) 

471 

472 def require_ordered(self) -> tuple[str, ...]: 

473 """Require that this wildcard contains no patterns, and return the 

474 ordered tuple of names that it does hold. 

475 

476 Returns 

477 ------- 

478 names : `tuple` [ `str` ] 

479 Ordered tuple of collection names. 

480 

481 Raises 

482 ------ 

483 CollectionExpressionError 

484 Raised if the patterns has regular expression, glob patterns, or 

485 the ``...`` wildcard. 

486 """ 

487 if self.patterns: 

488 raise CollectionExpressionError( 

489 f"An ordered collection expression is required; got patterns {self.patterns}." 

490 ) 

491 return self.strings 

492 

493 def __str__(self) -> str: 

494 if self.patterns is Ellipsis: 

495 return "..." 

496 else: 

497 terms = list(self.strings) 

498 terms.extend(str(p) for p in self.patterns) 

499 return "[{}]".format(", ".join(terms)) 

500 

501 

502@dataclasses.dataclass 

503class DatasetTypeWildcard: 

504 """A validated expression that resolves to one or more dataset types. 

505 

506 The `from_expression` method should almost always be used to construct 

507 instances, as the regular constructor performs no checking of inputs (and 

508 that can lead to confusing error messages downstream). 

509 """ 

510 

511 values: Mapping[str, DatasetType | None] = dataclasses.field(default_factory=dict) 

512 """A mapping with `str` dataset type name keys and optional `DatasetType` 

513 instances. 

514 """ 

515 

516 patterns: tuple[re.Pattern, ...] | EllipsisType = Ellipsis 

517 """Regular expressions to be matched against dataset type names, or the 

518 special value ``...`` indicating all dataset types. 

519 

520 Any pattern matching a dataset type is considered an overall match for 

521 the expression. 

522 """ 

523 

524 @classmethod 

525 def from_expression(cls, expression: Any) -> DatasetTypeWildcard: 

526 """Construct an instance by analyzing the given expression. 

527 

528 Parameters 

529 ---------- 

530 expression 

531 Expression to analyze. May be any of the following: 

532 

533 - a `str` dataset type name; 

534 - a `DatasetType` instance; 

535 - a `re.Pattern` to match against dataset type names; 

536 - an iterable whose elements may be any of the above (any dataset 

537 type matching any element in the list is an overall match); 

538 - an existing `DatasetTypeWildcard` instance; 

539 - the special ``...`` ellipsis object, which matches any dataset 

540 type. 

541 

542 Returns 

543 ------- 

544 query : `DatasetTypeWildcard` 

545 An instance of this class (new unless an existing instance was 

546 passed in). 

547 

548 Raises 

549 ------ 

550 DatasetTypeExpressionError 

551 Raised if the given expression does not have one of the allowed 

552 types. 

553 """ 

554 if isinstance(expression, cls): 

555 return expression 

556 try: 

557 wildcard = CategorizedWildcard.fromExpression( 

558 expression, coerceUnrecognized=lambda d: (d.name, d) 

559 ) 

560 except TypeError as err: 

561 raise DatasetTypeExpressionError(f"Invalid dataset type expression: {expression!r}.") from err 

562 if wildcard is Ellipsis: 

563 return cls() 

564 values: dict[str, DatasetType | None] = {} 

565 for name in wildcard.strings: 

566 values[name] = None 

567 for name, item in wildcard.items: 

568 if not isinstance(item, DatasetType): 

569 raise DatasetTypeExpressionError( 

570 f"Invalid value '{item}' of type {type(item)} in dataset type expression; " 

571 "expected str, re.Pattern, DatasetType objects, iterables thereof, or '...'." 

572 ) 

573 values[name] = item 

574 return cls(values, patterns=tuple(wildcard.patterns)) 

575 

576 def __str__(self) -> str: 

577 if self.patterns is Ellipsis: 

578 return "..." 

579 else: 

580 terms = list(self.values.keys()) 

581 terms.extend(str(p) for p in self.patterns) 

582 return "[{}]".format(", ".join(terms))