Coverage for python/lsst/daf/butler/registry/wildcards.py: 22%

212 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-12 10:56 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "CategorizedWildcard", 

25 "CollectionWildcard", 

26 "CollectionSearch", 

27 "DatasetTypeWildcard", 

28) 

29 

30import dataclasses 

31import re 

32from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence 

33from types import EllipsisType 

34from typing import Any 

35 

36from deprecated.sphinx import deprecated 

37from lsst.utils.iteration import ensure_iterable 

38 

39try: 

40 from pydantic.v1 import BaseModel 

41except ModuleNotFoundError: 

42 from pydantic import BaseModel # type: ignore 

43 

44from ..core import DatasetType 

45from ..core.utils import globToRegex 

46from ._exceptions import CollectionExpressionError, DatasetTypeExpressionError 

47 

48 

49@dataclasses.dataclass 

50class CategorizedWildcard: 

51 """The results of preprocessing a wildcard expression to separate match 

52 patterns from strings. 

53 

54 The `fromExpression` method should almost always be used to construct 

55 instances, as the regular constructor performs no checking of inputs (and 

56 that can lead to confusing error messages downstream). 

57 """ 

58 

59 @classmethod 

60 def fromExpression( 

61 cls, 

62 expression: Any, 

63 *, 

64 allowAny: bool = True, 

65 allowPatterns: bool = True, 

66 coerceUnrecognized: Callable[[Any], tuple[str, Any] | str] | None = None, 

67 coerceItemValue: Callable[[Any], Any] | None = None, 

68 defaultItemValue: Any | None = None, 

69 ) -> CategorizedWildcard | EllipsisType: 

70 """Categorize a wildcard expression. 

71 

72 Parameters 

73 ---------- 

74 expression 

75 The expression to categorize. May be any of: 

76 - `str` (including glob patterns if ``allowPatterns`` is `True`); 

77 - `re.Pattern` (only if ``allowPatterns`` is `True`); 

78 - objects recognized by ``coerceUnrecognized`` (if provided); 

79 - two-element tuples of (`str`, value) where value is recognized 

80 by ``coerceItemValue`` (if provided); 

81 - a non-`str`, non-mapping iterable containing any of the above; 

82 - the special value `...` (only if ``allowAny`` is `True`), which 

83 matches anything; 

84 - a mapping from `str` to a value are recognized by 

85 ``coerceItemValue`` (if provided); 

86 - a `CategorizedWildcard` instance (passed through unchanged if 

87 it meets the requirements specified by keyword arguments). 

88 allowAny: `bool`, optional 

89 If `False` (`True` is default) raise `TypeError` if `...` is 

90 encountered. 

91 allowPatterns: `bool`, optional 

92 If `False` (`True` is default) raise `TypeError` if a `re.Pattern` 

93 is encountered, or if ``expression`` is a `CategorizedWildcard` 

94 with `patterns` not empty. 

95 coerceUnrecognized: `~collections.abc.Callable`, optional 

96 A callback that takes a single argument of arbitrary type and 

97 returns either a `str` - appended to `strings` - or a `tuple` of 

98 (`str`, `Any`) to be appended to `items`. This will be called on 

99 objects of unrecognized type. Exceptions will be reraised as 

100 `TypeError` (and chained). 

101 coerceItemValue: `~collections.abc.Callable`, optional 

102 If provided, ``expression`` may be a mapping from `str` to any 

103 type that can be passed to this function; the result of that call 

104 will be stored instead as the value in ``self.items``. 

105 defaultItemValue: `Any`, optional 

106 If provided, combine this value with any string values encountered 

107 (including any returned by ``coerceUnrecognized``) to form a 

108 `tuple` and add it to `items`, guaranteeing that `strings` will be 

109 empty. Patterns are never added to `items`. 

110 

111 Returns 

112 ------- 

113 categorized : `CategorizedWildcard` or ``...``. 

114 The struct describing the wildcard. ``...`` is passed through 

115 unchanged. 

116 

117 Raises 

118 ------ 

119 TypeError 

120 Raised if an unsupported type is found in the expression. 

121 """ 

122 assert expression is not None 

123 # See if we were given ...; just return that if we were. 

124 if expression is ...: 

125 if not allowAny: 

126 raise TypeError("This expression may not be unconstrained.") 

127 return ... 

128 if isinstance(expression, cls): 

129 # This is already a CategorizedWildcard. Make sure it meets the 

130 # reqs. implied by the kwargs we got. 

131 if not allowPatterns and expression.patterns: 

132 raise TypeError( 

133 f"Regular expression(s) {expression.patterns} are not allowed in this context." 

134 ) 

135 if defaultItemValue is not None and expression.strings: 

136 if expression.items: 

137 raise TypeError( 

138 "Incompatible preprocessed expression: an ordered sequence of str is " 

139 "needed, but the original order was lost in the preprocessing." 

140 ) 

141 return cls( 

142 strings=[], 

143 patterns=expression.patterns, 

144 items=[(k, defaultItemValue) for k in expression.strings], 

145 ) 

146 elif defaultItemValue is None and expression.items: 

147 if expression.strings: 

148 raise TypeError( 

149 "Incompatible preprocessed expression: an ordered sequence of items is " 

150 "needed, but the original order was lost in the preprocessing." 

151 ) 

152 return cls(strings=[k for k, _ in expression.items], patterns=expression.patterns, items=[]) 

153 else: 

154 # Original expression was created with keyword arguments that 

155 # were at least as restrictive as what we just got; pass it 

156 # through. 

157 return expression 

158 

159 # If we get here, we know we'll be creating a new instance. 

160 # Initialize an empty one now. 

161 self = cls(strings=[], patterns=[], items=[]) 

162 

163 # If mappings are allowed, see if we were given a single mapping by 

164 # trying to get items. 

165 if coerceItemValue is not None: 

166 rawItems = None 

167 try: 

168 rawItems = expression.items() 

169 except AttributeError: 

170 pass 

171 if rawItems is not None: 

172 for k, v in rawItems: 

173 try: 

174 self.items.append((k, coerceItemValue(v))) 

175 except Exception as err: 

176 raise TypeError(f"Could not coerce mapping value '{v}' for key '{k}'.") from err 

177 return self 

178 

179 # Not ..., a CategorizedWildcard instance, or a mapping. Just 

180 # process scalars or an iterable. We put the body of the loop inside 

181 # a local function so we can recurse after coercion. 

182 

183 def process(element: Any, alreadyCoerced: bool = False) -> EllipsisType | None: 

184 if isinstance(element, str): 

185 if defaultItemValue is not None: 

186 self.items.append((element, defaultItemValue)) 

187 return None 

188 else: 

189 # This returns a list but we know we only passed in 

190 # single value. 

191 converted = globToRegex(element) 

192 if converted is ...: 

193 return ... 

194 element = converted[0] 

195 # Let regex and ... go through to the next check 

196 if isinstance(element, str): 

197 self.strings.append(element) 

198 return None 

199 if allowPatterns and isinstance(element, re.Pattern): 

200 self.patterns.append(element) 

201 return None 

202 if alreadyCoerced: 

203 try: 

204 k, v = element 

205 except TypeError: 

206 raise TypeError( 

207 f"Object '{element!r}' returned by coercion function must be `str` or `tuple`." 

208 ) from None 

209 else: 

210 self.items.append((k, v)) 

211 return None 

212 if coerceItemValue is not None: 

213 try: 

214 k, v = element 

215 except TypeError: 

216 pass 

217 else: 

218 if not isinstance(k, str): 

219 raise TypeError(f"Item key '{k}' is not a string.") 

220 try: 

221 v = coerceItemValue(v) 

222 except Exception as err: 

223 raise TypeError(f"Could not coerce tuple item value '{v}' for key '{k}'.") from err 

224 self.items.append((k, v)) 

225 return None 

226 if coerceUnrecognized is not None: 

227 try: 

228 # This should be safe but flake8 cant tell that the 

229 # function will be re-declared next function call 

230 process(coerceUnrecognized(element), alreadyCoerced=True) # noqa: F821 

231 except Exception as err: 

232 raise TypeError(f"Could not coerce expression element '{element!r}'.") from err 

233 else: 

234 extra = "." 

235 if isinstance(element, re.Pattern): 

236 extra = " and patterns are not allowed." 

237 raise TypeError(f"Unsupported object in wildcard expression: '{element!r}'{extra}") 

238 return None 

239 

240 for element in ensure_iterable(expression): 

241 retval = process(element) 

242 if retval is ...: 

243 # One of the globs matched everything 

244 if not allowAny: 

245 raise TypeError("This expression may not be unconstrained.") 

246 return ... 

247 del process 

248 return self 

249 

250 strings: list[str] 

251 """Explicit string values found in the wildcard (`list` [ `str` ]). 

252 """ 

253 

254 patterns: list[re.Pattern] 

255 """Regular expression patterns found in the wildcard 

256 (`list` [ `re.Pattern` ]). 

257 """ 

258 

259 items: list[tuple[str, Any]] 

260 """Two-item tuples that relate string values to other objects 

261 (`list` [ `tuple` [ `str`, `Any` ] ]). 

262 """ 

263 

264 

265@deprecated( 

266 reason="Tuples of string collection names are now preferred. Will be removed after v26.", 

267 version="v25.0", 

268 category=FutureWarning, 

269) 

270class CollectionSearch(BaseModel, Sequence[str]): 

271 """An ordered search path of collections. 

272 

273 The `fromExpression` method should almost always be used to construct 

274 instances, as the regular constructor performs no checking of inputs (and 

275 that can lead to confusing error messages downstream). 

276 

277 Parameters 

278 ---------- 

279 collections : `tuple` [ `str` ] 

280 Tuple of collection names, ordered from the first searched to the last 

281 searched. 

282 

283 Notes 

284 ----- 

285 A `CollectionSearch` is used to find a single dataset (or set of datasets 

286 with different dataset types or data IDs) according to its dataset type and 

287 data ID, giving preference to collections in the order in which they are 

288 specified. A `CollectionWildcard` can be constructed from a broader range 

289 of expressions but does not order the collections to be searched. 

290 

291 `CollectionSearch` is an immutable sequence of `str` collection names. 

292 

293 A `CollectionSearch` instance constructed properly (e.g. via 

294 `fromExpression`) is a unique representation of a particular search path; 

295 it is exactly the same internally and compares as equal to any 

296 `CollectionSearch` constructed from an equivalent expression, regardless of 

297 how different the original expressions appear. 

298 """ 

299 

300 __root__: tuple[str, ...] 

301 

302 @classmethod 

303 def fromExpression(cls, expression: Any) -> CollectionSearch: 

304 """Process a general expression to construct a `CollectionSearch` 

305 instance. 

306 

307 Parameters 

308 ---------- 

309 expression 

310 May be: 

311 - a `str` collection name; 

312 - an iterable of `str` collection names; 

313 - another `CollectionSearch` instance (passed through 

314 unchanged). 

315 

316 Duplicate entries will be removed (preserving the first appearance 

317 of each collection name). 

318 

319 Returns 

320 ------- 

321 collections : `CollectionSearch` 

322 A `CollectionSearch` instance. 

323 """ 

324 # First see if this is already a CollectionSearch; just pass that 

325 # through unchanged. This lets us standardize expressions (and turn 

326 # single-pass iterators into multi-pass iterables) in advance and pass 

327 # them down to other routines that accept arbitrary expressions. 

328 if isinstance(expression, cls): 

329 return expression 

330 try: 

331 wildcard = CategorizedWildcard.fromExpression( 

332 expression, 

333 allowAny=False, 

334 allowPatterns=False, 

335 ) 

336 except TypeError as err: 

337 raise CollectionExpressionError(str(err)) from None 

338 assert wildcard is not ... 

339 assert not wildcard.patterns 

340 assert not wildcard.items 

341 deduplicated = [] 

342 for name in wildcard.strings: 

343 if name not in deduplicated: 

344 deduplicated.append(name) 

345 return cls(__root__=tuple(deduplicated)) 

346 

347 def explicitNames(self) -> Iterator[str]: 

348 """Iterate over collection names that were specified explicitly.""" 

349 yield from self.__root__ 

350 

351 def __iter__(self) -> Iterator[str]: # type: ignore 

352 yield from self.__root__ 

353 

354 def __len__(self) -> int: 

355 return len(self.__root__) 

356 

357 def __getitem__(self, index: Any) -> str: 

358 return self.__root__[index] 

359 

360 def __eq__(self, other: Any) -> bool: 

361 if isinstance(other, CollectionSearch): 

362 return self.__root__ == other.__root__ 

363 return False 

364 

365 def __str__(self) -> str: 

366 return "[{}]".format(", ".join(self)) 

367 

368 def __repr__(self) -> str: 

369 return f"CollectionSearch({self.__root__!r})" 

370 

371 

372@dataclasses.dataclass(frozen=True) 

373class CollectionWildcard: 

374 """A validated wildcard for collection names. 

375 

376 The `from_expression` method should almost always be used to construct 

377 instances, as the regular constructor performs no checking of inputs (and 

378 that can lead to confusing error messages downstream). 

379 

380 Notes 

381 ----- 

382 `CollectionWildcard` is expected to be rarely used outside of `Registry` 

383 (which uses it to back several of its "query" methods that take general 

384 expressions for collections), but it may occasionally be useful outside 

385 `Registry` as a way to preprocess expressions that contain single-pass 

386 iterators into a form that can be used to call those `Registry` methods 

387 multiple times. 

388 """ 

389 

390 strings: tuple[str, ...] = () 

391 """An an ordered list of explicitly-named collections. (`tuple` [ `str` ]). 

392 """ 

393 

394 patterns: tuple[re.Pattern, ...] | EllipsisType = ... 

395 """Regular expression patterns to match against collection names, or the 

396 special value ``...`` indicating all collections. 

397 

398 `...` must be accompanied by ``strings=()``. 

399 """ 

400 

401 def __post_init__(self) -> None: 

402 if self.patterns is ... and self.strings: 

403 raise ValueError( 

404 f"Collection wildcard matches any string, but still has explicit strings {self.strings}." 

405 ) 

406 

407 @classmethod 

408 def from_expression(cls, expression: Any, require_ordered: bool = False) -> CollectionWildcard: 

409 """Process a general expression to construct a `CollectionWildcard` 

410 instance. 

411 

412 Parameters 

413 ---------- 

414 expression 

415 May be: 

416 - a `str` collection name; 

417 - an `re.Pattern` instance to match (with `re.Pattern.fullmatch`) 

418 against collection names; 

419 - any iterable containing any of the above; 

420 - another `CollectionWildcard` instance (passed through 

421 unchanged). 

422 

423 Duplicate collection names will be removed (preserving the first 

424 appearance of each collection name). 

425 require_ordered : `bool`, optional 

426 If `True` (`False` is default) require the expression to be 

427 ordered, and raise `CollectionExpressionError` if it is not. 

428 

429 Returns 

430 ------- 

431 wildcard : `CollectionWildcard` 

432 A `CollectionWildcard` instance. 

433 

434 Raises 

435 ------ 

436 CollectionExpressionError 

437 Raised if the patterns has regular expression, glob patterns, or 

438 the ``...`` wildcard, and ``require_ordered=True``. 

439 """ 

440 if isinstance(expression, cls): 

441 return expression 

442 if expression is ...: 

443 return cls() 

444 wildcard = CategorizedWildcard.fromExpression( 

445 expression, 

446 allowAny=True, 

447 allowPatterns=True, 

448 ) 

449 if wildcard is ...: 

450 return cls() 

451 result = cls( 

452 strings=tuple(wildcard.strings), 

453 patterns=tuple(wildcard.patterns), 

454 ) 

455 if require_ordered: 

456 result.require_ordered() 

457 return result 

458 

459 @classmethod 

460 def from_names(cls, names: Iterable[str]) -> CollectionWildcard: 

461 """Construct from an iterable of explicit collection names. 

462 

463 Parameters 

464 ---------- 

465 names : `~collections.abc.Iterable` [ `str` ] 

466 Iterable of collection names. 

467 

468 Returns 

469 ------- 

470 wildcard : ~CollectionWildcard` 

471 A `CollectionWildcard` instance. `require_ordered` is guaranteed 

472 to succeed and return the given names in order. 

473 """ 

474 return cls(strings=tuple(names), patterns=()) 

475 

476 def require_ordered(self) -> tuple[str, ...]: 

477 """Require that this wildcard contains no patterns, and return the 

478 ordered tuple of names that it does hold. 

479 

480 Returns 

481 ------- 

482 names : `tuple` [ `str` ] 

483 Ordered tuple of collection names. 

484 

485 Raises 

486 ------ 

487 CollectionExpressionError 

488 Raised if the patterns has regular expression, glob patterns, or 

489 the ``...`` wildcard. 

490 """ 

491 if self.patterns: 

492 raise CollectionExpressionError( 

493 f"An ordered collection expression is required; got patterns {self.patterns}." 

494 ) 

495 return self.strings 

496 

497 def empty(self) -> bool: 

498 """Return true if both ``strings`` and ``patterns`` are empty.""" 

499 # bool(Ellipsis) is True 

500 return not self.strings and not self.patterns 

501 

502 def __str__(self) -> str: 

503 if self.patterns is ...: 

504 return "..." 

505 else: 

506 terms = list(self.strings) 

507 terms.extend(str(p) for p in self.patterns) 

508 return "[{}]".format(", ".join(terms)) 

509 

510 

511@dataclasses.dataclass 

512class DatasetTypeWildcard: 

513 """A validated expression that resolves to one or more dataset types. 

514 

515 The `from_expression` method should almost always be used to construct 

516 instances, as the regular constructor performs no checking of inputs (and 

517 that can lead to confusing error messages downstream). 

518 """ 

519 

520 values: Mapping[str, DatasetType | None] = dataclasses.field(default_factory=dict) 

521 """A mapping with `str` dataset type name keys and optional `DatasetType` 

522 instances. 

523 """ 

524 

525 patterns: tuple[re.Pattern, ...] | EllipsisType = ... 

526 """Regular expressions to be matched against dataset type names, or the 

527 special value ``...`` indicating all dataset types. 

528 

529 Any pattern matching a dataset type is considered an overall match for 

530 the expression. 

531 """ 

532 

533 @classmethod 

534 def from_expression(cls, expression: Any) -> DatasetTypeWildcard: 

535 """Construct an instance by analyzing the given expression. 

536 

537 Parameters 

538 ---------- 

539 expression 

540 Expression to analyze. May be any of the following: 

541 

542 - a `str` dataset type name; 

543 - a `DatasetType` instance; 

544 - a `re.Pattern` to match against dataset type names; 

545 - an iterable whose elements may be any of the above (any dataset 

546 type matching any element in the list is an overall match); 

547 - an existing `DatasetTypeWildcard` instance; 

548 - the special ``...`` ellipsis object, which matches any dataset 

549 type. 

550 

551 Returns 

552 ------- 

553 query : `DatasetTypeWildcard` 

554 An instance of this class (new unless an existing instance was 

555 passed in). 

556 

557 Raises 

558 ------ 

559 DatasetTypeExpressionError 

560 Raised if the given expression does not have one of the allowed 

561 types. 

562 """ 

563 if isinstance(expression, cls): 

564 return expression 

565 try: 

566 wildcard = CategorizedWildcard.fromExpression( 

567 expression, coerceUnrecognized=lambda d: (d.name, d) 

568 ) 

569 except TypeError as err: 

570 raise DatasetTypeExpressionError(f"Invalid dataset type expression: {expression!r}.") from err 

571 if wildcard is ...: 

572 return cls() 

573 values: dict[str, DatasetType | None] = {} 

574 for name in wildcard.strings: 

575 values[name] = None 

576 for name, item in wildcard.items: 

577 if not isinstance(item, DatasetType): 

578 raise DatasetTypeExpressionError( 

579 f"Invalid value '{item}' of type {type(item)} in dataset type expression; " 

580 "expected str, re.Pattern, DatasetType objects, iterables thereof, or '...'." 

581 ) 

582 values[name] = item 

583 return cls(values, patterns=tuple(wildcard.patterns)) 

584 

585 def __str__(self) -> str: 

586 if self.patterns is ...: 

587 return "..." 

588 else: 

589 terms = list(self.values.keys()) 

590 terms.extend(str(p) for p in self.patterns) 

591 return "[{}]".format(", ".join(terms))