Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "CategorizedWildcard", 

25 "CollectionQuery", 

26 "CollectionSearch", 

27) 

28 

29from dataclasses import dataclass 

30import re 

31from typing import ( 

32 AbstractSet, 

33 Any, 

34 Callable, 

35 Iterator, 

36 List, 

37 Optional, 

38 Sequence, 

39 Set, 

40 Tuple, 

41 TYPE_CHECKING, 

42 Union, 

43) 

44 

45import sqlalchemy 

46 

47from ..core import DatasetType 

48from ..core.utils import iterable 

49from ._collectionType import CollectionType 

50 

51if TYPE_CHECKING: 51 ↛ 52line 51 didn't jump to line 52, because the condition on line 51 was never true

52 from .interfaces import CollectionManager, CollectionRecord 

53 

54 # Workaround for `...` not having an exposed type in Python, borrowed from 

55 # https://github.com/python/typing/issues/684#issuecomment-548203158 

56 # Along with that, we need to either use `Ellipsis` instead of `...` for 

57 # the actual sentinal value internally, and tell MyPy to ignore conversions 

58 # from `...` to `Ellipsis` at the public-interface boundary. 

59 # 

60 # `Ellipsis` and `EllipsisType` should be directly imported from this 

61 # module by related code that needs them; hopefully that will stay confined 

62 # to `lsst.daf.butler.registry`. Putting these in __all__ is bad for 

63 # Sphinx, and probably more confusing than helpful overall. 

64 from enum import Enum 

65 

66 class EllipsisType(Enum): 

67 Ellipsis = "..." 

68 

69 Ellipsis = EllipsisType.Ellipsis 

70 

71else: 

72 EllipsisType = type(Ellipsis) 

73 Ellipsis = Ellipsis 

74 

75 

76@dataclass 

77class CategorizedWildcard: 

78 """The results of preprocessing a wildcard expression to separate match 

79 patterns from strings. 

80 

81 The `fromExpression` method should almost always be used to construct 

82 instances, as the regular constructor performs no checking of inputs (and 

83 that can lead to confusing error messages downstream). 

84 """ 

85 

86 @classmethod 

87 def fromExpression(cls, expression: Any, *, 

88 allowAny: bool = True, 

89 allowPatterns: bool = True, 

90 coerceUnrecognized: Optional[Callable[[Any], Union[Tuple[str, Any], str]]] = None, 

91 coerceItemValue: Optional[Callable[[Any], Any]] = None, 

92 defaultItemValue: Optional[Any] = None, 

93 ) -> Union[CategorizedWildcard, EllipsisType]: 

94 """Categorize a wildcard expression. 

95 

96 Parameters 

97 ---------- 

98 expression 

99 The expression to categorize. May be any of: 

100 - `str`; 

101 - `re.Pattern` (only if ``allowPatterns`` is `True`); 

102 - objects recognized by ``coerceUnrecognized`` (if provided); 

103 - two-element tuples of (`str`, value) where value is recognized 

104 by ``coerceItemValue`` (if provided); 

105 - a non-`str`, non-mapping iterable containing any of the above; 

106 - the special value `...` (only if ``allowAny`` is `True`), which 

107 matches anything; 

108 - a mapping from `str` to a value are recognized by 

109 ``coerceItemValue`` (if provided); 

110 - a `CategorizedWildcard` instance (passed through unchanged if 

111 it meets the requirements specified by keyword arguments). 

112 allowAny: `bool`, optional 

113 If `False` (`True` is default) raise `TypeError` if `...` is 

114 encountered. 

115 allowPatterns: `bool`, optional 

116 If `False` (`True` is default) raise `TypeError` if a `re.Pattern` 

117 is encountered, or if ``expression`` is a `CategorizedWildcard` 

118 with `patterns` not empty. 

119 coerceUnrecognized: `Callable`, optional 

120 A callback that takes a single argument of arbitrary type and 

121 returns either a `str` - appended to `strings` - or a `tuple` of 

122 (`str`, `Any`) to be appended to `items`. This will be called on 

123 objects of unrecognized type, with the return value added to 

124 `strings`. Exceptions will be reraised as `TypeError` (and 

125 chained). 

126 coerceItemValue: `Callable`, optional 

127 If provided, ``expression`` may be a mapping from `str` to any 

128 type that can be passed to this function; the result of that call 

129 will be stored instead as the value in ``self.items``. 

130 defaultItemValue: `Any`, optional 

131 If provided, combine this value with any string values encountered 

132 (including any returned by ``coerceUnrecognized``) to form a 

133 `tuple` and add it to `items`, guaranteeing that `strings` will be 

134 empty. Patterns are never added to `items`. 

135 

136 Returns 

137 ------- 

138 categorized : `CategorizedWildcard` or ``...``. 

139 The struct describing the wildcard. ``...`` is passed through 

140 unchanged. 

141 

142 Raises 

143 ------ 

144 TypeError 

145 Raised if an unsupported type is found in the expression. 

146 """ 

147 assert expression is not None 

148 # See if we were given ...; just return that if we were. 

149 if expression is Ellipsis: 

150 if not allowAny: 

151 raise TypeError("This expression may not be unconstrained.") 

152 return Ellipsis 

153 if isinstance(expression, cls): 

154 # This is already a CategorizedWildcard. Make sure it meets the 

155 # reqs. implied by the kwargs we got. 

156 if not allowPatterns and expression.patterns: 

157 raise TypeError(f"Regular expression(s) {expression.patterns} " 

158 f"are not allowed in this context.") 

159 if defaultItemValue is not None and expression.strings: 

160 if expression.items: 

161 raise TypeError("Incompatible preprocessed expression: an ordered sequence of str is " 

162 "needed, but the original order was lost in the preprocessing.") 

163 return cls(strings=[], patterns=expression.patterns, 

164 items=[(k, defaultItemValue) for k in expression.strings]) 

165 elif defaultItemValue is None and expression.items: 

166 if expression.strings: 

167 raise TypeError("Incompatible preprocessed expression: an ordered sequence of items is " 

168 "needed, but the original order was lost in the preprocessing.") 

169 return cls(strings=[k for k, _ in expression.items], patterns=expression.patterns, items=[]) 

170 else: 

171 # Original expression was created with keyword arguments that 

172 # were at least as restrictive as what we just got; pass it 

173 # through. 

174 return expression 

175 

176 # If we get here, we know we'll be creating a new instance. 

177 # Initialize an empty one now. 

178 self = cls(strings=[], patterns=[], items=[]) 

179 

180 # If mappings are allowed, see if we were given a single mapping by 

181 # trying to get items. 

182 if coerceItemValue is not None: 

183 rawItems = None 

184 try: 

185 rawItems = expression.items() 

186 except AttributeError: 

187 pass 

188 if rawItems is not None: 

189 for k, v in rawItems: 

190 try: 

191 self.items.append((k, coerceItemValue(v))) 

192 except Exception as err: 

193 raise TypeError(f"Could not coerce mapping value '{v}' for key '{k}'.") from err 

194 return self 

195 

196 # Not ..., a CategorizedWildcard instance, or a mapping. Just 

197 # process scalars or an iterable. We put the body of the loop inside 

198 # a local function so we can recurse after coercion. 

199 

200 def process(element: Any, alreadyCoerced: bool = False) -> None: 

201 if isinstance(element, str): 

202 if defaultItemValue is not None: 

203 self.items.append((element, defaultItemValue)) 

204 else: 

205 self.strings.append(element) 

206 return 

207 if allowPatterns and isinstance(element, re.Pattern): 

208 self.patterns.append(element) 

209 return 

210 if coerceItemValue is not None: 

211 try: 

212 k, v = element 

213 except TypeError: 

214 pass 

215 else: 

216 if not alreadyCoerced: 

217 if not isinstance(k, str): 

218 raise TypeError(f"Item key '{k}' is not a string.") 

219 try: 

220 v = coerceItemValue(v) 

221 except Exception as err: 

222 raise TypeError(f"Could not coerce tuple item value '{v}' for key '{k}'." 

223 ) from err 

224 self.items.append((k, v)) 

225 return 

226 if alreadyCoerced: 

227 raise TypeError(f"Object '{element}' returned by coercion function is still unrecognized.") 

228 if coerceUnrecognized is not None: 

229 try: 

230 process(coerceUnrecognized(element), alreadyCoerced=True) 

231 except Exception as err: 

232 raise TypeError(f"Could not coerce expression element '{element}'.") from err 

233 else: 

234 raise TypeError(f"Unsupported object in wildcard expression: '{element}'.") 

235 

236 for element in iterable(expression): 

237 process(element) 

238 return self 

239 

240 def makeWhereExpression(self, column: sqlalchemy.sql.ColumnElement 

241 ) -> Optional[sqlalchemy.sql.ColumnElement]: 

242 """Transform the wildcard into a SQLAlchemy boolean expression suitable 

243 for use in a WHERE clause. 

244 

245 Parameters 

246 ---------- 

247 column : `sqlalchemy.sql.ColumnElement` 

248 A string column in a table or query that should be compared to the 

249 wildcard expression. 

250 

251 Returns 

252 ------- 

253 where : `sqlalchemy.sql.ColumnElement` or `None` 

254 A boolean SQL expression that evaluates to true if and only if 

255 the value of ``column`` matches the wildcard. `None` is returned 

256 if both `strings` and `patterns` are empty, and hence no match is 

257 possible. 

258 """ 

259 if self.items: 

260 raise NotImplementedError("Expressions that are processed into items cannot be transformed " 

261 "automatically into queries.") 

262 if self.patterns: 

263 raise NotImplementedError("Regular expression patterns are not yet supported here.") 

264 terms = [] 

265 if len(self.strings) == 1: 

266 terms.append(column == self.strings[0]) 

267 elif len(self.strings) > 1: 

268 terms.append(column.in_(self.strings)) 

269 # TODO: append terms for regular expressions 

270 if not terms: 

271 return None 

272 return sqlalchemy.sql.or_(*terms) 

273 

274 strings: List[str] 

275 """Explicit string values found in the wildcard (`list` [ `str` ]). 

276 """ 

277 

278 patterns: List[re.Pattern] 

279 """Regular expression patterns found in the wildcard 

280 (`list` [ `re.Pattern` ]). 

281 """ 

282 

283 items: List[Tuple[str, Any]] 

284 """Two-item tuples that relate string values to other objects 

285 (`list` [ `tuple` [ `str`, `Any` ] ]). 

286 """ 

287 

288 

289def _yieldCollectionRecords( 

290 manager: CollectionManager, 

291 record: CollectionRecord, 

292 collectionTypes: AbstractSet[CollectionType] = CollectionType.all(), 

293 done: Optional[Set[str]] = None, 

294 flattenChains: bool = True, 

295 includeChains: Optional[bool] = None, 

296) -> Iterator[CollectionRecord]: 

297 """A helper function containing common logic for `CollectionSearch.iter` 

298 and `CollectionQuery.iter`: recursively yield `CollectionRecord` only if 

299 they match the criteria given in other arguments. 

300 

301 Parameters 

302 ---------- 

303 manager : `CollectionManager` 

304 Object responsible for managing the collection tables in a `Registry`. 

305 record : `CollectionRecord` 

306 Record to conditionally yield. 

307 collectionTypes : `AbstractSet` [ `CollectionType` ], optional 

308 If provided, only yield collections of these types. 

309 done : `set` [ `str` ], optional 

310 A `set` of already-yielded collection names; if provided, ``record`` 

311 will only be yielded if it is not already in ``done``, and ``done`` 

312 will be updated to include it on return. 

313 flattenChains : `bool`, optional 

314 If `True` (default) recursively yield the child collections of 

315 `~CollectionType.CHAINED` collections. 

316 includeChains : `bool`, optional 

317 If `False`, return records for `~CollectionType.CHAINED` collections 

318 themselves. The default is the opposite of ``flattenChains``: either 

319 return records for CHAINED collections or their children, but not both. 

320 

321 Yields 

322 ------ 

323 record : `CollectionRecord` 

324 Matching collection records. 

325 """ 

326 if done is None: 

327 done = set() 

328 includeChains = includeChains if includeChains is not None else not flattenChains 

329 if record.type in collectionTypes: 

330 done.add(record.name) 

331 if record.type is not CollectionType.CHAINED or includeChains: 

332 yield record 

333 if flattenChains and record.type is CollectionType.CHAINED: 

334 done.add(record.name) 

335 # We know this is a ChainedCollectionRecord because of the enum value, 

336 # but MyPy doesn't. 

337 yield from record.children.iter( # type: ignore 

338 manager, 

339 collectionTypes=collectionTypes, 

340 done=done, 

341 flattenChains=flattenChains, 

342 includeChains=includeChains, 

343 ) 

344 

345 

346class CollectionSearch(Sequence[str]): 

347 """An ordered search path of collections. 

348 

349 The `fromExpression` method should almost always be used to construct 

350 instances, as the regular constructor performs no checking of inputs (and 

351 that can lead to confusing error messages downstream). 

352 

353 Parameters 

354 ---------- 

355 collections : `tuple` [ `str` ] 

356 Tuple of collection names, ordered from the first searched to the last 

357 searched. 

358 

359 Notes 

360 ----- 

361 A `CollectionSearch` is used to find a single dataset (or set of datasets 

362 with different dataset types or data IDs) according to its dataset type and 

363 data ID, giving preference to collections in the order in which they are 

364 specified. A `CollectionQuery` can be constructed from a broader range of 

365 expressions but does not order the collections to be searched. 

366 

367 `CollectionSearch` is an immutable sequence of `str` collection names. 

368 

369 A `CollectionSearch` instance constructed properly (e.g. via 

370 `fromExpression`) is a unique representation of a particular search path; 

371 it is exactly the same internally and compares as equal to any 

372 `CollectionSearch` constructed from an equivalent expression, regardless of 

373 how different the original expressions appear. 

374 """ 

375 def __init__(self, collections: Tuple[str, ...]): 

376 self._collections = collections 

377 

378 __slots__ = ("_collections",) 

379 

380 @classmethod 

381 def fromExpression(cls, expression: Any) -> CollectionSearch: 

382 """Process a general expression to construct a `CollectionSearch` 

383 instance. 

384 

385 Parameters 

386 ---------- 

387 expression 

388 May be: 

389 - a `str` collection name; 

390 - an iterable of `str` collection names; 

391 - another `CollectionSearch` instance (passed through 

392 unchanged). 

393 

394 Duplicate entries will be removed (preserving the first appearance 

395 of each collection name). 

396 Returns 

397 ------- 

398 collections : `CollectionSearch` 

399 A `CollectionSearch` instance. 

400 """ 

401 # First see if this is already a CollectionSearch; just pass that 

402 # through unchanged. This lets us standardize expressions (and turn 

403 # single-pass iterators into multi-pass iterables) in advance and pass 

404 # them down to other routines that accept arbitrary expressions. 

405 if isinstance(expression, cls): 

406 return expression 

407 wildcard = CategorizedWildcard.fromExpression( 

408 expression, 

409 allowAny=False, 

410 allowPatterns=False, 

411 ) 

412 assert wildcard is not Ellipsis 

413 assert not wildcard.patterns 

414 assert not wildcard.items 

415 deduplicated = [] 

416 for name in wildcard.strings: 

417 if name not in deduplicated: 

418 deduplicated.append(name) 

419 return cls(tuple(deduplicated)) 

420 

421 def iter( 

422 self, manager: CollectionManager, *, 

423 datasetType: Optional[DatasetType] = None, 

424 collectionTypes: AbstractSet[CollectionType] = CollectionType.all(), 

425 done: Optional[Set[str]] = None, 

426 flattenChains: bool = True, 

427 includeChains: Optional[bool] = None, 

428 ) -> Iterator[CollectionRecord]: 

429 """Iterate over collection records that match this instance and the 

430 given criteria, in order. 

431 

432 This method is primarily intended for internal use by `Registry`; 

433 other callers should generally prefer `Registry.findDatasets` or 

434 other `Registry` query methods. 

435 

436 Parameters 

437 ---------- 

438 manager : `CollectionManager` 

439 Object responsible for managing the collection tables in a 

440 `Registry`. 

441 collectionTypes : `AbstractSet` [ `CollectionType` ], optional 

442 If provided, only yield collections of these types. 

443 done : `set`, optional 

444 A `set` containing the names of all collections already yielded; 

445 any collections whose names are already present in this set will 

446 not be yielded again, and those yielded will be added to it while 

447 iterating. If not provided, an empty `set` will be created and 

448 used internally to avoid duplicates. 

449 flattenChains : `bool`, optional 

450 If `True` (default) recursively yield the child collections of 

451 `~CollectionType.CHAINED` collections. 

452 includeChains : `bool`, optional 

453 If `False`, return records for `~CollectionType.CHAINED` 

454 collections themselves. The default is the opposite of 

455 ``flattenChains``: either return records for CHAINED collections or 

456 their children, but not both. 

457 

458 Yields 

459 ------ 

460 record : `CollectionRecord` 

461 Matching collection records. 

462 """ 

463 if done is None: 

464 done = set() 

465 for name in self: 

466 if name not in done: 

467 yield from _yieldCollectionRecords( 

468 manager, 

469 manager.find(name), 

470 collectionTypes=collectionTypes, 

471 done=done, 

472 flattenChains=flattenChains, 

473 includeChains=includeChains, 

474 ) 

475 

476 def __iter__(self) -> Iterator[str]: 

477 yield from self._collections 

478 

479 def __len__(self) -> int: 

480 return len(self._collections) 

481 

482 def __getitem__(self, index: Any) -> str: 

483 return self._collections[index] 

484 

485 def __eq__(self, other: Any) -> bool: 

486 if isinstance(other, CollectionSearch): 

487 return self._collections == other._collections 

488 return False 

489 

490 def __str__(self) -> str: 

491 return "[{}]".format(", ".join(self)) 

492 

493 def __repr__(self) -> str: 

494 return f"CollectionSearch({self._collections!r})" 

495 

496 

497class CollectionQuery: 

498 """An unordered query for collections and dataset type restrictions. 

499 

500 The `fromExpression` method should almost always be used to construct 

501 instances, as the regular constructor performs no checking of inputs (and 

502 that can lead to confusing error messages downstream). 

503 

504 Parameters 

505 ---------- 

506 search : `CollectionSearch` or `...` 

507 An object representing an ordered search for explicitly-named 

508 collections (to be interpreted here as unordered), or the special 

509 value `...` indicating all collections. `...` must be accompanied 

510 by ``patterns=None``. 

511 patterns : `tuple` of `re.Pattern` 

512 Regular expression patterns to match against collection names. 

513 universe : `DimensionUniverse` 

514 Object managing all dimensions. 

515 

516 Notes 

517 ----- 

518 A `CollectionQuery` is used to find all matching datasets in any number 

519 of collections, or to find collections themselves. 

520 

521 `CollectionQuery` is expected to be rarely used outside of `Registry` 

522 (which uses it to back several of its "query" methods that take general 

523 expressions for collections), but it may occassionally be useful outside 

524 `Registry` as a way to preprocess expressions that contain single-pass 

525 iterators into a form that can be used to call those `Registry` methods 

526 multiple times. 

527 """ 

528 def __init__( 

529 self, 

530 search: Union[CollectionSearch, EllipsisType] = Ellipsis, 

531 patterns: Tuple[re.Pattern, ...] = (), 

532 ): 

533 self._search = search 

534 self._patterns = patterns 

535 

536 __slots__ = ("_search", "_patterns") 

537 

538 @classmethod 

539 def fromExpression(cls, expression: Any) -> CollectionQuery: 

540 """Process a general expression to construct a `CollectionQuery` 

541 instance. 

542 

543 Parameters 

544 ---------- 

545 expression 

546 May be: 

547 - a `str` collection name; 

548 - an `re.Pattern` instance to match (with `re.Pattern.fullmatch`) 

549 against collection names; 

550 - any iterable containing any of the above; 

551 - a `CollectionSearch` instance; 

552 - another `CollectionQuery` instance (passed through unchanged). 

553 

554 Duplicate collection names will be removed (preserving the first 

555 appearance of each collection name). 

556 

557 Returns 

558 ------- 

559 collections : `CollectionQuery` 

560 A `CollectionQuery` instance. 

561 """ 

562 if isinstance(expression, cls): 

563 return expression 

564 if expression is Ellipsis: 

565 return cls() 

566 if isinstance(expression, CollectionSearch): 

567 return cls(search=expression, patterns=()) 

568 wildcard = CategorizedWildcard.fromExpression( 

569 expression, 

570 allowAny=True, 

571 allowPatterns=True, 

572 ) 

573 if wildcard is Ellipsis: 

574 return cls() 

575 assert not wildcard.items, \ 

576 "We should no longer be transforming to (str, DatasetTypeRestriction) tuples." 

577 return cls( 

578 search=CollectionSearch.fromExpression(wildcard.strings), 

579 patterns=tuple(wildcard.patterns), 

580 ) 

581 

582 def iter( 

583 self, manager: CollectionManager, *, 

584 collectionTypes: AbstractSet[CollectionType] = CollectionType.all(), 

585 flattenChains: bool = True, 

586 includeChains: Optional[bool] = None, 

587 ) -> Iterator[CollectionRecord]: 

588 """Iterate over collection records that match this instance and the 

589 given criteria, in an arbitrary order. 

590 

591 This method is primarily intended for internal use by `Registry`; 

592 other callers should generally prefer `Registry.queryDatasets` or 

593 other `Registry` query methods. 

594 

595 Parameters 

596 ---------- 

597 manager : `CollectionManager` 

598 Object responsible for managing the collection tables in a 

599 `Registry`. 

600 collectionTypes : `AbstractSet` [ `CollectionType` ], optional 

601 If provided, only yield collections of these types. 

602 flattenChains : `bool`, optional 

603 If `True` (default) recursively yield the child collections of 

604 `~CollectionType.CHAINED` collections. 

605 includeChains : `bool`, optional 

606 If `False`, return records for `~CollectionType.CHAINED` 

607 collections themselves. The default is the opposite of 

608 ``flattenChains``: either return records for CHAINED collections or 

609 their children, but not both. 

610 

611 Yields 

612 ------ 

613 record : `CollectionRecord` 

614 Matching collection records. 

615 """ 

616 if self._search is Ellipsis: 

617 for record in manager: 

618 yield from _yieldCollectionRecords( 

619 manager, 

620 record, 

621 collectionTypes=collectionTypes, 

622 flattenChains=flattenChains, 

623 includeChains=includeChains, 

624 ) 

625 else: 

626 done: Set[str] = set() 

627 yield from self._search.iter( 

628 manager, 

629 collectionTypes=collectionTypes, 

630 done=done, 

631 flattenChains=flattenChains, 

632 includeChains=includeChains, 

633 ) 

634 for record in manager: 

635 if record.name not in done and any(p.fullmatch(record.name) for p in self._patterns): 

636 yield from _yieldCollectionRecords( 

637 manager, 

638 record, 

639 collectionTypes=collectionTypes, 

640 done=done, 

641 flattenChains=flattenChains, 

642 includeChains=includeChains, 

643 ) 

644 

645 def __eq__(self, other: Any) -> bool: 

646 if isinstance(other, CollectionQuery): 

647 return self._search == other._search and self._patterns == other._patterns 

648 else: 

649 return False