Coverage for python/lsst/daf/butler/registry/wildcards.py: 17%
207 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-15 09:41 +0000
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-15 09:41 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "CategorizedWildcard",
25 "CollectionQuery",
26 "CollectionSearch",
27)
29import re
30from dataclasses import dataclass
31from typing import (
32 TYPE_CHECKING,
33 AbstractSet,
34 Any,
35 Callable,
36 Iterator,
37 List,
38 Optional,
39 Sequence,
40 Set,
41 Tuple,
42 Union,
43)
45import sqlalchemy
46from lsst.utils.ellipsis import Ellipsis, EllipsisType
47from lsst.utils.iteration import ensure_iterable
48from pydantic import BaseModel
50from ..core import DatasetType
51from ..core.utils import globToRegex
52from ._collectionType import CollectionType
54if TYPE_CHECKING: 54 ↛ 55line 54 didn't jump to line 55, because the condition on line 54 was never true
55 from .interfaces import CollectionManager, CollectionRecord
58@dataclass
59class CategorizedWildcard:
60 """The results of preprocessing a wildcard expression to separate match
61 patterns from strings.
63 The `fromExpression` method should almost always be used to construct
64 instances, as the regular constructor performs no checking of inputs (and
65 that can lead to confusing error messages downstream).
66 """
68 @classmethod
69 def fromExpression(
70 cls,
71 expression: Any,
72 *,
73 allowAny: bool = True,
74 allowPatterns: bool = True,
75 coerceUnrecognized: Optional[Callable[[Any], Union[Tuple[str, Any], str]]] = None,
76 coerceItemValue: Optional[Callable[[Any], Any]] = None,
77 defaultItemValue: Optional[Any] = None,
78 ) -> Union[CategorizedWildcard, EllipsisType]:
79 """Categorize a wildcard expression.
81 Parameters
82 ----------
83 expression
84 The expression to categorize. May be any of:
85 - `str` (including glob patterns if ``allowPatterns`` is `True`);
86 - `re.Pattern` (only if ``allowPatterns`` is `True`);
87 - objects recognized by ``coerceUnrecognized`` (if provided);
88 - two-element tuples of (`str`, value) where value is recognized
89 by ``coerceItemValue`` (if provided);
90 - a non-`str`, non-mapping iterable containing any of the above;
91 - the special value `...` (only if ``allowAny`` is `True`), which
92 matches anything;
93 - a mapping from `str` to a value are recognized by
94 ``coerceItemValue`` (if provided);
95 - a `CategorizedWildcard` instance (passed through unchanged if
96 it meets the requirements specified by keyword arguments).
97 allowAny: `bool`, optional
98 If `False` (`True` is default) raise `TypeError` if `...` is
99 encountered.
100 allowPatterns: `bool`, optional
101 If `False` (`True` is default) raise `TypeError` if a `re.Pattern`
102 is encountered, or if ``expression`` is a `CategorizedWildcard`
103 with `patterns` not empty.
104 coerceUnrecognized: `Callable`, optional
105 A callback that takes a single argument of arbitrary type and
106 returns either a `str` - appended to `strings` - or a `tuple` of
107 (`str`, `Any`) to be appended to `items`. This will be called on
108 objects of unrecognized type, with the return value added to
109 `strings`. Exceptions will be reraised as `TypeError` (and
110 chained).
111 coerceItemValue: `Callable`, optional
112 If provided, ``expression`` may be a mapping from `str` to any
113 type that can be passed to this function; the result of that call
114 will be stored instead as the value in ``self.items``.
115 defaultItemValue: `Any`, optional
116 If provided, combine this value with any string values encountered
117 (including any returned by ``coerceUnrecognized``) to form a
118 `tuple` and add it to `items`, guaranteeing that `strings` will be
119 empty. Patterns are never added to `items`.
121 Returns
122 -------
123 categorized : `CategorizedWildcard` or ``...``.
124 The struct describing the wildcard. ``...`` is passed through
125 unchanged.
127 Raises
128 ------
129 TypeError
130 Raised if an unsupported type is found in the expression.
131 """
132 assert expression is not None
133 # See if we were given ...; just return that if we were.
134 if expression is Ellipsis:
135 if not allowAny:
136 raise TypeError("This expression may not be unconstrained.")
137 return Ellipsis
138 if isinstance(expression, cls):
139 # This is already a CategorizedWildcard. Make sure it meets the
140 # reqs. implied by the kwargs we got.
141 if not allowPatterns and expression.patterns:
142 raise TypeError(
143 f"Regular expression(s) {expression.patterns} are not allowed in this context."
144 )
145 if defaultItemValue is not None and expression.strings:
146 if expression.items:
147 raise TypeError(
148 "Incompatible preprocessed expression: an ordered sequence of str is "
149 "needed, but the original order was lost in the preprocessing."
150 )
151 return cls(
152 strings=[],
153 patterns=expression.patterns,
154 items=[(k, defaultItemValue) for k in expression.strings],
155 )
156 elif defaultItemValue is None and expression.items:
157 if expression.strings:
158 raise TypeError(
159 "Incompatible preprocessed expression: an ordered sequence of items is "
160 "needed, but the original order was lost in the preprocessing."
161 )
162 return cls(strings=[k for k, _ in expression.items], patterns=expression.patterns, items=[])
163 else:
164 # Original expression was created with keyword arguments that
165 # were at least as restrictive as what we just got; pass it
166 # through.
167 return expression
169 # If we get here, we know we'll be creating a new instance.
170 # Initialize an empty one now.
171 self = cls(strings=[], patterns=[], items=[])
173 # If mappings are allowed, see if we were given a single mapping by
174 # trying to get items.
175 if coerceItemValue is not None:
176 rawItems = None
177 try:
178 rawItems = expression.items()
179 except AttributeError:
180 pass
181 if rawItems is not None:
182 for k, v in rawItems:
183 try:
184 self.items.append((k, coerceItemValue(v)))
185 except Exception as err:
186 raise TypeError(f"Could not coerce mapping value '{v}' for key '{k}'.") from err
187 return self
189 # Not ..., a CategorizedWildcard instance, or a mapping. Just
190 # process scalars or an iterable. We put the body of the loop inside
191 # a local function so we can recurse after coercion.
193 def process(element: Any, alreadyCoerced: bool = False) -> Union[EllipsisType, None]:
194 if isinstance(element, str):
195 if defaultItemValue is not None:
196 self.items.append((element, defaultItemValue))
197 return None
198 else:
199 # This returns a list but we know we only passed in
200 # single value.
201 converted = globToRegex(element)
202 if converted is Ellipsis:
203 return Ellipsis
204 element = converted[0]
205 # Let regex and ... go through to the next check
206 if isinstance(element, str):
207 self.strings.append(element)
208 return None
209 if allowPatterns and isinstance(element, re.Pattern):
210 self.patterns.append(element)
211 return None
212 if coerceItemValue is not None:
213 try:
214 k, v = element
215 except TypeError:
216 pass
217 else:
218 if not alreadyCoerced:
219 if not isinstance(k, str):
220 raise TypeError(f"Item key '{k}' is not a string.")
221 try:
222 v = coerceItemValue(v)
223 except Exception as err:
224 raise TypeError(
225 f"Could not coerce tuple item value '{v}' for key '{k}'."
226 ) from err
227 self.items.append((k, v))
228 return None
229 if alreadyCoerced:
230 raise TypeError(f"Object '{element!r}' returned by coercion function is still unrecognized.")
231 if coerceUnrecognized is not None:
232 try:
233 # This should be safe but flake8 cant tell that the
234 # function will be re-declared next function call
235 process(coerceUnrecognized(element), alreadyCoerced=True) # noqa: F821
236 except Exception as err:
237 raise TypeError(f"Could not coerce expression element '{element!r}'.") from err
238 else:
239 extra = "."
240 if isinstance(element, re.Pattern):
241 extra = " and patterns are not allowed."
242 raise TypeError(f"Unsupported object in wildcard expression: '{element!r}'{extra}")
243 return None
245 for element in ensure_iterable(expression):
246 retval = process(element)
247 if retval is Ellipsis:
248 # One of the globs matched everything
249 if not allowAny:
250 raise TypeError("This expression may not be unconstrained.")
251 return Ellipsis
252 del process
253 return self
255 def makeWhereExpression(
256 self, column: sqlalchemy.sql.ColumnElement
257 ) -> Optional[sqlalchemy.sql.ColumnElement]:
258 """Transform the wildcard into a SQLAlchemy boolean expression suitable
259 for use in a WHERE clause.
261 Parameters
262 ----------
263 column : `sqlalchemy.sql.ColumnElement`
264 A string column in a table or query that should be compared to the
265 wildcard expression.
267 Returns
268 -------
269 where : `sqlalchemy.sql.ColumnElement` or `None`
270 A boolean SQL expression that evaluates to true if and only if
271 the value of ``column`` matches the wildcard. `None` is returned
272 if both `strings` and `patterns` are empty, and hence no match is
273 possible.
274 """
275 if self.items:
276 raise NotImplementedError(
277 "Expressions that are processed into items cannot be transformed "
278 "automatically into queries."
279 )
280 if self.patterns:
281 raise NotImplementedError("Regular expression patterns are not yet supported here.")
282 terms = []
283 if len(self.strings) == 1:
284 terms.append(column == self.strings[0])
285 elif len(self.strings) > 1:
286 terms.append(column.in_(self.strings))
287 # TODO: append terms for regular expressions
288 if not terms:
289 return None
290 return sqlalchemy.sql.or_(*terms)
292 strings: List[str]
293 """Explicit string values found in the wildcard (`list` [ `str` ]).
294 """
296 patterns: List[re.Pattern]
297 """Regular expression patterns found in the wildcard
298 (`list` [ `re.Pattern` ]).
299 """
301 items: List[Tuple[str, Any]]
302 """Two-item tuples that relate string values to other objects
303 (`list` [ `tuple` [ `str`, `Any` ] ]).
304 """
307def _yieldCollectionRecords(
308 manager: CollectionManager,
309 record: CollectionRecord,
310 collectionTypes: AbstractSet[CollectionType] = CollectionType.all(),
311 done: Optional[Set[str]] = None,
312 flattenChains: bool = True,
313 includeChains: Optional[bool] = None,
314) -> Iterator[CollectionRecord]:
315 """A helper function containing common logic for `CollectionSearch.iter`
316 and `CollectionQuery.iter`: recursively yield `CollectionRecord` only if
317 they match the criteria given in other arguments.
319 Parameters
320 ----------
321 manager : `CollectionManager`
322 Object responsible for managing the collection tables in a `Registry`.
323 record : `CollectionRecord`
324 Record to conditionally yield.
325 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
326 If provided, only yield collections of these types.
327 done : `set` [ `str` ], optional
328 A `set` of already-yielded collection names; if provided, ``record``
329 will only be yielded if it is not already in ``done``, and ``done``
330 will be updated to include it on return.
331 flattenChains : `bool`, optional
332 If `True` (default) recursively yield the child collections of
333 `~CollectionType.CHAINED` collections.
334 includeChains : `bool`, optional
335 If `False`, return records for `~CollectionType.CHAINED` collections
336 themselves. The default is the opposite of ``flattenChains``: either
337 return records for CHAINED collections or their children, but not both.
339 Yields
340 ------
341 record : `CollectionRecord`
342 Matching collection records.
343 """
344 if done is None:
345 done = set()
346 includeChains = includeChains if includeChains is not None else not flattenChains
347 if record.type in collectionTypes:
348 done.add(record.name)
349 if record.type is not CollectionType.CHAINED or includeChains:
350 yield record
351 if flattenChains and record.type is CollectionType.CHAINED:
352 done.add(record.name)
353 # We know this is a ChainedCollectionRecord because of the enum value,
354 # but MyPy doesn't.
355 yield from record.children.iter( # type: ignore
356 manager,
357 collectionTypes=collectionTypes,
358 done=done,
359 flattenChains=flattenChains,
360 includeChains=includeChains,
361 )
364class CollectionSearch(BaseModel, Sequence[str]):
365 """An ordered search path of collections.
367 The `fromExpression` method should almost always be used to construct
368 instances, as the regular constructor performs no checking of inputs (and
369 that can lead to confusing error messages downstream).
371 Parameters
372 ----------
373 collections : `tuple` [ `str` ]
374 Tuple of collection names, ordered from the first searched to the last
375 searched.
377 Notes
378 -----
379 A `CollectionSearch` is used to find a single dataset (or set of datasets
380 with different dataset types or data IDs) according to its dataset type and
381 data ID, giving preference to collections in the order in which they are
382 specified. A `CollectionQuery` can be constructed from a broader range of
383 expressions but does not order the collections to be searched.
385 `CollectionSearch` is an immutable sequence of `str` collection names.
387 A `CollectionSearch` instance constructed properly (e.g. via
388 `fromExpression`) is a unique representation of a particular search path;
389 it is exactly the same internally and compares as equal to any
390 `CollectionSearch` constructed from an equivalent expression, regardless of
391 how different the original expressions appear.
392 """
394 __root__: Tuple[str, ...]
396 @classmethod
397 def fromExpression(cls, expression: Any) -> CollectionSearch:
398 """Process a general expression to construct a `CollectionSearch`
399 instance.
401 Parameters
402 ----------
403 expression
404 May be:
405 - a `str` collection name;
406 - an iterable of `str` collection names;
407 - another `CollectionSearch` instance (passed through
408 unchanged).
410 Duplicate entries will be removed (preserving the first appearance
411 of each collection name).
412 Returns
413 -------
414 collections : `CollectionSearch`
415 A `CollectionSearch` instance.
416 """
417 # First see if this is already a CollectionSearch; just pass that
418 # through unchanged. This lets us standardize expressions (and turn
419 # single-pass iterators into multi-pass iterables) in advance and pass
420 # them down to other routines that accept arbitrary expressions.
421 if isinstance(expression, cls):
422 return expression
423 wildcard = CategorizedWildcard.fromExpression(
424 expression,
425 allowAny=False,
426 allowPatterns=False,
427 )
428 assert wildcard is not Ellipsis
429 assert not wildcard.patterns
430 assert not wildcard.items
431 deduplicated = []
432 for name in wildcard.strings:
433 if name not in deduplicated:
434 deduplicated.append(name)
435 return cls(__root__=tuple(deduplicated))
437 def iter(
438 self,
439 manager: CollectionManager,
440 *,
441 datasetType: Optional[DatasetType] = None,
442 collectionTypes: AbstractSet[CollectionType] = CollectionType.all(),
443 done: Optional[Set[str]] = None,
444 flattenChains: bool = True,
445 includeChains: Optional[bool] = None,
446 ) -> Iterator[CollectionRecord]:
447 """Iterate over collection records that match this instance and the
448 given criteria, in order.
450 This method is primarily intended for internal use by `Registry`;
451 other callers should generally prefer `Registry.findDatasets` or
452 other `Registry` query methods.
454 Parameters
455 ----------
456 manager : `CollectionManager`
457 Object responsible for managing the collection tables in a
458 `Registry`.
459 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
460 If provided, only yield collections of these types.
461 done : `set`, optional
462 A `set` containing the names of all collections already yielded;
463 any collections whose names are already present in this set will
464 not be yielded again, and those yielded will be added to it while
465 iterating. If not provided, an empty `set` will be created and
466 used internally to avoid duplicates.
467 flattenChains : `bool`, optional
468 If `True` (default) recursively yield the child collections of
469 `~CollectionType.CHAINED` collections.
470 includeChains : `bool`, optional
471 If `False`, return records for `~CollectionType.CHAINED`
472 collections themselves. The default is the opposite of
473 ``flattenChains``: either return records for CHAINED collections or
474 their children, but not both.
476 Yields
477 ------
478 record : `CollectionRecord`
479 Matching collection records.
480 """
481 if done is None:
482 done = set()
483 for name in self:
484 if name not in done:
485 yield from _yieldCollectionRecords(
486 manager,
487 manager.find(name),
488 collectionTypes=collectionTypes,
489 done=done,
490 flattenChains=flattenChains,
491 includeChains=includeChains,
492 )
494 def explicitNames(self) -> Iterator[str]:
495 """Iterate over collection names that were specified explicitly."""
496 yield from self.__root__
498 def __iter__(self) -> Iterator[str]: # type: ignore
499 yield from self.__root__
501 def __len__(self) -> int:
502 return len(self.__root__)
504 def __getitem__(self, index: Any) -> str:
505 return self.__root__[index]
507 def __eq__(self, other: Any) -> bool:
508 if isinstance(other, CollectionSearch):
509 return self.__root__ == other.__root__
510 return False
512 def __str__(self) -> str:
513 return "[{}]".format(", ".join(self))
515 def __repr__(self) -> str:
516 return f"CollectionSearch({self.__root__!r})"
519class CollectionQuery:
520 """An unordered query for collections and dataset type restrictions.
522 The `fromExpression` method should almost always be used to construct
523 instances, as the regular constructor performs no checking of inputs (and
524 that can lead to confusing error messages downstream).
526 Parameters
527 ----------
528 search : `CollectionSearch` or `...`
529 An object representing an ordered search for explicitly-named
530 collections (to be interpreted here as unordered), or the special
531 value `...` indicating all collections. `...` must be accompanied
532 by ``patterns=None``.
533 patterns : `tuple` of `re.Pattern`
534 Regular expression patterns to match against collection names.
535 universe : `DimensionUniverse`
536 Object managing all dimensions.
538 Notes
539 -----
540 A `CollectionQuery` is used to find all matching datasets in any number
541 of collections, or to find collections themselves.
543 `CollectionQuery` is expected to be rarely used outside of `Registry`
544 (which uses it to back several of its "query" methods that take general
545 expressions for collections), but it may occassionally be useful outside
546 `Registry` as a way to preprocess expressions that contain single-pass
547 iterators into a form that can be used to call those `Registry` methods
548 multiple times.
549 """
551 def __init__(
552 self,
553 search: Union[CollectionSearch, EllipsisType] = Ellipsis,
554 patterns: Tuple[re.Pattern, ...] = (),
555 ):
556 self._search = search
557 self._patterns = patterns
559 __slots__ = ("_search", "_patterns")
561 @classmethod
562 def fromExpression(cls, expression: Any) -> CollectionQuery:
563 """Process a general expression to construct a `CollectionQuery`
564 instance.
566 Parameters
567 ----------
568 expression
569 May be:
570 - a `str` collection name;
571 - an `re.Pattern` instance to match (with `re.Pattern.fullmatch`)
572 against collection names;
573 - any iterable containing any of the above;
574 - a `CollectionSearch` instance;
575 - another `CollectionQuery` instance (passed through unchanged).
577 Duplicate collection names will be removed (preserving the first
578 appearance of each collection name).
580 Returns
581 -------
582 collections : `CollectionQuery`
583 A `CollectionQuery` instance.
584 """
585 if isinstance(expression, cls):
586 return expression
587 if expression is Ellipsis:
588 return cls()
589 if isinstance(expression, CollectionSearch):
590 return cls(search=expression, patterns=())
591 wildcard = CategorizedWildcard.fromExpression(
592 expression,
593 allowAny=True,
594 allowPatterns=True,
595 )
596 if wildcard is Ellipsis:
597 return cls()
598 assert (
599 not wildcard.items
600 ), "We should no longer be transforming to (str, DatasetTypeRestriction) tuples."
601 return cls(
602 search=CollectionSearch.fromExpression(wildcard.strings),
603 patterns=tuple(wildcard.patterns),
604 )
606 def iter(
607 self,
608 manager: CollectionManager,
609 *,
610 collectionTypes: AbstractSet[CollectionType] = CollectionType.all(),
611 flattenChains: bool = True,
612 includeChains: Optional[bool] = None,
613 ) -> Iterator[CollectionRecord]:
614 """Iterate over collection records that match this instance and the
615 given criteria, in an arbitrary order.
617 This method is primarily intended for internal use by `Registry`;
618 other callers should generally prefer `Registry.queryDatasets` or
619 other `Registry` query methods.
621 Parameters
622 ----------
623 manager : `CollectionManager`
624 Object responsible for managing the collection tables in a
625 `Registry`.
626 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
627 If provided, only yield collections of these types.
628 flattenChains : `bool`, optional
629 If `True` (default) recursively yield the child collections of
630 `~CollectionType.CHAINED` collections.
631 includeChains : `bool`, optional
632 If `False`, return records for `~CollectionType.CHAINED`
633 collections themselves. The default is the opposite of
634 ``flattenChains``: either return records for CHAINED collections or
635 their children, but not both.
637 Yields
638 ------
639 record : `CollectionRecord`
640 Matching collection records.
641 """
642 if self._search is Ellipsis:
643 for record in manager:
644 yield from _yieldCollectionRecords(
645 manager,
646 record,
647 collectionTypes=collectionTypes,
648 flattenChains=flattenChains,
649 includeChains=includeChains,
650 )
651 else:
652 done: Set[str] = set()
653 yield from self._search.iter(
654 manager,
655 collectionTypes=collectionTypes,
656 done=done,
657 flattenChains=flattenChains,
658 includeChains=includeChains,
659 )
660 for record in manager:
661 if record.name not in done and any(p.fullmatch(record.name) for p in self._patterns):
662 yield from _yieldCollectionRecords(
663 manager,
664 record,
665 collectionTypes=collectionTypes,
666 done=done,
667 flattenChains=flattenChains,
668 includeChains=includeChains,
669 )
671 def explicitNames(self) -> Iterator[str]:
672 """Iterate over collection names that were specified explicitly."""
673 if isinstance(self._search, CollectionSearch):
674 yield from self._search.explicitNames()
676 def __eq__(self, other: Any) -> bool:
677 if isinstance(other, CollectionQuery):
678 return self._search == other._search and self._patterns == other._patterns
679 else:
680 return False
682 def __str__(self) -> str:
683 if self._search is Ellipsis:
684 return "..."
685 else:
686 terms = list(self._search)
687 terms.extend(str(p) for p in self._patterns)
688 return "[{}]".format(", ".join(terms))
690 def __repr__(self) -> str:
691 return f"CollectionQuery({self._search!r}, {self._patterns!r})"