Coverage for python/lsst/daf/butler/registry/wildcards.py: 16%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "CategorizedWildcard",
25 "CollectionQuery",
26 "CollectionSearch",
27)
29from pydantic import BaseModel
30from dataclasses import dataclass
31import re
32from typing import (
33 AbstractSet,
34 Any,
35 Callable,
36 Iterator,
37 List,
38 Optional,
39 Sequence,
40 Set,
41 Tuple,
42 TYPE_CHECKING,
43 Union,
44)
46import sqlalchemy
48from lsst.utils.iteration import ensure_iterable
49from ..core import DatasetType
50from ..core.utils import globToRegex
51from ._collectionType import CollectionType
53if TYPE_CHECKING: 53 ↛ 54line 53 didn't jump to line 54, because the condition on line 53 was never true
54 from .interfaces import CollectionManager, CollectionRecord
56 # Workaround for `...` not having an exposed type in Python, borrowed from
57 # https://github.com/python/typing/issues/684#issuecomment-548203158
58 # Along with that, we need to either use `Ellipsis` instead of `...` for
59 # the actual sentinal value internally, and tell MyPy to ignore conversions
60 # from `...` to `Ellipsis` at the public-interface boundary.
61 #
62 # `Ellipsis` and `EllipsisType` should be directly imported from this
63 # module by related code that needs them; hopefully that will stay confined
64 # to `lsst.daf.butler.registry`. Putting these in __all__ is bad for
65 # Sphinx, and probably more confusing than helpful overall.
66 from enum import Enum
68 class EllipsisType(Enum):
69 Ellipsis = "..."
71 Ellipsis = EllipsisType.Ellipsis
73else:
74 EllipsisType = type(Ellipsis)
75 Ellipsis = Ellipsis
78@dataclass
79class CategorizedWildcard:
80 """The results of preprocessing a wildcard expression to separate match
81 patterns from strings.
83 The `fromExpression` method should almost always be used to construct
84 instances, as the regular constructor performs no checking of inputs (and
85 that can lead to confusing error messages downstream).
86 """
88 @classmethod
89 def fromExpression(cls, expression: Any, *,
90 allowAny: bool = True,
91 allowPatterns: bool = True,
92 coerceUnrecognized: Optional[Callable[[Any], Union[Tuple[str, Any], str]]] = None,
93 coerceItemValue: Optional[Callable[[Any], Any]] = None,
94 defaultItemValue: Optional[Any] = None,
95 ) -> Union[CategorizedWildcard, EllipsisType]:
96 """Categorize a wildcard expression.
98 Parameters
99 ----------
100 expression
101 The expression to categorize. May be any of:
102 - `str` (including glob patterns if ``allowPatterns`` is `True`);
103 - `re.Pattern` (only if ``allowPatterns`` is `True`);
104 - objects recognized by ``coerceUnrecognized`` (if provided);
105 - two-element tuples of (`str`, value) where value is recognized
106 by ``coerceItemValue`` (if provided);
107 - a non-`str`, non-mapping iterable containing any of the above;
108 - the special value `...` (only if ``allowAny`` is `True`), which
109 matches anything;
110 - a mapping from `str` to a value are recognized by
111 ``coerceItemValue`` (if provided);
112 - a `CategorizedWildcard` instance (passed through unchanged if
113 it meets the requirements specified by keyword arguments).
114 allowAny: `bool`, optional
115 If `False` (`True` is default) raise `TypeError` if `...` is
116 encountered.
117 allowPatterns: `bool`, optional
118 If `False` (`True` is default) raise `TypeError` if a `re.Pattern`
119 is encountered, or if ``expression`` is a `CategorizedWildcard`
120 with `patterns` not empty.
121 coerceUnrecognized: `Callable`, optional
122 A callback that takes a single argument of arbitrary type and
123 returns either a `str` - appended to `strings` - or a `tuple` of
124 (`str`, `Any`) to be appended to `items`. This will be called on
125 objects of unrecognized type, with the return value added to
126 `strings`. Exceptions will be reraised as `TypeError` (and
127 chained).
128 coerceItemValue: `Callable`, optional
129 If provided, ``expression`` may be a mapping from `str` to any
130 type that can be passed to this function; the result of that call
131 will be stored instead as the value in ``self.items``.
132 defaultItemValue: `Any`, optional
133 If provided, combine this value with any string values encountered
134 (including any returned by ``coerceUnrecognized``) to form a
135 `tuple` and add it to `items`, guaranteeing that `strings` will be
136 empty. Patterns are never added to `items`.
138 Returns
139 -------
140 categorized : `CategorizedWildcard` or ``...``.
141 The struct describing the wildcard. ``...`` is passed through
142 unchanged.
144 Raises
145 ------
146 TypeError
147 Raised if an unsupported type is found in the expression.
148 """
149 assert expression is not None
150 # See if we were given ...; just return that if we were.
151 if expression is Ellipsis:
152 if not allowAny:
153 raise TypeError("This expression may not be unconstrained.")
154 return Ellipsis
155 if isinstance(expression, cls):
156 # This is already a CategorizedWildcard. Make sure it meets the
157 # reqs. implied by the kwargs we got.
158 if not allowPatterns and expression.patterns:
159 raise TypeError(f"Regular expression(s) {expression.patterns} "
160 f"are not allowed in this context.")
161 if defaultItemValue is not None and expression.strings:
162 if expression.items:
163 raise TypeError("Incompatible preprocessed expression: an ordered sequence of str is "
164 "needed, but the original order was lost in the preprocessing.")
165 return cls(strings=[], patterns=expression.patterns,
166 items=[(k, defaultItemValue) for k in expression.strings])
167 elif defaultItemValue is None and expression.items:
168 if expression.strings:
169 raise TypeError("Incompatible preprocessed expression: an ordered sequence of items is "
170 "needed, but the original order was lost in the preprocessing.")
171 return cls(strings=[k for k, _ in expression.items], patterns=expression.patterns, items=[])
172 else:
173 # Original expression was created with keyword arguments that
174 # were at least as restrictive as what we just got; pass it
175 # through.
176 return expression
178 # If we get here, we know we'll be creating a new instance.
179 # Initialize an empty one now.
180 self = cls(strings=[], patterns=[], items=[])
182 # If mappings are allowed, see if we were given a single mapping by
183 # trying to get items.
184 if coerceItemValue is not None:
185 rawItems = None
186 try:
187 rawItems = expression.items()
188 except AttributeError:
189 pass
190 if rawItems is not None:
191 for k, v in rawItems:
192 try:
193 self.items.append((k, coerceItemValue(v)))
194 except Exception as err:
195 raise TypeError(f"Could not coerce mapping value '{v}' for key '{k}'.") from err
196 return self
198 # Not ..., a CategorizedWildcard instance, or a mapping. Just
199 # process scalars or an iterable. We put the body of the loop inside
200 # a local function so we can recurse after coercion.
202 def process(element: Any, alreadyCoerced: bool = False) -> Union[EllipsisType, None]:
203 if isinstance(element, str):
204 if defaultItemValue is not None:
205 self.items.append((element, defaultItemValue))
206 return None
207 else:
208 # This returns a list but we know we only passed in
209 # single value.
210 converted = globToRegex(element)
211 if converted is Ellipsis:
212 return Ellipsis
213 element = converted[0]
214 # Let regex and ... go through to the next check
215 if isinstance(element, str):
216 self.strings.append(element)
217 return None
218 if allowPatterns and isinstance(element, re.Pattern):
219 self.patterns.append(element)
220 return None
221 if coerceItemValue is not None:
222 try:
223 k, v = element
224 except TypeError:
225 pass
226 else:
227 if not alreadyCoerced:
228 if not isinstance(k, str):
229 raise TypeError(f"Item key '{k}' is not a string.")
230 try:
231 v = coerceItemValue(v)
232 except Exception as err:
233 raise TypeError(f"Could not coerce tuple item value '{v}' for key '{k}'."
234 ) from err
235 self.items.append((k, v))
236 return None
237 if alreadyCoerced:
238 raise TypeError(f"Object '{element!r}' returned by coercion function is still unrecognized.")
239 if coerceUnrecognized is not None:
240 try:
241 # This should be safe but flake8 cant tell that the
242 # function will be re-declared next function call
243 process(coerceUnrecognized(element), alreadyCoerced=True) # noqa: F821
244 except Exception as err:
245 raise TypeError(f"Could not coerce expression element '{element!r}'.") from err
246 else:
247 extra = "."
248 if isinstance(element, re.Pattern):
249 extra = " and patterns are not allowed."
250 raise TypeError(f"Unsupported object in wildcard expression: '{element!r}'{extra}")
251 return None
253 for element in ensure_iterable(expression):
254 retval = process(element)
255 if retval is Ellipsis:
256 # One of the globs matched everything
257 if not allowAny:
258 raise TypeError("This expression may not be unconstrained.")
259 return Ellipsis
260 del process
261 return self
263 def makeWhereExpression(self, column: sqlalchemy.sql.ColumnElement
264 ) -> Optional[sqlalchemy.sql.ColumnElement]:
265 """Transform the wildcard into a SQLAlchemy boolean expression suitable
266 for use in a WHERE clause.
268 Parameters
269 ----------
270 column : `sqlalchemy.sql.ColumnElement`
271 A string column in a table or query that should be compared to the
272 wildcard expression.
274 Returns
275 -------
276 where : `sqlalchemy.sql.ColumnElement` or `None`
277 A boolean SQL expression that evaluates to true if and only if
278 the value of ``column`` matches the wildcard. `None` is returned
279 if both `strings` and `patterns` are empty, and hence no match is
280 possible.
281 """
282 if self.items:
283 raise NotImplementedError("Expressions that are processed into items cannot be transformed "
284 "automatically into queries.")
285 if self.patterns:
286 raise NotImplementedError("Regular expression patterns are not yet supported here.")
287 terms = []
288 if len(self.strings) == 1:
289 terms.append(column == self.strings[0])
290 elif len(self.strings) > 1:
291 terms.append(column.in_(self.strings))
292 # TODO: append terms for regular expressions
293 if not terms:
294 return None
295 return sqlalchemy.sql.or_(*terms)
297 strings: List[str]
298 """Explicit string values found in the wildcard (`list` [ `str` ]).
299 """
301 patterns: List[re.Pattern]
302 """Regular expression patterns found in the wildcard
303 (`list` [ `re.Pattern` ]).
304 """
306 items: List[Tuple[str, Any]]
307 """Two-item tuples that relate string values to other objects
308 (`list` [ `tuple` [ `str`, `Any` ] ]).
309 """
312def _yieldCollectionRecords(
313 manager: CollectionManager,
314 record: CollectionRecord,
315 collectionTypes: AbstractSet[CollectionType] = CollectionType.all(),
316 done: Optional[Set[str]] = None,
317 flattenChains: bool = True,
318 includeChains: Optional[bool] = None,
319) -> Iterator[CollectionRecord]:
320 """A helper function containing common logic for `CollectionSearch.iter`
321 and `CollectionQuery.iter`: recursively yield `CollectionRecord` only if
322 they match the criteria given in other arguments.
324 Parameters
325 ----------
326 manager : `CollectionManager`
327 Object responsible for managing the collection tables in a `Registry`.
328 record : `CollectionRecord`
329 Record to conditionally yield.
330 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
331 If provided, only yield collections of these types.
332 done : `set` [ `str` ], optional
333 A `set` of already-yielded collection names; if provided, ``record``
334 will only be yielded if it is not already in ``done``, and ``done``
335 will be updated to include it on return.
336 flattenChains : `bool`, optional
337 If `True` (default) recursively yield the child collections of
338 `~CollectionType.CHAINED` collections.
339 includeChains : `bool`, optional
340 If `False`, return records for `~CollectionType.CHAINED` collections
341 themselves. The default is the opposite of ``flattenChains``: either
342 return records for CHAINED collections or their children, but not both.
344 Yields
345 ------
346 record : `CollectionRecord`
347 Matching collection records.
348 """
349 if done is None:
350 done = set()
351 includeChains = includeChains if includeChains is not None else not flattenChains
352 if record.type in collectionTypes:
353 done.add(record.name)
354 if record.type is not CollectionType.CHAINED or includeChains:
355 yield record
356 if flattenChains and record.type is CollectionType.CHAINED:
357 done.add(record.name)
358 # We know this is a ChainedCollectionRecord because of the enum value,
359 # but MyPy doesn't.
360 yield from record.children.iter( # type: ignore
361 manager,
362 collectionTypes=collectionTypes,
363 done=done,
364 flattenChains=flattenChains,
365 includeChains=includeChains,
366 )
369class CollectionSearch(BaseModel, Sequence[str]):
370 """An ordered search path of collections.
372 The `fromExpression` method should almost always be used to construct
373 instances, as the regular constructor performs no checking of inputs (and
374 that can lead to confusing error messages downstream).
376 Parameters
377 ----------
378 collections : `tuple` [ `str` ]
379 Tuple of collection names, ordered from the first searched to the last
380 searched.
382 Notes
383 -----
384 A `CollectionSearch` is used to find a single dataset (or set of datasets
385 with different dataset types or data IDs) according to its dataset type and
386 data ID, giving preference to collections in the order in which they are
387 specified. A `CollectionQuery` can be constructed from a broader range of
388 expressions but does not order the collections to be searched.
390 `CollectionSearch` is an immutable sequence of `str` collection names.
392 A `CollectionSearch` instance constructed properly (e.g. via
393 `fromExpression`) is a unique representation of a particular search path;
394 it is exactly the same internally and compares as equal to any
395 `CollectionSearch` constructed from an equivalent expression, regardless of
396 how different the original expressions appear.
397 """
398 __root__: Tuple[str, ...]
400 @classmethod
401 def fromExpression(cls, expression: Any) -> CollectionSearch:
402 """Process a general expression to construct a `CollectionSearch`
403 instance.
405 Parameters
406 ----------
407 expression
408 May be:
409 - a `str` collection name;
410 - an iterable of `str` collection names;
411 - another `CollectionSearch` instance (passed through
412 unchanged).
414 Duplicate entries will be removed (preserving the first appearance
415 of each collection name).
416 Returns
417 -------
418 collections : `CollectionSearch`
419 A `CollectionSearch` instance.
420 """
421 # First see if this is already a CollectionSearch; just pass that
422 # through unchanged. This lets us standardize expressions (and turn
423 # single-pass iterators into multi-pass iterables) in advance and pass
424 # them down to other routines that accept arbitrary expressions.
425 if isinstance(expression, cls):
426 return expression
427 wildcard = CategorizedWildcard.fromExpression(
428 expression,
429 allowAny=False,
430 allowPatterns=False,
431 )
432 assert wildcard is not Ellipsis
433 assert not wildcard.patterns
434 assert not wildcard.items
435 deduplicated = []
436 for name in wildcard.strings:
437 if name not in deduplicated:
438 deduplicated.append(name)
439 return cls(__root__=tuple(deduplicated))
441 def iter(
442 self, manager: CollectionManager, *,
443 datasetType: Optional[DatasetType] = None,
444 collectionTypes: AbstractSet[CollectionType] = CollectionType.all(),
445 done: Optional[Set[str]] = None,
446 flattenChains: bool = True,
447 includeChains: Optional[bool] = None,
448 ) -> Iterator[CollectionRecord]:
449 """Iterate over collection records that match this instance and the
450 given criteria, in order.
452 This method is primarily intended for internal use by `Registry`;
453 other callers should generally prefer `Registry.findDatasets` or
454 other `Registry` query methods.
456 Parameters
457 ----------
458 manager : `CollectionManager`
459 Object responsible for managing the collection tables in a
460 `Registry`.
461 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
462 If provided, only yield collections of these types.
463 done : `set`, optional
464 A `set` containing the names of all collections already yielded;
465 any collections whose names are already present in this set will
466 not be yielded again, and those yielded will be added to it while
467 iterating. If not provided, an empty `set` will be created and
468 used internally to avoid duplicates.
469 flattenChains : `bool`, optional
470 If `True` (default) recursively yield the child collections of
471 `~CollectionType.CHAINED` collections.
472 includeChains : `bool`, optional
473 If `False`, return records for `~CollectionType.CHAINED`
474 collections themselves. The default is the opposite of
475 ``flattenChains``: either return records for CHAINED collections or
476 their children, but not both.
478 Yields
479 ------
480 record : `CollectionRecord`
481 Matching collection records.
482 """
483 if done is None:
484 done = set()
485 for name in self:
486 if name not in done:
487 yield from _yieldCollectionRecords(
488 manager,
489 manager.find(name),
490 collectionTypes=collectionTypes,
491 done=done,
492 flattenChains=flattenChains,
493 includeChains=includeChains,
494 )
496 def explicitNames(self) -> Iterator[str]:
497 """Iterate over collection names that were specified explicitly.
498 """
499 yield from self.__root__
501 def __iter__(self) -> Iterator[str]: # type: ignore
502 yield from self.__root__
504 def __len__(self) -> int:
505 return len(self.__root__)
507 def __getitem__(self, index: Any) -> str:
508 return self.__root__[index]
510 def __eq__(self, other: Any) -> bool:
511 if isinstance(other, CollectionSearch):
512 return self.__root__ == other.__root__
513 return False
515 def __str__(self) -> str:
516 return "[{}]".format(", ".join(self))
518 def __repr__(self) -> str:
519 return f"CollectionSearch({self.__root__!r})"
522class CollectionQuery:
523 """An unordered query for collections and dataset type restrictions.
525 The `fromExpression` method should almost always be used to construct
526 instances, as the regular constructor performs no checking of inputs (and
527 that can lead to confusing error messages downstream).
529 Parameters
530 ----------
531 search : `CollectionSearch` or `...`
532 An object representing an ordered search for explicitly-named
533 collections (to be interpreted here as unordered), or the special
534 value `...` indicating all collections. `...` must be accompanied
535 by ``patterns=None``.
536 patterns : `tuple` of `re.Pattern`
537 Regular expression patterns to match against collection names.
538 universe : `DimensionUniverse`
539 Object managing all dimensions.
541 Notes
542 -----
543 A `CollectionQuery` is used to find all matching datasets in any number
544 of collections, or to find collections themselves.
546 `CollectionQuery` is expected to be rarely used outside of `Registry`
547 (which uses it to back several of its "query" methods that take general
548 expressions for collections), but it may occassionally be useful outside
549 `Registry` as a way to preprocess expressions that contain single-pass
550 iterators into a form that can be used to call those `Registry` methods
551 multiple times.
552 """
553 def __init__(
554 self,
555 search: Union[CollectionSearch, EllipsisType] = Ellipsis,
556 patterns: Tuple[re.Pattern, ...] = (),
557 ):
558 self._search = search
559 self._patterns = patterns
561 __slots__ = ("_search", "_patterns")
563 @classmethod
564 def fromExpression(cls, expression: Any) -> CollectionQuery:
565 """Process a general expression to construct a `CollectionQuery`
566 instance.
568 Parameters
569 ----------
570 expression
571 May be:
572 - a `str` collection name;
573 - an `re.Pattern` instance to match (with `re.Pattern.fullmatch`)
574 against collection names;
575 - any iterable containing any of the above;
576 - a `CollectionSearch` instance;
577 - another `CollectionQuery` instance (passed through unchanged).
579 Duplicate collection names will be removed (preserving the first
580 appearance of each collection name).
582 Returns
583 -------
584 collections : `CollectionQuery`
585 A `CollectionQuery` instance.
586 """
587 if isinstance(expression, cls):
588 return expression
589 if expression is Ellipsis:
590 return cls()
591 if isinstance(expression, CollectionSearch):
592 return cls(search=expression, patterns=())
593 wildcard = CategorizedWildcard.fromExpression(
594 expression,
595 allowAny=True,
596 allowPatterns=True,
597 )
598 if wildcard is Ellipsis:
599 return cls()
600 assert not wildcard.items, \
601 "We should no longer be transforming to (str, DatasetTypeRestriction) tuples."
602 return cls(
603 search=CollectionSearch.fromExpression(wildcard.strings),
604 patterns=tuple(wildcard.patterns),
605 )
607 def iter(
608 self, manager: CollectionManager, *,
609 collectionTypes: AbstractSet[CollectionType] = CollectionType.all(),
610 flattenChains: bool = True,
611 includeChains: Optional[bool] = None,
612 ) -> Iterator[CollectionRecord]:
613 """Iterate over collection records that match this instance and the
614 given criteria, in an arbitrary order.
616 This method is primarily intended for internal use by `Registry`;
617 other callers should generally prefer `Registry.queryDatasets` or
618 other `Registry` query methods.
620 Parameters
621 ----------
622 manager : `CollectionManager`
623 Object responsible for managing the collection tables in a
624 `Registry`.
625 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
626 If provided, only yield collections of these types.
627 flattenChains : `bool`, optional
628 If `True` (default) recursively yield the child collections of
629 `~CollectionType.CHAINED` collections.
630 includeChains : `bool`, optional
631 If `False`, return records for `~CollectionType.CHAINED`
632 collections themselves. The default is the opposite of
633 ``flattenChains``: either return records for CHAINED collections or
634 their children, but not both.
636 Yields
637 ------
638 record : `CollectionRecord`
639 Matching collection records.
640 """
641 if self._search is Ellipsis:
642 for record in manager:
643 yield from _yieldCollectionRecords(
644 manager,
645 record,
646 collectionTypes=collectionTypes,
647 flattenChains=flattenChains,
648 includeChains=includeChains,
649 )
650 else:
651 done: Set[str] = set()
652 yield from self._search.iter(
653 manager,
654 collectionTypes=collectionTypes,
655 done=done,
656 flattenChains=flattenChains,
657 includeChains=includeChains,
658 )
659 for record in manager:
660 if record.name not in done and any(p.fullmatch(record.name) for p in self._patterns):
661 yield from _yieldCollectionRecords(
662 manager,
663 record,
664 collectionTypes=collectionTypes,
665 done=done,
666 flattenChains=flattenChains,
667 includeChains=includeChains,
668 )
670 def explicitNames(self) -> Iterator[str]:
671 """Iterate over collection names that were specified explicitly.
672 """
673 if isinstance(self._search, CollectionSearch):
674 yield from self._search.explicitNames()
676 def __eq__(self, other: Any) -> bool:
677 if isinstance(other, CollectionQuery):
678 return self._search == other._search and self._patterns == other._patterns
679 else:
680 return False
682 def __str__(self) -> str:
683 if self._search is Ellipsis:
684 return "..."
685 else:
686 terms = list(self._search)
687 terms.extend(str(p) for p in self._patterns)
688 return "[{}]".format(", ".join(terms))
690 def __repr__(self) -> str:
691 return f"CollectionQuery({self._search!r}, {self._patterns!r})"