Coverage for python/lsst/daf/butler/registry/wildcards.py: 16%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "CategorizedWildcard",
25 "CollectionQuery",
26 "CollectionSearch",
27)
29from pydantic import BaseModel
30from dataclasses import dataclass
31import re
32from typing import (
33 AbstractSet,
34 Any,
35 Callable,
36 Iterator,
37 List,
38 Optional,
39 Sequence,
40 Set,
41 Tuple,
42 TYPE_CHECKING,
43 Union,
44)
46import sqlalchemy
48from ..core import DatasetType
49from ..core.utils import iterable, globToRegex
50from ._collectionType import CollectionType
52if TYPE_CHECKING: 52 ↛ 53line 52 didn't jump to line 53, because the condition on line 52 was never true
53 from .interfaces import CollectionManager, CollectionRecord
55 # Workaround for `...` not having an exposed type in Python, borrowed from
56 # https://github.com/python/typing/issues/684#issuecomment-548203158
57 # Along with that, we need to either use `Ellipsis` instead of `...` for
58 # the actual sentinal value internally, and tell MyPy to ignore conversions
59 # from `...` to `Ellipsis` at the public-interface boundary.
60 #
61 # `Ellipsis` and `EllipsisType` should be directly imported from this
62 # module by related code that needs them; hopefully that will stay confined
63 # to `lsst.daf.butler.registry`. Putting these in __all__ is bad for
64 # Sphinx, and probably more confusing than helpful overall.
65 from enum import Enum
67 class EllipsisType(Enum):
68 Ellipsis = "..."
70 Ellipsis = EllipsisType.Ellipsis
72else:
73 EllipsisType = type(Ellipsis)
74 Ellipsis = Ellipsis
77@dataclass
78class CategorizedWildcard:
79 """The results of preprocessing a wildcard expression to separate match
80 patterns from strings.
82 The `fromExpression` method should almost always be used to construct
83 instances, as the regular constructor performs no checking of inputs (and
84 that can lead to confusing error messages downstream).
85 """
87 @classmethod
88 def fromExpression(cls, expression: Any, *,
89 allowAny: bool = True,
90 allowPatterns: bool = True,
91 coerceUnrecognized: Optional[Callable[[Any], Union[Tuple[str, Any], str]]] = None,
92 coerceItemValue: Optional[Callable[[Any], Any]] = None,
93 defaultItemValue: Optional[Any] = None,
94 ) -> Union[CategorizedWildcard, EllipsisType]:
95 """Categorize a wildcard expression.
97 Parameters
98 ----------
99 expression
100 The expression to categorize. May be any of:
101 - `str` (including glob patterns if ``allowPatterns`` is `True`);
102 - `re.Pattern` (only if ``allowPatterns`` is `True`);
103 - objects recognized by ``coerceUnrecognized`` (if provided);
104 - two-element tuples of (`str`, value) where value is recognized
105 by ``coerceItemValue`` (if provided);
106 - a non-`str`, non-mapping iterable containing any of the above;
107 - the special value `...` (only if ``allowAny`` is `True`), which
108 matches anything;
109 - a mapping from `str` to a value are recognized by
110 ``coerceItemValue`` (if provided);
111 - a `CategorizedWildcard` instance (passed through unchanged if
112 it meets the requirements specified by keyword arguments).
113 allowAny: `bool`, optional
114 If `False` (`True` is default) raise `TypeError` if `...` is
115 encountered.
116 allowPatterns: `bool`, optional
117 If `False` (`True` is default) raise `TypeError` if a `re.Pattern`
118 is encountered, or if ``expression`` is a `CategorizedWildcard`
119 with `patterns` not empty.
120 coerceUnrecognized: `Callable`, optional
121 A callback that takes a single argument of arbitrary type and
122 returns either a `str` - appended to `strings` - or a `tuple` of
123 (`str`, `Any`) to be appended to `items`. This will be called on
124 objects of unrecognized type, with the return value added to
125 `strings`. Exceptions will be reraised as `TypeError` (and
126 chained).
127 coerceItemValue: `Callable`, optional
128 If provided, ``expression`` may be a mapping from `str` to any
129 type that can be passed to this function; the result of that call
130 will be stored instead as the value in ``self.items``.
131 defaultItemValue: `Any`, optional
132 If provided, combine this value with any string values encountered
133 (including any returned by ``coerceUnrecognized``) to form a
134 `tuple` and add it to `items`, guaranteeing that `strings` will be
135 empty. Patterns are never added to `items`.
137 Returns
138 -------
139 categorized : `CategorizedWildcard` or ``...``.
140 The struct describing the wildcard. ``...`` is passed through
141 unchanged.
143 Raises
144 ------
145 TypeError
146 Raised if an unsupported type is found in the expression.
147 """
148 assert expression is not None
149 # See if we were given ...; just return that if we were.
150 if expression is Ellipsis:
151 if not allowAny:
152 raise TypeError("This expression may not be unconstrained.")
153 return Ellipsis
154 if isinstance(expression, cls):
155 # This is already a CategorizedWildcard. Make sure it meets the
156 # reqs. implied by the kwargs we got.
157 if not allowPatterns and expression.patterns:
158 raise TypeError(f"Regular expression(s) {expression.patterns} "
159 f"are not allowed in this context.")
160 if defaultItemValue is not None and expression.strings:
161 if expression.items:
162 raise TypeError("Incompatible preprocessed expression: an ordered sequence of str is "
163 "needed, but the original order was lost in the preprocessing.")
164 return cls(strings=[], patterns=expression.patterns,
165 items=[(k, defaultItemValue) for k in expression.strings])
166 elif defaultItemValue is None and expression.items:
167 if expression.strings:
168 raise TypeError("Incompatible preprocessed expression: an ordered sequence of items is "
169 "needed, but the original order was lost in the preprocessing.")
170 return cls(strings=[k for k, _ in expression.items], patterns=expression.patterns, items=[])
171 else:
172 # Original expression was created with keyword arguments that
173 # were at least as restrictive as what we just got; pass it
174 # through.
175 return expression
177 # If we get here, we know we'll be creating a new instance.
178 # Initialize an empty one now.
179 self = cls(strings=[], patterns=[], items=[])
181 # If mappings are allowed, see if we were given a single mapping by
182 # trying to get items.
183 if coerceItemValue is not None:
184 rawItems = None
185 try:
186 rawItems = expression.items()
187 except AttributeError:
188 pass
189 if rawItems is not None:
190 for k, v in rawItems:
191 try:
192 self.items.append((k, coerceItemValue(v)))
193 except Exception as err:
194 raise TypeError(f"Could not coerce mapping value '{v}' for key '{k}'.") from err
195 return self
197 # Not ..., a CategorizedWildcard instance, or a mapping. Just
198 # process scalars or an iterable. We put the body of the loop inside
199 # a local function so we can recurse after coercion.
201 def process(element: Any, alreadyCoerced: bool = False) -> Union[EllipsisType, None]:
202 if isinstance(element, str):
203 if defaultItemValue is not None:
204 self.items.append((element, defaultItemValue))
205 return None
206 else:
207 # This returns a list but we know we only passed in
208 # single value.
209 converted = globToRegex(element)
210 if converted is Ellipsis:
211 return Ellipsis
212 element = converted[0]
213 # Let regex and ... go through to the next check
214 if isinstance(element, str):
215 self.strings.append(element)
216 return None
217 if allowPatterns and isinstance(element, re.Pattern):
218 self.patterns.append(element)
219 return None
220 if coerceItemValue is not None:
221 try:
222 k, v = element
223 except TypeError:
224 pass
225 else:
226 if not alreadyCoerced:
227 if not isinstance(k, str):
228 raise TypeError(f"Item key '{k}' is not a string.")
229 try:
230 v = coerceItemValue(v)
231 except Exception as err:
232 raise TypeError(f"Could not coerce tuple item value '{v}' for key '{k}'."
233 ) from err
234 self.items.append((k, v))
235 return None
236 if alreadyCoerced:
237 raise TypeError(f"Object '{element!r}' returned by coercion function is still unrecognized.")
238 if coerceUnrecognized is not None:
239 try:
240 # This should be safe but flake8 cant tell that the
241 # function will be re-declared next function call
242 process(coerceUnrecognized(element), alreadyCoerced=True) # noqa: F821
243 except Exception as err:
244 raise TypeError(f"Could not coerce expression element '{element!r}'.") from err
245 else:
246 extra = "."
247 if isinstance(element, re.Pattern):
248 extra = " and patterns are not allowed."
249 raise TypeError(f"Unsupported object in wildcard expression: '{element!r}'{extra}")
250 return None
252 for element in iterable(expression):
253 retval = process(element)
254 if retval is Ellipsis:
255 # One of the globs matched everything
256 if not allowAny:
257 raise TypeError("This expression may not be unconstrained.")
258 return Ellipsis
259 del process
260 return self
262 def makeWhereExpression(self, column: sqlalchemy.sql.ColumnElement
263 ) -> Optional[sqlalchemy.sql.ColumnElement]:
264 """Transform the wildcard into a SQLAlchemy boolean expression suitable
265 for use in a WHERE clause.
267 Parameters
268 ----------
269 column : `sqlalchemy.sql.ColumnElement`
270 A string column in a table or query that should be compared to the
271 wildcard expression.
273 Returns
274 -------
275 where : `sqlalchemy.sql.ColumnElement` or `None`
276 A boolean SQL expression that evaluates to true if and only if
277 the value of ``column`` matches the wildcard. `None` is returned
278 if both `strings` and `patterns` are empty, and hence no match is
279 possible.
280 """
281 if self.items:
282 raise NotImplementedError("Expressions that are processed into items cannot be transformed "
283 "automatically into queries.")
284 if self.patterns:
285 raise NotImplementedError("Regular expression patterns are not yet supported here.")
286 terms = []
287 if len(self.strings) == 1:
288 terms.append(column == self.strings[0])
289 elif len(self.strings) > 1:
290 terms.append(column.in_(self.strings))
291 # TODO: append terms for regular expressions
292 if not terms:
293 return None
294 return sqlalchemy.sql.or_(*terms)
296 strings: List[str]
297 """Explicit string values found in the wildcard (`list` [ `str` ]).
298 """
300 patterns: List[re.Pattern]
301 """Regular expression patterns found in the wildcard
302 (`list` [ `re.Pattern` ]).
303 """
305 items: List[Tuple[str, Any]]
306 """Two-item tuples that relate string values to other objects
307 (`list` [ `tuple` [ `str`, `Any` ] ]).
308 """
311def _yieldCollectionRecords(
312 manager: CollectionManager,
313 record: CollectionRecord,
314 collectionTypes: AbstractSet[CollectionType] = CollectionType.all(),
315 done: Optional[Set[str]] = None,
316 flattenChains: bool = True,
317 includeChains: Optional[bool] = None,
318) -> Iterator[CollectionRecord]:
319 """A helper function containing common logic for `CollectionSearch.iter`
320 and `CollectionQuery.iter`: recursively yield `CollectionRecord` only if
321 they match the criteria given in other arguments.
323 Parameters
324 ----------
325 manager : `CollectionManager`
326 Object responsible for managing the collection tables in a `Registry`.
327 record : `CollectionRecord`
328 Record to conditionally yield.
329 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
330 If provided, only yield collections of these types.
331 done : `set` [ `str` ], optional
332 A `set` of already-yielded collection names; if provided, ``record``
333 will only be yielded if it is not already in ``done``, and ``done``
334 will be updated to include it on return.
335 flattenChains : `bool`, optional
336 If `True` (default) recursively yield the child collections of
337 `~CollectionType.CHAINED` collections.
338 includeChains : `bool`, optional
339 If `False`, return records for `~CollectionType.CHAINED` collections
340 themselves. The default is the opposite of ``flattenChains``: either
341 return records for CHAINED collections or their children, but not both.
343 Yields
344 ------
345 record : `CollectionRecord`
346 Matching collection records.
347 """
348 if done is None:
349 done = set()
350 includeChains = includeChains if includeChains is not None else not flattenChains
351 if record.type in collectionTypes:
352 done.add(record.name)
353 if record.type is not CollectionType.CHAINED or includeChains:
354 yield record
355 if flattenChains and record.type is CollectionType.CHAINED:
356 done.add(record.name)
357 # We know this is a ChainedCollectionRecord because of the enum value,
358 # but MyPy doesn't.
359 yield from record.children.iter( # type: ignore
360 manager,
361 collectionTypes=collectionTypes,
362 done=done,
363 flattenChains=flattenChains,
364 includeChains=includeChains,
365 )
368class CollectionSearch(BaseModel, Sequence[str]):
369 """An ordered search path of collections.
371 The `fromExpression` method should almost always be used to construct
372 instances, as the regular constructor performs no checking of inputs (and
373 that can lead to confusing error messages downstream).
375 Parameters
376 ----------
377 collections : `tuple` [ `str` ]
378 Tuple of collection names, ordered from the first searched to the last
379 searched.
381 Notes
382 -----
383 A `CollectionSearch` is used to find a single dataset (or set of datasets
384 with different dataset types or data IDs) according to its dataset type and
385 data ID, giving preference to collections in the order in which they are
386 specified. A `CollectionQuery` can be constructed from a broader range of
387 expressions but does not order the collections to be searched.
389 `CollectionSearch` is an immutable sequence of `str` collection names.
391 A `CollectionSearch` instance constructed properly (e.g. via
392 `fromExpression`) is a unique representation of a particular search path;
393 it is exactly the same internally and compares as equal to any
394 `CollectionSearch` constructed from an equivalent expression, regardless of
395 how different the original expressions appear.
396 """
397 __root__: Tuple[str, ...]
399 @classmethod
400 def fromExpression(cls, expression: Any) -> CollectionSearch:
401 """Process a general expression to construct a `CollectionSearch`
402 instance.
404 Parameters
405 ----------
406 expression
407 May be:
408 - a `str` collection name;
409 - an iterable of `str` collection names;
410 - another `CollectionSearch` instance (passed through
411 unchanged).
413 Duplicate entries will be removed (preserving the first appearance
414 of each collection name).
415 Returns
416 -------
417 collections : `CollectionSearch`
418 A `CollectionSearch` instance.
419 """
420 # First see if this is already a CollectionSearch; just pass that
421 # through unchanged. This lets us standardize expressions (and turn
422 # single-pass iterators into multi-pass iterables) in advance and pass
423 # them down to other routines that accept arbitrary expressions.
424 if isinstance(expression, cls):
425 return expression
426 wildcard = CategorizedWildcard.fromExpression(
427 expression,
428 allowAny=False,
429 allowPatterns=False,
430 )
431 assert wildcard is not Ellipsis
432 assert not wildcard.patterns
433 assert not wildcard.items
434 deduplicated = []
435 for name in wildcard.strings:
436 if name not in deduplicated:
437 deduplicated.append(name)
438 return cls(__root__=tuple(deduplicated))
440 def iter(
441 self, manager: CollectionManager, *,
442 datasetType: Optional[DatasetType] = None,
443 collectionTypes: AbstractSet[CollectionType] = CollectionType.all(),
444 done: Optional[Set[str]] = None,
445 flattenChains: bool = True,
446 includeChains: Optional[bool] = None,
447 ) -> Iterator[CollectionRecord]:
448 """Iterate over collection records that match this instance and the
449 given criteria, in order.
451 This method is primarily intended for internal use by `Registry`;
452 other callers should generally prefer `Registry.findDatasets` or
453 other `Registry` query methods.
455 Parameters
456 ----------
457 manager : `CollectionManager`
458 Object responsible for managing the collection tables in a
459 `Registry`.
460 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
461 If provided, only yield collections of these types.
462 done : `set`, optional
463 A `set` containing the names of all collections already yielded;
464 any collections whose names are already present in this set will
465 not be yielded again, and those yielded will be added to it while
466 iterating. If not provided, an empty `set` will be created and
467 used internally to avoid duplicates.
468 flattenChains : `bool`, optional
469 If `True` (default) recursively yield the child collections of
470 `~CollectionType.CHAINED` collections.
471 includeChains : `bool`, optional
472 If `False`, return records for `~CollectionType.CHAINED`
473 collections themselves. The default is the opposite of
474 ``flattenChains``: either return records for CHAINED collections or
475 their children, but not both.
477 Yields
478 ------
479 record : `CollectionRecord`
480 Matching collection records.
481 """
482 if done is None:
483 done = set()
484 for name in self:
485 if name not in done:
486 yield from _yieldCollectionRecords(
487 manager,
488 manager.find(name),
489 collectionTypes=collectionTypes,
490 done=done,
491 flattenChains=flattenChains,
492 includeChains=includeChains,
493 )
495 def explicitNames(self) -> Iterator[str]:
496 """Iterate over collection names that were specified explicitly.
497 """
498 yield from self.__root__
500 def __iter__(self) -> Iterator[str]: # type: ignore
501 yield from self.__root__
503 def __len__(self) -> int:
504 return len(self.__root__)
506 def __getitem__(self, index: Any) -> str:
507 return self.__root__[index]
509 def __eq__(self, other: Any) -> bool:
510 if isinstance(other, CollectionSearch):
511 return self.__root__ == other.__root__
512 return False
514 def __str__(self) -> str:
515 return "[{}]".format(", ".join(self))
517 def __repr__(self) -> str:
518 return f"CollectionSearch({self.__root__!r})"
521class CollectionQuery:
522 """An unordered query for collections and dataset type restrictions.
524 The `fromExpression` method should almost always be used to construct
525 instances, as the regular constructor performs no checking of inputs (and
526 that can lead to confusing error messages downstream).
528 Parameters
529 ----------
530 search : `CollectionSearch` or `...`
531 An object representing an ordered search for explicitly-named
532 collections (to be interpreted here as unordered), or the special
533 value `...` indicating all collections. `...` must be accompanied
534 by ``patterns=None``.
535 patterns : `tuple` of `re.Pattern`
536 Regular expression patterns to match against collection names.
537 universe : `DimensionUniverse`
538 Object managing all dimensions.
540 Notes
541 -----
542 A `CollectionQuery` is used to find all matching datasets in any number
543 of collections, or to find collections themselves.
545 `CollectionQuery` is expected to be rarely used outside of `Registry`
546 (which uses it to back several of its "query" methods that take general
547 expressions for collections), but it may occassionally be useful outside
548 `Registry` as a way to preprocess expressions that contain single-pass
549 iterators into a form that can be used to call those `Registry` methods
550 multiple times.
551 """
552 def __init__(
553 self,
554 search: Union[CollectionSearch, EllipsisType] = Ellipsis,
555 patterns: Tuple[re.Pattern, ...] = (),
556 ):
557 self._search = search
558 self._patterns = patterns
560 __slots__ = ("_search", "_patterns")
562 @classmethod
563 def fromExpression(cls, expression: Any) -> CollectionQuery:
564 """Process a general expression to construct a `CollectionQuery`
565 instance.
567 Parameters
568 ----------
569 expression
570 May be:
571 - a `str` collection name;
572 - an `re.Pattern` instance to match (with `re.Pattern.fullmatch`)
573 against collection names;
574 - any iterable containing any of the above;
575 - a `CollectionSearch` instance;
576 - another `CollectionQuery` instance (passed through unchanged).
578 Duplicate collection names will be removed (preserving the first
579 appearance of each collection name).
581 Returns
582 -------
583 collections : `CollectionQuery`
584 A `CollectionQuery` instance.
585 """
586 if isinstance(expression, cls):
587 return expression
588 if expression is Ellipsis:
589 return cls()
590 if isinstance(expression, CollectionSearch):
591 return cls(search=expression, patterns=())
592 wildcard = CategorizedWildcard.fromExpression(
593 expression,
594 allowAny=True,
595 allowPatterns=True,
596 )
597 if wildcard is Ellipsis:
598 return cls()
599 assert not wildcard.items, \
600 "We should no longer be transforming to (str, DatasetTypeRestriction) tuples."
601 return cls(
602 search=CollectionSearch.fromExpression(wildcard.strings),
603 patterns=tuple(wildcard.patterns),
604 )
606 def iter(
607 self, manager: CollectionManager, *,
608 collectionTypes: AbstractSet[CollectionType] = CollectionType.all(),
609 flattenChains: bool = True,
610 includeChains: Optional[bool] = None,
611 ) -> Iterator[CollectionRecord]:
612 """Iterate over collection records that match this instance and the
613 given criteria, in an arbitrary order.
615 This method is primarily intended for internal use by `Registry`;
616 other callers should generally prefer `Registry.queryDatasets` or
617 other `Registry` query methods.
619 Parameters
620 ----------
621 manager : `CollectionManager`
622 Object responsible for managing the collection tables in a
623 `Registry`.
624 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
625 If provided, only yield collections of these types.
626 flattenChains : `bool`, optional
627 If `True` (default) recursively yield the child collections of
628 `~CollectionType.CHAINED` collections.
629 includeChains : `bool`, optional
630 If `False`, return records for `~CollectionType.CHAINED`
631 collections themselves. The default is the opposite of
632 ``flattenChains``: either return records for CHAINED collections or
633 their children, but not both.
635 Yields
636 ------
637 record : `CollectionRecord`
638 Matching collection records.
639 """
640 if self._search is Ellipsis:
641 for record in manager:
642 yield from _yieldCollectionRecords(
643 manager,
644 record,
645 collectionTypes=collectionTypes,
646 flattenChains=flattenChains,
647 includeChains=includeChains,
648 )
649 else:
650 done: Set[str] = set()
651 yield from self._search.iter(
652 manager,
653 collectionTypes=collectionTypes,
654 done=done,
655 flattenChains=flattenChains,
656 includeChains=includeChains,
657 )
658 for record in manager:
659 if record.name not in done and any(p.fullmatch(record.name) for p in self._patterns):
660 yield from _yieldCollectionRecords(
661 manager,
662 record,
663 collectionTypes=collectionTypes,
664 done=done,
665 flattenChains=flattenChains,
666 includeChains=includeChains,
667 )
669 def explicitNames(self) -> Iterator[str]:
670 """Iterate over collection names that were specified explicitly.
671 """
672 if isinstance(self._search, CollectionSearch):
673 yield from self._search.explicitNames()
675 def __eq__(self, other: Any) -> bool:
676 if isinstance(other, CollectionQuery):
677 return self._search == other._search and self._patterns == other._patterns
678 else:
679 return False
681 def __str__(self) -> str:
682 if self._search is Ellipsis:
683 return "..."
684 else:
685 terms = list(self._search)
686 terms.extend(str(p) for p in self._patterns)
687 return "[{}]".format(", ".join(terms))
689 def __repr__(self) -> str:
690 return f"CollectionQuery({self._search!r}, {self._patterns!r})"