Coverage for python/lsst/daf/butler/registry/wildcards.py: 17%
212 statements
« prev ^ index » next coverage.py v6.4.2, created at 2022-08-04 02:20 -0700
« prev ^ index » next coverage.py v6.4.2, created at 2022-08-04 02:20 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "CategorizedWildcard",
25 "CollectionQuery",
26 "CollectionSearch",
27)
29import re
30from dataclasses import dataclass
31from typing import (
32 TYPE_CHECKING,
33 AbstractSet,
34 Any,
35 Callable,
36 Iterator,
37 List,
38 Optional,
39 Sequence,
40 Set,
41 Tuple,
42 Union,
43)
45import sqlalchemy
46from lsst.utils.iteration import ensure_iterable
47from pydantic import BaseModel
49from ..core import DatasetType
50from ..core.utils import globToRegex
51from ._collectionType import CollectionType
53if TYPE_CHECKING: 53 ↛ 64line 53 didn't jump to line 64, because the condition on line 53 was never true
54 # Workaround for `...` not having an exposed type in Python, borrowed from
55 # https://github.com/python/typing/issues/684#issuecomment-548203158
56 # Along with that, we need to either use `Ellipsis` instead of `...` for
57 # the actual sentinal value internally, and tell MyPy to ignore conversions
58 # from `...` to `Ellipsis` at the public-interface boundary.
59 #
60 # `Ellipsis` and `EllipsisType` should be directly imported from this
61 # module by related code that needs them; hopefully that will stay confined
62 # to `lsst.daf.butler.registry`. Putting these in __all__ is bad for
63 # Sphinx, and probably more confusing than helpful overall.
64 from enum import Enum
66 from .interfaces import CollectionManager, CollectionRecord
68 class EllipsisType(Enum):
69 Ellipsis = "..."
71 Ellipsis = EllipsisType.Ellipsis
73else:
74 EllipsisType = type(Ellipsis)
75 Ellipsis = Ellipsis
78@dataclass
79class CategorizedWildcard:
80 """The results of preprocessing a wildcard expression to separate match
81 patterns from strings.
83 The `fromExpression` method should almost always be used to construct
84 instances, as the regular constructor performs no checking of inputs (and
85 that can lead to confusing error messages downstream).
86 """
88 @classmethod
89 def fromExpression(
90 cls,
91 expression: Any,
92 *,
93 allowAny: bool = True,
94 allowPatterns: bool = True,
95 coerceUnrecognized: Optional[Callable[[Any], Union[Tuple[str, Any], str]]] = None,
96 coerceItemValue: Optional[Callable[[Any], Any]] = None,
97 defaultItemValue: Optional[Any] = None,
98 ) -> Union[CategorizedWildcard, EllipsisType]:
99 """Categorize a wildcard expression.
101 Parameters
102 ----------
103 expression
104 The expression to categorize. May be any of:
105 - `str` (including glob patterns if ``allowPatterns`` is `True`);
106 - `re.Pattern` (only if ``allowPatterns`` is `True`);
107 - objects recognized by ``coerceUnrecognized`` (if provided);
108 - two-element tuples of (`str`, value) where value is recognized
109 by ``coerceItemValue`` (if provided);
110 - a non-`str`, non-mapping iterable containing any of the above;
111 - the special value `...` (only if ``allowAny`` is `True`), which
112 matches anything;
113 - a mapping from `str` to a value are recognized by
114 ``coerceItemValue`` (if provided);
115 - a `CategorizedWildcard` instance (passed through unchanged if
116 it meets the requirements specified by keyword arguments).
117 allowAny: `bool`, optional
118 If `False` (`True` is default) raise `TypeError` if `...` is
119 encountered.
120 allowPatterns: `bool`, optional
121 If `False` (`True` is default) raise `TypeError` if a `re.Pattern`
122 is encountered, or if ``expression`` is a `CategorizedWildcard`
123 with `patterns` not empty.
124 coerceUnrecognized: `Callable`, optional
125 A callback that takes a single argument of arbitrary type and
126 returns either a `str` - appended to `strings` - or a `tuple` of
127 (`str`, `Any`) to be appended to `items`. This will be called on
128 objects of unrecognized type, with the return value added to
129 `strings`. Exceptions will be reraised as `TypeError` (and
130 chained).
131 coerceItemValue: `Callable`, optional
132 If provided, ``expression`` may be a mapping from `str` to any
133 type that can be passed to this function; the result of that call
134 will be stored instead as the value in ``self.items``.
135 defaultItemValue: `Any`, optional
136 If provided, combine this value with any string values encountered
137 (including any returned by ``coerceUnrecognized``) to form a
138 `tuple` and add it to `items`, guaranteeing that `strings` will be
139 empty. Patterns are never added to `items`.
141 Returns
142 -------
143 categorized : `CategorizedWildcard` or ``...``.
144 The struct describing the wildcard. ``...`` is passed through
145 unchanged.
147 Raises
148 ------
149 TypeError
150 Raised if an unsupported type is found in the expression.
151 """
152 assert expression is not None
153 # See if we were given ...; just return that if we were.
154 if expression is Ellipsis:
155 if not allowAny:
156 raise TypeError("This expression may not be unconstrained.")
157 return Ellipsis
158 if isinstance(expression, cls):
159 # This is already a CategorizedWildcard. Make sure it meets the
160 # reqs. implied by the kwargs we got.
161 if not allowPatterns and expression.patterns:
162 raise TypeError(
163 f"Regular expression(s) {expression.patterns} are not allowed in this context."
164 )
165 if defaultItemValue is not None and expression.strings:
166 if expression.items:
167 raise TypeError(
168 "Incompatible preprocessed expression: an ordered sequence of str is "
169 "needed, but the original order was lost in the preprocessing."
170 )
171 return cls(
172 strings=[],
173 patterns=expression.patterns,
174 items=[(k, defaultItemValue) for k in expression.strings],
175 )
176 elif defaultItemValue is None and expression.items:
177 if expression.strings:
178 raise TypeError(
179 "Incompatible preprocessed expression: an ordered sequence of items is "
180 "needed, but the original order was lost in the preprocessing."
181 )
182 return cls(strings=[k for k, _ in expression.items], patterns=expression.patterns, items=[])
183 else:
184 # Original expression was created with keyword arguments that
185 # were at least as restrictive as what we just got; pass it
186 # through.
187 return expression
189 # If we get here, we know we'll be creating a new instance.
190 # Initialize an empty one now.
191 self = cls(strings=[], patterns=[], items=[])
193 # If mappings are allowed, see if we were given a single mapping by
194 # trying to get items.
195 if coerceItemValue is not None:
196 rawItems = None
197 try:
198 rawItems = expression.items()
199 except AttributeError:
200 pass
201 if rawItems is not None:
202 for k, v in rawItems:
203 try:
204 self.items.append((k, coerceItemValue(v)))
205 except Exception as err:
206 raise TypeError(f"Could not coerce mapping value '{v}' for key '{k}'.") from err
207 return self
209 # Not ..., a CategorizedWildcard instance, or a mapping. Just
210 # process scalars or an iterable. We put the body of the loop inside
211 # a local function so we can recurse after coercion.
213 def process(element: Any, alreadyCoerced: bool = False) -> Union[EllipsisType, None]:
214 if isinstance(element, str):
215 if defaultItemValue is not None:
216 self.items.append((element, defaultItemValue))
217 return None
218 else:
219 # This returns a list but we know we only passed in
220 # single value.
221 converted = globToRegex(element)
222 if converted is Ellipsis:
223 return Ellipsis
224 element = converted[0]
225 # Let regex and ... go through to the next check
226 if isinstance(element, str):
227 self.strings.append(element)
228 return None
229 if allowPatterns and isinstance(element, re.Pattern):
230 self.patterns.append(element)
231 return None
232 if coerceItemValue is not None:
233 try:
234 k, v = element
235 except TypeError:
236 pass
237 else:
238 if not alreadyCoerced:
239 if not isinstance(k, str):
240 raise TypeError(f"Item key '{k}' is not a string.")
241 try:
242 v = coerceItemValue(v)
243 except Exception as err:
244 raise TypeError(
245 f"Could not coerce tuple item value '{v}' for key '{k}'."
246 ) from err
247 self.items.append((k, v))
248 return None
249 if alreadyCoerced:
250 raise TypeError(f"Object '{element!r}' returned by coercion function is still unrecognized.")
251 if coerceUnrecognized is not None:
252 try:
253 # This should be safe but flake8 cant tell that the
254 # function will be re-declared next function call
255 process(coerceUnrecognized(element), alreadyCoerced=True) # noqa: F821
256 except Exception as err:
257 raise TypeError(f"Could not coerce expression element '{element!r}'.") from err
258 else:
259 extra = "."
260 if isinstance(element, re.Pattern):
261 extra = " and patterns are not allowed."
262 raise TypeError(f"Unsupported object in wildcard expression: '{element!r}'{extra}")
263 return None
265 for element in ensure_iterable(expression):
266 retval = process(element)
267 if retval is Ellipsis:
268 # One of the globs matched everything
269 if not allowAny:
270 raise TypeError("This expression may not be unconstrained.")
271 return Ellipsis
272 del process
273 return self
275 def makeWhereExpression(
276 self, column: sqlalchemy.sql.ColumnElement
277 ) -> Optional[sqlalchemy.sql.ColumnElement]:
278 """Transform the wildcard into a SQLAlchemy boolean expression suitable
279 for use in a WHERE clause.
281 Parameters
282 ----------
283 column : `sqlalchemy.sql.ColumnElement`
284 A string column in a table or query that should be compared to the
285 wildcard expression.
287 Returns
288 -------
289 where : `sqlalchemy.sql.ColumnElement` or `None`
290 A boolean SQL expression that evaluates to true if and only if
291 the value of ``column`` matches the wildcard. `None` is returned
292 if both `strings` and `patterns` are empty, and hence no match is
293 possible.
294 """
295 if self.items:
296 raise NotImplementedError(
297 "Expressions that are processed into items cannot be transformed "
298 "automatically into queries."
299 )
300 if self.patterns:
301 raise NotImplementedError("Regular expression patterns are not yet supported here.")
302 terms = []
303 if len(self.strings) == 1:
304 terms.append(column == self.strings[0])
305 elif len(self.strings) > 1:
306 terms.append(column.in_(self.strings))
307 # TODO: append terms for regular expressions
308 if not terms:
309 return None
310 return sqlalchemy.sql.or_(*terms)
312 strings: List[str]
313 """Explicit string values found in the wildcard (`list` [ `str` ]).
314 """
316 patterns: List[re.Pattern]
317 """Regular expression patterns found in the wildcard
318 (`list` [ `re.Pattern` ]).
319 """
321 items: List[Tuple[str, Any]]
322 """Two-item tuples that relate string values to other objects
323 (`list` [ `tuple` [ `str`, `Any` ] ]).
324 """
327def _yieldCollectionRecords(
328 manager: CollectionManager,
329 record: CollectionRecord,
330 collectionTypes: AbstractSet[CollectionType] = CollectionType.all(),
331 done: Optional[Set[str]] = None,
332 flattenChains: bool = True,
333 includeChains: Optional[bool] = None,
334) -> Iterator[CollectionRecord]:
335 """A helper function containing common logic for `CollectionSearch.iter`
336 and `CollectionQuery.iter`: recursively yield `CollectionRecord` only if
337 they match the criteria given in other arguments.
339 Parameters
340 ----------
341 manager : `CollectionManager`
342 Object responsible for managing the collection tables in a `Registry`.
343 record : `CollectionRecord`
344 Record to conditionally yield.
345 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
346 If provided, only yield collections of these types.
347 done : `set` [ `str` ], optional
348 A `set` of already-yielded collection names; if provided, ``record``
349 will only be yielded if it is not already in ``done``, and ``done``
350 will be updated to include it on return.
351 flattenChains : `bool`, optional
352 If `True` (default) recursively yield the child collections of
353 `~CollectionType.CHAINED` collections.
354 includeChains : `bool`, optional
355 If `False`, return records for `~CollectionType.CHAINED` collections
356 themselves. The default is the opposite of ``flattenChains``: either
357 return records for CHAINED collections or their children, but not both.
359 Yields
360 ------
361 record : `CollectionRecord`
362 Matching collection records.
363 """
364 if done is None:
365 done = set()
366 includeChains = includeChains if includeChains is not None else not flattenChains
367 if record.type in collectionTypes:
368 done.add(record.name)
369 if record.type is not CollectionType.CHAINED or includeChains:
370 yield record
371 if flattenChains and record.type is CollectionType.CHAINED:
372 done.add(record.name)
373 # We know this is a ChainedCollectionRecord because of the enum value,
374 # but MyPy doesn't.
375 yield from record.children.iter( # type: ignore
376 manager,
377 collectionTypes=collectionTypes,
378 done=done,
379 flattenChains=flattenChains,
380 includeChains=includeChains,
381 )
384class CollectionSearch(BaseModel, Sequence[str]):
385 """An ordered search path of collections.
387 The `fromExpression` method should almost always be used to construct
388 instances, as the regular constructor performs no checking of inputs (and
389 that can lead to confusing error messages downstream).
391 Parameters
392 ----------
393 collections : `tuple` [ `str` ]
394 Tuple of collection names, ordered from the first searched to the last
395 searched.
397 Notes
398 -----
399 A `CollectionSearch` is used to find a single dataset (or set of datasets
400 with different dataset types or data IDs) according to its dataset type and
401 data ID, giving preference to collections in the order in which they are
402 specified. A `CollectionQuery` can be constructed from a broader range of
403 expressions but does not order the collections to be searched.
405 `CollectionSearch` is an immutable sequence of `str` collection names.
407 A `CollectionSearch` instance constructed properly (e.g. via
408 `fromExpression`) is a unique representation of a particular search path;
409 it is exactly the same internally and compares as equal to any
410 `CollectionSearch` constructed from an equivalent expression, regardless of
411 how different the original expressions appear.
412 """
414 __root__: Tuple[str, ...]
416 @classmethod
417 def fromExpression(cls, expression: Any) -> CollectionSearch:
418 """Process a general expression to construct a `CollectionSearch`
419 instance.
421 Parameters
422 ----------
423 expression
424 May be:
425 - a `str` collection name;
426 - an iterable of `str` collection names;
427 - another `CollectionSearch` instance (passed through
428 unchanged).
430 Duplicate entries will be removed (preserving the first appearance
431 of each collection name).
432 Returns
433 -------
434 collections : `CollectionSearch`
435 A `CollectionSearch` instance.
436 """
437 # First see if this is already a CollectionSearch; just pass that
438 # through unchanged. This lets us standardize expressions (and turn
439 # single-pass iterators into multi-pass iterables) in advance and pass
440 # them down to other routines that accept arbitrary expressions.
441 if isinstance(expression, cls):
442 return expression
443 wildcard = CategorizedWildcard.fromExpression(
444 expression,
445 allowAny=False,
446 allowPatterns=False,
447 )
448 assert wildcard is not Ellipsis
449 assert not wildcard.patterns
450 assert not wildcard.items
451 deduplicated = []
452 for name in wildcard.strings:
453 if name not in deduplicated:
454 deduplicated.append(name)
455 return cls(__root__=tuple(deduplicated))
457 def iter(
458 self,
459 manager: CollectionManager,
460 *,
461 datasetType: Optional[DatasetType] = None,
462 collectionTypes: AbstractSet[CollectionType] = CollectionType.all(),
463 done: Optional[Set[str]] = None,
464 flattenChains: bool = True,
465 includeChains: Optional[bool] = None,
466 ) -> Iterator[CollectionRecord]:
467 """Iterate over collection records that match this instance and the
468 given criteria, in order.
470 This method is primarily intended for internal use by `Registry`;
471 other callers should generally prefer `Registry.findDatasets` or
472 other `Registry` query methods.
474 Parameters
475 ----------
476 manager : `CollectionManager`
477 Object responsible for managing the collection tables in a
478 `Registry`.
479 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
480 If provided, only yield collections of these types.
481 done : `set`, optional
482 A `set` containing the names of all collections already yielded;
483 any collections whose names are already present in this set will
484 not be yielded again, and those yielded will be added to it while
485 iterating. If not provided, an empty `set` will be created and
486 used internally to avoid duplicates.
487 flattenChains : `bool`, optional
488 If `True` (default) recursively yield the child collections of
489 `~CollectionType.CHAINED` collections.
490 includeChains : `bool`, optional
491 If `False`, return records for `~CollectionType.CHAINED`
492 collections themselves. The default is the opposite of
493 ``flattenChains``: either return records for CHAINED collections or
494 their children, but not both.
496 Yields
497 ------
498 record : `CollectionRecord`
499 Matching collection records.
500 """
501 if done is None:
502 done = set()
503 for name in self:
504 if name not in done:
505 yield from _yieldCollectionRecords(
506 manager,
507 manager.find(name),
508 collectionTypes=collectionTypes,
509 done=done,
510 flattenChains=flattenChains,
511 includeChains=includeChains,
512 )
514 def explicitNames(self) -> Iterator[str]:
515 """Iterate over collection names that were specified explicitly."""
516 yield from self.__root__
518 def __iter__(self) -> Iterator[str]: # type: ignore
519 yield from self.__root__
521 def __len__(self) -> int:
522 return len(self.__root__)
524 def __getitem__(self, index: Any) -> str:
525 return self.__root__[index]
527 def __eq__(self, other: Any) -> bool:
528 if isinstance(other, CollectionSearch):
529 return self.__root__ == other.__root__
530 return False
532 def __str__(self) -> str:
533 return "[{}]".format(", ".join(self))
535 def __repr__(self) -> str:
536 return f"CollectionSearch({self.__root__!r})"
539class CollectionQuery:
540 """An unordered query for collections and dataset type restrictions.
542 The `fromExpression` method should almost always be used to construct
543 instances, as the regular constructor performs no checking of inputs (and
544 that can lead to confusing error messages downstream).
546 Parameters
547 ----------
548 search : `CollectionSearch` or `...`
549 An object representing an ordered search for explicitly-named
550 collections (to be interpreted here as unordered), or the special
551 value `...` indicating all collections. `...` must be accompanied
552 by ``patterns=None``.
553 patterns : `tuple` of `re.Pattern`
554 Regular expression patterns to match against collection names.
555 universe : `DimensionUniverse`
556 Object managing all dimensions.
558 Notes
559 -----
560 A `CollectionQuery` is used to find all matching datasets in any number
561 of collections, or to find collections themselves.
563 `CollectionQuery` is expected to be rarely used outside of `Registry`
564 (which uses it to back several of its "query" methods that take general
565 expressions for collections), but it may occassionally be useful outside
566 `Registry` as a way to preprocess expressions that contain single-pass
567 iterators into a form that can be used to call those `Registry` methods
568 multiple times.
569 """
571 def __init__(
572 self,
573 search: Union[CollectionSearch, EllipsisType] = Ellipsis,
574 patterns: Tuple[re.Pattern, ...] = (),
575 ):
576 self._search = search
577 self._patterns = patterns
579 __slots__ = ("_search", "_patterns")
581 @classmethod
582 def fromExpression(cls, expression: Any) -> CollectionQuery:
583 """Process a general expression to construct a `CollectionQuery`
584 instance.
586 Parameters
587 ----------
588 expression
589 May be:
590 - a `str` collection name;
591 - an `re.Pattern` instance to match (with `re.Pattern.fullmatch`)
592 against collection names;
593 - any iterable containing any of the above;
594 - a `CollectionSearch` instance;
595 - another `CollectionQuery` instance (passed through unchanged).
597 Duplicate collection names will be removed (preserving the first
598 appearance of each collection name).
600 Returns
601 -------
602 collections : `CollectionQuery`
603 A `CollectionQuery` instance.
604 """
605 if isinstance(expression, cls):
606 return expression
607 if expression is Ellipsis:
608 return cls()
609 if isinstance(expression, CollectionSearch):
610 return cls(search=expression, patterns=())
611 wildcard = CategorizedWildcard.fromExpression(
612 expression,
613 allowAny=True,
614 allowPatterns=True,
615 )
616 if wildcard is Ellipsis:
617 return cls()
618 assert (
619 not wildcard.items
620 ), "We should no longer be transforming to (str, DatasetTypeRestriction) tuples."
621 return cls(
622 search=CollectionSearch.fromExpression(wildcard.strings),
623 patterns=tuple(wildcard.patterns),
624 )
626 def iter(
627 self,
628 manager: CollectionManager,
629 *,
630 collectionTypes: AbstractSet[CollectionType] = CollectionType.all(),
631 flattenChains: bool = True,
632 includeChains: Optional[bool] = None,
633 ) -> Iterator[CollectionRecord]:
634 """Iterate over collection records that match this instance and the
635 given criteria, in an arbitrary order.
637 This method is primarily intended for internal use by `Registry`;
638 other callers should generally prefer `Registry.queryDatasets` or
639 other `Registry` query methods.
641 Parameters
642 ----------
643 manager : `CollectionManager`
644 Object responsible for managing the collection tables in a
645 `Registry`.
646 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
647 If provided, only yield collections of these types.
648 flattenChains : `bool`, optional
649 If `True` (default) recursively yield the child collections of
650 `~CollectionType.CHAINED` collections.
651 includeChains : `bool`, optional
652 If `False`, return records for `~CollectionType.CHAINED`
653 collections themselves. The default is the opposite of
654 ``flattenChains``: either return records for CHAINED collections or
655 their children, but not both.
657 Yields
658 ------
659 record : `CollectionRecord`
660 Matching collection records.
661 """
662 if self._search is Ellipsis:
663 for record in manager:
664 yield from _yieldCollectionRecords(
665 manager,
666 record,
667 collectionTypes=collectionTypes,
668 flattenChains=flattenChains,
669 includeChains=includeChains,
670 )
671 else:
672 done: Set[str] = set()
673 yield from self._search.iter(
674 manager,
675 collectionTypes=collectionTypes,
676 done=done,
677 flattenChains=flattenChains,
678 includeChains=includeChains,
679 )
680 for record in manager:
681 if record.name not in done and any(p.fullmatch(record.name) for p in self._patterns):
682 yield from _yieldCollectionRecords(
683 manager,
684 record,
685 collectionTypes=collectionTypes,
686 done=done,
687 flattenChains=flattenChains,
688 includeChains=includeChains,
689 )
691 def explicitNames(self) -> Iterator[str]:
692 """Iterate over collection names that were specified explicitly."""
693 if isinstance(self._search, CollectionSearch):
694 yield from self._search.explicitNames()
696 def __eq__(self, other: Any) -> bool:
697 if isinstance(other, CollectionQuery):
698 return self._search == other._search and self._patterns == other._patterns
699 else:
700 return False
702 def __str__(self) -> str:
703 if self._search is Ellipsis:
704 return "..."
705 else:
706 terms = list(self._search)
707 terms.extend(str(p) for p in self._patterns)
708 return "[{}]".format(", ".join(terms))
710 def __repr__(self) -> str:
711 return f"CollectionQuery({self._search!r}, {self._patterns!r})"