Coverage for python/lsst/daf/butler/registry/wildcards.py : 14%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "CategorizedWildcard",
25 "CollectionQuery",
26 "CollectionSearch",
27)
29from pydantic import BaseModel
30from dataclasses import dataclass
31import re
32from typing import (
33 AbstractSet,
34 Any,
35 Callable,
36 Iterator,
37 List,
38 Optional,
39 Sequence,
40 Set,
41 Tuple,
42 TYPE_CHECKING,
43 Union,
44)
46import sqlalchemy
48from ..core import DatasetType
49from ..core.utils import iterable, globToRegex
50from ._collectionType import CollectionType
52if TYPE_CHECKING: 52 ↛ 53line 52 didn't jump to line 53, because the condition on line 52 was never true
53 from .interfaces import CollectionManager, CollectionRecord
55 # Workaround for `...` not having an exposed type in Python, borrowed from
56 # https://github.com/python/typing/issues/684#issuecomment-548203158
57 # Along with that, we need to either use `Ellipsis` instead of `...` for
58 # the actual sentinal value internally, and tell MyPy to ignore conversions
59 # from `...` to `Ellipsis` at the public-interface boundary.
60 #
61 # `Ellipsis` and `EllipsisType` should be directly imported from this
62 # module by related code that needs them; hopefully that will stay confined
63 # to `lsst.daf.butler.registry`. Putting these in __all__ is bad for
64 # Sphinx, and probably more confusing than helpful overall.
65 from enum import Enum
67 class EllipsisType(Enum):
68 Ellipsis = "..."
70 Ellipsis = EllipsisType.Ellipsis
72else:
73 EllipsisType = type(Ellipsis)
74 Ellipsis = Ellipsis
77@dataclass
78class CategorizedWildcard:
79 """The results of preprocessing a wildcard expression to separate match
80 patterns from strings.
82 The `fromExpression` method should almost always be used to construct
83 instances, as the regular constructor performs no checking of inputs (and
84 that can lead to confusing error messages downstream).
85 """
87 @classmethod
88 def fromExpression(cls, expression: Any, *,
89 allowAny: bool = True,
90 allowPatterns: bool = True,
91 coerceUnrecognized: Optional[Callable[[Any], Union[Tuple[str, Any], str]]] = None,
92 coerceItemValue: Optional[Callable[[Any], Any]] = None,
93 defaultItemValue: Optional[Any] = None,
94 ) -> Union[CategorizedWildcard, EllipsisType]:
95 """Categorize a wildcard expression.
97 Parameters
98 ----------
99 expression
100 The expression to categorize. May be any of:
101 - `str` (including glob patterns if ``allowPatterns`` is `True`);
102 - `re.Pattern` (only if ``allowPatterns`` is `True`);
103 - objects recognized by ``coerceUnrecognized`` (if provided);
104 - two-element tuples of (`str`, value) where value is recognized
105 by ``coerceItemValue`` (if provided);
106 - a non-`str`, non-mapping iterable containing any of the above;
107 - the special value `...` (only if ``allowAny`` is `True`), which
108 matches anything;
109 - a mapping from `str` to a value are recognized by
110 ``coerceItemValue`` (if provided);
111 - a `CategorizedWildcard` instance (passed through unchanged if
112 it meets the requirements specified by keyword arguments).
113 allowAny: `bool`, optional
114 If `False` (`True` is default) raise `TypeError` if `...` is
115 encountered.
116 allowPatterns: `bool`, optional
117 If `False` (`True` is default) raise `TypeError` if a `re.Pattern`
118 is encountered, or if ``expression`` is a `CategorizedWildcard`
119 with `patterns` not empty.
120 coerceUnrecognized: `Callable`, optional
121 A callback that takes a single argument of arbitrary type and
122 returns either a `str` - appended to `strings` - or a `tuple` of
123 (`str`, `Any`) to be appended to `items`. This will be called on
124 objects of unrecognized type, with the return value added to
125 `strings`. Exceptions will be reraised as `TypeError` (and
126 chained).
127 coerceItemValue: `Callable`, optional
128 If provided, ``expression`` may be a mapping from `str` to any
129 type that can be passed to this function; the result of that call
130 will be stored instead as the value in ``self.items``.
131 defaultItemValue: `Any`, optional
132 If provided, combine this value with any string values encountered
133 (including any returned by ``coerceUnrecognized``) to form a
134 `tuple` and add it to `items`, guaranteeing that `strings` will be
135 empty. Patterns are never added to `items`.
137 Returns
138 -------
139 categorized : `CategorizedWildcard` or ``...``.
140 The struct describing the wildcard. ``...`` is passed through
141 unchanged.
143 Raises
144 ------
145 TypeError
146 Raised if an unsupported type is found in the expression.
147 """
148 assert expression is not None
149 # See if we were given ...; just return that if we were.
150 if expression is Ellipsis:
151 if not allowAny:
152 raise TypeError("This expression may not be unconstrained.")
153 return Ellipsis
154 if isinstance(expression, cls):
155 # This is already a CategorizedWildcard. Make sure it meets the
156 # reqs. implied by the kwargs we got.
157 if not allowPatterns and expression.patterns:
158 raise TypeError(f"Regular expression(s) {expression.patterns} "
159 f"are not allowed in this context.")
160 if defaultItemValue is not None and expression.strings:
161 if expression.items:
162 raise TypeError("Incompatible preprocessed expression: an ordered sequence of str is "
163 "needed, but the original order was lost in the preprocessing.")
164 return cls(strings=[], patterns=expression.patterns,
165 items=[(k, defaultItemValue) for k in expression.strings])
166 elif defaultItemValue is None and expression.items:
167 if expression.strings:
168 raise TypeError("Incompatible preprocessed expression: an ordered sequence of items is "
169 "needed, but the original order was lost in the preprocessing.")
170 return cls(strings=[k for k, _ in expression.items], patterns=expression.patterns, items=[])
171 else:
172 # Original expression was created with keyword arguments that
173 # were at least as restrictive as what we just got; pass it
174 # through.
175 return expression
177 # If we get here, we know we'll be creating a new instance.
178 # Initialize an empty one now.
179 self = cls(strings=[], patterns=[], items=[])
181 # If mappings are allowed, see if we were given a single mapping by
182 # trying to get items.
183 if coerceItemValue is not None:
184 rawItems = None
185 try:
186 rawItems = expression.items()
187 except AttributeError:
188 pass
189 if rawItems is not None:
190 for k, v in rawItems:
191 try:
192 self.items.append((k, coerceItemValue(v)))
193 except Exception as err:
194 raise TypeError(f"Could not coerce mapping value '{v}' for key '{k}'.") from err
195 return self
197 # Not ..., a CategorizedWildcard instance, or a mapping. Just
198 # process scalars or an iterable. We put the body of the loop inside
199 # a local function so we can recurse after coercion.
201 def process(element: Any, alreadyCoerced: bool = False) -> Union[EllipsisType, None]:
202 if isinstance(element, str):
203 if defaultItemValue is not None:
204 self.items.append((element, defaultItemValue))
205 return None
206 else:
207 # This returns a list but we know we only passed in
208 # single value.
209 converted = globToRegex(element)
210 if converted is Ellipsis:
211 return Ellipsis
212 element = converted[0]
213 # Let regex and ... go through to the next check
214 if isinstance(element, str):
215 self.strings.append(element)
216 return None
217 if allowPatterns and isinstance(element, re.Pattern):
218 self.patterns.append(element)
219 return None
220 if coerceItemValue is not None:
221 try:
222 k, v = element
223 except TypeError:
224 pass
225 else:
226 if not alreadyCoerced:
227 if not isinstance(k, str):
228 raise TypeError(f"Item key '{k}' is not a string.")
229 try:
230 v = coerceItemValue(v)
231 except Exception as err:
232 raise TypeError(f"Could not coerce tuple item value '{v}' for key '{k}'."
233 ) from err
234 self.items.append((k, v))
235 return None
236 if alreadyCoerced:
237 raise TypeError(f"Object '{element!r}' returned by coercion function is still unrecognized.")
238 if coerceUnrecognized is not None:
239 try:
240 process(coerceUnrecognized(element), alreadyCoerced=True)
241 except Exception as err:
242 raise TypeError(f"Could not coerce expression element '{element!r}'.") from err
243 else:
244 extra = "."
245 if isinstance(element, re.Pattern):
246 extra = " and patterns are not allowed."
247 raise TypeError(f"Unsupported object in wildcard expression: '{element!r}'{extra}")
248 return None
250 for element in iterable(expression):
251 retval = process(element)
252 if retval is Ellipsis:
253 # One of the globs matched everything
254 if not allowAny:
255 raise TypeError("This expression may not be unconstrained.")
256 return Ellipsis
257 return self
259 def makeWhereExpression(self, column: sqlalchemy.sql.ColumnElement
260 ) -> Optional[sqlalchemy.sql.ColumnElement]:
261 """Transform the wildcard into a SQLAlchemy boolean expression suitable
262 for use in a WHERE clause.
264 Parameters
265 ----------
266 column : `sqlalchemy.sql.ColumnElement`
267 A string column in a table or query that should be compared to the
268 wildcard expression.
270 Returns
271 -------
272 where : `sqlalchemy.sql.ColumnElement` or `None`
273 A boolean SQL expression that evaluates to true if and only if
274 the value of ``column`` matches the wildcard. `None` is returned
275 if both `strings` and `patterns` are empty, and hence no match is
276 possible.
277 """
278 if self.items:
279 raise NotImplementedError("Expressions that are processed into items cannot be transformed "
280 "automatically into queries.")
281 if self.patterns:
282 raise NotImplementedError("Regular expression patterns are not yet supported here.")
283 terms = []
284 if len(self.strings) == 1:
285 terms.append(column == self.strings[0])
286 elif len(self.strings) > 1:
287 terms.append(column.in_(self.strings))
288 # TODO: append terms for regular expressions
289 if not terms:
290 return None
291 return sqlalchemy.sql.or_(*terms)
293 strings: List[str]
294 """Explicit string values found in the wildcard (`list` [ `str` ]).
295 """
297 patterns: List[re.Pattern]
298 """Regular expression patterns found in the wildcard
299 (`list` [ `re.Pattern` ]).
300 """
302 items: List[Tuple[str, Any]]
303 """Two-item tuples that relate string values to other objects
304 (`list` [ `tuple` [ `str`, `Any` ] ]).
305 """
308def _yieldCollectionRecords(
309 manager: CollectionManager,
310 record: CollectionRecord,
311 collectionTypes: AbstractSet[CollectionType] = CollectionType.all(),
312 done: Optional[Set[str]] = None,
313 flattenChains: bool = True,
314 includeChains: Optional[bool] = None,
315) -> Iterator[CollectionRecord]:
316 """A helper function containing common logic for `CollectionSearch.iter`
317 and `CollectionQuery.iter`: recursively yield `CollectionRecord` only if
318 they match the criteria given in other arguments.
320 Parameters
321 ----------
322 manager : `CollectionManager`
323 Object responsible for managing the collection tables in a `Registry`.
324 record : `CollectionRecord`
325 Record to conditionally yield.
326 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
327 If provided, only yield collections of these types.
328 done : `set` [ `str` ], optional
329 A `set` of already-yielded collection names; if provided, ``record``
330 will only be yielded if it is not already in ``done``, and ``done``
331 will be updated to include it on return.
332 flattenChains : `bool`, optional
333 If `True` (default) recursively yield the child collections of
334 `~CollectionType.CHAINED` collections.
335 includeChains : `bool`, optional
336 If `False`, return records for `~CollectionType.CHAINED` collections
337 themselves. The default is the opposite of ``flattenChains``: either
338 return records for CHAINED collections or their children, but not both.
340 Yields
341 ------
342 record : `CollectionRecord`
343 Matching collection records.
344 """
345 if done is None:
346 done = set()
347 includeChains = includeChains if includeChains is not None else not flattenChains
348 if record.type in collectionTypes:
349 done.add(record.name)
350 if record.type is not CollectionType.CHAINED or includeChains:
351 yield record
352 if flattenChains and record.type is CollectionType.CHAINED:
353 done.add(record.name)
354 # We know this is a ChainedCollectionRecord because of the enum value,
355 # but MyPy doesn't.
356 yield from record.children.iter( # type: ignore
357 manager,
358 collectionTypes=collectionTypes,
359 done=done,
360 flattenChains=flattenChains,
361 includeChains=includeChains,
362 )
365class CollectionSearch(BaseModel, Sequence[str]):
366 """An ordered search path of collections.
368 The `fromExpression` method should almost always be used to construct
369 instances, as the regular constructor performs no checking of inputs (and
370 that can lead to confusing error messages downstream).
372 Parameters
373 ----------
374 collections : `tuple` [ `str` ]
375 Tuple of collection names, ordered from the first searched to the last
376 searched.
378 Notes
379 -----
380 A `CollectionSearch` is used to find a single dataset (or set of datasets
381 with different dataset types or data IDs) according to its dataset type and
382 data ID, giving preference to collections in the order in which they are
383 specified. A `CollectionQuery` can be constructed from a broader range of
384 expressions but does not order the collections to be searched.
386 `CollectionSearch` is an immutable sequence of `str` collection names.
388 A `CollectionSearch` instance constructed properly (e.g. via
389 `fromExpression`) is a unique representation of a particular search path;
390 it is exactly the same internally and compares as equal to any
391 `CollectionSearch` constructed from an equivalent expression, regardless of
392 how different the original expressions appear.
393 """
394 __root__: Tuple[str, ...]
396 @classmethod
397 def fromExpression(cls, expression: Any) -> CollectionSearch:
398 """Process a general expression to construct a `CollectionSearch`
399 instance.
401 Parameters
402 ----------
403 expression
404 May be:
405 - a `str` collection name;
406 - an iterable of `str` collection names;
407 - another `CollectionSearch` instance (passed through
408 unchanged).
410 Duplicate entries will be removed (preserving the first appearance
411 of each collection name).
412 Returns
413 -------
414 collections : `CollectionSearch`
415 A `CollectionSearch` instance.
416 """
417 # First see if this is already a CollectionSearch; just pass that
418 # through unchanged. This lets us standardize expressions (and turn
419 # single-pass iterators into multi-pass iterables) in advance and pass
420 # them down to other routines that accept arbitrary expressions.
421 if isinstance(expression, cls):
422 return expression
423 wildcard = CategorizedWildcard.fromExpression(
424 expression,
425 allowAny=False,
426 allowPatterns=False,
427 )
428 assert wildcard is not Ellipsis
429 assert not wildcard.patterns
430 assert not wildcard.items
431 deduplicated = []
432 for name in wildcard.strings:
433 if name not in deduplicated:
434 deduplicated.append(name)
435 return cls(__root__=tuple(deduplicated))
437 def iter(
438 self, manager: CollectionManager, *,
439 datasetType: Optional[DatasetType] = None,
440 collectionTypes: AbstractSet[CollectionType] = CollectionType.all(),
441 done: Optional[Set[str]] = None,
442 flattenChains: bool = True,
443 includeChains: Optional[bool] = None,
444 ) -> Iterator[CollectionRecord]:
445 """Iterate over collection records that match this instance and the
446 given criteria, in order.
448 This method is primarily intended for internal use by `Registry`;
449 other callers should generally prefer `Registry.findDatasets` or
450 other `Registry` query methods.
452 Parameters
453 ----------
454 manager : `CollectionManager`
455 Object responsible for managing the collection tables in a
456 `Registry`.
457 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
458 If provided, only yield collections of these types.
459 done : `set`, optional
460 A `set` containing the names of all collections already yielded;
461 any collections whose names are already present in this set will
462 not be yielded again, and those yielded will be added to it while
463 iterating. If not provided, an empty `set` will be created and
464 used internally to avoid duplicates.
465 flattenChains : `bool`, optional
466 If `True` (default) recursively yield the child collections of
467 `~CollectionType.CHAINED` collections.
468 includeChains : `bool`, optional
469 If `False`, return records for `~CollectionType.CHAINED`
470 collections themselves. The default is the opposite of
471 ``flattenChains``: either return records for CHAINED collections or
472 their children, but not both.
474 Yields
475 ------
476 record : `CollectionRecord`
477 Matching collection records.
478 """
479 if done is None:
480 done = set()
481 for name in self:
482 if name not in done:
483 yield from _yieldCollectionRecords(
484 manager,
485 manager.find(name),
486 collectionTypes=collectionTypes,
487 done=done,
488 flattenChains=flattenChains,
489 includeChains=includeChains,
490 )
492 def __iter__(self) -> Iterator[str]: # type: ignore
493 yield from self.__root__
495 def __len__(self) -> int:
496 return len(self.__root__)
498 def __getitem__(self, index: Any) -> str:
499 return self.__root__[index]
501 def __eq__(self, other: Any) -> bool:
502 if isinstance(other, CollectionSearch):
503 return self.__root__ == other.__root__
504 return False
506 def __str__(self) -> str:
507 return "[{}]".format(", ".join(self))
509 def __repr__(self) -> str:
510 return f"CollectionSearch({self.__root__!r})"
513class CollectionQuery:
514 """An unordered query for collections and dataset type restrictions.
516 The `fromExpression` method should almost always be used to construct
517 instances, as the regular constructor performs no checking of inputs (and
518 that can lead to confusing error messages downstream).
520 Parameters
521 ----------
522 search : `CollectionSearch` or `...`
523 An object representing an ordered search for explicitly-named
524 collections (to be interpreted here as unordered), or the special
525 value `...` indicating all collections. `...` must be accompanied
526 by ``patterns=None``.
527 patterns : `tuple` of `re.Pattern`
528 Regular expression patterns to match against collection names.
529 universe : `DimensionUniverse`
530 Object managing all dimensions.
532 Notes
533 -----
534 A `CollectionQuery` is used to find all matching datasets in any number
535 of collections, or to find collections themselves.
537 `CollectionQuery` is expected to be rarely used outside of `Registry`
538 (which uses it to back several of its "query" methods that take general
539 expressions for collections), but it may occassionally be useful outside
540 `Registry` as a way to preprocess expressions that contain single-pass
541 iterators into a form that can be used to call those `Registry` methods
542 multiple times.
543 """
544 def __init__(
545 self,
546 search: Union[CollectionSearch, EllipsisType] = Ellipsis,
547 patterns: Tuple[re.Pattern, ...] = (),
548 ):
549 self._search = search
550 self._patterns = patterns
552 __slots__ = ("_search", "_patterns")
554 @classmethod
555 def fromExpression(cls, expression: Any) -> CollectionQuery:
556 """Process a general expression to construct a `CollectionQuery`
557 instance.
559 Parameters
560 ----------
561 expression
562 May be:
563 - a `str` collection name;
564 - an `re.Pattern` instance to match (with `re.Pattern.fullmatch`)
565 against collection names;
566 - any iterable containing any of the above;
567 - a `CollectionSearch` instance;
568 - another `CollectionQuery` instance (passed through unchanged).
570 Duplicate collection names will be removed (preserving the first
571 appearance of each collection name).
573 Returns
574 -------
575 collections : `CollectionQuery`
576 A `CollectionQuery` instance.
577 """
578 if isinstance(expression, cls):
579 return expression
580 if expression is Ellipsis:
581 return cls()
582 if isinstance(expression, CollectionSearch):
583 return cls(search=expression, patterns=())
584 wildcard = CategorizedWildcard.fromExpression(
585 expression,
586 allowAny=True,
587 allowPatterns=True,
588 )
589 if wildcard is Ellipsis:
590 return cls()
591 assert not wildcard.items, \
592 "We should no longer be transforming to (str, DatasetTypeRestriction) tuples."
593 return cls(
594 search=CollectionSearch.fromExpression(wildcard.strings),
595 patterns=tuple(wildcard.patterns),
596 )
598 def iter(
599 self, manager: CollectionManager, *,
600 collectionTypes: AbstractSet[CollectionType] = CollectionType.all(),
601 flattenChains: bool = True,
602 includeChains: Optional[bool] = None,
603 ) -> Iterator[CollectionRecord]:
604 """Iterate over collection records that match this instance and the
605 given criteria, in an arbitrary order.
607 This method is primarily intended for internal use by `Registry`;
608 other callers should generally prefer `Registry.queryDatasets` or
609 other `Registry` query methods.
611 Parameters
612 ----------
613 manager : `CollectionManager`
614 Object responsible for managing the collection tables in a
615 `Registry`.
616 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
617 If provided, only yield collections of these types.
618 flattenChains : `bool`, optional
619 If `True` (default) recursively yield the child collections of
620 `~CollectionType.CHAINED` collections.
621 includeChains : `bool`, optional
622 If `False`, return records for `~CollectionType.CHAINED`
623 collections themselves. The default is the opposite of
624 ``flattenChains``: either return records for CHAINED collections or
625 their children, but not both.
627 Yields
628 ------
629 record : `CollectionRecord`
630 Matching collection records.
631 """
632 if self._search is Ellipsis:
633 for record in manager:
634 yield from _yieldCollectionRecords(
635 manager,
636 record,
637 collectionTypes=collectionTypes,
638 flattenChains=flattenChains,
639 includeChains=includeChains,
640 )
641 else:
642 done: Set[str] = set()
643 yield from self._search.iter(
644 manager,
645 collectionTypes=collectionTypes,
646 done=done,
647 flattenChains=flattenChains,
648 includeChains=includeChains,
649 )
650 for record in manager:
651 if record.name not in done and any(p.fullmatch(record.name) for p in self._patterns):
652 yield from _yieldCollectionRecords(
653 manager,
654 record,
655 collectionTypes=collectionTypes,
656 done=done,
657 flattenChains=flattenChains,
658 includeChains=includeChains,
659 )
661 def __eq__(self, other: Any) -> bool:
662 if isinstance(other, CollectionQuery):
663 return self._search == other._search and self._patterns == other._patterns
664 else:
665 return False