Coverage for python/lsst/daf/butler/registry/wildcards.py : 16%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "CategorizedWildcard",
25 "CollectionQuery",
26 "CollectionSearch",
27)
29from dataclasses import dataclass
30import re
31from typing import (
32 AbstractSet,
33 Any,
34 Callable,
35 Iterator,
36 List,
37 Optional,
38 Sequence,
39 Set,
40 Tuple,
41 TYPE_CHECKING,
42 Union,
43)
45import sqlalchemy
47from ..core import DatasetType
48from ..core.utils import iterable
49from ._collectionType import CollectionType
51if TYPE_CHECKING: 51 ↛ 52line 51 didn't jump to line 52, because the condition on line 51 was never true
52 from .interfaces import CollectionManager, CollectionRecord
54 # Workaround for `...` not having an exposed type in Python, borrowed from
55 # https://github.com/python/typing/issues/684#issuecomment-548203158
56 # Along with that, we need to either use `Ellipsis` instead of `...` for
57 # the actual sentinal value internally, and tell MyPy to ignore conversions
58 # from `...` to `Ellipsis` at the public-interface boundary.
59 #
60 # `Ellipsis` and `EllipsisType` should be directly imported from this
61 # module by related code that needs them; hopefully that will stay confined
62 # to `lsst.daf.butler.registry`. Putting these in __all__ is bad for
63 # Sphinx, and probably more confusing than helpful overall.
64 from enum import Enum
66 class EllipsisType(Enum):
67 Ellipsis = "..."
69 Ellipsis = EllipsisType.Ellipsis
71else:
72 EllipsisType = type(Ellipsis)
73 Ellipsis = Ellipsis
76@dataclass
77class CategorizedWildcard:
78 """The results of preprocessing a wildcard expression to separate match
79 patterns from strings.
81 The `fromExpression` method should almost always be used to construct
82 instances, as the regular constructor performs no checking of inputs (and
83 that can lead to confusing error messages downstream).
84 """
86 @classmethod
87 def fromExpression(cls, expression: Any, *,
88 allowAny: bool = True,
89 allowPatterns: bool = True,
90 coerceUnrecognized: Optional[Callable[[Any], Union[Tuple[str, Any], str]]] = None,
91 coerceItemValue: Optional[Callable[[Any], Any]] = None,
92 defaultItemValue: Optional[Any] = None,
93 ) -> Union[CategorizedWildcard, EllipsisType]:
94 """Categorize a wildcard expression.
96 Parameters
97 ----------
98 expression
99 The expression to categorize. May be any of:
100 - `str`;
101 - `re.Pattern` (only if ``allowPatterns`` is `True`);
102 - objects recognized by ``coerceUnrecognized`` (if provided);
103 - two-element tuples of (`str`, value) where value is recognized
104 by ``coerceItemValue`` (if provided);
105 - a non-`str`, non-mapping iterable containing any of the above;
106 - the special value `...` (only if ``allowAny`` is `True`), which
107 matches anything;
108 - a mapping from `str` to a value are recognized by
109 ``coerceItemValue`` (if provided);
110 - a `CategorizedWildcard` instance (passed through unchanged if
111 it meets the requirements specified by keyword arguments).
112 allowAny: `bool`, optional
113 If `False` (`True` is default) raise `TypeError` if `...` is
114 encountered.
115 allowPatterns: `bool`, optional
116 If `False` (`True` is default) raise `TypeError` if a `re.Pattern`
117 is encountered, or if ``expression`` is a `CategorizedWildcard`
118 with `patterns` not empty.
119 coerceUnrecognized: `Callable`, optional
120 A callback that takes a single argument of arbitrary type and
121 returns either a `str` - appended to `strings` - or a `tuple` of
122 (`str`, `Any`) to be appended to `items`. This will be called on
123 objects of unrecognized type, with the return value added to
124 `strings`. Exceptions will be reraised as `TypeError` (and
125 chained).
126 coerceItemValue: `Callable`, optional
127 If provided, ``expression`` may be a mapping from `str` to any
128 type that can be passed to this function; the result of that call
129 will be stored instead as the value in ``self.items``.
130 defaultItemValue: `Any`, optional
131 If provided, combine this value with any string values encountered
132 (including any returned by ``coerceUnrecognized``) to form a
133 `tuple` and add it to `items`, guaranteeing that `strings` will be
134 empty. Patterns are never added to `items`.
136 Returns
137 -------
138 categorized : `CategorizedWildcard` or ``...``.
139 The struct describing the wildcard. ``...`` is passed through
140 unchanged.
142 Raises
143 ------
144 TypeError
145 Raised if an unsupported type is found in the expression.
146 """
147 assert expression is not None
148 # See if we were given ...; just return that if we were.
149 if expression is Ellipsis:
150 if not allowAny:
151 raise TypeError("This expression may not be unconstrained.")
152 return Ellipsis
153 if isinstance(expression, cls):
154 # This is already a CategorizedWildcard. Make sure it meets the
155 # reqs. implied by the kwargs we got.
156 if not allowPatterns and expression.patterns:
157 raise TypeError(f"Regular expression(s) {expression.patterns} "
158 f"are not allowed in this context.")
159 if defaultItemValue is not None and expression.strings:
160 if expression.items:
161 raise TypeError("Incompatible preprocessed expression: an ordered sequence of str is "
162 "needed, but the original order was lost in the preprocessing.")
163 return cls(strings=[], patterns=expression.patterns,
164 items=[(k, defaultItemValue) for k in expression.strings])
165 elif defaultItemValue is None and expression.items:
166 if expression.strings:
167 raise TypeError("Incompatible preprocessed expression: an ordered sequence of items is "
168 "needed, but the original order was lost in the preprocessing.")
169 return cls(strings=[k for k, _ in expression.items], patterns=expression.patterns, items=[])
170 else:
171 # Original expression was created with keyword arguments that
172 # were at least as restrictive as what we just got; pass it
173 # through.
174 return expression
176 # If we get here, we know we'll be creating a new instance.
177 # Initialize an empty one now.
178 self = cls(strings=[], patterns=[], items=[])
180 # If mappings are allowed, see if we were given a single mapping by
181 # trying to get items.
182 if coerceItemValue is not None:
183 rawItems = None
184 try:
185 rawItems = expression.items()
186 except AttributeError:
187 pass
188 if rawItems is not None:
189 for k, v in rawItems:
190 try:
191 self.items.append((k, coerceItemValue(v)))
192 except Exception as err:
193 raise TypeError(f"Could not coerce mapping value '{v}' for key '{k}'.") from err
194 return self
196 # Not ..., a CategorizedWildcard instance, or a mapping. Just
197 # process scalars or an iterable. We put the body of the loop inside
198 # a local function so we can recurse after coercion.
200 def process(element: Any, alreadyCoerced: bool = False) -> None:
201 if isinstance(element, str):
202 if defaultItemValue is not None:
203 self.items.append((element, defaultItemValue))
204 else:
205 self.strings.append(element)
206 return
207 if allowPatterns and isinstance(element, re.Pattern):
208 self.patterns.append(element)
209 return
210 if coerceItemValue is not None:
211 try:
212 k, v = element
213 except TypeError:
214 pass
215 else:
216 if not alreadyCoerced:
217 if not isinstance(k, str):
218 raise TypeError(f"Item key '{k}' is not a string.")
219 try:
220 v = coerceItemValue(v)
221 except Exception as err:
222 raise TypeError(f"Could not coerce tuple item value '{v}' for key '{k}'."
223 ) from err
224 self.items.append((k, v))
225 return
226 if alreadyCoerced:
227 raise TypeError(f"Object '{element}' returned by coercion function is still unrecognized.")
228 if coerceUnrecognized is not None:
229 try:
230 process(coerceUnrecognized(element), alreadyCoerced=True)
231 except Exception as err:
232 raise TypeError(f"Could not coerce expression element '{element}'.") from err
233 else:
234 raise TypeError(f"Unsupported object in wildcard expression: '{element}'.")
236 for element in iterable(expression):
237 process(element)
238 return self
240 def makeWhereExpression(self, column: sqlalchemy.sql.ColumnElement
241 ) -> Optional[sqlalchemy.sql.ColumnElement]:
242 """Transform the wildcard into a SQLAlchemy boolean expression suitable
243 for use in a WHERE clause.
245 Parameters
246 ----------
247 column : `sqlalchemy.sql.ColumnElement`
248 A string column in a table or query that should be compared to the
249 wildcard expression.
251 Returns
252 -------
253 where : `sqlalchemy.sql.ColumnElement` or `None`
254 A boolean SQL expression that evaluates to true if and only if
255 the value of ``column`` matches the wildcard. `None` is returned
256 if both `strings` and `patterns` are empty, and hence no match is
257 possible.
258 """
259 if self.items:
260 raise NotImplementedError("Expressions that are processed into items cannot be transformed "
261 "automatically into queries.")
262 if self.patterns:
263 raise NotImplementedError("Regular expression patterns are not yet supported here.")
264 terms = []
265 if len(self.strings) == 1:
266 terms.append(column == self.strings[0])
267 elif len(self.strings) > 1:
268 terms.append(column.in_(self.strings))
269 # TODO: append terms for regular expressions
270 if not terms:
271 return None
272 return sqlalchemy.sql.or_(*terms)
274 strings: List[str]
275 """Explicit string values found in the wildcard (`list` [ `str` ]).
276 """
278 patterns: List[re.Pattern]
279 """Regular expression patterns found in the wildcard
280 (`list` [ `re.Pattern` ]).
281 """
283 items: List[Tuple[str, Any]]
284 """Two-item tuples that relate string values to other objects
285 (`list` [ `tuple` [ `str`, `Any` ] ]).
286 """
289def _yieldCollectionRecords(
290 manager: CollectionManager,
291 record: CollectionRecord,
292 collectionTypes: AbstractSet[CollectionType] = CollectionType.all(),
293 done: Optional[Set[str]] = None,
294 flattenChains: bool = True,
295 includeChains: Optional[bool] = None,
296) -> Iterator[CollectionRecord]:
297 """A helper function containing common logic for `CollectionSearch.iter`
298 and `CollectionQuery.iter`: recursively yield `CollectionRecord` only if
299 they match the criteria given in other arguments.
301 Parameters
302 ----------
303 manager : `CollectionManager`
304 Object responsible for managing the collection tables in a `Registry`.
305 record : `CollectionRecord`
306 Record to conditionally yield.
307 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
308 If provided, only yield collections of these types.
309 done : `set` [ `str` ], optional
310 A `set` of already-yielded collection names; if provided, ``record``
311 will only be yielded if it is not already in ``done``, and ``done``
312 will be updated to include it on return.
313 flattenChains : `bool`, optional
314 If `True` (default) recursively yield the child collections of
315 `~CollectionType.CHAINED` collections.
316 includeChains : `bool`, optional
317 If `False`, return records for `~CollectionType.CHAINED` collections
318 themselves. The default is the opposite of ``flattenChains``: either
319 return records for CHAINED collections or their children, but not both.
321 Yields
322 ------
323 record : `CollectionRecord`
324 Matching collection records.
325 """
326 if done is None:
327 done = set()
328 includeChains = includeChains if includeChains is not None else not flattenChains
329 if record.type in collectionTypes:
330 done.add(record.name)
331 if record.type is not CollectionType.CHAINED or includeChains:
332 yield record
333 if flattenChains and record.type is CollectionType.CHAINED:
334 done.add(record.name)
335 # We know this is a ChainedCollectionRecord because of the enum value,
336 # but MyPy doesn't.
337 yield from record.children.iter( # type: ignore
338 manager,
339 collectionTypes=collectionTypes,
340 done=done,
341 flattenChains=flattenChains,
342 includeChains=includeChains,
343 )
346class CollectionSearch(Sequence[str]):
347 """An ordered search path of collections.
349 The `fromExpression` method should almost always be used to construct
350 instances, as the regular constructor performs no checking of inputs (and
351 that can lead to confusing error messages downstream).
353 Parameters
354 ----------
355 collections : `tuple` [ `str` ]
356 Tuple of collection names, ordered from the first searched to the last
357 searched.
359 Notes
360 -----
361 A `CollectionSearch` is used to find a single dataset (or set of datasets
362 with different dataset types or data IDs) according to its dataset type and
363 data ID, giving preference to collections in the order in which they are
364 specified. A `CollectionQuery` can be constructed from a broader range of
365 expressions but does not order the collections to be searched.
367 `CollectionSearch` is an immutable sequence of `str` collection names.
369 A `CollectionSearch` instance constructed properly (e.g. via
370 `fromExpression`) is a unique representation of a particular search path;
371 it is exactly the same internally and compares as equal to any
372 `CollectionSearch` constructed from an equivalent expression, regardless of
373 how different the original expressions appear.
374 """
375 def __init__(self, collections: Tuple[str, ...]):
376 self._collections = collections
378 __slots__ = ("_collections",)
380 @classmethod
381 def fromExpression(cls, expression: Any) -> CollectionSearch:
382 """Process a general expression to construct a `CollectionSearch`
383 instance.
385 Parameters
386 ----------
387 expression
388 May be:
389 - a `str` collection name;
390 - an iterable of `str` collection names;
391 - another `CollectionSearch` instance (passed through
392 unchanged).
394 Duplicate entries will be removed (preserving the first appearance
395 of each collection name).
396 Returns
397 -------
398 collections : `CollectionSearch`
399 A `CollectionSearch` instance.
400 """
401 # First see if this is already a CollectionSearch; just pass that
402 # through unchanged. This lets us standardize expressions (and turn
403 # single-pass iterators into multi-pass iterables) in advance and pass
404 # them down to other routines that accept arbitrary expressions.
405 if isinstance(expression, cls):
406 return expression
407 wildcard = CategorizedWildcard.fromExpression(
408 expression,
409 allowAny=False,
410 allowPatterns=False,
411 )
412 assert wildcard is not Ellipsis
413 assert not wildcard.patterns
414 assert not wildcard.items
415 deduplicated = []
416 for name in wildcard.strings:
417 if name not in deduplicated:
418 deduplicated.append(name)
419 return cls(tuple(deduplicated))
421 def iter(
422 self, manager: CollectionManager, *,
423 datasetType: Optional[DatasetType] = None,
424 collectionTypes: AbstractSet[CollectionType] = CollectionType.all(),
425 done: Optional[Set[str]] = None,
426 flattenChains: bool = True,
427 includeChains: Optional[bool] = None,
428 ) -> Iterator[CollectionRecord]:
429 """Iterate over collection records that match this instance and the
430 given criteria, in order.
432 This method is primarily intended for internal use by `Registry`;
433 other callers should generally prefer `Registry.findDatasets` or
434 other `Registry` query methods.
436 Parameters
437 ----------
438 manager : `CollectionManager`
439 Object responsible for managing the collection tables in a
440 `Registry`.
441 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
442 If provided, only yield collections of these types.
443 done : `set`, optional
444 A `set` containing the names of all collections already yielded;
445 any collections whose names are already present in this set will
446 not be yielded again, and those yielded will be added to it while
447 iterating. If not provided, an empty `set` will be created and
448 used internally to avoid duplicates.
449 flattenChains : `bool`, optional
450 If `True` (default) recursively yield the child collections of
451 `~CollectionType.CHAINED` collections.
452 includeChains : `bool`, optional
453 If `False`, return records for `~CollectionType.CHAINED`
454 collections themselves. The default is the opposite of
455 ``flattenChains``: either return records for CHAINED collections or
456 their children, but not both.
458 Yields
459 ------
460 record : `CollectionRecord`
461 Matching collection records.
462 """
463 if done is None:
464 done = set()
465 for name in self:
466 if name not in done:
467 yield from _yieldCollectionRecords(
468 manager,
469 manager.find(name),
470 collectionTypes=collectionTypes,
471 done=done,
472 flattenChains=flattenChains,
473 includeChains=includeChains,
474 )
476 def __iter__(self) -> Iterator[str]:
477 yield from self._collections
479 def __len__(self) -> int:
480 return len(self._collections)
482 def __getitem__(self, index: Any) -> str:
483 return self._collections[index]
485 def __eq__(self, other: Any) -> bool:
486 if isinstance(other, CollectionSearch):
487 return self._collections == other._collections
488 return False
490 def __str__(self) -> str:
491 return "[{}]".format(", ".join(self))
493 def __repr__(self) -> str:
494 return f"CollectionSearch({self._collections!r})"
497class CollectionQuery:
498 """An unordered query for collections and dataset type restrictions.
500 The `fromExpression` method should almost always be used to construct
501 instances, as the regular constructor performs no checking of inputs (and
502 that can lead to confusing error messages downstream).
504 Parameters
505 ----------
506 search : `CollectionSearch` or `...`
507 An object representing an ordered search for explicitly-named
508 collections (to be interpreted here as unordered), or the special
509 value `...` indicating all collections. `...` must be accompanied
510 by ``patterns=None``.
511 patterns : `tuple` of `re.Pattern`
512 Regular expression patterns to match against collection names.
513 universe : `DimensionUniverse`
514 Object managing all dimensions.
516 Notes
517 -----
518 A `CollectionQuery` is used to find all matching datasets in any number
519 of collections, or to find collections themselves.
521 `CollectionQuery` is expected to be rarely used outside of `Registry`
522 (which uses it to back several of its "query" methods that take general
523 expressions for collections), but it may occassionally be useful outside
524 `Registry` as a way to preprocess expressions that contain single-pass
525 iterators into a form that can be used to call those `Registry` methods
526 multiple times.
527 """
528 def __init__(
529 self,
530 search: Union[CollectionSearch, EllipsisType] = Ellipsis,
531 patterns: Tuple[re.Pattern, ...] = (),
532 ):
533 self._search = search
534 self._patterns = patterns
536 __slots__ = ("_search", "_patterns")
538 @classmethod
539 def fromExpression(cls, expression: Any) -> CollectionQuery:
540 """Process a general expression to construct a `CollectionQuery`
541 instance.
543 Parameters
544 ----------
545 expression
546 May be:
547 - a `str` collection name;
548 - an `re.Pattern` instance to match (with `re.Pattern.fullmatch`)
549 against collection names;
550 - any iterable containing any of the above;
551 - a `CollectionSearch` instance;
552 - another `CollectionQuery` instance (passed through unchanged).
554 Duplicate collection names will be removed (preserving the first
555 appearance of each collection name).
557 Returns
558 -------
559 collections : `CollectionQuery`
560 A `CollectionQuery` instance.
561 """
562 if isinstance(expression, cls):
563 return expression
564 if expression is Ellipsis:
565 return cls()
566 if isinstance(expression, CollectionSearch):
567 return cls(search=expression, patterns=())
568 wildcard = CategorizedWildcard.fromExpression(
569 expression,
570 allowAny=True,
571 allowPatterns=True,
572 )
573 if wildcard is Ellipsis:
574 return cls()
575 assert not wildcard.items, \
576 "We should no longer be transforming to (str, DatasetTypeRestriction) tuples."
577 return cls(
578 search=CollectionSearch.fromExpression(wildcard.strings),
579 patterns=tuple(wildcard.patterns),
580 )
582 def iter(
583 self, manager: CollectionManager, *,
584 collectionTypes: AbstractSet[CollectionType] = CollectionType.all(),
585 flattenChains: bool = True,
586 includeChains: Optional[bool] = None,
587 ) -> Iterator[CollectionRecord]:
588 """Iterate over collection records that match this instance and the
589 given criteria, in an arbitrary order.
591 This method is primarily intended for internal use by `Registry`;
592 other callers should generally prefer `Registry.queryDatasets` or
593 other `Registry` query methods.
595 Parameters
596 ----------
597 manager : `CollectionManager`
598 Object responsible for managing the collection tables in a
599 `Registry`.
600 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
601 If provided, only yield collections of these types.
602 flattenChains : `bool`, optional
603 If `True` (default) recursively yield the child collections of
604 `~CollectionType.CHAINED` collections.
605 includeChains : `bool`, optional
606 If `False`, return records for `~CollectionType.CHAINED`
607 collections themselves. The default is the opposite of
608 ``flattenChains``: either return records for CHAINED collections or
609 their children, but not both.
611 Yields
612 ------
613 record : `CollectionRecord`
614 Matching collection records.
615 """
616 if self._search is Ellipsis:
617 for record in manager:
618 yield from _yieldCollectionRecords(
619 manager,
620 record,
621 collectionTypes=collectionTypes,
622 flattenChains=flattenChains,
623 includeChains=includeChains,
624 )
625 else:
626 done: Set[str] = set()
627 yield from self._search.iter(
628 manager,
629 collectionTypes=collectionTypes,
630 done=done,
631 flattenChains=flattenChains,
632 includeChains=includeChains,
633 )
634 for record in manager:
635 if record.name not in done and any(p.fullmatch(record.name) for p in self._patterns):
636 yield from _yieldCollectionRecords(
637 manager,
638 record,
639 collectionTypes=collectionTypes,
640 done=done,
641 flattenChains=flattenChains,
642 includeChains=includeChains,
643 )
645 def __eq__(self, other: Any) -> bool:
646 if isinstance(other, CollectionQuery):
647 return self._search == other._search and self._patterns == other._patterns
648 else:
649 return False