Coverage for python/lsst/daf/butler/registry/wildcards.py : 16%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "CategorizedWildcard",
25 "CollectionQuery",
26 "CollectionSearch",
27)
29from pydantic import BaseModel
30from dataclasses import dataclass
31import re
32from typing import (
33 AbstractSet,
34 Any,
35 Callable,
36 Iterator,
37 List,
38 Optional,
39 Sequence,
40 Set,
41 Tuple,
42 TYPE_CHECKING,
43 Union,
44)
46import sqlalchemy
48from ..core import DatasetType
49from ..core.utils import iterable
50from ._collectionType import CollectionType
52if TYPE_CHECKING: 52 ↛ 53line 52 didn't jump to line 53, because the condition on line 52 was never true
53 from .interfaces import CollectionManager, CollectionRecord
55 # Workaround for `...` not having an exposed type in Python, borrowed from
56 # https://github.com/python/typing/issues/684#issuecomment-548203158
57 # Along with that, we need to either use `Ellipsis` instead of `...` for
58 # the actual sentinal value internally, and tell MyPy to ignore conversions
59 # from `...` to `Ellipsis` at the public-interface boundary.
60 #
61 # `Ellipsis` and `EllipsisType` should be directly imported from this
62 # module by related code that needs them; hopefully that will stay confined
63 # to `lsst.daf.butler.registry`. Putting these in __all__ is bad for
64 # Sphinx, and probably more confusing than helpful overall.
65 from enum import Enum
67 class EllipsisType(Enum):
68 Ellipsis = "..."
70 Ellipsis = EllipsisType.Ellipsis
72else:
73 EllipsisType = type(Ellipsis)
74 Ellipsis = Ellipsis
77@dataclass
78class CategorizedWildcard:
79 """The results of preprocessing a wildcard expression to separate match
80 patterns from strings.
82 The `fromExpression` method should almost always be used to construct
83 instances, as the regular constructor performs no checking of inputs (and
84 that can lead to confusing error messages downstream).
85 """
87 @classmethod
88 def fromExpression(cls, expression: Any, *,
89 allowAny: bool = True,
90 allowPatterns: bool = True,
91 coerceUnrecognized: Optional[Callable[[Any], Union[Tuple[str, Any], str]]] = None,
92 coerceItemValue: Optional[Callable[[Any], Any]] = None,
93 defaultItemValue: Optional[Any] = None,
94 ) -> Union[CategorizedWildcard, EllipsisType]:
95 """Categorize a wildcard expression.
97 Parameters
98 ----------
99 expression
100 The expression to categorize. May be any of:
101 - `str`;
102 - `re.Pattern` (only if ``allowPatterns`` is `True`);
103 - objects recognized by ``coerceUnrecognized`` (if provided);
104 - two-element tuples of (`str`, value) where value is recognized
105 by ``coerceItemValue`` (if provided);
106 - a non-`str`, non-mapping iterable containing any of the above;
107 - the special value `...` (only if ``allowAny`` is `True`), which
108 matches anything;
109 - a mapping from `str` to a value are recognized by
110 ``coerceItemValue`` (if provided);
111 - a `CategorizedWildcard` instance (passed through unchanged if
112 it meets the requirements specified by keyword arguments).
113 allowAny: `bool`, optional
114 If `False` (`True` is default) raise `TypeError` if `...` is
115 encountered.
116 allowPatterns: `bool`, optional
117 If `False` (`True` is default) raise `TypeError` if a `re.Pattern`
118 is encountered, or if ``expression`` is a `CategorizedWildcard`
119 with `patterns` not empty.
120 coerceUnrecognized: `Callable`, optional
121 A callback that takes a single argument of arbitrary type and
122 returns either a `str` - appended to `strings` - or a `tuple` of
123 (`str`, `Any`) to be appended to `items`. This will be called on
124 objects of unrecognized type, with the return value added to
125 `strings`. Exceptions will be reraised as `TypeError` (and
126 chained).
127 coerceItemValue: `Callable`, optional
128 If provided, ``expression`` may be a mapping from `str` to any
129 type that can be passed to this function; the result of that call
130 will be stored instead as the value in ``self.items``.
131 defaultItemValue: `Any`, optional
132 If provided, combine this value with any string values encountered
133 (including any returned by ``coerceUnrecognized``) to form a
134 `tuple` and add it to `items`, guaranteeing that `strings` will be
135 empty. Patterns are never added to `items`.
137 Returns
138 -------
139 categorized : `CategorizedWildcard` or ``...``.
140 The struct describing the wildcard. ``...`` is passed through
141 unchanged.
143 Raises
144 ------
145 TypeError
146 Raised if an unsupported type is found in the expression.
147 """
148 assert expression is not None
149 # See if we were given ...; just return that if we were.
150 if expression is Ellipsis:
151 if not allowAny:
152 raise TypeError("This expression may not be unconstrained.")
153 return Ellipsis
154 if isinstance(expression, cls):
155 # This is already a CategorizedWildcard. Make sure it meets the
156 # reqs. implied by the kwargs we got.
157 if not allowPatterns and expression.patterns:
158 raise TypeError(f"Regular expression(s) {expression.patterns} "
159 f"are not allowed in this context.")
160 if defaultItemValue is not None and expression.strings:
161 if expression.items:
162 raise TypeError("Incompatible preprocessed expression: an ordered sequence of str is "
163 "needed, but the original order was lost in the preprocessing.")
164 return cls(strings=[], patterns=expression.patterns,
165 items=[(k, defaultItemValue) for k in expression.strings])
166 elif defaultItemValue is None and expression.items:
167 if expression.strings:
168 raise TypeError("Incompatible preprocessed expression: an ordered sequence of items is "
169 "needed, but the original order was lost in the preprocessing.")
170 return cls(strings=[k for k, _ in expression.items], patterns=expression.patterns, items=[])
171 else:
172 # Original expression was created with keyword arguments that
173 # were at least as restrictive as what we just got; pass it
174 # through.
175 return expression
177 # If we get here, we know we'll be creating a new instance.
178 # Initialize an empty one now.
179 self = cls(strings=[], patterns=[], items=[])
181 # If mappings are allowed, see if we were given a single mapping by
182 # trying to get items.
183 if coerceItemValue is not None:
184 rawItems = None
185 try:
186 rawItems = expression.items()
187 except AttributeError:
188 pass
189 if rawItems is not None:
190 for k, v in rawItems:
191 try:
192 self.items.append((k, coerceItemValue(v)))
193 except Exception as err:
194 raise TypeError(f"Could not coerce mapping value '{v}' for key '{k}'.") from err
195 return self
197 # Not ..., a CategorizedWildcard instance, or a mapping. Just
198 # process scalars or an iterable. We put the body of the loop inside
199 # a local function so we can recurse after coercion.
201 def process(element: Any, alreadyCoerced: bool = False) -> None:
202 if isinstance(element, str):
203 if defaultItemValue is not None:
204 self.items.append((element, defaultItemValue))
205 else:
206 self.strings.append(element)
207 return
208 if allowPatterns and isinstance(element, re.Pattern):
209 self.patterns.append(element)
210 return
211 if coerceItemValue is not None:
212 try:
213 k, v = element
214 except TypeError:
215 pass
216 else:
217 if not alreadyCoerced:
218 if not isinstance(k, str):
219 raise TypeError(f"Item key '{k}' is not a string.")
220 try:
221 v = coerceItemValue(v)
222 except Exception as err:
223 raise TypeError(f"Could not coerce tuple item value '{v}' for key '{k}'."
224 ) from err
225 self.items.append((k, v))
226 return
227 if alreadyCoerced:
228 raise TypeError(f"Object '{element}' returned by coercion function is still unrecognized.")
229 if coerceUnrecognized is not None:
230 try:
231 process(coerceUnrecognized(element), alreadyCoerced=True)
232 except Exception as err:
233 raise TypeError(f"Could not coerce expression element '{element}'.") from err
234 else:
235 raise TypeError(f"Unsupported object in wildcard expression: '{element}'.")
237 for element in iterable(expression):
238 process(element)
239 return self
241 def makeWhereExpression(self, column: sqlalchemy.sql.ColumnElement
242 ) -> Optional[sqlalchemy.sql.ColumnElement]:
243 """Transform the wildcard into a SQLAlchemy boolean expression suitable
244 for use in a WHERE clause.
246 Parameters
247 ----------
248 column : `sqlalchemy.sql.ColumnElement`
249 A string column in a table or query that should be compared to the
250 wildcard expression.
252 Returns
253 -------
254 where : `sqlalchemy.sql.ColumnElement` or `None`
255 A boolean SQL expression that evaluates to true if and only if
256 the value of ``column`` matches the wildcard. `None` is returned
257 if both `strings` and `patterns` are empty, and hence no match is
258 possible.
259 """
260 if self.items:
261 raise NotImplementedError("Expressions that are processed into items cannot be transformed "
262 "automatically into queries.")
263 if self.patterns:
264 raise NotImplementedError("Regular expression patterns are not yet supported here.")
265 terms = []
266 if len(self.strings) == 1:
267 terms.append(column == self.strings[0])
268 elif len(self.strings) > 1:
269 terms.append(column.in_(self.strings))
270 # TODO: append terms for regular expressions
271 if not terms:
272 return None
273 return sqlalchemy.sql.or_(*terms)
275 strings: List[str]
276 """Explicit string values found in the wildcard (`list` [ `str` ]).
277 """
279 patterns: List[re.Pattern]
280 """Regular expression patterns found in the wildcard
281 (`list` [ `re.Pattern` ]).
282 """
284 items: List[Tuple[str, Any]]
285 """Two-item tuples that relate string values to other objects
286 (`list` [ `tuple` [ `str`, `Any` ] ]).
287 """
290def _yieldCollectionRecords(
291 manager: CollectionManager,
292 record: CollectionRecord,
293 collectionTypes: AbstractSet[CollectionType] = CollectionType.all(),
294 done: Optional[Set[str]] = None,
295 flattenChains: bool = True,
296 includeChains: Optional[bool] = None,
297) -> Iterator[CollectionRecord]:
298 """A helper function containing common logic for `CollectionSearch.iter`
299 and `CollectionQuery.iter`: recursively yield `CollectionRecord` only if
300 they match the criteria given in other arguments.
302 Parameters
303 ----------
304 manager : `CollectionManager`
305 Object responsible for managing the collection tables in a `Registry`.
306 record : `CollectionRecord`
307 Record to conditionally yield.
308 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
309 If provided, only yield collections of these types.
310 done : `set` [ `str` ], optional
311 A `set` of already-yielded collection names; if provided, ``record``
312 will only be yielded if it is not already in ``done``, and ``done``
313 will be updated to include it on return.
314 flattenChains : `bool`, optional
315 If `True` (default) recursively yield the child collections of
316 `~CollectionType.CHAINED` collections.
317 includeChains : `bool`, optional
318 If `False`, return records for `~CollectionType.CHAINED` collections
319 themselves. The default is the opposite of ``flattenChains``: either
320 return records for CHAINED collections or their children, but not both.
322 Yields
323 ------
324 record : `CollectionRecord`
325 Matching collection records.
326 """
327 if done is None:
328 done = set()
329 includeChains = includeChains if includeChains is not None else not flattenChains
330 if record.type in collectionTypes:
331 done.add(record.name)
332 if record.type is not CollectionType.CHAINED or includeChains:
333 yield record
334 if flattenChains and record.type is CollectionType.CHAINED:
335 done.add(record.name)
336 # We know this is a ChainedCollectionRecord because of the enum value,
337 # but MyPy doesn't.
338 yield from record.children.iter( # type: ignore
339 manager,
340 collectionTypes=collectionTypes,
341 done=done,
342 flattenChains=flattenChains,
343 includeChains=includeChains,
344 )
347class CollectionSearch(BaseModel, Sequence[str]):
348 """An ordered search path of collections.
350 The `fromExpression` method should almost always be used to construct
351 instances, as the regular constructor performs no checking of inputs (and
352 that can lead to confusing error messages downstream).
354 Parameters
355 ----------
356 collections : `tuple` [ `str` ]
357 Tuple of collection names, ordered from the first searched to the last
358 searched.
360 Notes
361 -----
362 A `CollectionSearch` is used to find a single dataset (or set of datasets
363 with different dataset types or data IDs) according to its dataset type and
364 data ID, giving preference to collections in the order in which they are
365 specified. A `CollectionQuery` can be constructed from a broader range of
366 expressions but does not order the collections to be searched.
368 `CollectionSearch` is an immutable sequence of `str` collection names.
370 A `CollectionSearch` instance constructed properly (e.g. via
371 `fromExpression`) is a unique representation of a particular search path;
372 it is exactly the same internally and compares as equal to any
373 `CollectionSearch` constructed from an equivalent expression, regardless of
374 how different the original expressions appear.
375 """
376 __root__: Tuple[str, ...]
378 @classmethod
379 def fromExpression(cls, expression: Any) -> CollectionSearch:
380 """Process a general expression to construct a `CollectionSearch`
381 instance.
383 Parameters
384 ----------
385 expression
386 May be:
387 - a `str` collection name;
388 - an iterable of `str` collection names;
389 - another `CollectionSearch` instance (passed through
390 unchanged).
392 Duplicate entries will be removed (preserving the first appearance
393 of each collection name).
394 Returns
395 -------
396 collections : `CollectionSearch`
397 A `CollectionSearch` instance.
398 """
399 # First see if this is already a CollectionSearch; just pass that
400 # through unchanged. This lets us standardize expressions (and turn
401 # single-pass iterators into multi-pass iterables) in advance and pass
402 # them down to other routines that accept arbitrary expressions.
403 if isinstance(expression, cls):
404 return expression
405 wildcard = CategorizedWildcard.fromExpression(
406 expression,
407 allowAny=False,
408 allowPatterns=False,
409 )
410 assert wildcard is not Ellipsis
411 assert not wildcard.patterns
412 assert not wildcard.items
413 deduplicated = []
414 for name in wildcard.strings:
415 if name not in deduplicated:
416 deduplicated.append(name)
417 return cls(__root__=tuple(deduplicated))
419 def iter(
420 self, manager: CollectionManager, *,
421 datasetType: Optional[DatasetType] = None,
422 collectionTypes: AbstractSet[CollectionType] = CollectionType.all(),
423 done: Optional[Set[str]] = None,
424 flattenChains: bool = True,
425 includeChains: Optional[bool] = None,
426 ) -> Iterator[CollectionRecord]:
427 """Iterate over collection records that match this instance and the
428 given criteria, in order.
430 This method is primarily intended for internal use by `Registry`;
431 other callers should generally prefer `Registry.findDatasets` or
432 other `Registry` query methods.
434 Parameters
435 ----------
436 manager : `CollectionManager`
437 Object responsible for managing the collection tables in a
438 `Registry`.
439 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
440 If provided, only yield collections of these types.
441 done : `set`, optional
442 A `set` containing the names of all collections already yielded;
443 any collections whose names are already present in this set will
444 not be yielded again, and those yielded will be added to it while
445 iterating. If not provided, an empty `set` will be created and
446 used internally to avoid duplicates.
447 flattenChains : `bool`, optional
448 If `True` (default) recursively yield the child collections of
449 `~CollectionType.CHAINED` collections.
450 includeChains : `bool`, optional
451 If `False`, return records for `~CollectionType.CHAINED`
452 collections themselves. The default is the opposite of
453 ``flattenChains``: either return records for CHAINED collections or
454 their children, but not both.
456 Yields
457 ------
458 record : `CollectionRecord`
459 Matching collection records.
460 """
461 if done is None:
462 done = set()
463 for name in self:
464 if name not in done:
465 yield from _yieldCollectionRecords(
466 manager,
467 manager.find(name),
468 collectionTypes=collectionTypes,
469 done=done,
470 flattenChains=flattenChains,
471 includeChains=includeChains,
472 )
474 def __iter__(self) -> Iterator[str]: # type: ignore
475 yield from self.__root__
477 def __len__(self) -> int:
478 return len(self.__root__)
480 def __getitem__(self, index: Any) -> str:
481 return self.__root__[index]
483 def __eq__(self, other: Any) -> bool:
484 if isinstance(other, CollectionSearch):
485 return self.__root__ == other.__root__
486 return False
488 def __str__(self) -> str:
489 return "[{}]".format(", ".join(self))
491 def __repr__(self) -> str:
492 return f"CollectionSearch({self.__root__!r})"
495class CollectionQuery:
496 """An unordered query for collections and dataset type restrictions.
498 The `fromExpression` method should almost always be used to construct
499 instances, as the regular constructor performs no checking of inputs (and
500 that can lead to confusing error messages downstream).
502 Parameters
503 ----------
504 search : `CollectionSearch` or `...`
505 An object representing an ordered search for explicitly-named
506 collections (to be interpreted here as unordered), or the special
507 value `...` indicating all collections. `...` must be accompanied
508 by ``patterns=None``.
509 patterns : `tuple` of `re.Pattern`
510 Regular expression patterns to match against collection names.
511 universe : `DimensionUniverse`
512 Object managing all dimensions.
514 Notes
515 -----
516 A `CollectionQuery` is used to find all matching datasets in any number
517 of collections, or to find collections themselves.
519 `CollectionQuery` is expected to be rarely used outside of `Registry`
520 (which uses it to back several of its "query" methods that take general
521 expressions for collections), but it may occassionally be useful outside
522 `Registry` as a way to preprocess expressions that contain single-pass
523 iterators into a form that can be used to call those `Registry` methods
524 multiple times.
525 """
526 def __init__(
527 self,
528 search: Union[CollectionSearch, EllipsisType] = Ellipsis,
529 patterns: Tuple[re.Pattern, ...] = (),
530 ):
531 self._search = search
532 self._patterns = patterns
534 __slots__ = ("_search", "_patterns")
536 @classmethod
537 def fromExpression(cls, expression: Any) -> CollectionQuery:
538 """Process a general expression to construct a `CollectionQuery`
539 instance.
541 Parameters
542 ----------
543 expression
544 May be:
545 - a `str` collection name;
546 - an `re.Pattern` instance to match (with `re.Pattern.fullmatch`)
547 against collection names;
548 - any iterable containing any of the above;
549 - a `CollectionSearch` instance;
550 - another `CollectionQuery` instance (passed through unchanged).
552 Duplicate collection names will be removed (preserving the first
553 appearance of each collection name).
555 Returns
556 -------
557 collections : `CollectionQuery`
558 A `CollectionQuery` instance.
559 """
560 if isinstance(expression, cls):
561 return expression
562 if expression is Ellipsis:
563 return cls()
564 if isinstance(expression, CollectionSearch):
565 return cls(search=expression, patterns=())
566 wildcard = CategorizedWildcard.fromExpression(
567 expression,
568 allowAny=True,
569 allowPatterns=True,
570 )
571 if wildcard is Ellipsis:
572 return cls()
573 assert not wildcard.items, \
574 "We should no longer be transforming to (str, DatasetTypeRestriction) tuples."
575 return cls(
576 search=CollectionSearch.fromExpression(wildcard.strings),
577 patterns=tuple(wildcard.patterns),
578 )
580 def iter(
581 self, manager: CollectionManager, *,
582 collectionTypes: AbstractSet[CollectionType] = CollectionType.all(),
583 flattenChains: bool = True,
584 includeChains: Optional[bool] = None,
585 ) -> Iterator[CollectionRecord]:
586 """Iterate over collection records that match this instance and the
587 given criteria, in an arbitrary order.
589 This method is primarily intended for internal use by `Registry`;
590 other callers should generally prefer `Registry.queryDatasets` or
591 other `Registry` query methods.
593 Parameters
594 ----------
595 manager : `CollectionManager`
596 Object responsible for managing the collection tables in a
597 `Registry`.
598 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
599 If provided, only yield collections of these types.
600 flattenChains : `bool`, optional
601 If `True` (default) recursively yield the child collections of
602 `~CollectionType.CHAINED` collections.
603 includeChains : `bool`, optional
604 If `False`, return records for `~CollectionType.CHAINED`
605 collections themselves. The default is the opposite of
606 ``flattenChains``: either return records for CHAINED collections or
607 their children, but not both.
609 Yields
610 ------
611 record : `CollectionRecord`
612 Matching collection records.
613 """
614 if self._search is Ellipsis:
615 for record in manager:
616 yield from _yieldCollectionRecords(
617 manager,
618 record,
619 collectionTypes=collectionTypes,
620 flattenChains=flattenChains,
621 includeChains=includeChains,
622 )
623 else:
624 done: Set[str] = set()
625 yield from self._search.iter(
626 manager,
627 collectionTypes=collectionTypes,
628 done=done,
629 flattenChains=flattenChains,
630 includeChains=includeChains,
631 )
632 for record in manager:
633 if record.name not in done and any(p.fullmatch(record.name) for p in self._patterns):
634 yield from _yieldCollectionRecords(
635 manager,
636 record,
637 collectionTypes=collectionTypes,
638 done=done,
639 flattenChains=flattenChains,
640 includeChains=includeChains,
641 )
643 def __eq__(self, other: Any) -> bool:
644 if isinstance(other, CollectionQuery):
645 return self._search == other._search and self._patterns == other._patterns
646 else:
647 return False