Coverage for python/lsst/daf/butler/registry/wildcards.py: 17%
194 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-27 08:58 +0000
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-27 08:58 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "CategorizedWildcard",
25 "CollectionQuery",
26 "CollectionSearch",
27)
29import re
30from collections.abc import Callable, Iterator, Sequence, Set
31from dataclasses import dataclass
32from typing import TYPE_CHECKING, Any
34from lsst.utils.ellipsis import Ellipsis, EllipsisType
35from lsst.utils.iteration import ensure_iterable
36from pydantic import BaseModel
38from ..core import DatasetType
39from ..core.utils import globToRegex
40from ._collectionType import CollectionType
42if TYPE_CHECKING: 42 ↛ 43line 42 didn't jump to line 43, because the condition on line 42 was never true
43 from .interfaces import CollectionManager, CollectionRecord
46@dataclass
47class CategorizedWildcard:
48 """The results of preprocessing a wildcard expression to separate match
49 patterns from strings.
51 The `fromExpression` method should almost always be used to construct
52 instances, as the regular constructor performs no checking of inputs (and
53 that can lead to confusing error messages downstream).
54 """
56 @classmethod
57 def fromExpression(
58 cls,
59 expression: Any,
60 *,
61 allowAny: bool = True,
62 allowPatterns: bool = True,
63 coerceUnrecognized: Callable[[Any], tuple[str, Any] | str] | None = None,
64 coerceItemValue: Callable[[Any], Any] | None = None,
65 defaultItemValue: Any | None = None,
66 ) -> CategorizedWildcard | EllipsisType:
67 """Categorize a wildcard expression.
69 Parameters
70 ----------
71 expression
72 The expression to categorize. May be any of:
73 - `str` (including glob patterns if ``allowPatterns`` is `True`);
74 - `re.Pattern` (only if ``allowPatterns`` is `True`);
75 - objects recognized by ``coerceUnrecognized`` (if provided);
76 - two-element tuples of (`str`, value) where value is recognized
77 by ``coerceItemValue`` (if provided);
78 - a non-`str`, non-mapping iterable containing any of the above;
79 - the special value `...` (only if ``allowAny`` is `True`), which
80 matches anything;
81 - a mapping from `str` to a value are recognized by
82 ``coerceItemValue`` (if provided);
83 - a `CategorizedWildcard` instance (passed through unchanged if
84 it meets the requirements specified by keyword arguments).
85 allowAny: `bool`, optional
86 If `False` (`True` is default) raise `TypeError` if `...` is
87 encountered.
88 allowPatterns: `bool`, optional
89 If `False` (`True` is default) raise `TypeError` if a `re.Pattern`
90 is encountered, or if ``expression`` is a `CategorizedWildcard`
91 with `patterns` not empty.
92 coerceUnrecognized: `Callable`, optional
93 A callback that takes a single argument of arbitrary type and
94 returns either a `str` - appended to `strings` - or a `tuple` of
95 (`str`, `Any`) to be appended to `items`. This will be called on
96 objects of unrecognized type, with the return value added to
97 `strings`. Exceptions will be reraised as `TypeError` (and
98 chained).
99 coerceItemValue: `Callable`, optional
100 If provided, ``expression`` may be a mapping from `str` to any
101 type that can be passed to this function; the result of that call
102 will be stored instead as the value in ``self.items``.
103 defaultItemValue: `Any`, optional
104 If provided, combine this value with any string values encountered
105 (including any returned by ``coerceUnrecognized``) to form a
106 `tuple` and add it to `items`, guaranteeing that `strings` will be
107 empty. Patterns are never added to `items`.
109 Returns
110 -------
111 categorized : `CategorizedWildcard` or ``...``.
112 The struct describing the wildcard. ``...`` is passed through
113 unchanged.
115 Raises
116 ------
117 TypeError
118 Raised if an unsupported type is found in the expression.
119 """
120 assert expression is not None
121 # See if we were given ...; just return that if we were.
122 if expression is Ellipsis:
123 if not allowAny:
124 raise TypeError("This expression may not be unconstrained.")
125 return Ellipsis
126 if isinstance(expression, cls):
127 # This is already a CategorizedWildcard. Make sure it meets the
128 # reqs. implied by the kwargs we got.
129 if not allowPatterns and expression.patterns:
130 raise TypeError(
131 f"Regular expression(s) {expression.patterns} are not allowed in this context."
132 )
133 if defaultItemValue is not None and expression.strings:
134 if expression.items:
135 raise TypeError(
136 "Incompatible preprocessed expression: an ordered sequence of str is "
137 "needed, but the original order was lost in the preprocessing."
138 )
139 return cls(
140 strings=[],
141 patterns=expression.patterns,
142 items=[(k, defaultItemValue) for k in expression.strings],
143 )
144 elif defaultItemValue is None and expression.items:
145 if expression.strings:
146 raise TypeError(
147 "Incompatible preprocessed expression: an ordered sequence of items is "
148 "needed, but the original order was lost in the preprocessing."
149 )
150 return cls(strings=[k for k, _ in expression.items], patterns=expression.patterns, items=[])
151 else:
152 # Original expression was created with keyword arguments that
153 # were at least as restrictive as what we just got; pass it
154 # through.
155 return expression
157 # If we get here, we know we'll be creating a new instance.
158 # Initialize an empty one now.
159 self = cls(strings=[], patterns=[], items=[])
161 # If mappings are allowed, see if we were given a single mapping by
162 # trying to get items.
163 if coerceItemValue is not None:
164 rawItems = None
165 try:
166 rawItems = expression.items()
167 except AttributeError:
168 pass
169 if rawItems is not None:
170 for k, v in rawItems:
171 try:
172 self.items.append((k, coerceItemValue(v)))
173 except Exception as err:
174 raise TypeError(f"Could not coerce mapping value '{v}' for key '{k}'.") from err
175 return self
177 # Not ..., a CategorizedWildcard instance, or a mapping. Just
178 # process scalars or an iterable. We put the body of the loop inside
179 # a local function so we can recurse after coercion.
181 def process(element: Any, alreadyCoerced: bool = False) -> EllipsisType | None:
182 if isinstance(element, str):
183 if defaultItemValue is not None:
184 self.items.append((element, defaultItemValue))
185 return None
186 else:
187 # This returns a list but we know we only passed in
188 # single value.
189 converted = globToRegex(element)
190 if converted is Ellipsis:
191 return Ellipsis
192 element = converted[0]
193 # Let regex and ... go through to the next check
194 if isinstance(element, str):
195 self.strings.append(element)
196 return None
197 if allowPatterns and isinstance(element, re.Pattern):
198 self.patterns.append(element)
199 return None
200 if coerceItemValue is not None:
201 try:
202 k, v = element
203 except TypeError:
204 pass
205 else:
206 if not alreadyCoerced:
207 if not isinstance(k, str):
208 raise TypeError(f"Item key '{k}' is not a string.")
209 try:
210 v = coerceItemValue(v)
211 except Exception as err:
212 raise TypeError(
213 f"Could not coerce tuple item value '{v}' for key '{k}'."
214 ) from err
215 self.items.append((k, v))
216 return None
217 if alreadyCoerced:
218 raise TypeError(f"Object '{element!r}' returned by coercion function is still unrecognized.")
219 if coerceUnrecognized is not None:
220 try:
221 # This should be safe but flake8 cant tell that the
222 # function will be re-declared next function call
223 process(coerceUnrecognized(element), alreadyCoerced=True) # noqa: F821
224 except Exception as err:
225 raise TypeError(f"Could not coerce expression element '{element!r}'.") from err
226 else:
227 extra = "."
228 if isinstance(element, re.Pattern):
229 extra = " and patterns are not allowed."
230 raise TypeError(f"Unsupported object in wildcard expression: '{element!r}'{extra}")
231 return None
233 for element in ensure_iterable(expression):
234 retval = process(element)
235 if retval is Ellipsis:
236 # One of the globs matched everything
237 if not allowAny:
238 raise TypeError("This expression may not be unconstrained.")
239 return Ellipsis
240 del process
241 return self
243 strings: list[str]
244 """Explicit string values found in the wildcard (`list` [ `str` ]).
245 """
247 patterns: list[re.Pattern]
248 """Regular expression patterns found in the wildcard
249 (`list` [ `re.Pattern` ]).
250 """
252 items: list[tuple[str, Any]]
253 """Two-item tuples that relate string values to other objects
254 (`list` [ `tuple` [ `str`, `Any` ] ]).
255 """
258def _yieldCollectionRecords(
259 manager: CollectionManager,
260 record: CollectionRecord,
261 collectionTypes: Set[CollectionType] = CollectionType.all(),
262 done: set[str] | None = None,
263 flattenChains: bool = True,
264 includeChains: bool | None = None,
265) -> Iterator[CollectionRecord]:
266 """A helper function containing common logic for `CollectionSearch.iter`
267 and `CollectionQuery.iter`: recursively yield `CollectionRecord` only if
268 they match the criteria given in other arguments.
270 Parameters
271 ----------
272 manager : `CollectionManager`
273 Object responsible for managing the collection tables in a `Registry`.
274 record : `CollectionRecord`
275 Record to conditionally yield.
276 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
277 If provided, only yield collections of these types.
278 done : `set` [ `str` ], optional
279 A `set` of already-yielded collection names; if provided, ``record``
280 will only be yielded if it is not already in ``done``, and ``done``
281 will be updated to include it on return.
282 flattenChains : `bool`, optional
283 If `True` (default) recursively yield the child collections of
284 `~CollectionType.CHAINED` collections.
285 includeChains : `bool`, optional
286 If `False`, return records for `~CollectionType.CHAINED` collections
287 themselves. The default is the opposite of ``flattenChains``: either
288 return records for CHAINED collections or their children, but not both.
290 Yields
291 ------
292 record : `CollectionRecord`
293 Matching collection records.
294 """
295 if done is None:
296 done = set()
297 includeChains = includeChains if includeChains is not None else not flattenChains
298 if record.type in collectionTypes:
299 done.add(record.name)
300 if record.type is not CollectionType.CHAINED or includeChains:
301 yield record
302 if flattenChains and record.type is CollectionType.CHAINED:
303 done.add(record.name)
304 # We know this is a ChainedCollectionRecord because of the enum value,
305 # but MyPy doesn't.
306 yield from record.children.iter( # type: ignore
307 manager,
308 collectionTypes=collectionTypes,
309 done=done,
310 flattenChains=flattenChains,
311 includeChains=includeChains,
312 )
315class CollectionSearch(BaseModel, Sequence[str]):
316 """An ordered search path of collections.
318 The `fromExpression` method should almost always be used to construct
319 instances, as the regular constructor performs no checking of inputs (and
320 that can lead to confusing error messages downstream).
322 Parameters
323 ----------
324 collections : `tuple` [ `str` ]
325 Tuple of collection names, ordered from the first searched to the last
326 searched.
328 Notes
329 -----
330 A `CollectionSearch` is used to find a single dataset (or set of datasets
331 with different dataset types or data IDs) according to its dataset type and
332 data ID, giving preference to collections in the order in which they are
333 specified. A `CollectionQuery` can be constructed from a broader range of
334 expressions but does not order the collections to be searched.
336 `CollectionSearch` is an immutable sequence of `str` collection names.
338 A `CollectionSearch` instance constructed properly (e.g. via
339 `fromExpression`) is a unique representation of a particular search path;
340 it is exactly the same internally and compares as equal to any
341 `CollectionSearch` constructed from an equivalent expression, regardless of
342 how different the original expressions appear.
343 """
345 __root__: tuple[str, ...]
347 @classmethod
348 def fromExpression(cls, expression: Any) -> CollectionSearch:
349 """Process a general expression to construct a `CollectionSearch`
350 instance.
352 Parameters
353 ----------
354 expression
355 May be:
356 - a `str` collection name;
357 - an iterable of `str` collection names;
358 - another `CollectionSearch` instance (passed through
359 unchanged).
361 Duplicate entries will be removed (preserving the first appearance
362 of each collection name).
363 Returns
364 -------
365 collections : `CollectionSearch`
366 A `CollectionSearch` instance.
367 """
368 # First see if this is already a CollectionSearch; just pass that
369 # through unchanged. This lets us standardize expressions (and turn
370 # single-pass iterators into multi-pass iterables) in advance and pass
371 # them down to other routines that accept arbitrary expressions.
372 if isinstance(expression, cls):
373 return expression
374 wildcard = CategorizedWildcard.fromExpression(
375 expression,
376 allowAny=False,
377 allowPatterns=False,
378 )
379 assert wildcard is not Ellipsis
380 assert not wildcard.patterns
381 assert not wildcard.items
382 deduplicated = []
383 for name in wildcard.strings:
384 if name not in deduplicated:
385 deduplicated.append(name)
386 return cls(__root__=tuple(deduplicated))
388 def iter(
389 self,
390 manager: CollectionManager,
391 *,
392 datasetType: DatasetType | None = None,
393 collectionTypes: Set[CollectionType] = CollectionType.all(),
394 done: set[str] | None = None,
395 flattenChains: bool = True,
396 includeChains: bool | None = None,
397 ) -> Iterator[CollectionRecord]:
398 """Iterate over collection records that match this instance and the
399 given criteria, in order.
401 This method is primarily intended for internal use by `Registry`;
402 other callers should generally prefer `Registry.findDatasets` or
403 other `Registry` query methods.
405 Parameters
406 ----------
407 manager : `CollectionManager`
408 Object responsible for managing the collection tables in a
409 `Registry`.
410 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
411 If provided, only yield collections of these types.
412 done : `set`, optional
413 A `set` containing the names of all collections already yielded;
414 any collections whose names are already present in this set will
415 not be yielded again, and those yielded will be added to it while
416 iterating. If not provided, an empty `set` will be created and
417 used internally to avoid duplicates.
418 flattenChains : `bool`, optional
419 If `True` (default) recursively yield the child collections of
420 `~CollectionType.CHAINED` collections.
421 includeChains : `bool`, optional
422 If `False`, return records for `~CollectionType.CHAINED`
423 collections themselves. The default is the opposite of
424 ``flattenChains``: either return records for CHAINED collections or
425 their children, but not both.
427 Yields
428 ------
429 record : `CollectionRecord`
430 Matching collection records.
431 """
432 if done is None:
433 done = set()
434 for name in self:
435 if name not in done:
436 yield from _yieldCollectionRecords(
437 manager,
438 manager.find(name),
439 collectionTypes=collectionTypes,
440 done=done,
441 flattenChains=flattenChains,
442 includeChains=includeChains,
443 )
445 def explicitNames(self) -> Iterator[str]:
446 """Iterate over collection names that were specified explicitly."""
447 yield from self.__root__
449 def __iter__(self) -> Iterator[str]: # type: ignore
450 yield from self.__root__
452 def __len__(self) -> int:
453 return len(self.__root__)
455 def __getitem__(self, index: Any) -> str:
456 return self.__root__[index]
458 def __eq__(self, other: Any) -> bool:
459 if isinstance(other, CollectionSearch):
460 return self.__root__ == other.__root__
461 return False
463 def __str__(self) -> str:
464 return "[{}]".format(", ".join(self))
466 def __repr__(self) -> str:
467 return f"CollectionSearch({self.__root__!r})"
470class CollectionQuery:
471 """An unordered query for collections and dataset type restrictions.
473 The `fromExpression` method should almost always be used to construct
474 instances, as the regular constructor performs no checking of inputs (and
475 that can lead to confusing error messages downstream).
477 Parameters
478 ----------
479 search : `CollectionSearch` or `...`
480 An object representing an ordered search for explicitly-named
481 collections (to be interpreted here as unordered), or the special
482 value `...` indicating all collections. `...` must be accompanied
483 by ``patterns=None``.
484 patterns : `tuple` of `re.Pattern`
485 Regular expression patterns to match against collection names.
486 universe : `DimensionUniverse`
487 Object managing all dimensions.
489 Notes
490 -----
491 A `CollectionQuery` is used to find all matching datasets in any number
492 of collections, or to find collections themselves.
494 `CollectionQuery` is expected to be rarely used outside of `Registry`
495 (which uses it to back several of its "query" methods that take general
496 expressions for collections), but it may occassionally be useful outside
497 `Registry` as a way to preprocess expressions that contain single-pass
498 iterators into a form that can be used to call those `Registry` methods
499 multiple times.
500 """
502 def __init__(
503 self,
504 search: CollectionSearch | EllipsisType = Ellipsis,
505 patterns: tuple[re.Pattern, ...] = (),
506 ):
507 self._search = search
508 self._patterns = patterns
510 __slots__ = ("_search", "_patterns")
512 @classmethod
513 def fromExpression(cls, expression: Any) -> CollectionQuery:
514 """Process a general expression to construct a `CollectionQuery`
515 instance.
517 Parameters
518 ----------
519 expression
520 May be:
521 - a `str` collection name;
522 - an `re.Pattern` instance to match (with `re.Pattern.fullmatch`)
523 against collection names;
524 - any iterable containing any of the above;
525 - a `CollectionSearch` instance;
526 - another `CollectionQuery` instance (passed through unchanged).
528 Duplicate collection names will be removed (preserving the first
529 appearance of each collection name).
531 Returns
532 -------
533 collections : `CollectionQuery`
534 A `CollectionQuery` instance.
535 """
536 if isinstance(expression, cls):
537 return expression
538 if expression is Ellipsis:
539 return cls()
540 if isinstance(expression, CollectionSearch):
541 return cls(search=expression, patterns=())
542 wildcard = CategorizedWildcard.fromExpression(
543 expression,
544 allowAny=True,
545 allowPatterns=True,
546 )
547 if wildcard is Ellipsis:
548 return cls()
549 assert (
550 not wildcard.items
551 ), "We should no longer be transforming to (str, DatasetTypeRestriction) tuples."
552 return cls(
553 search=CollectionSearch.fromExpression(wildcard.strings),
554 patterns=tuple(wildcard.patterns),
555 )
557 def iter(
558 self,
559 manager: CollectionManager,
560 *,
561 collectionTypes: Set[CollectionType] = CollectionType.all(),
562 flattenChains: bool = True,
563 includeChains: bool | None = None,
564 ) -> Iterator[CollectionRecord]:
565 """Iterate over collection records that match this instance and the
566 given criteria, in an arbitrary order.
568 This method is primarily intended for internal use by `Registry`;
569 other callers should generally prefer `Registry.queryDatasets` or
570 other `Registry` query methods.
572 Parameters
573 ----------
574 manager : `CollectionManager`
575 Object responsible for managing the collection tables in a
576 `Registry`.
577 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
578 If provided, only yield collections of these types.
579 flattenChains : `bool`, optional
580 If `True` (default) recursively yield the child collections of
581 `~CollectionType.CHAINED` collections.
582 includeChains : `bool`, optional
583 If `False`, return records for `~CollectionType.CHAINED`
584 collections themselves. The default is the opposite of
585 ``flattenChains``: either return records for CHAINED collections or
586 their children, but not both.
588 Yields
589 ------
590 record : `CollectionRecord`
591 Matching collection records.
592 """
593 if self._search is Ellipsis:
594 for record in manager:
595 yield from _yieldCollectionRecords(
596 manager,
597 record,
598 collectionTypes=collectionTypes,
599 flattenChains=flattenChains,
600 includeChains=includeChains,
601 )
602 else:
603 done: set[str] = set()
604 yield from self._search.iter(
605 manager,
606 collectionTypes=collectionTypes,
607 done=done,
608 flattenChains=flattenChains,
609 includeChains=includeChains,
610 )
611 for record in manager:
612 if record.name not in done and any(p.fullmatch(record.name) for p in self._patterns):
613 yield from _yieldCollectionRecords(
614 manager,
615 record,
616 collectionTypes=collectionTypes,
617 done=done,
618 flattenChains=flattenChains,
619 includeChains=includeChains,
620 )
622 def explicitNames(self) -> Iterator[str]:
623 """Iterate over collection names that were specified explicitly."""
624 if isinstance(self._search, CollectionSearch):
625 yield from self._search.explicitNames()
627 def __eq__(self, other: Any) -> bool:
628 if isinstance(other, CollectionQuery):
629 return self._search == other._search and self._patterns == other._patterns
630 else:
631 return False
633 def __str__(self) -> str:
634 if self._search is Ellipsis:
635 return "..."
636 else:
637 terms = list(self._search)
638 terms.extend(str(p) for p in self._patterns)
639 return "[{}]".format(", ".join(terms))
641 def __repr__(self) -> str:
642 return f"CollectionQuery({self._search!r}, {self._patterns!r})"