Coverage for python/lsst/daf/butler/registry/wildcards.py: 25%
220 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:44 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:44 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29__all__ = (
30 "CategorizedWildcard",
31 "CollectionWildcard",
32 "CollectionSearch",
33 "DatasetTypeWildcard",
34)
36import contextlib
37import dataclasses
38import re
39from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
40from types import EllipsisType
41from typing import Any
43from deprecated.sphinx import deprecated
44from lsst.daf.butler._compat import PYDANTIC_V2
45from lsst.utils.iteration import ensure_iterable
47from .._dataset_type import DatasetType
48from ..utils import globToRegex
49from ._exceptions import CollectionExpressionError, DatasetTypeExpressionError
52@dataclasses.dataclass
53class CategorizedWildcard:
54 """The results of preprocessing a wildcard expression to separate match
55 patterns from strings.
57 The `fromExpression` method should almost always be used to construct
58 instances, as the regular constructor performs no checking of inputs (and
59 that can lead to confusing error messages downstream).
60 """
62 @classmethod
63 def fromExpression(
64 cls,
65 expression: Any,
66 *,
67 allowAny: bool = True,
68 allowPatterns: bool = True,
69 coerceUnrecognized: Callable[[Any], tuple[str, Any] | str] | None = None,
70 coerceItemValue: Callable[[Any], Any] | None = None,
71 defaultItemValue: Any | None = None,
72 ) -> CategorizedWildcard | EllipsisType:
73 """Categorize a wildcard expression.
75 Parameters
76 ----------
77 expression
78 The expression to categorize. May be any of:
79 - `str` (including glob patterns if ``allowPatterns`` is `True`);
80 - `re.Pattern` (only if ``allowPatterns`` is `True`);
81 - objects recognized by ``coerceUnrecognized`` (if provided);
82 - two-element tuples of (`str`, value) where value is recognized
83 by ``coerceItemValue`` (if provided);
84 - a non-`str`, non-mapping iterable containing any of the above;
85 - the special value `...` (only if ``allowAny`` is `True`), which
86 matches anything;
87 - a mapping from `str` to a value are recognized by
88 ``coerceItemValue`` (if provided);
89 - a `CategorizedWildcard` instance (passed through unchanged if
90 it meets the requirements specified by keyword arguments).
91 allowAny: `bool`, optional
92 If `False` (`True` is default) raise `TypeError` if `...` is
93 encountered.
94 allowPatterns: `bool`, optional
95 If `False` (`True` is default) raise `TypeError` if a `re.Pattern`
96 is encountered, or if ``expression`` is a `CategorizedWildcard`
97 with `patterns` not empty.
98 coerceUnrecognized: `~collections.abc.Callable`, optional
99 A callback that takes a single argument of arbitrary type and
100 returns either a `str` - appended to `strings` - or a `tuple` of
101 (`str`, `Any`) to be appended to `items`. This will be called on
102 objects of unrecognized type. Exceptions will be reraised as
103 `TypeError` (and chained).
104 coerceItemValue: `~collections.abc.Callable`, optional
105 If provided, ``expression`` may be a mapping from `str` to any
106 type that can be passed to this function; the result of that call
107 will be stored instead as the value in ``self.items``.
108 defaultItemValue: `Any`, optional
109 If provided, combine this value with any string values encountered
110 (including any returned by ``coerceUnrecognized``) to form a
111 `tuple` and add it to `items`, guaranteeing that `strings` will be
112 empty. Patterns are never added to `items`.
114 Returns
115 -------
116 categorized : `CategorizedWildcard` or ``...``.
117 The struct describing the wildcard. ``...`` is passed through
118 unchanged.
120 Raises
121 ------
122 TypeError
123 Raised if an unsupported type is found in the expression.
124 """
125 assert expression is not None
126 # See if we were given ...; just return that if we were.
127 if expression is ...:
128 if not allowAny:
129 raise TypeError("This expression may not be unconstrained.")
130 return ...
131 if isinstance(expression, cls):
132 # This is already a CategorizedWildcard. Make sure it meets the
133 # reqs. implied by the kwargs we got.
134 if not allowPatterns and expression.patterns:
135 raise TypeError(
136 f"Regular expression(s) {expression.patterns} are not allowed in this context."
137 )
138 if defaultItemValue is not None and expression.strings:
139 if expression.items:
140 raise TypeError(
141 "Incompatible preprocessed expression: an ordered sequence of str is "
142 "needed, but the original order was lost in the preprocessing."
143 )
144 return cls(
145 strings=[],
146 patterns=expression.patterns,
147 items=[(k, defaultItemValue) for k in expression.strings],
148 )
149 elif defaultItemValue is None and expression.items:
150 if expression.strings:
151 raise TypeError(
152 "Incompatible preprocessed expression: an ordered sequence of items is "
153 "needed, but the original order was lost in the preprocessing."
154 )
155 return cls(strings=[k for k, _ in expression.items], patterns=expression.patterns, items=[])
156 else:
157 # Original expression was created with keyword arguments that
158 # were at least as restrictive as what we just got; pass it
159 # through.
160 return expression
162 # If we get here, we know we'll be creating a new instance.
163 # Initialize an empty one now.
164 self = cls(strings=[], patterns=[], items=[])
166 # If mappings are allowed, see if we were given a single mapping by
167 # trying to get items.
168 if coerceItemValue is not None:
169 rawItems = None
170 with contextlib.suppress(AttributeError):
171 rawItems = expression.items()
173 if rawItems is not None:
174 for k, v in rawItems:
175 try:
176 self.items.append((k, coerceItemValue(v)))
177 except Exception as err:
178 raise TypeError(f"Could not coerce mapping value '{v}' for key '{k}'.") from err
179 return self
181 # Not ..., a CategorizedWildcard instance, or a mapping. Just
182 # process scalars or an iterable. We put the body of the loop inside
183 # a local function so we can recurse after coercion.
185 def process(element: Any, alreadyCoerced: bool = False) -> EllipsisType | None:
186 if isinstance(element, str):
187 if defaultItemValue is not None:
188 self.items.append((element, defaultItemValue))
189 return None
190 else:
191 # This returns a list but we know we only passed in
192 # single value.
193 converted = globToRegex(element)
194 if converted is ...:
195 return ...
196 element = converted[0]
197 # Let regex and ... go through to the next check
198 if isinstance(element, str):
199 self.strings.append(element)
200 return None
201 if allowPatterns and isinstance(element, re.Pattern):
202 self.patterns.append(element)
203 return None
204 if alreadyCoerced:
205 try:
206 k, v = element
207 except TypeError:
208 raise TypeError(
209 f"Object '{element!r}' returned by coercion function must be `str` or `tuple`."
210 ) from None
211 else:
212 self.items.append((k, v))
213 return None
214 if coerceItemValue is not None:
215 try:
216 k, v = element
217 except TypeError:
218 pass
219 else:
220 if not isinstance(k, str):
221 raise TypeError(f"Item key '{k}' is not a string.")
222 try:
223 v = coerceItemValue(v)
224 except Exception as err:
225 raise TypeError(f"Could not coerce tuple item value '{v}' for key '{k}'.") from err
226 self.items.append((k, v))
227 return None
228 if coerceUnrecognized is not None:
229 try:
230 # This should be safe but flake8 cant tell that the
231 # function will be re-declared next function call
232 process(coerceUnrecognized(element), alreadyCoerced=True) # noqa: F821
233 except Exception as err:
234 raise TypeError(f"Could not coerce expression element '{element!r}'.") from err
235 else:
236 extra = "."
237 if isinstance(element, re.Pattern):
238 extra = " and patterns are not allowed."
239 raise TypeError(f"Unsupported object in wildcard expression: '{element!r}'{extra}")
240 return None
242 for element in ensure_iterable(expression):
243 retval = process(element)
244 if retval is ...:
245 # One of the globs matched everything
246 if not allowAny:
247 raise TypeError("This expression may not be unconstrained.")
248 return ...
249 del process
250 return self
252 strings: list[str]
253 """Explicit string values found in the wildcard (`list` [ `str` ]).
254 """
256 patterns: list[re.Pattern]
257 """Regular expression patterns found in the wildcard
258 (`list` [ `re.Pattern` ]).
259 """
261 items: list[tuple[str, Any]]
262 """Two-item tuples that relate string values to other objects
263 (`list` [ `tuple` [ `str`, `Any` ] ]).
264 """
267if PYDANTIC_V2: 267 ↛ 268line 267 didn't jump to line 268, because the condition on line 267 was never true
268 from pydantic import RootModel # type: ignore
270 class _CollectionSearch(RootModel):
271 root: tuple[str, ...]
273else:
274 from pydantic import BaseModel
276 class _CollectionSearch(BaseModel, Sequence[str]): # type: ignore
277 __root__: tuple[str, ...]
279 @property
280 def root(self) -> tuple[str, ...]:
281 return self.__root__
284@deprecated(
285 reason="Tuples of string collection names are now preferred. Will be removed after v26.",
286 version="v25.0",
287 category=FutureWarning,
288)
289class CollectionSearch(_CollectionSearch):
290 """An ordered search path of collections.
292 The `fromExpression` method should almost always be used to construct
293 instances, as the regular constructor performs no checking of inputs (and
294 that can lead to confusing error messages downstream).
296 Parameters
297 ----------
298 collections : `tuple` [ `str` ]
299 Tuple of collection names, ordered from the first searched to the last
300 searched.
302 Notes
303 -----
304 A `CollectionSearch` is used to find a single dataset (or set of datasets
305 with different dataset types or data IDs) according to its dataset type and
306 data ID, giving preference to collections in the order in which they are
307 specified. A `CollectionWildcard` can be constructed from a broader range
308 of expressions but does not order the collections to be searched.
310 `CollectionSearch` is an immutable sequence of `str` collection names.
312 A `CollectionSearch` instance constructed properly (e.g. via
313 `fromExpression`) is a unique representation of a particular search path;
314 it is exactly the same internally and compares as equal to any
315 `CollectionSearch` constructed from an equivalent expression, regardless of
316 how different the original expressions appear.
317 """
319 @classmethod
320 def fromExpression(cls, expression: Any) -> CollectionSearch:
321 """Process a general expression to construct a `CollectionSearch`
322 instance.
324 Parameters
325 ----------
326 expression
327 May be:
328 - a `str` collection name;
329 - an iterable of `str` collection names;
330 - another `CollectionSearch` instance (passed through
331 unchanged).
333 Duplicate entries will be removed (preserving the first appearance
334 of each collection name).
336 Returns
337 -------
338 collections : `CollectionSearch`
339 A `CollectionSearch` instance.
340 """
341 # First see if this is already a CollectionSearch; just pass that
342 # through unchanged. This lets us standardize expressions (and turn
343 # single-pass iterators into multi-pass iterables) in advance and pass
344 # them down to other routines that accept arbitrary expressions.
345 if isinstance(expression, cls):
346 return expression
347 try:
348 wildcard = CategorizedWildcard.fromExpression(
349 expression,
350 allowAny=False,
351 allowPatterns=False,
352 )
353 except TypeError as err:
354 raise CollectionExpressionError(str(err)) from None
355 assert wildcard is not ...
356 assert not wildcard.patterns
357 assert not wildcard.items
358 deduplicated = []
359 for name in wildcard.strings:
360 if name not in deduplicated:
361 deduplicated.append(name)
362 if PYDANTIC_V2:
363 model = cls(tuple(deduplicated)) # type: ignore
364 else:
365 model = cls(__root__=tuple(deduplicated)) # type: ignore
366 return model
368 def explicitNames(self) -> Iterator[str]:
369 """Iterate over collection names that were specified explicitly."""
370 yield from self.root
372 def __iter__(self) -> Iterator[str]: # type: ignore
373 yield from self.root
375 def __len__(self) -> int:
376 return len(self.root)
378 def __getitem__(self, index: Any) -> str:
379 return self.root[index]
381 def __eq__(self, other: Any) -> bool:
382 if isinstance(other, CollectionSearch):
383 return self.root == other.root
384 return False
386 def __str__(self) -> str:
387 return "[{}]".format(", ".join(self))
389 def __repr__(self) -> str:
390 return f"CollectionSearch({self.root!r})"
393@dataclasses.dataclass(frozen=True)
394class CollectionWildcard:
395 """A validated wildcard for collection names.
397 The `from_expression` method should almost always be used to construct
398 instances, as the regular constructor performs no checking of inputs (and
399 that can lead to confusing error messages downstream).
401 Notes
402 -----
403 `CollectionWildcard` is expected to be rarely used outside of `Registry`
404 (which uses it to back several of its "query" methods that take general
405 expressions for collections), but it may occasionally be useful outside
406 `Registry` as a way to preprocess expressions that contain single-pass
407 iterators into a form that can be used to call those `Registry` methods
408 multiple times.
409 """
411 strings: tuple[str, ...] = ()
412 """An an ordered list of explicitly-named collections. (`tuple` [ `str` ]).
413 """
415 patterns: tuple[re.Pattern, ...] | EllipsisType = ...
416 """Regular expression patterns to match against collection names, or the
417 special value ``...`` indicating all collections.
419 `...` must be accompanied by ``strings=()``.
420 """
422 def __post_init__(self) -> None:
423 if self.patterns is ... and self.strings:
424 raise ValueError(
425 f"Collection wildcard matches any string, but still has explicit strings {self.strings}."
426 )
428 @classmethod
429 def from_expression(cls, expression: Any, require_ordered: bool = False) -> CollectionWildcard:
430 """Process a general expression to construct a `CollectionWildcard`
431 instance.
433 Parameters
434 ----------
435 expression
436 May be:
437 - a `str` collection name;
438 - an `re.Pattern` instance to match (with `re.Pattern.fullmatch`)
439 against collection names;
440 - any iterable containing any of the above;
441 - another `CollectionWildcard` instance (passed through
442 unchanged).
444 Duplicate collection names will be removed (preserving the first
445 appearance of each collection name).
446 require_ordered : `bool`, optional
447 If `True` (`False` is default) require the expression to be
448 ordered, and raise `CollectionExpressionError` if it is not.
450 Returns
451 -------
452 wildcard : `CollectionWildcard`
453 A `CollectionWildcard` instance.
455 Raises
456 ------
457 CollectionExpressionError
458 Raised if the patterns has regular expression, glob patterns, or
459 the ``...`` wildcard, and ``require_ordered=True``.
460 """
461 if isinstance(expression, cls):
462 return expression
463 if expression is ...:
464 return cls()
465 wildcard = CategorizedWildcard.fromExpression(
466 expression,
467 allowAny=True,
468 allowPatterns=True,
469 )
470 if wildcard is ...:
471 return cls()
472 result = cls(
473 strings=tuple(wildcard.strings),
474 patterns=tuple(wildcard.patterns),
475 )
476 if require_ordered:
477 result.require_ordered()
478 return result
480 @classmethod
481 def from_names(cls, names: Iterable[str]) -> CollectionWildcard:
482 """Construct from an iterable of explicit collection names.
484 Parameters
485 ----------
486 names : `~collections.abc.Iterable` [ `str` ]
487 Iterable of collection names.
489 Returns
490 -------
491 wildcard : ~CollectionWildcard`
492 A `CollectionWildcard` instance. `require_ordered` is guaranteed
493 to succeed and return the given names in order.
494 """
495 return cls(strings=tuple(names), patterns=())
497 def require_ordered(self) -> tuple[str, ...]:
498 """Require that this wildcard contains no patterns, and return the
499 ordered tuple of names that it does hold.
501 Returns
502 -------
503 names : `tuple` [ `str` ]
504 Ordered tuple of collection names.
506 Raises
507 ------
508 CollectionExpressionError
509 Raised if the patterns has regular expression, glob patterns, or
510 the ``...`` wildcard.
511 """
512 if self.patterns:
513 raise CollectionExpressionError(
514 f"An ordered collection expression is required; got patterns {self.patterns}."
515 )
516 return self.strings
518 def empty(self) -> bool:
519 """Return true if both ``strings`` and ``patterns`` are empty."""
520 # bool(Ellipsis) is True
521 return not self.strings and not self.patterns
523 def __str__(self) -> str:
524 if self.patterns is ...:
525 return "..."
526 else:
527 terms = list(self.strings)
528 terms.extend(str(p) for p in self.patterns)
529 return "[{}]".format(", ".join(terms))
532@dataclasses.dataclass
533class DatasetTypeWildcard:
534 """A validated expression that resolves to one or more dataset types.
536 The `from_expression` method should almost always be used to construct
537 instances, as the regular constructor performs no checking of inputs (and
538 that can lead to confusing error messages downstream).
539 """
541 values: Mapping[str, DatasetType | None] = dataclasses.field(default_factory=dict)
542 """A mapping with `str` dataset type name keys and optional `DatasetType`
543 instances.
544 """
546 patterns: tuple[re.Pattern, ...] | EllipsisType = ...
547 """Regular expressions to be matched against dataset type names, or the
548 special value ``...`` indicating all dataset types.
550 Any pattern matching a dataset type is considered an overall match for
551 the expression.
552 """
554 @classmethod
555 def from_expression(cls, expression: Any) -> DatasetTypeWildcard:
556 """Construct an instance by analyzing the given expression.
558 Parameters
559 ----------
560 expression
561 Expression to analyze. May be any of the following:
563 - a `str` dataset type name;
564 - a `DatasetType` instance;
565 - a `re.Pattern` to match against dataset type names;
566 - an iterable whose elements may be any of the above (any dataset
567 type matching any element in the list is an overall match);
568 - an existing `DatasetTypeWildcard` instance;
569 - the special ``...`` ellipsis object, which matches any dataset
570 type.
572 Returns
573 -------
574 query : `DatasetTypeWildcard`
575 An instance of this class (new unless an existing instance was
576 passed in).
578 Raises
579 ------
580 DatasetTypeExpressionError
581 Raised if the given expression does not have one of the allowed
582 types.
583 """
584 if isinstance(expression, cls):
585 return expression
586 try:
587 wildcard = CategorizedWildcard.fromExpression(
588 expression, coerceUnrecognized=lambda d: (d.name, d)
589 )
590 except TypeError as err:
591 raise DatasetTypeExpressionError(f"Invalid dataset type expression: {expression!r}.") from err
592 if wildcard is ...:
593 return cls()
594 values: dict[str, DatasetType | None] = {}
595 for name in wildcard.strings:
596 values[name] = None
597 for name, item in wildcard.items:
598 if not isinstance(item, DatasetType):
599 raise DatasetTypeExpressionError(
600 f"Invalid value '{item}' of type {type(item)} in dataset type expression; "
601 "expected str, re.Pattern, DatasetType objects, iterables thereof, or '...'."
602 )
603 values[name] = item
604 return cls(values, patterns=tuple(wildcard.patterns))
606 def __str__(self) -> str:
607 if self.patterns is ...:
608 return "..."
609 else:
610 terms = list(self.values.keys())
611 terms.extend(str(p) for p in self.patterns)
612 return "[{}]".format(", ".join(terms))