Coverage for python/lsst/daf/butler/core/fileTemplates.py: 12%
244 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-01 19:55 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-01 19:55 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24"""Support for file template string expansion."""
26__all__ = ("FileTemplates", "FileTemplate", "FileTemplatesConfig", "FileTemplateValidationError")
28import os.path
29import string
30import logging
31from types import MappingProxyType
33from typing import (
34 TYPE_CHECKING,
35 Any,
36 Iterable,
37 Mapping,
38 Optional,
39 Set,
40 Tuple,
41 Union,
42)
44from .config import Config
45from .configSupport import processLookupConfigs, LookupKey
46from .exceptions import ValidationError
47from .dimensions import SkyPixDimension, DataCoordinate
48from .datasets import DatasetRef
49from .storageClass import StorageClass
51if TYPE_CHECKING: 51 ↛ 52line 51 didn't jump to line 52, because the condition on line 51 was never true
52 from .dimensions import DimensionUniverse
53 from .datasets import DatasetType
55log = logging.getLogger(__name__)
58class FileTemplateValidationError(ValidationError):
59 """Exception for file template inconsistent with associated DatasetType."""
61 pass
64class FileTemplatesConfig(Config):
65 """Configuration information for `FileTemplates`."""
67 pass
70class FileTemplates:
71 """Collection of `FileTemplate` templates.
73 Parameters
74 ----------
75 config : `FileTemplatesConfig` or `str`
76 Load configuration.
77 default : `str`, optional
78 If not `None`, a default template to use if no template has
79 been specified explicitly in the configuration.
80 universe : `DimensionUniverse`
81 The set of all known dimensions, used to normalize any lookup keys
82 involving dimensions.
84 Notes
85 -----
86 The configuration can include one level of hierarchy where an
87 instrument-specific section can be defined to override more general
88 template specifications. This is represented in YAML using a
89 key of form ``instrument<name>`` which can then define templates
90 that will be returned if a `DatasetRef` contains a matching instrument
91 name in the data ID.
93 A default fallback template can be specified using the key ``default``.
94 Defaulting can be disabled in a child configuration by defining the
95 value to be an empty string or a boolean `False`.
97 The config is parsed using the function
98 `~lsst.daf.butler.configSubset.processLookupConfigs`.
99 """
101 defaultKey = LookupKey("default")
102 """Configuration key associated with the default template."""
104 def __init__(self, config: Union[FileTemplatesConfig, str],
105 default: Optional[str] = None, *,
106 universe: DimensionUniverse):
107 self.config = FileTemplatesConfig(config)
108 self._templates = {}
110 contents = processLookupConfigs(self.config, universe=universe)
112 # Determine default to use -- defaults can be disabled if
113 # we get a False or None
114 defaultValue = contents.get(self.defaultKey, default)
115 if defaultValue and not isinstance(defaultValue, str):
116 raise RuntimeError("Default template value should be str or False, or None. "
117 f"Got '{defaultValue}'")
118 self.default = FileTemplate(defaultValue) if isinstance(defaultValue, str) and defaultValue else None
120 # Convert all the values to FileTemplate, handling defaults
121 for key, templateStr in contents.items():
122 if key == self.defaultKey:
123 continue
124 if not isinstance(templateStr, str):
125 raise RuntimeError(f"Unexpected value in file template key {key}: {templateStr}")
126 self._templates[key] = FileTemplate(templateStr)
128 @property
129 def templates(self) -> Mapping[LookupKey, FileTemplate]:
130 """Return collection of templates indexed by lookup key (`dict`)."""
131 return MappingProxyType(self._templates)
133 def __contains__(self, key: LookupKey) -> bool:
134 """Indicate whether the supplied key is present in the templates.
136 Parameters
137 ----------
138 key : `LookupKey`
139 Key to use to determine if a corresponding value is present
140 in the templates.
142 Returns
143 -------
144 in : `bool`
145 `True` if the supplied key is present in the templates.
146 """
147 return key in self.templates
149 def __getitem__(self, key: LookupKey) -> FileTemplate:
150 return self.templates[key]
152 def validateTemplates(self, entities: Iterable[Union[DatasetType, DatasetRef, StorageClass]],
153 logFailures: bool = False) -> None:
154 """Validate the templates.
156 Retrieves the template associated with each dataset type and
157 validates the dimensions against the template.
159 Parameters
160 ----------
161 entities : `DatasetType`, `DatasetRef`, or `StorageClass`
162 Entities to validate against the matching templates. Can be
163 differing types.
164 logFailures : `bool`, optional
165 If `True`, output a log message for every validation error
166 detected.
168 Raises
169 ------
170 FileTemplateValidationError
171 Raised if an entity failed validation.
173 Notes
174 -----
175 See `FileTemplate.validateTemplate()` for details on the validation.
176 """
177 unmatchedKeys = set(self.templates)
178 failed = []
179 for entity in entities:
180 try:
181 matchKey, template = self.getTemplateWithMatch(entity)
182 except KeyError as e:
183 # KeyError always quotes on stringification so strip here
184 errMsg = str(e).strip('"\'')
185 failed.append(errMsg)
186 if logFailures:
187 log.critical("%s", errMsg)
188 continue
190 if matchKey in unmatchedKeys:
191 unmatchedKeys.remove(matchKey)
193 try:
194 template.validateTemplate(entity)
195 except FileTemplateValidationError as e:
196 failed.append(f"{e} (via key '{matchKey}')")
197 if logFailures:
198 log.critical("Template failure with key '%s': %s", matchKey, e)
200 if logFailures and unmatchedKeys:
201 log.warning("Unchecked keys: %s", ", ".join([str(k) for k in unmatchedKeys]))
203 if failed:
204 if len(failed) == 1:
205 msg = str(failed[0])
206 else:
207 failMsg = ";\n".join(failed)
208 msg = f"{len(failed)} template validation failures: {failMsg}"
209 raise FileTemplateValidationError(msg)
211 def getLookupKeys(self) -> Set[LookupKey]:
212 """Retrieve the look up keys for all the template entries.
214 Returns
215 -------
216 keys : `set` of `LookupKey`
217 The keys available for matching a template.
218 """
219 return set(self.templates)
221 def getTemplateWithMatch(self, entity: Union[DatasetRef,
222 DatasetType, StorageClass]) -> Tuple[LookupKey,
223 FileTemplate]:
224 """Retrieve the `FileTemplate` associated with the dataset type.
226 Also retrieves the lookup key that was a match for this template.
228 If the lookup name corresponds to a component the base name for
229 the component will be examined if the full component name does
230 not match.
232 Parameters
233 ----------
234 entity : `DatasetType`, `DatasetRef`, or `StorageClass`
235 Instance to use to look for a corresponding template.
236 A `DatasetType` name or a `StorageClass` name will be used
237 depending on the supplied entity. Priority is given to a
238 `DatasetType` name. Supports instrument override if a
239 `DatasetRef` is provided configured with an ``instrument``
240 value for the data ID.
242 Returns
243 -------
244 matchKey : `LookupKey`
245 The key that resulted in the successful match.
246 template : `FileTemplate`
247 Template instance to use with that dataset type.
249 Raises
250 ------
251 KeyError
252 Raised if no template could be located for this Dataset type.
253 """
254 # Get the names to use for lookup
255 names = entity._lookupNames()
257 # Get a location from the templates
258 template = self.default
259 source = self.defaultKey
260 for name in names:
261 if name in self.templates:
262 template = self.templates[name]
263 source = name
264 break
266 if template is None:
267 raise KeyError(f"Unable to determine file template from supplied argument [{entity}]")
269 log.debug("Got file %s from %s via %s", template, entity, source)
271 return source, template
273 def getTemplate(self, entity: Union[DatasetType, DatasetRef, StorageClass]) -> FileTemplate:
274 """Retrieve the `FileTemplate` associated with the dataset type.
276 If the lookup name corresponds to a component the base name for
277 the component will be examined if the full component name does
278 not match.
280 Parameters
281 ----------
282 entity : `DatasetType`, `DatasetRef`, or `StorageClass`
283 Instance to use to look for a corresponding template.
284 A `DatasetType` name or a `StorageClass` name will be used
285 depending on the supplied entity. Priority is given to a
286 `DatasetType` name. Supports instrument override if a
287 `DatasetRef` is provided configured with an ``instrument``
288 value for the data ID.
290 Returns
291 -------
292 template : `FileTemplate`
293 Template instance to use with that dataset type.
295 Raises
296 ------
297 KeyError
298 Raised if no template could be located for this Dataset type.
299 """
300 _, template = self.getTemplateWithMatch(entity)
301 return template
304class FileTemplate:
305 """Format a path template into a fully expanded path.
307 Parameters
308 ----------
309 template : `str`
310 Template string.
312 Raises
313 ------
314 FileTemplateValidationError
315 Raised if the template fails basic validation.
317 Notes
318 -----
319 The templates use the standard Format Specification Mini-Language
320 with the caveat that only named fields can be used. The field names
321 are taken from the Dimensions along with several additional fields:
323 - datasetType: `str`, `DatasetType.name`
324 - component: `str`, name of the StorageClass component
325 - run: `str`, name of the run this dataset was added with
327 `run` must always be provided to ensure unique paths.
329 More detailed information can be requested from dimensions by using a dot
330 notation, so ``visit.name`` would use the name of the visit and
331 ``detector.name_in_raft`` would use the name of the detector within the
332 raft.
334 The mini-language is extended to understand a "?" in the format
335 specification. This indicates that a field is optional. If that
336 Dimension is missing the field, along with the text before the field,
337 unless it is a path separator, will be removed from the output path.
339 By default any "/" in a dataId value will be replaced by "_" to prevent
340 unexpected directories being created in the path. If the "/" should be
341 retained then a special "/" format specifier can be included in the
342 template.
343 """
345 mandatoryFields = {"run"}
346 """A set of fields, one of which must be present in a template."""
348 datasetFields = {"datasetType", "component"}
349 """Fields related to the supplied dataset, not a dimension."""
351 specialFields = mandatoryFields | datasetFields
352 """Set of special fields that are available independently of the defined
353 Dimensions."""
355 def __init__(self, template: str):
356 if not isinstance(template, str):
357 raise FileTemplateValidationError(f"Template ('{template}') does "
358 "not contain any format specifiers")
359 self.template = template
361 # Do basic validation without access to dimensions
362 self.validateTemplate(None)
364 def __eq__(self, other: Any) -> bool:
365 if not isinstance(other, FileTemplate):
366 return False
368 return self.template == other.template
370 def __str__(self) -> str:
371 return self.template
373 def __repr__(self) -> str:
374 return f'{self.__class__.__name__}("{self.template}")'
376 def fields(self, optionals: bool = False, specials: bool = False, subfields: bool = False) -> Set[str]:
377 """Return the field names used in this template.
379 Parameters
380 ----------
381 optionals : `bool`
382 If `True`, optional fields are included in the returned set.
383 specials : `bool`
384 If `True`, non-dimension fields are included.
385 subfields : `bool`, optional
386 If `True`, fields with syntax ``a.b`` are included. If `False`,
387 the default, only ``a`` would be returned.
389 Returns
390 -------
391 names : `set`
392 Names of fields used in this template
394 Notes
395 -----
396 The returned set will include the special values such as `datasetType`
397 and `component`.
398 """
399 fmt = string.Formatter()
400 parts = fmt.parse(self.template)
402 names = set()
403 for literal, field_name, format_spec, conversion in parts:
404 if field_name is not None and format_spec is not None:
405 if "?" in format_spec and not optionals:
406 continue
408 if not specials and field_name in self.specialFields:
409 continue
411 if "." in field_name and not subfields:
412 field_name, _ = field_name.split(".")
414 names.add(field_name)
416 return names
418 def format(self, ref: DatasetRef) -> str:
419 """Format a template string into a full path.
421 Parameters
422 ----------
423 ref : `DatasetRef`
424 The dataset to be formatted.
426 Returns
427 -------
428 path : `str`
429 Expanded path.
431 Raises
432 ------
433 KeyError
434 Raised if the requested field is not defined and the field is
435 not optional. Or, `component` is specified but "component" was
436 not part of the template.
437 """
438 # Extract defined non-None dimensions from the dataId.
439 # This guards against Nones being explicitly present in the data ID
440 # (which can happen if, say, an exposure has no filter), as well as
441 # the case where only required dimensions are present (which in this
442 # context should only happen in unit tests; in general we need all
443 # dimensions to fill out templates).
444 fields = {k: ref.dataId.get(k) for k in ref.datasetType.dimensions.names
445 if ref.dataId.get(k) is not None}
446 # Extra information that can be included using . syntax
447 extras = {}
448 if isinstance(ref.dataId, DataCoordinate):
449 if ref.dataId.hasRecords():
450 extras = ref.dataId.records.byName()
451 skypix_alias = self._determine_skypix_alias(ref)
452 if skypix_alias is not None:
453 fields["skypix"] = fields[skypix_alias]
454 if extras:
455 extras["skypix"] = extras[skypix_alias]
457 datasetType = ref.datasetType
458 fields["datasetType"], component = datasetType.nameAndComponent()
460 usedComponent = False
461 if component is not None:
462 fields["component"] = component
464 usedRun = False
465 fields["run"] = ref.run
467 fmt = string.Formatter()
468 parts = fmt.parse(self.template)
469 output = ""
471 for literal, field_name, format_spec, conversion in parts:
473 if field_name == "component":
474 usedComponent = True
476 if format_spec is None:
477 output = output + literal
478 continue
480 # Should only happen if format_spec is None
481 if field_name is None:
482 raise RuntimeError(f"Unexpected blank field_name encountered in {self.template} [{literal}]")
484 if "?" in format_spec:
485 optional = True
486 # Remove the non-standard character from the spec
487 format_spec = format_spec.replace("?", "")
488 else:
489 optional = False
491 if field_name == "run":
492 usedRun = True
494 if field_name == "collection":
495 raise KeyError("'collection' is no longer supported as a "
496 "file template placeholder; use 'run' instead.")
498 # Check for request for additional information from the dataId
499 if "." in field_name:
500 primary, secondary = field_name.split(".")
501 if primary in extras:
502 record = extras[primary]
503 # Only fill in the fields if we have a value, the
504 # KeyError will trigger below if the attribute is missing.
505 if hasattr(record, secondary):
506 fields[field_name] = getattr(record, secondary)
508 if field_name in fields:
509 value = fields[field_name]
510 elif optional:
511 # If this is optional ignore the format spec
512 # and do not include the literal text prior to the optional
513 # field unless it contains a "/" path separator
514 format_spec = ""
515 value = ""
516 if "/" not in literal:
517 literal = ""
518 else:
519 raise KeyError(f"'{field_name}' requested in template via '{self.template}' "
520 "but not defined and not optional")
522 # Handle "/" in values since we do not want to be surprised by
523 # unexpected directories turning up
524 replace_slash = True
525 if "/" in format_spec:
526 # Remove the non-standard character from the spec
527 format_spec = format_spec.replace("/", "")
528 replace_slash = False
530 if isinstance(value, str):
531 # Replace spaces with underscores for more friendly file paths
532 value = value.replace(" ", "_")
533 if replace_slash:
534 value = value.replace("/", "_")
536 # Now use standard formatting
537 output = output + literal + format(value, format_spec)
539 # Replace periods with underscores in the non-directory part to
540 # prevent file extension confusion. Also replace # in the non-dir
541 # part to avoid confusion with URI fragments
542 head, tail = os.path.split(output)
543 tail = tail.replace(".", "_")
544 tail = tail.replace("#", "HASH")
545 output = os.path.join(head, tail)
547 # Complain if we were meant to use a component
548 if component is not None and not usedComponent:
549 raise KeyError("Component '{}' specified but template {} did not use it".format(component,
550 self.template))
552 # Complain if there's no run
553 if not usedRun:
554 raise KeyError("Template does not include 'run'.")
556 # Since this is known to be a path, normalize it in case some double
557 # slashes have crept in
558 path = os.path.normpath(output)
560 # It should not be an absolute path (may happen with optionals)
561 if os.path.isabs(path):
562 path = os.path.relpath(path, start="/")
564 return path
566 def validateTemplate(self, entity: Union[DatasetRef, DatasetType, StorageClass, None]) -> None:
567 """Compare the template against supplied entity that wants to use it.
569 Parameters
570 ----------
571 entity : `DatasetType`, `DatasetRef`, or `StorageClass`
572 Entity to compare against template. If `None` is given only
573 very basic validation of templates will be performed.
575 Raises
576 ------
577 FileTemplateValidationError
578 Raised if the template is inconsistent with the supplied entity.
580 Notes
581 -----
582 Validation will always include a check that mandatory fields
583 are present and that at least one field refers to a dimension.
584 If the supplied entity includes a `DimensionGraph` then it will be
585 used to compare the available dimensions with those specified in the
586 template.
587 """
588 # Check that the template has run
589 withSpecials = self.fields(specials=True, optionals=True)
590 if not withSpecials & self.mandatoryFields:
591 raise FileTemplateValidationError(f"Template '{self}' is missing a mandatory field"
592 f" from {self.mandatoryFields}")
594 # Check that there are some dimension fields in the template
595 allfields = self.fields(optionals=True)
596 if not allfields:
597 raise FileTemplateValidationError(f"Template '{self}' does not seem to have any fields"
598 " corresponding to dimensions.")
600 # If we do not have dimensions available then all we can do is shrug
601 if not hasattr(entity, "dimensions"):
602 return
604 # Mypy does not know about hasattr so help it out
605 if entity is None:
606 return
608 # if this entity represents a component then insist that component
609 # is present in the template. If the entity is not a component
610 # make sure that component is not mandatory.
611 try:
612 # mypy does not see the except block so complains about
613 # StorageClass not supporting isComponent
614 if entity.isComponent(): # type: ignore
615 if "component" not in withSpecials:
616 raise FileTemplateValidationError(f"Template '{self}' has no component but "
617 f"{entity} refers to a component.")
618 else:
619 mandatorySpecials = self.fields(specials=True)
620 if "component" in mandatorySpecials:
621 raise FileTemplateValidationError(f"Template '{self}' has mandatory component but "
622 f"{entity} does not refer to a component.")
623 except AttributeError:
624 pass
626 # From here on we need at least a DatasetType
627 # Mypy doesn't understand the AttributeError clause below
628 if isinstance(entity, StorageClass):
629 return
631 # Get the dimension links to get the full set of available field names
632 # Fall back to dataId keys if we have them but no links.
633 # dataId keys must still be present in the template
634 try:
635 minimal = set(entity.dimensions.required.names)
636 maximal = set(entity.dimensions.names)
637 except AttributeError:
638 try:
639 minimal = set(entity.dataId.keys().names) # type: ignore
640 maximal = minimal
641 except AttributeError:
642 return
644 # Replace specific skypix dimensions with generic one
645 skypix_alias = self._determine_skypix_alias(entity)
646 if skypix_alias is not None:
647 minimal.add("skypix")
648 maximal.add("skypix")
649 minimal.remove(skypix_alias)
650 maximal.remove(skypix_alias)
652 required = self.fields(optionals=False)
654 # Calculate any field usage that does not match a dimension
655 if not required.issubset(maximal):
656 raise FileTemplateValidationError(f"Template '{self}' is inconsistent with {entity}:"
657 f" {required} is not a subset of {maximal}.")
659 if not allfields.issuperset(minimal):
660 raise FileTemplateValidationError(f"Template '{self}' is inconsistent with {entity}:"
661 f" {allfields} is not a superset of {minimal}.")
663 return
665 def _determine_skypix_alias(self, entity: Union[DatasetRef, DatasetType]) -> Optional[str]:
666 """Return the dimension name that refers to a sky pixel.
668 Parameters
669 ----------
670 ref : `DatasetRef` or `DatasetType`
671 The entity to examine.
673 Returns
674 -------
675 alias : `str`
676 If there is a sky pixelization in the supplied dataId, return
677 its name, else returns `None`. Will return `None` also if there
678 is more than one sky pix dimension in the data ID or if the
679 dataID is not a `DataCoordinate`
680 """
681 alias = None
683 if isinstance(entity, DatasetRef):
684 entity = entity.datasetType
686 # If there is exactly one SkyPixDimension in the data ID, alias its
687 # value with the key "skypix", so we can use that to match any
688 # skypix dimension.
689 # We restrict this behavior to the (real-world) case where the
690 # data ID is a DataCoordinate, not just a dict. That should only
691 # not be true in some test code, but that test code is a pain to
692 # update to be more like the real world while still providing our
693 # only tests of important behavior.
694 skypix = [dimension for dimension in entity.dimensions
695 if isinstance(dimension, SkyPixDimension)]
696 if len(skypix) == 1:
697 alias = skypix[0].name
698 return alias