Coverage for python/lsst/daf/butler/core/fileTemplates.py: 15%
252 statements
« prev ^ index » next coverage.py v6.4.2, created at 2022-08-03 02:30 -0700
« prev ^ index » next coverage.py v6.4.2, created at 2022-08-03 02:30 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24"""Support for file template string expansion."""
26__all__ = ("FileTemplates", "FileTemplate", "FileTemplatesConfig", "FileTemplateValidationError")
28import logging
29import os.path
30import string
31from types import MappingProxyType
32from typing import TYPE_CHECKING, Any, Iterable, Mapping, Optional, Set, Tuple, Union
34from .config import Config
35from .configSupport import LookupKey, processLookupConfigs
36from .datasets import DatasetRef
37from .dimensions import DataCoordinate, SkyPixDimension
38from .exceptions import ValidationError
39from .storageClass import StorageClass
41if TYPE_CHECKING: 41 ↛ 42line 41 didn't jump to line 42, because the condition on line 41 was never true
42 from .datasets import DatasetType
43 from .dimensions import DimensionUniverse
45log = logging.getLogger(__name__)
48class FileTemplateValidationError(ValidationError):
49 """Exception for file template inconsistent with associated DatasetType."""
51 pass
54class FileTemplatesConfig(Config):
55 """Configuration information for `FileTemplates`."""
57 pass
60class FileTemplates:
61 """Collection of `FileTemplate` templates.
63 Parameters
64 ----------
65 config : `FileTemplatesConfig` or `str`
66 Load configuration.
67 default : `str`, optional
68 If not `None`, a default template to use if no template has
69 been specified explicitly in the configuration.
70 universe : `DimensionUniverse`
71 The set of all known dimensions, used to normalize any lookup keys
72 involving dimensions.
74 Notes
75 -----
76 The configuration can include one level of hierarchy where an
77 instrument-specific section can be defined to override more general
78 template specifications. This is represented in YAML using a
79 key of form ``instrument<name>`` which can then define templates
80 that will be returned if a `DatasetRef` contains a matching instrument
81 name in the data ID.
83 A default fallback template can be specified using the key ``default``.
84 Defaulting can be disabled in a child configuration by defining the
85 value to be an empty string or a boolean `False`.
87 The config is parsed using the function
88 `~lsst.daf.butler.configSubset.processLookupConfigs`.
89 """
91 defaultKey = LookupKey("default")
92 """Configuration key associated with the default template."""
94 def __init__(
95 self,
96 config: Union[FileTemplatesConfig, str],
97 default: Optional[str] = None,
98 *,
99 universe: DimensionUniverse,
100 ):
101 self.config = FileTemplatesConfig(config)
102 self._templates = {}
104 contents = processLookupConfigs(self.config, universe=universe)
106 # Determine default to use -- defaults can be disabled if
107 # we get a False or None
108 defaultValue = contents.get(self.defaultKey, default)
109 if defaultValue and not isinstance(defaultValue, str):
110 raise RuntimeError(
111 f"Default template value should be str or False, or None. Got '{defaultValue}'"
112 )
113 self.default = FileTemplate(defaultValue) if isinstance(defaultValue, str) and defaultValue else None
115 # Convert all the values to FileTemplate, handling defaults
116 for key, templateStr in contents.items():
117 if key == self.defaultKey:
118 continue
119 if not isinstance(templateStr, str):
120 raise RuntimeError(f"Unexpected value in file template key {key}: {templateStr}")
121 self._templates[key] = FileTemplate(templateStr)
123 @property
124 def templates(self) -> Mapping[LookupKey, FileTemplate]:
125 """Return collection of templates indexed by lookup key (`dict`)."""
126 return MappingProxyType(self._templates)
128 def __contains__(self, key: LookupKey) -> bool:
129 """Indicate whether the supplied key is present in the templates.
131 Parameters
132 ----------
133 key : `LookupKey`
134 Key to use to determine if a corresponding value is present
135 in the templates.
137 Returns
138 -------
139 in : `bool`
140 `True` if the supplied key is present in the templates.
141 """
142 return key in self.templates
144 def __getitem__(self, key: LookupKey) -> FileTemplate:
145 return self.templates[key]
147 def validateTemplates(
148 self, entities: Iterable[Union[DatasetType, DatasetRef, StorageClass]], logFailures: bool = False
149 ) -> None:
150 """Validate the templates.
152 Retrieves the template associated with each dataset type and
153 validates the dimensions against the template.
155 Parameters
156 ----------
157 entities : `DatasetType`, `DatasetRef`, or `StorageClass`
158 Entities to validate against the matching templates. Can be
159 differing types.
160 logFailures : `bool`, optional
161 If `True`, output a log message for every validation error
162 detected.
164 Raises
165 ------
166 FileTemplateValidationError
167 Raised if an entity failed validation.
169 Notes
170 -----
171 See `FileTemplate.validateTemplate()` for details on the validation.
172 """
173 unmatchedKeys = set(self.templates)
174 failed = []
175 for entity in entities:
176 try:
177 matchKey, template = self.getTemplateWithMatch(entity)
178 except KeyError as e:
179 # KeyError always quotes on stringification so strip here
180 errMsg = str(e).strip("\"'")
181 failed.append(errMsg)
182 if logFailures:
183 log.critical("%s", errMsg)
184 continue
186 if matchKey in unmatchedKeys:
187 unmatchedKeys.remove(matchKey)
189 try:
190 template.validateTemplate(entity)
191 except FileTemplateValidationError as e:
192 failed.append(f"{e} (via key '{matchKey}')")
193 if logFailures:
194 log.critical("Template failure with key '%s': %s", matchKey, e)
196 if logFailures and unmatchedKeys:
197 log.warning("Unchecked keys: %s", ", ".join([str(k) for k in unmatchedKeys]))
199 if failed:
200 if len(failed) == 1:
201 msg = str(failed[0])
202 else:
203 failMsg = ";\n".join(failed)
204 msg = f"{len(failed)} template validation failures: {failMsg}"
205 raise FileTemplateValidationError(msg)
207 def getLookupKeys(self) -> Set[LookupKey]:
208 """Retrieve the look up keys for all the template entries.
210 Returns
211 -------
212 keys : `set` of `LookupKey`
213 The keys available for matching a template.
214 """
215 return set(self.templates)
217 def getTemplateWithMatch(
218 self, entity: Union[DatasetRef, DatasetType, StorageClass]
219 ) -> Tuple[LookupKey, FileTemplate]:
220 """Retrieve the `FileTemplate` associated with the dataset type.
222 Also retrieves the lookup key that was a match for this template.
224 If the lookup name corresponds to a component the base name for
225 the component will be examined if the full component name does
226 not match.
228 Parameters
229 ----------
230 entity : `DatasetType`, `DatasetRef`, or `StorageClass`
231 Instance to use to look for a corresponding template.
232 A `DatasetType` name or a `StorageClass` name will be used
233 depending on the supplied entity. Priority is given to a
234 `DatasetType` name. Supports instrument override if a
235 `DatasetRef` is provided configured with an ``instrument``
236 value for the data ID.
238 Returns
239 -------
240 matchKey : `LookupKey`
241 The key that resulted in the successful match.
242 template : `FileTemplate`
243 Template instance to use with that dataset type.
245 Raises
246 ------
247 KeyError
248 Raised if no template could be located for this Dataset type.
249 """
250 # Get the names to use for lookup
251 names = entity._lookupNames()
253 # Get a location from the templates
254 template = self.default
255 source = self.defaultKey
256 for name in names:
257 if name in self.templates:
258 template = self.templates[name]
259 source = name
260 break
262 if template is None:
263 raise KeyError(f"Unable to determine file template from supplied argument [{entity}]")
265 log.debug("Got file %s from %s via %s", template, entity, source)
267 return source, template
269 def getTemplate(self, entity: Union[DatasetType, DatasetRef, StorageClass]) -> FileTemplate:
270 """Retrieve the `FileTemplate` associated with the dataset type.
272 If the lookup name corresponds to a component the base name for
273 the component will be examined if the full component name does
274 not match.
276 Parameters
277 ----------
278 entity : `DatasetType`, `DatasetRef`, or `StorageClass`
279 Instance to use to look for a corresponding template.
280 A `DatasetType` name or a `StorageClass` name will be used
281 depending on the supplied entity. Priority is given to a
282 `DatasetType` name. Supports instrument override if a
283 `DatasetRef` is provided configured with an ``instrument``
284 value for the data ID.
286 Returns
287 -------
288 template : `FileTemplate`
289 Template instance to use with that dataset type.
291 Raises
292 ------
293 KeyError
294 Raised if no template could be located for this Dataset type.
295 """
296 _, template = self.getTemplateWithMatch(entity)
297 return template
300class FileTemplate:
301 """Format a path template into a fully expanded path.
303 Parameters
304 ----------
305 template : `str`
306 Template string.
308 Raises
309 ------
310 FileTemplateValidationError
311 Raised if the template fails basic validation.
313 Notes
314 -----
315 The templates use the standard Format Specification Mini-Language
316 with the caveat that only named fields can be used. The field names
317 are taken from the Dimensions along with several additional fields:
319 - datasetType: `str`, `DatasetType.name`
320 - component: `str`, name of the StorageClass component
321 - run: `str`, name of the run this dataset was added with
323 `run` must always be provided to ensure unique paths.
325 More detailed information can be requested from dimensions by using a dot
326 notation, so ``visit.name`` would use the name of the visit and
327 ``detector.name_in_raft`` would use the name of the detector within the
328 raft.
330 The mini-language is extended to understand a "?" in the format
331 specification. This indicates that a field is optional. If that
332 Dimension is missing the field, along with the text before the field,
333 unless it is a path separator, will be removed from the output path.
335 By default any "/" in a dataId value will be replaced by "_" to prevent
336 unexpected directories being created in the path. If the "/" should be
337 retained then a special "/" format specifier can be included in the
338 template.
339 """
341 mandatoryFields = {"run", "id"}
342 """A set of fields, one of which must be present in a template."""
344 datasetFields = {"datasetType", "component"}
345 """Fields related to the supplied dataset, not a dimension."""
347 specialFields = mandatoryFields | datasetFields
348 """Set of special fields that are available independently of the defined
349 Dimensions."""
351 def __init__(self, template: str):
352 if not isinstance(template, str):
353 raise FileTemplateValidationError(
354 f"Template ('{template}') does not contain any format specifiers"
355 )
356 self.template = template
358 # Do basic validation without access to dimensions
359 self.validateTemplate(None)
361 def __eq__(self, other: Any) -> bool:
362 if not isinstance(other, FileTemplate):
363 return False
365 return self.template == other.template
367 def __str__(self) -> str:
368 return self.template
370 def __repr__(self) -> str:
371 return f'{self.__class__.__name__}("{self.template}")'
373 def fields(self, optionals: bool = False, specials: bool = False, subfields: bool = False) -> Set[str]:
374 """Return the field names used in this template.
376 Parameters
377 ----------
378 optionals : `bool`
379 If `True`, optional fields are included in the returned set.
380 specials : `bool`
381 If `True`, non-dimension fields are included.
382 subfields : `bool`, optional
383 If `True`, fields with syntax ``a.b`` are included. If `False`,
384 the default, only ``a`` would be returned.
386 Returns
387 -------
388 names : `set`
389 Names of fields used in this template
391 Notes
392 -----
393 The returned set will include the special values such as `datasetType`
394 and `component`.
395 """
396 fmt = string.Formatter()
397 parts = fmt.parse(self.template)
399 names = set()
400 for literal, field_name, format_spec, conversion in parts:
401 if field_name is not None and format_spec is not None:
402 if "?" in format_spec and not optionals:
403 continue
405 if not specials and field_name in self.specialFields:
406 continue
408 if "." in field_name and not subfields:
409 field_name, _ = field_name.split(".")
411 names.add(field_name)
413 return names
415 def format(self, ref: DatasetRef) -> str:
416 """Format a template string into a full path.
418 Parameters
419 ----------
420 ref : `DatasetRef`
421 The dataset to be formatted.
423 Returns
424 -------
425 path : `str`
426 Expanded path.
428 Raises
429 ------
430 KeyError
431 Raised if the requested field is not defined and the field is
432 not optional. Or, `component` is specified but "component" was
433 not part of the template.
434 RuntimeError
435 Raised if a template uses dimension record metadata but no
436 records are attached to the `DatasetRef`.
437 """
438 # Extract defined non-None dimensions from the dataId.
439 # This guards against Nones being explicitly present in the data ID
440 # (which can happen if, say, an exposure has no filter), as well as
441 # the case where only required dimensions are present (which in this
442 # context should only happen in unit tests; in general we need all
443 # dimensions to fill out templates).
444 fields = {
445 k: ref.dataId.get(k) for k in ref.datasetType.dimensions.names if ref.dataId.get(k) is not None
446 }
447 # Extra information that can be included using . syntax
448 extras = {}
449 if isinstance(ref.dataId, DataCoordinate):
450 if ref.dataId.hasRecords():
451 extras = ref.dataId.records.byName()
452 skypix_alias = self._determine_skypix_alias(ref)
453 if skypix_alias is not None:
454 fields["skypix"] = fields[skypix_alias]
455 if extras:
456 extras["skypix"] = extras[skypix_alias]
458 datasetType = ref.datasetType
459 fields["datasetType"], component = datasetType.nameAndComponent()
461 usedComponent = False
462 if component is not None:
463 fields["component"] = component
465 fields["run"] = ref.run
466 fields["id"] = ref.id
468 fmt = string.Formatter()
469 parts = fmt.parse(self.template)
470 output = ""
472 for literal, field_name, format_spec, conversion in parts:
474 if field_name == "component":
475 usedComponent = True
477 if format_spec is None:
478 output = output + literal
479 continue
481 # Should only happen if format_spec is None
482 if field_name is None:
483 raise RuntimeError(f"Unexpected blank field_name encountered in {self.template} [{literal}]")
485 if "?" in format_spec:
486 optional = True
487 # Remove the non-standard character from the spec
488 format_spec = format_spec.replace("?", "")
489 else:
490 optional = False
492 # Check for request for additional information from the dataId
493 if "." in field_name:
494 primary, secondary = field_name.split(".")
495 if primary in extras:
496 record = extras[primary]
497 # Only fill in the fields if we have a value, the
498 # KeyError will trigger below if the attribute is missing,
499 # but only if it is not optional. This is most likely
500 # a typo in the metadata field and so should be reported
501 # even if optional.
502 if hasattr(record, secondary):
503 fields[field_name] = getattr(record, secondary)
504 else:
505 # Is a log message sufficient?
506 log.info(
507 "Template field %s could not be resolved because metadata field %s"
508 " is not understood for dimension %s. Template entry will be ignored",
509 field_name,
510 secondary,
511 primary,
512 )
513 elif primary in fields:
514 # We do have an entry for the primary but do not have any
515 # secondary entries. This is likely a problem with the
516 # code failing to attach a record to the DatasetRef.
517 raise RuntimeError(
518 f"No metadata records attached to dataset {ref}"
519 f" when attempting to expand field {field_name}."
520 " Either expand the DatasetRef or change the template."
521 )
523 if field_name in fields:
524 value = fields[field_name]
525 elif optional:
526 # If this is optional ignore the format spec
527 # and do not include the literal text prior to the optional
528 # field unless it contains a "/" path separator
529 format_spec = ""
530 value = ""
531 if "/" not in literal:
532 literal = ""
533 else:
534 raise KeyError(
535 f"'{field_name}' requested in template via '{self.template}' "
536 "but not defined and not optional"
537 )
539 # Handle "/" in values since we do not want to be surprised by
540 # unexpected directories turning up
541 replace_slash = True
542 if "/" in format_spec:
543 # Remove the non-standard character from the spec
544 format_spec = format_spec.replace("/", "")
545 replace_slash = False
547 if isinstance(value, str):
548 # Replace spaces with underscores for more friendly file paths
549 value = value.replace(" ", "_")
550 if replace_slash:
551 value = value.replace("/", "_")
553 # Now use standard formatting
554 output = output + literal + format(value, format_spec)
556 # Replace periods with underscores in the non-directory part to
557 # prevent file extension confusion. Also replace # in the non-dir
558 # part to avoid confusion with URI fragments
559 head, tail = os.path.split(output)
560 tail = tail.replace(".", "_")
561 tail = tail.replace("#", "HASH")
562 output = os.path.join(head, tail)
564 # Complain if we were meant to use a component
565 if component is not None and not usedComponent:
566 raise KeyError(
567 "Component '{}' specified but template {} did not use it".format(component, self.template)
568 )
570 # Since this is known to be a path, normalize it in case some double
571 # slashes have crept in
572 path = os.path.normpath(output)
574 # It should not be an absolute path (may happen with optionals)
575 if os.path.isabs(path):
576 path = os.path.relpath(path, start="/")
578 return path
580 def validateTemplate(self, entity: Union[DatasetRef, DatasetType, StorageClass, None]) -> None:
581 """Compare the template against supplied entity that wants to use it.
583 Parameters
584 ----------
585 entity : `DatasetType`, `DatasetRef`, or `StorageClass`
586 Entity to compare against template. If `None` is given only
587 very basic validation of templates will be performed.
589 Raises
590 ------
591 FileTemplateValidationError
592 Raised if the template is inconsistent with the supplied entity.
594 Notes
595 -----
596 Validation will always include a check that mandatory fields
597 are present and that at least one field refers to a dimension.
598 If the supplied entity includes a `DimensionGraph` then it will be
599 used to compare the available dimensions with those specified in the
600 template.
601 """
602 # Check that the template has run
603 withSpecials = self.fields(specials=True, optionals=True)
605 if "collection" in withSpecials:
606 raise FileTemplateValidationError(
607 "'collection' is no longer supported as a file template placeholder; use 'run' instead."
608 )
610 if not withSpecials & self.mandatoryFields:
611 raise FileTemplateValidationError(
612 f"Template '{self}' is missing a mandatory field from {self.mandatoryFields}"
613 )
615 # Check that there are some dimension fields in the template
616 # The id is allowed instead if present since that also uniquely
617 # identifies the file in the datastore.
618 allfields = self.fields(optionals=True)
619 if not allfields and "id" not in withSpecials:
620 raise FileTemplateValidationError(
621 f"Template '{self}' does not seem to have any fields corresponding to dimensions."
622 )
624 # Require that if "id" is in the template then it must exist in the
625 # file part -- this avoids templates like "{id}/fixed" where the file
626 # name is fixed but the directory has the ID.
627 if "id" in withSpecials:
628 file_part = os.path.split(self.template)[-1]
629 if "{id}" not in file_part:
630 raise FileTemplateValidationError(
631 f"Template '{self}' includes the 'id' but that ID is not part of the file name."
632 )
634 # If we do not have dimensions available then all we can do is shrug
635 if not hasattr(entity, "dimensions"):
636 return
638 # Mypy does not know about hasattr so help it out
639 if entity is None:
640 return
642 # if this entity represents a component then insist that component
643 # is present in the template. If the entity is not a component
644 # make sure that component is not mandatory.
645 try:
646 # mypy does not see the except block so complains about
647 # StorageClass not supporting isComponent
648 if entity.isComponent(): # type: ignore
649 if "component" not in withSpecials:
650 raise FileTemplateValidationError(
651 f"Template '{self}' has no component but {entity} refers to a component."
652 )
653 else:
654 mandatorySpecials = self.fields(specials=True)
655 if "component" in mandatorySpecials:
656 raise FileTemplateValidationError(
657 f"Template '{self}' has mandatory component but "
658 f"{entity} does not refer to a component."
659 )
660 except AttributeError:
661 pass
663 # From here on we need at least a DatasetType
664 # Mypy doesn't understand the AttributeError clause below
665 if isinstance(entity, StorageClass):
666 return
668 # Get the dimension links to get the full set of available field names
669 # Fall back to dataId keys if we have them but no links.
670 # dataId keys must still be present in the template
671 try:
672 minimal = set(entity.dimensions.required.names)
673 maximal = set(entity.dimensions.names)
674 except AttributeError:
675 try:
676 minimal = set(entity.dataId.keys().names) # type: ignore
677 maximal = minimal
678 except AttributeError:
679 return
681 # Replace specific skypix dimensions with generic one
682 skypix_alias = self._determine_skypix_alias(entity)
683 if skypix_alias is not None:
684 minimal.add("skypix")
685 maximal.add("skypix")
686 minimal.remove(skypix_alias)
687 maximal.remove(skypix_alias)
689 required = self.fields(optionals=False)
691 # Calculate any field usage that does not match a dimension
692 if not required.issubset(maximal):
693 raise FileTemplateValidationError(
694 f"Template '{self}' is inconsistent with {entity}:"
695 f" {required} is not a subset of {maximal}."
696 )
698 if not allfields.issuperset(minimal):
699 raise FileTemplateValidationError(
700 f"Template '{self}' is inconsistent with {entity}:"
701 f" {allfields} is not a superset of {minimal}."
702 )
704 return
706 def _determine_skypix_alias(self, entity: Union[DatasetRef, DatasetType]) -> Optional[str]:
707 """Return the dimension name that refers to a sky pixel.
709 Parameters
710 ----------
711 ref : `DatasetRef` or `DatasetType`
712 The entity to examine.
714 Returns
715 -------
716 alias : `str`
717 If there is a sky pixelization in the supplied dataId, return
718 its name, else returns `None`. Will return `None` also if there
719 is more than one sky pix dimension in the data ID or if the
720 dataID is not a `DataCoordinate`
721 """
722 alias = None
724 if isinstance(entity, DatasetRef):
725 entity = entity.datasetType
727 # If there is exactly one SkyPixDimension in the data ID, alias its
728 # value with the key "skypix", so we can use that to match any
729 # skypix dimension.
730 # We restrict this behavior to the (real-world) case where the
731 # data ID is a DataCoordinate, not just a dict. That should only
732 # not be true in some test code, but that test code is a pain to
733 # update to be more like the real world while still providing our
734 # only tests of important behavior.
735 skypix = [dimension for dimension in entity.dimensions if isinstance(dimension, SkyPixDimension)]
736 if len(skypix) == 1:
737 alias = skypix[0].name
738 return alias