Coverage for python/lsst/daf/butler/core/fileTemplates.py: 13%
250 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-12 10:56 -0700
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-12 10:56 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Support for file template string expansion."""
24from __future__ import annotations
26__all__ = ("FileTemplates", "FileTemplate", "FileTemplatesConfig", "FileTemplateValidationError")
28import logging
29import os.path
30import string
31from collections.abc import Iterable, Mapping
32from types import MappingProxyType
33from typing import TYPE_CHECKING, Any
35from .config import Config
36from .configSupport import LookupKey, processLookupConfigs
37from .datasets import DatasetRef
38from .dimensions import DataCoordinate, SkyPixDimension
39from .exceptions import ValidationError
40from .storageClass import StorageClass
42if TYPE_CHECKING:
43 from .datasets import DatasetType
44 from .dimensions import DimensionUniverse
46log = logging.getLogger(__name__)
49class FileTemplateValidationError(ValidationError):
50 """Exception for file template inconsistent with associated DatasetType."""
52 pass
55class FileTemplatesConfig(Config):
56 """Configuration information for `FileTemplates`."""
58 pass
61class FileTemplates:
62 """Collection of `FileTemplate` templates.
64 Parameters
65 ----------
66 config : `FileTemplatesConfig` or `str`
67 Load configuration.
68 default : `str`, optional
69 If not `None`, a default template to use if no template has
70 been specified explicitly in the configuration.
71 universe : `DimensionUniverse`
72 The set of all known dimensions, used to normalize any lookup keys
73 involving dimensions.
75 Notes
76 -----
77 The configuration can include one level of hierarchy where an
78 instrument-specific section can be defined to override more general
79 template specifications. This is represented in YAML using a
80 key of form ``instrument<name>`` which can then define templates
81 that will be returned if a `DatasetRef` contains a matching instrument
82 name in the data ID.
84 A default fallback template can be specified using the key ``default``.
85 Defaulting can be disabled in a child configuration by defining the
86 value to be an empty string or a boolean `False`.
88 The config is parsed using the function
89 `~lsst.daf.butler.configSubset.processLookupConfigs`.
90 """
92 defaultKey = LookupKey("default")
93 """Configuration key associated with the default template."""
95 def __init__(
96 self,
97 config: FileTemplatesConfig | str,
98 default: str | None = None,
99 *,
100 universe: DimensionUniverse,
101 ):
102 self.config = FileTemplatesConfig(config)
103 self._templates = {}
105 contents = processLookupConfigs(self.config, universe=universe)
107 # Determine default to use -- defaults can be disabled if
108 # we get a False or None
109 defaultValue = contents.get(self.defaultKey, default)
110 if defaultValue and not isinstance(defaultValue, str):
111 raise RuntimeError(
112 f"Default template value should be str or False, or None. Got '{defaultValue}'"
113 )
114 self.default = FileTemplate(defaultValue) if isinstance(defaultValue, str) and defaultValue else None
116 # Convert all the values to FileTemplate, handling defaults
117 for key, templateStr in contents.items():
118 if key == self.defaultKey:
119 continue
120 if not isinstance(templateStr, str):
121 raise RuntimeError(f"Unexpected value in file template key {key}: {templateStr}")
122 self._templates[key] = FileTemplate(templateStr)
124 @property
125 def templates(self) -> Mapping[LookupKey, FileTemplate]:
126 """Return collection of templates indexed by lookup key (`dict`)."""
127 return MappingProxyType(self._templates)
129 def __contains__(self, key: LookupKey) -> bool:
130 """Indicate whether the supplied key is present in the templates.
132 Parameters
133 ----------
134 key : `LookupKey`
135 Key to use to determine if a corresponding value is present
136 in the templates.
138 Returns
139 -------
140 in : `bool`
141 `True` if the supplied key is present in the templates.
142 """
143 return key in self.templates
145 def __getitem__(self, key: LookupKey) -> FileTemplate:
146 return self.templates[key]
148 def validateTemplates(
149 self, entities: Iterable[DatasetType | DatasetRef | StorageClass], logFailures: bool = False
150 ) -> None:
151 """Validate the templates.
153 Retrieves the template associated with each dataset type and
154 validates the dimensions against the template.
156 Parameters
157 ----------
158 entities : `DatasetType`, `DatasetRef`, or `StorageClass`
159 Entities to validate against the matching templates. Can be
160 differing types.
161 logFailures : `bool`, optional
162 If `True`, output a log message for every validation error
163 detected.
165 Raises
166 ------
167 FileTemplateValidationError
168 Raised if an entity failed validation.
170 Notes
171 -----
172 See `FileTemplate.validateTemplate()` for details on the validation.
173 """
174 unmatchedKeys = set(self.templates)
175 failed = []
176 for entity in entities:
177 try:
178 matchKey, template = self.getTemplateWithMatch(entity)
179 except KeyError as e:
180 # KeyError always quotes on stringification so strip here
181 errMsg = str(e).strip("\"'")
182 failed.append(errMsg)
183 if logFailures:
184 log.critical("%s", errMsg)
185 continue
187 if matchKey in unmatchedKeys:
188 unmatchedKeys.remove(matchKey)
190 try:
191 template.validateTemplate(entity)
192 except FileTemplateValidationError as e:
193 failed.append(f"{e} (via key '{matchKey}')")
194 if logFailures:
195 log.critical("Template failure with key '%s': %s", matchKey, e)
197 if logFailures and unmatchedKeys:
198 log.warning("Unchecked keys: %s", ", ".join([str(k) for k in unmatchedKeys]))
200 if failed:
201 if len(failed) == 1:
202 msg = str(failed[0])
203 else:
204 failMsg = ";\n".join(failed)
205 msg = f"{len(failed)} template validation failures: {failMsg}"
206 raise FileTemplateValidationError(msg)
208 def getLookupKeys(self) -> set[LookupKey]:
209 """Retrieve the look up keys for all the template entries.
211 Returns
212 -------
213 keys : `set` of `LookupKey`
214 The keys available for matching a template.
215 """
216 return set(self.templates)
218 def getTemplateWithMatch(
219 self, entity: DatasetRef | DatasetType | StorageClass
220 ) -> tuple[LookupKey, FileTemplate]:
221 """Retrieve the `FileTemplate` associated with the dataset type.
223 Also retrieves the lookup key that was a match for this template.
225 If the lookup name corresponds to a component the base name for
226 the component will be examined if the full component name does
227 not match.
229 Parameters
230 ----------
231 entity : `DatasetType`, `DatasetRef`, or `StorageClass`
232 Instance to use to look for a corresponding template.
233 A `DatasetType` name or a `StorageClass` name will be used
234 depending on the supplied entity. Priority is given to a
235 `DatasetType` name. Supports instrument override if a
236 `DatasetRef` is provided configured with an ``instrument``
237 value for the data ID.
239 Returns
240 -------
241 matchKey : `LookupKey`
242 The key that resulted in the successful match.
243 template : `FileTemplate`
244 Template instance to use with that dataset type.
246 Raises
247 ------
248 KeyError
249 Raised if no template could be located for this Dataset type.
250 """
251 # Get the names to use for lookup
252 names = entity._lookupNames()
254 # Get a location from the templates
255 template = self.default
256 source = self.defaultKey
257 for name in names:
258 if name in self.templates:
259 template = self.templates[name]
260 source = name
261 break
263 if template is None:
264 raise KeyError(f"Unable to determine file template from supplied argument [{entity}]")
266 log.debug("Got file %s from %s via %s", template, entity, source)
268 return source, template
270 def getTemplate(self, entity: DatasetType | DatasetRef | StorageClass) -> FileTemplate:
271 """Retrieve the `FileTemplate` associated with the dataset type.
273 If the lookup name corresponds to a component the base name for
274 the component will be examined if the full component name does
275 not match.
277 Parameters
278 ----------
279 entity : `DatasetType`, `DatasetRef`, or `StorageClass`
280 Instance to use to look for a corresponding template.
281 A `DatasetType` name or a `StorageClass` name will be used
282 depending on the supplied entity. Priority is given to a
283 `DatasetType` name. Supports instrument override if a
284 `DatasetRef` is provided configured with an ``instrument``
285 value for the data ID.
287 Returns
288 -------
289 template : `FileTemplate`
290 Template instance to use with that dataset type.
292 Raises
293 ------
294 KeyError
295 Raised if no template could be located for this Dataset type.
296 """
297 _, template = self.getTemplateWithMatch(entity)
298 return template
301class FileTemplate:
302 """Format a path template into a fully expanded path.
304 Parameters
305 ----------
306 template : `str`
307 Template string.
309 Raises
310 ------
311 FileTemplateValidationError
312 Raised if the template fails basic validation.
314 Notes
315 -----
316 The templates use the standard Format Specification Mini-Language
317 with the caveat that only named fields can be used. The field names
318 are taken from the Dimensions along with several additional fields:
320 - datasetType: `str`, `DatasetType.name`
321 - component: `str`, name of the StorageClass component
322 - run: `str`, name of the run this dataset was added with
324 `run` must always be provided to ensure unique paths.
326 More detailed information can be requested from dimensions by using a dot
327 notation, so ``visit.name`` would use the name of the visit and
328 ``detector.name_in_raft`` would use the name of the detector within the
329 raft.
331 The mini-language is extended to understand a "?" in the format
332 specification. This indicates that a field is optional. If that
333 Dimension is missing the field, along with the text before the field,
334 unless it is a path separator, will be removed from the output path.
336 By default any "/" in a dataId value will be replaced by "_" to prevent
337 unexpected directories being created in the path. If the "/" should be
338 retained then a special "/" format specifier can be included in the
339 template.
340 """
342 mandatoryFields = {"run", "id"}
343 """A set of fields, one of which must be present in a template."""
345 datasetFields = {"datasetType", "component"}
346 """Fields related to the supplied dataset, not a dimension."""
348 specialFields = mandatoryFields | datasetFields
349 """Set of special fields that are available independently of the defined
350 Dimensions."""
352 def __init__(self, template: str):
353 if not isinstance(template, str):
354 raise FileTemplateValidationError(
355 f"Template ('{template}') does not contain any format specifiers"
356 )
357 self.template = template
359 # Do basic validation without access to dimensions
360 self.validateTemplate(None)
362 def __eq__(self, other: Any) -> bool:
363 if not isinstance(other, FileTemplate):
364 return False
366 return self.template == other.template
368 def __str__(self) -> str:
369 return self.template
371 def __repr__(self) -> str:
372 return f'{self.__class__.__name__}("{self.template}")'
374 def fields(self, optionals: bool = False, specials: bool = False, subfields: bool = False) -> set[str]:
375 """Return the field names used in this template.
377 Parameters
378 ----------
379 optionals : `bool`
380 If `True`, optional fields are included in the returned set.
381 specials : `bool`
382 If `True`, non-dimension fields are included.
383 subfields : `bool`, optional
384 If `True`, fields with syntax ``a.b`` are included. If `False`,
385 the default, only ``a`` would be returned.
387 Returns
388 -------
389 names : `set`
390 Names of fields used in this template
392 Notes
393 -----
394 The returned set will include the special values such as `datasetType`
395 and `component`.
396 """
397 fmt = string.Formatter()
398 parts = fmt.parse(self.template)
400 names = set()
401 for literal, field_name, format_spec, conversion in parts:
402 if field_name is not None and format_spec is not None:
403 if "?" in format_spec and not optionals:
404 continue
406 if not specials and field_name in self.specialFields:
407 continue
409 if "." in field_name and not subfields:
410 field_name, _ = field_name.split(".")
412 names.add(field_name)
414 return names
416 def format(self, ref: DatasetRef) -> str:
417 """Format a template string into a full path.
419 Parameters
420 ----------
421 ref : `DatasetRef`
422 The dataset to be formatted.
424 Returns
425 -------
426 path : `str`
427 Expanded path.
429 Raises
430 ------
431 KeyError
432 Raised if the requested field is not defined and the field is
433 not optional. Or, `component` is specified but "component" was
434 not part of the template.
435 RuntimeError
436 Raised if a template uses dimension record metadata but no
437 records are attached to the `DatasetRef`.
438 """
439 # Extract defined non-None dimensions from the dataId.
440 # This guards against Nones being explicitly present in the data ID
441 # (which can happen if, say, an exposure has no filter), as well as
442 # the case where only required dimensions are present (which in this
443 # context should only happen in unit tests; in general we need all
444 # dimensions to fill out templates).
445 fields = {
446 k: ref.dataId.get(k) for k in ref.datasetType.dimensions.names if ref.dataId.get(k) is not None
447 }
448 # Extra information that can be included using . syntax
449 extras = {}
450 if isinstance(ref.dataId, DataCoordinate):
451 if ref.dataId.hasRecords():
452 extras = ref.dataId.records.byName()
453 skypix_alias = self._determine_skypix_alias(ref)
454 if skypix_alias is not None:
455 fields["skypix"] = fields[skypix_alias]
456 if extras:
457 extras["skypix"] = extras[skypix_alias]
459 datasetType = ref.datasetType
460 fields["datasetType"], component = datasetType.nameAndComponent()
462 usedComponent = False
463 if component is not None:
464 fields["component"] = component
466 fields["run"] = ref.run
467 fields["id"] = ref.id
469 fmt = string.Formatter()
470 parts = fmt.parse(self.template)
471 output = ""
473 for literal, field_name, format_spec, conversion in parts:
474 if field_name == "component":
475 usedComponent = True
477 if format_spec is None:
478 output = output + literal
479 continue
481 # Should only happen if format_spec is None
482 if field_name is None:
483 raise RuntimeError(f"Unexpected blank field_name encountered in {self.template} [{literal}]")
485 if "?" in format_spec:
486 optional = True
487 # Remove the non-standard character from the spec
488 format_spec = format_spec.replace("?", "")
489 else:
490 optional = False
492 # Check for request for additional information from the dataId
493 if "." in field_name:
494 primary, secondary = field_name.split(".")
495 if primary in extras:
496 record = extras[primary]
497 # Only fill in the fields if we have a value, the
498 # KeyError will trigger below if the attribute is missing,
499 # but only if it is not optional. This is most likely
500 # a typo in the metadata field and so should be reported
501 # even if optional.
502 if hasattr(record, secondary):
503 fields[field_name] = getattr(record, secondary)
504 else:
505 # Is a log message sufficient?
506 log.info(
507 "Template field %s could not be resolved because metadata field %s"
508 " is not understood for dimension %s. Template entry will be ignored",
509 field_name,
510 secondary,
511 primary,
512 )
513 elif primary in fields:
514 # We do have an entry for the primary but do not have any
515 # secondary entries. This is likely a problem with the
516 # code failing to attach a record to the DatasetRef.
517 raise RuntimeError(
518 f"No metadata records attached to dataset {ref}"
519 f" when attempting to expand field {field_name}."
520 " Either expand the DatasetRef or change the template."
521 )
523 if field_name in fields:
524 value = fields[field_name]
525 elif optional:
526 # If this is optional ignore the format spec
527 # and do not include the literal text prior to the optional
528 # field unless it contains a "/" path separator
529 format_spec = ""
530 value = ""
531 if "/" not in literal:
532 literal = ""
533 else:
534 raise KeyError(
535 f"'{field_name}' requested in template via '{self.template}' "
536 "but not defined and not optional"
537 )
539 # Handle "/" in values since we do not want to be surprised by
540 # unexpected directories turning up
541 replace_slash = True
542 if "/" in format_spec:
543 # Remove the non-standard character from the spec
544 format_spec = format_spec.replace("/", "")
545 replace_slash = False
547 if isinstance(value, str):
548 # Replace spaces with underscores for more friendly file paths
549 value = value.replace(" ", "_")
550 if replace_slash:
551 value = value.replace("/", "_")
553 # Now use standard formatting
554 output = output + literal + format(value, format_spec)
556 # Replace periods with underscores in the non-directory part to
557 # prevent file extension confusion. Also replace # in the non-dir
558 # part to avoid confusion with URI fragments
559 head, tail = os.path.split(output)
560 tail = tail.replace(".", "_")
561 tail = tail.replace("#", "HASH")
562 output = os.path.join(head, tail)
564 # Complain if we were meant to use a component
565 if component is not None and not usedComponent:
566 raise KeyError(f"Component '{component}' specified but template {self.template} did not use it")
568 # Since this is known to be a path, normalize it in case some double
569 # slashes have crept in
570 path = os.path.normpath(output)
572 # It should not be an absolute path (may happen with optionals)
573 if os.path.isabs(path):
574 path = os.path.relpath(path, start="/")
576 return path
578 def validateTemplate(self, entity: DatasetRef | DatasetType | StorageClass | None) -> None:
579 """Compare the template against supplied entity that wants to use it.
581 Parameters
582 ----------
583 entity : `DatasetType`, `DatasetRef`, or `StorageClass`
584 Entity to compare against template. If `None` is given only
585 very basic validation of templates will be performed.
587 Raises
588 ------
589 FileTemplateValidationError
590 Raised if the template is inconsistent with the supplied entity.
592 Notes
593 -----
594 Validation will always include a check that mandatory fields
595 are present and that at least one field refers to a dimension.
596 If the supplied entity includes a `DimensionGraph` then it will be
597 used to compare the available dimensions with those specified in the
598 template.
599 """
600 # Check that the template has run
601 withSpecials = self.fields(specials=True, optionals=True)
603 if "collection" in withSpecials:
604 raise FileTemplateValidationError(
605 "'collection' is no longer supported as a file template placeholder; use 'run' instead."
606 )
608 if not withSpecials & self.mandatoryFields:
609 raise FileTemplateValidationError(
610 f"Template '{self}' is missing a mandatory field from {self.mandatoryFields}"
611 )
613 # Check that there are some dimension fields in the template
614 # The id is allowed instead if present since that also uniquely
615 # identifies the file in the datastore.
616 allfields = self.fields(optionals=True)
617 if not allfields and "id" not in withSpecials:
618 raise FileTemplateValidationError(
619 f"Template '{self}' does not seem to have any fields corresponding to dimensions."
620 )
622 # Require that if "id" is in the template then it must exist in the
623 # file part -- this avoids templates like "{id}/fixed" where the file
624 # name is fixed but the directory has the ID.
625 if "id" in withSpecials:
626 file_part = os.path.split(self.template)[-1]
627 if "{id}" not in file_part:
628 raise FileTemplateValidationError(
629 f"Template '{self}' includes the 'id' but that ID is not part of the file name."
630 )
632 # If we do not have dimensions available then all we can do is shrug
633 if not hasattr(entity, "dimensions"):
634 return
636 # Mypy does not know about hasattr so help it out
637 if entity is None:
638 return
640 # if this entity represents a component then insist that component
641 # is present in the template. If the entity is not a component
642 # make sure that component is not mandatory.
643 try:
644 # mypy does not see the except block so complains about
645 # StorageClass not supporting isComponent
646 if entity.isComponent(): # type: ignore
647 if "component" not in withSpecials:
648 raise FileTemplateValidationError(
649 f"Template '{self}' has no component but {entity} refers to a component."
650 )
651 else:
652 mandatorySpecials = self.fields(specials=True)
653 if "component" in mandatorySpecials:
654 raise FileTemplateValidationError(
655 f"Template '{self}' has mandatory component but "
656 f"{entity} does not refer to a component."
657 )
658 except AttributeError:
659 pass
661 # From here on we need at least a DatasetType
662 # Mypy doesn't understand the AttributeError clause below
663 if isinstance(entity, StorageClass):
664 return
666 # Get the dimension links to get the full set of available field names
667 # Fall back to dataId keys if we have them but no links.
668 # dataId keys must still be present in the template
669 try:
670 minimal = set(entity.dimensions.required.names)
671 maximal = set(entity.dimensions.names)
672 except AttributeError:
673 try:
674 minimal = set(entity.dataId.keys().names) # type: ignore
675 maximal = minimal
676 except AttributeError:
677 return
679 # Replace specific skypix dimensions with generic one
680 skypix_alias = self._determine_skypix_alias(entity)
681 if skypix_alias is not None:
682 minimal.add("skypix")
683 maximal.add("skypix")
684 minimal.remove(skypix_alias)
685 maximal.remove(skypix_alias)
687 required = self.fields(optionals=False)
689 # Calculate any field usage that does not match a dimension
690 if not required.issubset(maximal):
691 raise FileTemplateValidationError(
692 f"Template '{self}' is inconsistent with {entity}: {required} is not a subset of {maximal}."
693 )
695 if not allfields.issuperset(minimal):
696 raise FileTemplateValidationError(
697 f"Template '{self}' is inconsistent with {entity}:"
698 f" {allfields} is not a superset of {minimal}."
699 )
701 return
703 def _determine_skypix_alias(self, entity: DatasetRef | DatasetType) -> str | None:
704 """Return the dimension name that refers to a sky pixel.
706 Parameters
707 ----------
708 ref : `DatasetRef` or `DatasetType`
709 The entity to examine.
711 Returns
712 -------
713 alias : `str`
714 If there is a sky pixelization in the supplied dataId, return
715 its name, else returns `None`. Will return `None` also if there
716 is more than one sky pix dimension in the data ID or if the
717 dataID is not a `DataCoordinate`
718 """
719 alias = None
721 if isinstance(entity, DatasetRef):
722 entity = entity.datasetType
724 # If there is exactly one SkyPixDimension in the data ID, alias its
725 # value with the key "skypix", so we can use that to match any
726 # skypix dimension.
727 # We restrict this behavior to the (real-world) case where the
728 # data ID is a DataCoordinate, not just a dict. That should only
729 # not be true in some test code, but that test code is a pain to
730 # update to be more like the real world while still providing our
731 # only tests of important behavior.
732 skypix = [dimension for dimension in entity.dimensions if isinstance(dimension, SkyPixDimension)]
733 if len(skypix) == 1:
734 alias = skypix[0].name
735 return alias