Coverage for python/lsst/daf/butler/datastore/file_templates.py: 13%
249 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-06 10:53 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-06 10:53 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Support for file template string expansion."""
30from __future__ import annotations
32__all__ = ("FileTemplates", "FileTemplate", "FileTemplatesConfig", "FileTemplateValidationError")
34import logging
35import os.path
36import string
37from collections.abc import Iterable, Mapping
38from types import MappingProxyType
39from typing import TYPE_CHECKING, Any
41from .._config import Config
42from .._config_support import LookupKey, processLookupConfigs
43from .._dataset_ref import DatasetRef
44from .._exceptions import ValidationError
45from .._storage_class import StorageClass
46from ..dimensions import DataCoordinate
48if TYPE_CHECKING:
49 from .._dataset_type import DatasetType
50 from ..dimensions import DimensionUniverse
52log = logging.getLogger(__name__)
55class FileTemplateValidationError(ValidationError):
56 """Exception for file template inconsistent with associated DatasetType."""
58 pass
61class FileTemplatesConfig(Config):
62 """Configuration information for `FileTemplates`."""
64 pass
67class FileTemplates:
68 """Collection of `FileTemplate` templates.
70 Parameters
71 ----------
72 config : `FileTemplatesConfig` or `str`
73 Load configuration.
74 default : `str`, optional
75 If not `None`, a default template to use if no template has
76 been specified explicitly in the configuration.
77 universe : `DimensionUniverse`
78 The set of all known dimensions, used to normalize any lookup keys
79 involving dimensions.
81 Notes
82 -----
83 The configuration can include one level of hierarchy where an
84 instrument-specific section can be defined to override more general
85 template specifications. This is represented in YAML using a
86 key of form ``instrument<name>`` which can then define templates
87 that will be returned if a `DatasetRef` contains a matching instrument
88 name in the data ID.
90 A default fallback template can be specified using the key ``default``.
91 Defaulting can be disabled in a child configuration by defining the
92 value to be an empty string or a boolean `False`.
94 The config is parsed using the function
95 `~lsst.daf.butler.configSubset.processLookupConfigs`.
96 """
98 defaultKey = LookupKey("default")
99 """Configuration key associated with the default template."""
101 def __init__(
102 self,
103 config: FileTemplatesConfig | str,
104 default: str | None = None,
105 *,
106 universe: DimensionUniverse,
107 ):
108 self.config = FileTemplatesConfig(config)
109 self._templates = {}
111 contents = processLookupConfigs(self.config, universe=universe)
113 # Determine default to use -- defaults can be disabled if
114 # we get a False or None
115 defaultValue = contents.get(self.defaultKey, default)
116 if defaultValue and not isinstance(defaultValue, str):
117 raise RuntimeError(
118 f"Default template value should be str or False, or None. Got '{defaultValue}'"
119 )
120 self.default = FileTemplate(defaultValue) if isinstance(defaultValue, str) and defaultValue else None
122 # Convert all the values to FileTemplate, handling defaults
123 for key, templateStr in contents.items():
124 if key == self.defaultKey:
125 continue
126 if not isinstance(templateStr, str):
127 raise RuntimeError(f"Unexpected value in file template key {key}: {templateStr}")
128 self._templates[key] = FileTemplate(templateStr)
130 @property
131 def templates(self) -> Mapping[LookupKey, FileTemplate]:
132 """Return collection of templates indexed by lookup key (`dict`)."""
133 return MappingProxyType(self._templates)
135 def __contains__(self, key: LookupKey) -> bool:
136 """Indicate whether the supplied key is present in the templates.
138 Parameters
139 ----------
140 key : `LookupKey`
141 Key to use to determine if a corresponding value is present
142 in the templates.
144 Returns
145 -------
146 in : `bool`
147 `True` if the supplied key is present in the templates.
148 """
149 return key in self.templates
151 def __getitem__(self, key: LookupKey) -> FileTemplate:
152 return self.templates[key]
154 def validateTemplates(
155 self, entities: Iterable[DatasetType | DatasetRef | StorageClass], logFailures: bool = False
156 ) -> None:
157 """Validate the templates.
159 Retrieves the template associated with each dataset type and
160 validates the dimensions against the template.
162 Parameters
163 ----------
164 entities : `DatasetType`, `DatasetRef`, or `StorageClass`
165 Entities to validate against the matching templates. Can be
166 differing types.
167 logFailures : `bool`, optional
168 If `True`, output a log message for every validation error
169 detected.
171 Raises
172 ------
173 FileTemplateValidationError
174 Raised if an entity failed validation.
176 Notes
177 -----
178 See `FileTemplate.validateTemplate()` for details on the validation.
179 """
180 unmatchedKeys = set(self.templates)
181 failed = []
182 for entity in entities:
183 try:
184 matchKey, template = self.getTemplateWithMatch(entity)
185 except KeyError as e:
186 # KeyError always quotes on stringification so strip here
187 errMsg = str(e).strip("\"'")
188 failed.append(errMsg)
189 if logFailures:
190 log.critical("%s", errMsg)
191 continue
193 if matchKey in unmatchedKeys:
194 unmatchedKeys.remove(matchKey)
196 try:
197 template.validateTemplate(entity)
198 except FileTemplateValidationError as e:
199 failed.append(f"{e} (via key '{matchKey}')")
200 if logFailures:
201 log.critical("Template failure with key '%s': %s", matchKey, e)
203 if logFailures and unmatchedKeys:
204 log.warning("Unchecked keys: '%s'", ", ".join([str(k) for k in unmatchedKeys]))
206 if failed:
207 if len(failed) == 1:
208 msg = str(failed[0])
209 else:
210 failMsg = ";\n".join(failed)
211 msg = f"{len(failed)} template validation failures: {failMsg}"
212 raise FileTemplateValidationError(msg)
214 def getLookupKeys(self) -> set[LookupKey]:
215 """Retrieve the look up keys for all the template entries.
217 Returns
218 -------
219 keys : `set` of `LookupKey`
220 The keys available for matching a template.
221 """
222 return set(self.templates)
224 def getTemplateWithMatch(
225 self, entity: DatasetRef | DatasetType | StorageClass
226 ) -> tuple[LookupKey, FileTemplate]:
227 """Retrieve the `FileTemplate` associated with the dataset type.
229 Also retrieves the lookup key that was a match for this template.
231 If the lookup name corresponds to a component the base name for
232 the component will be examined if the full component name does
233 not match.
235 Parameters
236 ----------
237 entity : `DatasetType`, `DatasetRef`, or `StorageClass`
238 Instance to use to look for a corresponding template.
239 A `DatasetType` name or a `StorageClass` name will be used
240 depending on the supplied entity. Priority is given to a
241 `DatasetType` name. Supports instrument override if a
242 `DatasetRef` is provided configured with an ``instrument``
243 value for the data ID.
245 Returns
246 -------
247 matchKey : `LookupKey`
248 The key that resulted in the successful match.
249 template : `FileTemplate`
250 Template instance to use with that dataset type.
252 Raises
253 ------
254 KeyError
255 Raised if no template could be located for this Dataset type.
256 """
257 # Get the names to use for lookup
258 names = entity._lookupNames()
260 # Get a location from the templates
261 template = self.default
262 source = self.defaultKey
263 for name in names:
264 if name in self.templates:
265 template = self.templates[name]
266 source = name
267 break
269 if template is None:
270 raise KeyError(f"Unable to determine file template from supplied argument [{entity}]")
272 log.debug("Got file %s from %s via %s", template, entity, source)
274 return source, template
276 def getTemplate(self, entity: DatasetType | DatasetRef | StorageClass) -> FileTemplate:
277 """Retrieve the `FileTemplate` associated with the dataset type.
279 If the lookup name corresponds to a component the base name for
280 the component will be examined if the full component name does
281 not match.
283 Parameters
284 ----------
285 entity : `DatasetType`, `DatasetRef`, or `StorageClass`
286 Instance to use to look for a corresponding template.
287 A `DatasetType` name or a `StorageClass` name will be used
288 depending on the supplied entity. Priority is given to a
289 `DatasetType` name. Supports instrument override if a
290 `DatasetRef` is provided configured with an ``instrument``
291 value for the data ID.
293 Returns
294 -------
295 template : `FileTemplate`
296 Template instance to use with that dataset type.
298 Raises
299 ------
300 KeyError
301 Raised if no template could be located for this Dataset type.
302 """
303 _, template = self.getTemplateWithMatch(entity)
304 return template
307class FileTemplate:
308 """Format a path template into a fully expanded path.
310 Parameters
311 ----------
312 template : `str`
313 Template string.
315 Raises
316 ------
317 FileTemplateValidationError
318 Raised if the template fails basic validation.
320 Notes
321 -----
322 The templates use the standard Format Specification Mini-Language
323 with the caveat that only named fields can be used. The field names
324 are taken from the Dimensions along with several additional fields:
326 - datasetType: `str`, `DatasetType.name`
327 - component: `str`, name of the StorageClass component
328 - run: `str`, name of the run this dataset was added with
330 `run` must always be provided to ensure unique paths.
332 More detailed information can be requested from dimensions by using a dot
333 notation, so ``visit.name`` would use the name of the visit and
334 ``detector.name_in_raft`` would use the name of the detector within the
335 raft.
337 The mini-language is extended to understand a "?" in the format
338 specification. This indicates that a field is optional. If that
339 Dimension is missing the field, along with the text before the field,
340 unless it is a path separator, will be removed from the output path.
342 By default any "/" in a dataId value will be replaced by "_" to prevent
343 unexpected directories being created in the path. If the "/" should be
344 retained then a special "/" format specifier can be included in the
345 template.
346 """
348 mandatoryFields = {"run", "id"}
349 """A set of fields, one of which must be present in a template."""
351 datasetFields = {"datasetType", "component"}
352 """Fields related to the supplied dataset, not a dimension."""
354 specialFields = mandatoryFields | datasetFields
355 """Set of special fields that are available independently of the defined
356 Dimensions."""
358 def __init__(self, template: str):
359 if not isinstance(template, str):
360 raise FileTemplateValidationError(
361 f"Template ('{template}') does not contain any format specifiers"
362 )
363 self.template = template
365 # Do basic validation without access to dimensions
366 self.validateTemplate(None)
368 def __eq__(self, other: Any) -> bool:
369 if not isinstance(other, FileTemplate):
370 return False
372 return self.template == other.template
374 def __str__(self) -> str:
375 return self.template
377 def __repr__(self) -> str:
378 return f'{self.__class__.__name__}("{self.template}")'
380 def fields(self, optionals: bool = False, specials: bool = False, subfields: bool = False) -> set[str]:
381 """Return the field names used in this template.
383 Parameters
384 ----------
385 optionals : `bool`
386 If `True`, optional fields are included in the returned set.
387 specials : `bool`
388 If `True`, non-dimension fields are included.
389 subfields : `bool`, optional
390 If `True`, fields with syntax ``a.b`` are included. If `False`,
391 the default, only ``a`` would be returned.
393 Returns
394 -------
395 names : `set`
396 Names of fields used in this template
398 Notes
399 -----
400 The returned set will include the special values such as `datasetType`
401 and `component`.
402 """
403 fmt = string.Formatter()
404 parts = fmt.parse(self.template)
406 names = set()
407 for _, field_name, format_spec, _ in parts:
408 if field_name is not None and format_spec is not None:
409 if "?" in format_spec and not optionals:
410 continue
412 if not specials and field_name in self.specialFields:
413 continue
415 if "." in field_name and not subfields:
416 field_name, _ = field_name.split(".")
418 names.add(field_name)
420 return names
422 def format(self, ref: DatasetRef) -> str:
423 """Format a template string into a full path.
425 Parameters
426 ----------
427 ref : `DatasetRef`
428 The dataset to be formatted.
430 Returns
431 -------
432 path : `str`
433 Expanded path.
435 Raises
436 ------
437 KeyError
438 Raised if the requested field is not defined and the field is
439 not optional. Or, `component` is specified but "component" was
440 not part of the template.
441 RuntimeError
442 Raised if a template uses dimension record metadata but no
443 records are attached to the `DatasetRef`.
444 """
445 # Extract defined non-None dimensions from the dataId.
446 # This guards against Nones being explicitly present in the data ID
447 # (which can happen if, say, an exposure has no filter), as well as
448 # the case where only required dimensions are present (which in this
449 # context should only happen in unit tests; in general we need all
450 # dimensions to fill out templates).
451 fields: dict[str, object] = {
452 k: ref.dataId.get(k) for k in ref.datasetType.dimensions.names if ref.dataId.get(k) is not None
453 }
454 # Extra information that can be included using . syntax
455 extras = {}
456 if isinstance(ref.dataId, DataCoordinate):
457 if ref.dataId.hasRecords():
458 extras = {k: ref.dataId.records[k] for k in ref.dataId.dimensions.elements}
459 skypix_alias = self._determine_skypix_alias(ref)
460 if skypix_alias is not None:
461 fields["skypix"] = fields[skypix_alias]
462 if extras:
463 extras["skypix"] = extras[skypix_alias]
465 datasetType = ref.datasetType
466 fields["datasetType"], component = datasetType.nameAndComponent()
468 usedComponent = False
469 if component is not None:
470 fields["component"] = component
472 fields["run"] = ref.run
473 fields["id"] = ref.id
475 fmt = string.Formatter()
476 parts = fmt.parse(self.template)
477 output = ""
479 for literal, field_name, format_spec, _ in parts:
480 if field_name == "component":
481 usedComponent = True
483 if format_spec is None:
484 output = output + literal
485 continue
487 # Should only happen if format_spec is None
488 if field_name is None:
489 raise RuntimeError(f"Unexpected blank field_name encountered in {self.template} [{literal}]")
491 if "?" in format_spec:
492 optional = True
493 # Remove the non-standard character from the spec
494 format_spec = format_spec.replace("?", "")
495 else:
496 optional = False
498 # Check for request for additional information from the dataId
499 if "." in field_name:
500 primary, secondary = field_name.split(".")
501 if primary in extras:
502 record = extras[primary]
503 # Only fill in the fields if we have a value, the
504 # KeyError will trigger below if the attribute is missing,
505 # but only if it is not optional. This is most likely
506 # a typo in the metadata field and so should be reported
507 # even if optional.
508 if hasattr(record, secondary):
509 fields[field_name] = getattr(record, secondary)
510 else:
511 # Is a log message sufficient?
512 log.info(
513 "Template field %s could not be resolved because metadata field %s"
514 " is not understood for dimension %s. Template entry will be ignored",
515 field_name,
516 secondary,
517 primary,
518 )
519 elif primary in fields:
520 # We do have an entry for the primary but do not have any
521 # secondary entries. This is likely a problem with the
522 # code failing to attach a record to the DatasetRef.
523 raise RuntimeError(
524 f"No metadata records attached to dataset {ref}"
525 f" when attempting to expand field {field_name}."
526 " Either expand the DatasetRef or change the template."
527 )
529 if field_name in fields:
530 value = fields[field_name]
531 elif optional:
532 # If this is optional ignore the format spec
533 # and do not include the literal text prior to the optional
534 # field unless it contains a "/" path separator
535 format_spec = ""
536 value = ""
537 if "/" not in literal:
538 literal = ""
539 else:
540 raise KeyError(
541 f"'{field_name}' requested in template via '{self.template}' "
542 "but not defined and not optional"
543 )
545 # Handle "/" in values since we do not want to be surprised by
546 # unexpected directories turning up
547 replace_slash = True
548 if "/" in format_spec:
549 # Remove the non-standard character from the spec
550 format_spec = format_spec.replace("/", "")
551 replace_slash = False
553 if isinstance(value, str):
554 # Replace spaces with underscores for more friendly file paths
555 value = value.replace(" ", "_")
556 if replace_slash:
557 value = value.replace("/", "_")
559 # Now use standard formatting
560 output = output + literal + format(value, format_spec)
562 # Replace periods with underscores in the non-directory part to
563 # prevent file extension confusion. Also replace # in the non-dir
564 # part to avoid confusion with URI fragments
565 head, tail = os.path.split(output)
566 tail = tail.replace(".", "_")
567 tail = tail.replace("#", "HASH")
568 output = os.path.join(head, tail)
570 # Complain if we were meant to use a component
571 if component is not None and not usedComponent:
572 raise KeyError(f"Component '{component}' specified but template {self.template} did not use it")
574 # Since this is known to be a path, normalize it in case some double
575 # slashes have crept in
576 path = os.path.normpath(output)
578 # It should not be an absolute path (may happen with optionals)
579 if os.path.isabs(path):
580 path = os.path.relpath(path, start="/")
582 return path
584 def validateTemplate(self, entity: DatasetRef | DatasetType | StorageClass | None) -> None:
585 """Compare the template against supplied entity that wants to use it.
587 Parameters
588 ----------
589 entity : `DatasetType`, `DatasetRef`, or `StorageClass`
590 Entity to compare against template. If `None` is given only
591 very basic validation of templates will be performed.
593 Raises
594 ------
595 FileTemplateValidationError
596 Raised if the template is inconsistent with the supplied entity.
598 Notes
599 -----
600 Validation will always include a check that mandatory fields
601 are present and that at least one field refers to a dimension.
602 If the supplied entity includes a `DimensionGraph` then it will be
603 used to compare the available dimensions with those specified in the
604 template.
605 """
606 # Check that the template has run
607 withSpecials = self.fields(specials=True, optionals=True)
609 if "collection" in withSpecials:
610 raise FileTemplateValidationError(
611 "'collection' is no longer supported as a file template placeholder; use 'run' instead."
612 )
614 if not withSpecials & self.mandatoryFields:
615 raise FileTemplateValidationError(
616 f"Template '{self}' is missing a mandatory field from {self.mandatoryFields}"
617 )
619 # Check that there are some dimension fields in the template
620 # The id is allowed instead if present since that also uniquely
621 # identifies the file in the datastore.
622 allfields = self.fields(optionals=True)
623 if not allfields and "id" not in withSpecials:
624 raise FileTemplateValidationError(
625 f"Template '{self}' does not seem to have any fields corresponding to dimensions."
626 )
628 # Require that if "id" is in the template then it must exist in the
629 # file part -- this avoids templates like "{id}/fixed" where the file
630 # name is fixed but the directory has the ID.
631 if "id" in withSpecials:
632 file_part = os.path.split(self.template)[-1]
633 if "{id}" not in file_part:
634 raise FileTemplateValidationError(
635 f"Template '{self}' includes the 'id' but that ID is not part of the file name."
636 )
638 # If we do not have dimensions available then all we can do is shrug
639 if not hasattr(entity, "dimensions"):
640 return
642 # Mypy does not know about hasattr so help it out
643 if entity is None:
644 return
646 # if this entity represents a component then insist that component
647 # is present in the template. If the entity is not a component
648 # make sure that component is not mandatory.
649 try:
650 # mypy does not see the except block so complains about
651 # StorageClass not supporting isComponent
652 if entity.isComponent(): # type: ignore
653 if "component" not in withSpecials:
654 raise FileTemplateValidationError(
655 f"Template '{self}' has no component but {entity} refers to a component."
656 )
657 else:
658 mandatorySpecials = self.fields(specials=True)
659 if "component" in mandatorySpecials:
660 raise FileTemplateValidationError(
661 f"Template '{self}' has mandatory component but "
662 f"{entity} does not refer to a component."
663 )
664 except AttributeError:
665 pass
667 # From here on we need at least a DatasetType
668 # Mypy doesn't understand the AttributeError clause below
669 if isinstance(entity, StorageClass):
670 return
672 # Get the dimension links to get the full set of available field names
673 # Fall back to dataId keys if we have them but no links.
674 # dataId keys must still be present in the template
675 try:
676 minimal = set(entity.dimensions.required.names)
677 maximal = set(entity.dimensions.names)
678 except AttributeError:
679 try:
680 minimal = set(entity.dataId.keys().names) # type: ignore
681 maximal = minimal
682 except AttributeError:
683 return
685 # Replace specific skypix dimensions with generic one
686 skypix_alias = self._determine_skypix_alias(entity)
687 if skypix_alias is not None:
688 minimal.add("skypix")
689 maximal.add("skypix")
690 minimal.remove(skypix_alias)
691 maximal.remove(skypix_alias)
693 required = self.fields(optionals=False)
695 # Calculate any field usage that does not match a dimension
696 if not required.issubset(maximal):
697 raise FileTemplateValidationError(
698 f"Template '{self}' is inconsistent with {entity}: {required} is not a subset of {maximal}."
699 )
701 if not allfields.issuperset(minimal):
702 raise FileTemplateValidationError(
703 f"Template '{self}' is inconsistent with {entity}:"
704 f" {allfields} is not a superset of {minimal}."
705 )
707 return
709 def _determine_skypix_alias(self, entity: DatasetRef | DatasetType) -> str | None:
710 """Return the dimension name that refers to a sky pixel.
712 Parameters
713 ----------
714 ref : `DatasetRef` or `DatasetType`
715 The entity to examine.
717 Returns
718 -------
719 alias : `str`
720 If there is a sky pixelization in the supplied dataId, return
721 its name, else returns `None`. Will return `None` also if there
722 is more than one sky pix dimension in the data ID or if the
723 dataID is not a `DataCoordinate`
724 """
725 alias = None
727 if isinstance(entity, DatasetRef):
728 entity = entity.datasetType
730 # If there is exactly one SkyPixDimension in the data ID, alias its
731 # value with the key "skypix", so we can use that to match any
732 # skypix dimension.
733 # We restrict this behavior to the (real-world) case where the
734 # data ID is a DataCoordinate, not just a dict. That should only
735 # not be true in some test code, but that test code is a pain to
736 # update to be more like the real world while still providing our
737 # only tests of important behavior.
738 if len(entity.dimensions.skypix) == 1:
739 (alias,) = entity.dimensions.skypix.names
740 return alias