Coverage for python/lsst/daf/butler/core/fileTemplates.py: 14%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

244 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""Support for file template string expansion.""" 

25 

26__all__ = ("FileTemplates", "FileTemplate", "FileTemplatesConfig", "FileTemplateValidationError") 

27 

28import logging 

29import os.path 

30import string 

31from types import MappingProxyType 

32from typing import TYPE_CHECKING, Any, Iterable, Mapping, Optional, Set, Tuple, Union 

33 

34from .config import Config 

35from .configSupport import LookupKey, processLookupConfigs 

36from .datasets import DatasetRef 

37from .dimensions import DataCoordinate, SkyPixDimension 

38from .exceptions import ValidationError 

39from .storageClass import StorageClass 

40 

41if TYPE_CHECKING: 41 ↛ 42line 41 didn't jump to line 42, because the condition on line 41 was never true

42 from .datasets import DatasetType 

43 from .dimensions import DimensionUniverse 

44 

45log = logging.getLogger(__name__) 

46 

47 

48class FileTemplateValidationError(ValidationError): 

49 """Exception for file template inconsistent with associated DatasetType.""" 

50 

51 pass 

52 

53 

54class FileTemplatesConfig(Config): 

55 """Configuration information for `FileTemplates`.""" 

56 

57 pass 

58 

59 

60class FileTemplates: 

61 """Collection of `FileTemplate` templates. 

62 

63 Parameters 

64 ---------- 

65 config : `FileTemplatesConfig` or `str` 

66 Load configuration. 

67 default : `str`, optional 

68 If not `None`, a default template to use if no template has 

69 been specified explicitly in the configuration. 

70 universe : `DimensionUniverse` 

71 The set of all known dimensions, used to normalize any lookup keys 

72 involving dimensions. 

73 

74 Notes 

75 ----- 

76 The configuration can include one level of hierarchy where an 

77 instrument-specific section can be defined to override more general 

78 template specifications. This is represented in YAML using a 

79 key of form ``instrument<name>`` which can then define templates 

80 that will be returned if a `DatasetRef` contains a matching instrument 

81 name in the data ID. 

82 

83 A default fallback template can be specified using the key ``default``. 

84 Defaulting can be disabled in a child configuration by defining the 

85 value to be an empty string or a boolean `False`. 

86 

87 The config is parsed using the function 

88 `~lsst.daf.butler.configSubset.processLookupConfigs`. 

89 """ 

90 

91 defaultKey = LookupKey("default") 

92 """Configuration key associated with the default template.""" 

93 

94 def __init__( 

95 self, 

96 config: Union[FileTemplatesConfig, str], 

97 default: Optional[str] = None, 

98 *, 

99 universe: DimensionUniverse, 

100 ): 

101 self.config = FileTemplatesConfig(config) 

102 self._templates = {} 

103 

104 contents = processLookupConfigs(self.config, universe=universe) 

105 

106 # Determine default to use -- defaults can be disabled if 

107 # we get a False or None 

108 defaultValue = contents.get(self.defaultKey, default) 

109 if defaultValue and not isinstance(defaultValue, str): 

110 raise RuntimeError( 

111 f"Default template value should be str or False, or None. Got '{defaultValue}'" 

112 ) 

113 self.default = FileTemplate(defaultValue) if isinstance(defaultValue, str) and defaultValue else None 

114 

115 # Convert all the values to FileTemplate, handling defaults 

116 for key, templateStr in contents.items(): 

117 if key == self.defaultKey: 

118 continue 

119 if not isinstance(templateStr, str): 

120 raise RuntimeError(f"Unexpected value in file template key {key}: {templateStr}") 

121 self._templates[key] = FileTemplate(templateStr) 

122 

123 @property 

124 def templates(self) -> Mapping[LookupKey, FileTemplate]: 

125 """Return collection of templates indexed by lookup key (`dict`).""" 

126 return MappingProxyType(self._templates) 

127 

128 def __contains__(self, key: LookupKey) -> bool: 

129 """Indicate whether the supplied key is present in the templates. 

130 

131 Parameters 

132 ---------- 

133 key : `LookupKey` 

134 Key to use to determine if a corresponding value is present 

135 in the templates. 

136 

137 Returns 

138 ------- 

139 in : `bool` 

140 `True` if the supplied key is present in the templates. 

141 """ 

142 return key in self.templates 

143 

144 def __getitem__(self, key: LookupKey) -> FileTemplate: 

145 return self.templates[key] 

146 

147 def validateTemplates( 

148 self, entities: Iterable[Union[DatasetType, DatasetRef, StorageClass]], logFailures: bool = False 

149 ) -> None: 

150 """Validate the templates. 

151 

152 Retrieves the template associated with each dataset type and 

153 validates the dimensions against the template. 

154 

155 Parameters 

156 ---------- 

157 entities : `DatasetType`, `DatasetRef`, or `StorageClass` 

158 Entities to validate against the matching templates. Can be 

159 differing types. 

160 logFailures : `bool`, optional 

161 If `True`, output a log message for every validation error 

162 detected. 

163 

164 Raises 

165 ------ 

166 FileTemplateValidationError 

167 Raised if an entity failed validation. 

168 

169 Notes 

170 ----- 

171 See `FileTemplate.validateTemplate()` for details on the validation. 

172 """ 

173 unmatchedKeys = set(self.templates) 

174 failed = [] 

175 for entity in entities: 

176 try: 

177 matchKey, template = self.getTemplateWithMatch(entity) 

178 except KeyError as e: 

179 # KeyError always quotes on stringification so strip here 

180 errMsg = str(e).strip("\"'") 

181 failed.append(errMsg) 

182 if logFailures: 

183 log.critical("%s", errMsg) 

184 continue 

185 

186 if matchKey in unmatchedKeys: 

187 unmatchedKeys.remove(matchKey) 

188 

189 try: 

190 template.validateTemplate(entity) 

191 except FileTemplateValidationError as e: 

192 failed.append(f"{e} (via key '{matchKey}')") 

193 if logFailures: 

194 log.critical("Template failure with key '%s': %s", matchKey, e) 

195 

196 if logFailures and unmatchedKeys: 

197 log.warning("Unchecked keys: %s", ", ".join([str(k) for k in unmatchedKeys])) 

198 

199 if failed: 

200 if len(failed) == 1: 

201 msg = str(failed[0]) 

202 else: 

203 failMsg = ";\n".join(failed) 

204 msg = f"{len(failed)} template validation failures: {failMsg}" 

205 raise FileTemplateValidationError(msg) 

206 

207 def getLookupKeys(self) -> Set[LookupKey]: 

208 """Retrieve the look up keys for all the template entries. 

209 

210 Returns 

211 ------- 

212 keys : `set` of `LookupKey` 

213 The keys available for matching a template. 

214 """ 

215 return set(self.templates) 

216 

217 def getTemplateWithMatch( 

218 self, entity: Union[DatasetRef, DatasetType, StorageClass] 

219 ) -> Tuple[LookupKey, FileTemplate]: 

220 """Retrieve the `FileTemplate` associated with the dataset type. 

221 

222 Also retrieves the lookup key that was a match for this template. 

223 

224 If the lookup name corresponds to a component the base name for 

225 the component will be examined if the full component name does 

226 not match. 

227 

228 Parameters 

229 ---------- 

230 entity : `DatasetType`, `DatasetRef`, or `StorageClass` 

231 Instance to use to look for a corresponding template. 

232 A `DatasetType` name or a `StorageClass` name will be used 

233 depending on the supplied entity. Priority is given to a 

234 `DatasetType` name. Supports instrument override if a 

235 `DatasetRef` is provided configured with an ``instrument`` 

236 value for the data ID. 

237 

238 Returns 

239 ------- 

240 matchKey : `LookupKey` 

241 The key that resulted in the successful match. 

242 template : `FileTemplate` 

243 Template instance to use with that dataset type. 

244 

245 Raises 

246 ------ 

247 KeyError 

248 Raised if no template could be located for this Dataset type. 

249 """ 

250 # Get the names to use for lookup 

251 names = entity._lookupNames() 

252 

253 # Get a location from the templates 

254 template = self.default 

255 source = self.defaultKey 

256 for name in names: 

257 if name in self.templates: 

258 template = self.templates[name] 

259 source = name 

260 break 

261 

262 if template is None: 

263 raise KeyError(f"Unable to determine file template from supplied argument [{entity}]") 

264 

265 log.debug("Got file %s from %s via %s", template, entity, source) 

266 

267 return source, template 

268 

269 def getTemplate(self, entity: Union[DatasetType, DatasetRef, StorageClass]) -> FileTemplate: 

270 """Retrieve the `FileTemplate` associated with the dataset type. 

271 

272 If the lookup name corresponds to a component the base name for 

273 the component will be examined if the full component name does 

274 not match. 

275 

276 Parameters 

277 ---------- 

278 entity : `DatasetType`, `DatasetRef`, or `StorageClass` 

279 Instance to use to look for a corresponding template. 

280 A `DatasetType` name or a `StorageClass` name will be used 

281 depending on the supplied entity. Priority is given to a 

282 `DatasetType` name. Supports instrument override if a 

283 `DatasetRef` is provided configured with an ``instrument`` 

284 value for the data ID. 

285 

286 Returns 

287 ------- 

288 template : `FileTemplate` 

289 Template instance to use with that dataset type. 

290 

291 Raises 

292 ------ 

293 KeyError 

294 Raised if no template could be located for this Dataset type. 

295 """ 

296 _, template = self.getTemplateWithMatch(entity) 

297 return template 

298 

299 

300class FileTemplate: 

301 """Format a path template into a fully expanded path. 

302 

303 Parameters 

304 ---------- 

305 template : `str` 

306 Template string. 

307 

308 Raises 

309 ------ 

310 FileTemplateValidationError 

311 Raised if the template fails basic validation. 

312 

313 Notes 

314 ----- 

315 The templates use the standard Format Specification Mini-Language 

316 with the caveat that only named fields can be used. The field names 

317 are taken from the Dimensions along with several additional fields: 

318 

319 - datasetType: `str`, `DatasetType.name` 

320 - component: `str`, name of the StorageClass component 

321 - run: `str`, name of the run this dataset was added with 

322 

323 `run` must always be provided to ensure unique paths. 

324 

325 More detailed information can be requested from dimensions by using a dot 

326 notation, so ``visit.name`` would use the name of the visit and 

327 ``detector.name_in_raft`` would use the name of the detector within the 

328 raft. 

329 

330 The mini-language is extended to understand a "?" in the format 

331 specification. This indicates that a field is optional. If that 

332 Dimension is missing the field, along with the text before the field, 

333 unless it is a path separator, will be removed from the output path. 

334 

335 By default any "/" in a dataId value will be replaced by "_" to prevent 

336 unexpected directories being created in the path. If the "/" should be 

337 retained then a special "/" format specifier can be included in the 

338 template. 

339 """ 

340 

341 mandatoryFields = {"run", "id"} 

342 """A set of fields, one of which must be present in a template.""" 

343 

344 datasetFields = {"datasetType", "component"} 

345 """Fields related to the supplied dataset, not a dimension.""" 

346 

347 specialFields = mandatoryFields | datasetFields 

348 """Set of special fields that are available independently of the defined 

349 Dimensions.""" 

350 

351 def __init__(self, template: str): 

352 if not isinstance(template, str): 

353 raise FileTemplateValidationError( 

354 f"Template ('{template}') does not contain any format specifiers" 

355 ) 

356 self.template = template 

357 

358 # Do basic validation without access to dimensions 

359 self.validateTemplate(None) 

360 

361 def __eq__(self, other: Any) -> bool: 

362 if not isinstance(other, FileTemplate): 

363 return False 

364 

365 return self.template == other.template 

366 

367 def __str__(self) -> str: 

368 return self.template 

369 

370 def __repr__(self) -> str: 

371 return f'{self.__class__.__name__}("{self.template}")' 

372 

373 def fields(self, optionals: bool = False, specials: bool = False, subfields: bool = False) -> Set[str]: 

374 """Return the field names used in this template. 

375 

376 Parameters 

377 ---------- 

378 optionals : `bool` 

379 If `True`, optional fields are included in the returned set. 

380 specials : `bool` 

381 If `True`, non-dimension fields are included. 

382 subfields : `bool`, optional 

383 If `True`, fields with syntax ``a.b`` are included. If `False`, 

384 the default, only ``a`` would be returned. 

385 

386 Returns 

387 ------- 

388 names : `set` 

389 Names of fields used in this template 

390 

391 Notes 

392 ----- 

393 The returned set will include the special values such as `datasetType` 

394 and `component`. 

395 """ 

396 fmt = string.Formatter() 

397 parts = fmt.parse(self.template) 

398 

399 names = set() 

400 for literal, field_name, format_spec, conversion in parts: 

401 if field_name is not None and format_spec is not None: 

402 if "?" in format_spec and not optionals: 

403 continue 

404 

405 if not specials and field_name in self.specialFields: 

406 continue 

407 

408 if "." in field_name and not subfields: 

409 field_name, _ = field_name.split(".") 

410 

411 names.add(field_name) 

412 

413 return names 

414 

415 def format(self, ref: DatasetRef) -> str: 

416 """Format a template string into a full path. 

417 

418 Parameters 

419 ---------- 

420 ref : `DatasetRef` 

421 The dataset to be formatted. 

422 

423 Returns 

424 ------- 

425 path : `str` 

426 Expanded path. 

427 

428 Raises 

429 ------ 

430 KeyError 

431 Raised if the requested field is not defined and the field is 

432 not optional. Or, `component` is specified but "component" was 

433 not part of the template. 

434 """ 

435 # Extract defined non-None dimensions from the dataId. 

436 # This guards against Nones being explicitly present in the data ID 

437 # (which can happen if, say, an exposure has no filter), as well as 

438 # the case where only required dimensions are present (which in this 

439 # context should only happen in unit tests; in general we need all 

440 # dimensions to fill out templates). 

441 fields = { 

442 k: ref.dataId.get(k) for k in ref.datasetType.dimensions.names if ref.dataId.get(k) is not None 

443 } 

444 # Extra information that can be included using . syntax 

445 extras = {} 

446 if isinstance(ref.dataId, DataCoordinate): 

447 if ref.dataId.hasRecords(): 

448 extras = ref.dataId.records.byName() 

449 skypix_alias = self._determine_skypix_alias(ref) 

450 if skypix_alias is not None: 

451 fields["skypix"] = fields[skypix_alias] 

452 if extras: 

453 extras["skypix"] = extras[skypix_alias] 

454 

455 datasetType = ref.datasetType 

456 fields["datasetType"], component = datasetType.nameAndComponent() 

457 

458 usedComponent = False 

459 if component is not None: 

460 fields["component"] = component 

461 

462 fields["run"] = ref.run 

463 fields["id"] = ref.id 

464 

465 fmt = string.Formatter() 

466 parts = fmt.parse(self.template) 

467 output = "" 

468 

469 for literal, field_name, format_spec, conversion in parts: 

470 

471 if field_name == "component": 

472 usedComponent = True 

473 

474 if format_spec is None: 

475 output = output + literal 

476 continue 

477 

478 # Should only happen if format_spec is None 

479 if field_name is None: 

480 raise RuntimeError(f"Unexpected blank field_name encountered in {self.template} [{literal}]") 

481 

482 if "?" in format_spec: 

483 optional = True 

484 # Remove the non-standard character from the spec 

485 format_spec = format_spec.replace("?", "") 

486 else: 

487 optional = False 

488 

489 # Check for request for additional information from the dataId 

490 if "." in field_name: 

491 primary, secondary = field_name.split(".") 

492 if primary in extras: 

493 record = extras[primary] 

494 # Only fill in the fields if we have a value, the 

495 # KeyError will trigger below if the attribute is missing. 

496 if hasattr(record, secondary): 

497 fields[field_name] = getattr(record, secondary) 

498 

499 if field_name in fields: 

500 value = fields[field_name] 

501 elif optional: 

502 # If this is optional ignore the format spec 

503 # and do not include the literal text prior to the optional 

504 # field unless it contains a "/" path separator 

505 format_spec = "" 

506 value = "" 

507 if "/" not in literal: 

508 literal = "" 

509 else: 

510 raise KeyError( 

511 f"'{field_name}' requested in template via '{self.template}' " 

512 "but not defined and not optional" 

513 ) 

514 

515 # Handle "/" in values since we do not want to be surprised by 

516 # unexpected directories turning up 

517 replace_slash = True 

518 if "/" in format_spec: 

519 # Remove the non-standard character from the spec 

520 format_spec = format_spec.replace("/", "") 

521 replace_slash = False 

522 

523 if isinstance(value, str): 

524 # Replace spaces with underscores for more friendly file paths 

525 value = value.replace(" ", "_") 

526 if replace_slash: 

527 value = value.replace("/", "_") 

528 

529 # Now use standard formatting 

530 output = output + literal + format(value, format_spec) 

531 

532 # Replace periods with underscores in the non-directory part to 

533 # prevent file extension confusion. Also replace # in the non-dir 

534 # part to avoid confusion with URI fragments 

535 head, tail = os.path.split(output) 

536 tail = tail.replace(".", "_") 

537 tail = tail.replace("#", "HASH") 

538 output = os.path.join(head, tail) 

539 

540 # Complain if we were meant to use a component 

541 if component is not None and not usedComponent: 

542 raise KeyError( 

543 "Component '{}' specified but template {} did not use it".format(component, self.template) 

544 ) 

545 

546 # Since this is known to be a path, normalize it in case some double 

547 # slashes have crept in 

548 path = os.path.normpath(output) 

549 

550 # It should not be an absolute path (may happen with optionals) 

551 if os.path.isabs(path): 

552 path = os.path.relpath(path, start="/") 

553 

554 return path 

555 

556 def validateTemplate(self, entity: Union[DatasetRef, DatasetType, StorageClass, None]) -> None: 

557 """Compare the template against supplied entity that wants to use it. 

558 

559 Parameters 

560 ---------- 

561 entity : `DatasetType`, `DatasetRef`, or `StorageClass` 

562 Entity to compare against template. If `None` is given only 

563 very basic validation of templates will be performed. 

564 

565 Raises 

566 ------ 

567 FileTemplateValidationError 

568 Raised if the template is inconsistent with the supplied entity. 

569 

570 Notes 

571 ----- 

572 Validation will always include a check that mandatory fields 

573 are present and that at least one field refers to a dimension. 

574 If the supplied entity includes a `DimensionGraph` then it will be 

575 used to compare the available dimensions with those specified in the 

576 template. 

577 """ 

578 # Check that the template has run 

579 withSpecials = self.fields(specials=True, optionals=True) 

580 

581 if "collection" in withSpecials: 

582 raise FileTemplateValidationError( 

583 "'collection' is no longer supported as a file template placeholder; use 'run' instead." 

584 ) 

585 

586 if not withSpecials & self.mandatoryFields: 

587 raise FileTemplateValidationError( 

588 f"Template '{self}' is missing a mandatory field from {self.mandatoryFields}" 

589 ) 

590 

591 # Check that there are some dimension fields in the template 

592 # The id is allowed instead if present since that also uniquely 

593 # identifies the file in the datastore. 

594 allfields = self.fields(optionals=True) 

595 if not allfields and "id" not in withSpecials: 

596 raise FileTemplateValidationError( 

597 f"Template '{self}' does not seem to have any fields corresponding to dimensions." 

598 ) 

599 

600 # Require that if "id" is in the template then it must exist in the 

601 # file part -- this avoids templates like "{id}/fixed" where the file 

602 # name is fixed but the directory has the ID. 

603 if "id" in withSpecials: 

604 file_part = os.path.split(self.template)[-1] 

605 if "{id}" not in file_part: 

606 raise FileTemplateValidationError( 

607 f"Template '{self}' includes the 'id' but that ID is not part of the file name." 

608 ) 

609 

610 # If we do not have dimensions available then all we can do is shrug 

611 if not hasattr(entity, "dimensions"): 

612 return 

613 

614 # Mypy does not know about hasattr so help it out 

615 if entity is None: 

616 return 

617 

618 # if this entity represents a component then insist that component 

619 # is present in the template. If the entity is not a component 

620 # make sure that component is not mandatory. 

621 try: 

622 # mypy does not see the except block so complains about 

623 # StorageClass not supporting isComponent 

624 if entity.isComponent(): # type: ignore 

625 if "component" not in withSpecials: 

626 raise FileTemplateValidationError( 

627 f"Template '{self}' has no component but {entity} refers to a component." 

628 ) 

629 else: 

630 mandatorySpecials = self.fields(specials=True) 

631 if "component" in mandatorySpecials: 

632 raise FileTemplateValidationError( 

633 f"Template '{self}' has mandatory component but " 

634 f"{entity} does not refer to a component." 

635 ) 

636 except AttributeError: 

637 pass 

638 

639 # From here on we need at least a DatasetType 

640 # Mypy doesn't understand the AttributeError clause below 

641 if isinstance(entity, StorageClass): 

642 return 

643 

644 # Get the dimension links to get the full set of available field names 

645 # Fall back to dataId keys if we have them but no links. 

646 # dataId keys must still be present in the template 

647 try: 

648 minimal = set(entity.dimensions.required.names) 

649 maximal = set(entity.dimensions.names) 

650 except AttributeError: 

651 try: 

652 minimal = set(entity.dataId.keys().names) # type: ignore 

653 maximal = minimal 

654 except AttributeError: 

655 return 

656 

657 # Replace specific skypix dimensions with generic one 

658 skypix_alias = self._determine_skypix_alias(entity) 

659 if skypix_alias is not None: 

660 minimal.add("skypix") 

661 maximal.add("skypix") 

662 minimal.remove(skypix_alias) 

663 maximal.remove(skypix_alias) 

664 

665 required = self.fields(optionals=False) 

666 

667 # Calculate any field usage that does not match a dimension 

668 if not required.issubset(maximal): 

669 raise FileTemplateValidationError( 

670 f"Template '{self}' is inconsistent with {entity}:" 

671 f" {required} is not a subset of {maximal}." 

672 ) 

673 

674 if not allfields.issuperset(minimal): 

675 raise FileTemplateValidationError( 

676 f"Template '{self}' is inconsistent with {entity}:" 

677 f" {allfields} is not a superset of {minimal}." 

678 ) 

679 

680 return 

681 

682 def _determine_skypix_alias(self, entity: Union[DatasetRef, DatasetType]) -> Optional[str]: 

683 """Return the dimension name that refers to a sky pixel. 

684 

685 Parameters 

686 ---------- 

687 ref : `DatasetRef` or `DatasetType` 

688 The entity to examine. 

689 

690 Returns 

691 ------- 

692 alias : `str` 

693 If there is a sky pixelization in the supplied dataId, return 

694 its name, else returns `None`. Will return `None` also if there 

695 is more than one sky pix dimension in the data ID or if the 

696 dataID is not a `DataCoordinate` 

697 """ 

698 alias = None 

699 

700 if isinstance(entity, DatasetRef): 

701 entity = entity.datasetType 

702 

703 # If there is exactly one SkyPixDimension in the data ID, alias its 

704 # value with the key "skypix", so we can use that to match any 

705 # skypix dimension. 

706 # We restrict this behavior to the (real-world) case where the 

707 # data ID is a DataCoordinate, not just a dict. That should only 

708 # not be true in some test code, but that test code is a pain to 

709 # update to be more like the real world while still providing our 

710 # only tests of important behavior. 

711 skypix = [dimension for dimension in entity.dimensions if isinstance(dimension, SkyPixDimension)] 

712 if len(skypix) == 1: 

713 alias = skypix[0].name 

714 return alias