Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""Support for file template string expansion.""" 

25 

26__all__ = ("FileTemplates", "FileTemplate", "FileTemplatesConfig", "FileTemplateValidationError") 

27 

28import os.path 

29import string 

30import logging 

31from types import MappingProxyType 

32 

33from typing import ( 

34 TYPE_CHECKING, 

35 Any, 

36 Iterable, 

37 Mapping, 

38 Optional, 

39 Set, 

40 Tuple, 

41 Union, 

42) 

43 

44from .config import Config 

45from .configSupport import processLookupConfigs, LookupKey 

46from .exceptions import ValidationError 

47from .dimensions import SkyPixDimension, DataCoordinate 

48from .datasets import DatasetRef 

49from .storageClass import StorageClass 

50 

51if TYPE_CHECKING: 51 ↛ 52line 51 didn't jump to line 52, because the condition on line 51 was never true

52 from .dimensions import DimensionUniverse 

53 from .datasets import DatasetType 

54 

55log = logging.getLogger(__name__) 

56 

57 

58class FileTemplateValidationError(ValidationError): 

59 """Exception for file template inconsistent with associated DatasetType.""" 

60 

61 pass 

62 

63 

64class FileTemplatesConfig(Config): 

65 """Configuration information for `FileTemplates`.""" 

66 

67 pass 

68 

69 

70class FileTemplates: 

71 """Collection of `FileTemplate` templates. 

72 

73 Parameters 

74 ---------- 

75 config : `FileTemplatesConfig` or `str` 

76 Load configuration. 

77 default : `str`, optional 

78 If not `None`, a default template to use if no template has 

79 been specified explicitly in the configuration. 

80 universe : `DimensionUniverse` 

81 The set of all known dimensions, used to normalize any lookup keys 

82 involving dimensions. 

83 

84 Notes 

85 ----- 

86 The configuration can include one level of hierarchy where an 

87 instrument-specific section can be defined to override more general 

88 template specifications. This is represented in YAML using a 

89 key of form ``instrument<name>`` which can then define templates 

90 that will be returned if a `DatasetRef` contains a matching instrument 

91 name in the data ID. 

92 

93 A default fallback template can be specified using the key ``default``. 

94 Defaulting can be disabled in a child configuration by defining the 

95 value to be an empty string or a boolean `False`. 

96 

97 The config is parsed using the function 

98 `~lsst.daf.butler.configSubset.processLookupConfigs`. 

99 """ 

100 

101 defaultKey = LookupKey("default") 

102 """Configuration key associated with the default template.""" 

103 

104 def __init__(self, config: Union[FileTemplatesConfig, str], 

105 default: Optional[str] = None, *, 

106 universe: DimensionUniverse): 

107 self.config = FileTemplatesConfig(config) 

108 self._templates = {} 

109 

110 contents = processLookupConfigs(self.config, universe=universe) 

111 

112 # Determine default to use -- defaults can be disabled if 

113 # we get a False or None 

114 defaultValue = contents.get(self.defaultKey, default) 

115 if defaultValue and not isinstance(defaultValue, str): 

116 raise RuntimeError("Default template value should be str or False, or None. " 

117 f"Got '{defaultValue}'") 

118 self.default = FileTemplate(defaultValue) if isinstance(defaultValue, str) and defaultValue else None 

119 

120 # Convert all the values to FileTemplate, handling defaults 

121 for key, templateStr in contents.items(): 

122 if key == self.defaultKey: 

123 continue 

124 if not isinstance(templateStr, str): 

125 raise RuntimeError(f"Unexpected value in file template key {key}: {templateStr}") 

126 self._templates[key] = FileTemplate(templateStr) 

127 

128 @property 

129 def templates(self) -> Mapping[LookupKey, FileTemplate]: 

130 """Return collection of templates indexed by lookup key (`dict`).""" 

131 return MappingProxyType(self._templates) 

132 

133 def __contains__(self, key: LookupKey) -> bool: 

134 """Indicate whether the supplied key is present in the templates. 

135 

136 Parameters 

137 ---------- 

138 key : `LookupKey` 

139 Key to use to determine if a corresponding value is present 

140 in the templates. 

141 

142 Returns 

143 ------- 

144 in : `bool` 

145 `True` if the supplied key is present in the templates. 

146 """ 

147 return key in self.templates 

148 

149 def __getitem__(self, key: LookupKey) -> FileTemplate: 

150 return self.templates[key] 

151 

152 def validateTemplates(self, entities: Iterable[Union[DatasetType, DatasetRef, StorageClass]], 

153 logFailures: bool = False) -> None: 

154 """Validate the templates. 

155 

156 Retrieves the template associated with each dataset type and 

157 validates the dimensions against the template. 

158 

159 Parameters 

160 ---------- 

161 entities : `DatasetType`, `DatasetRef`, or `StorageClass` 

162 Entities to validate against the matching templates. Can be 

163 differing types. 

164 logFailures : `bool`, optional 

165 If `True`, output a log message for every validation error 

166 detected. 

167 

168 Raises 

169 ------ 

170 FileTemplateValidationError 

171 Raised if an entity failed validation. 

172 

173 Notes 

174 ----- 

175 See `FileTemplate.validateTemplate()` for details on the validation. 

176 """ 

177 unmatchedKeys = set(self.templates) 

178 failed = [] 

179 for entity in entities: 

180 try: 

181 matchKey, template = self.getTemplateWithMatch(entity) 

182 except KeyError as e: 

183 # KeyError always quotes on stringification so strip here 

184 errMsg = str(e).strip('"\'') 

185 failed.append(errMsg) 

186 if logFailures: 

187 log.critical("%s", errMsg) 

188 continue 

189 

190 if matchKey in unmatchedKeys: 

191 unmatchedKeys.remove(matchKey) 

192 

193 try: 

194 template.validateTemplate(entity) 

195 except FileTemplateValidationError as e: 

196 failed.append(f"{e} (via key '{matchKey}')") 

197 if logFailures: 

198 log.critical("Template failure with key '%s': %s", matchKey, e) 

199 

200 if logFailures and unmatchedKeys: 

201 log.warning("Unchecked keys: %s", ", ".join([str(k) for k in unmatchedKeys])) 

202 

203 if failed: 

204 if len(failed) == 1: 

205 msg = str(failed[0]) 

206 else: 

207 failMsg = ";\n".join(failed) 

208 msg = f"{len(failed)} template validation failures: {failMsg}" 

209 raise FileTemplateValidationError(msg) 

210 

211 def getLookupKeys(self) -> Set[LookupKey]: 

212 """Retrieve the look up keys for all the template entries. 

213 

214 Returns 

215 ------- 

216 keys : `set` of `LookupKey` 

217 The keys available for matching a template. 

218 """ 

219 return set(self.templates) 

220 

221 def getTemplateWithMatch(self, entity: Union[DatasetRef, 

222 DatasetType, StorageClass]) -> Tuple[LookupKey, 

223 FileTemplate]: 

224 """Retrieve the `FileTemplate` associated with the dataset type. 

225 

226 Also retrieves the lookup key that was a match for this template. 

227 

228 If the lookup name corresponds to a component the base name for 

229 the component will be examined if the full component name does 

230 not match. 

231 

232 Parameters 

233 ---------- 

234 entity : `DatasetType`, `DatasetRef`, or `StorageClass` 

235 Instance to use to look for a corresponding template. 

236 A `DatasetType` name or a `StorageClass` name will be used 

237 depending on the supplied entity. Priority is given to a 

238 `DatasetType` name. Supports instrument override if a 

239 `DatasetRef` is provided configured with an ``instrument`` 

240 value for the data ID. 

241 

242 Returns 

243 ------- 

244 matchKey : `LookupKey` 

245 The key that resulted in the successful match. 

246 template : `FileTemplate` 

247 Template instance to use with that dataset type. 

248 

249 Raises 

250 ------ 

251 KeyError 

252 Raised if no template could be located for this Dataset type. 

253 """ 

254 # Get the names to use for lookup 

255 names = entity._lookupNames() 

256 

257 # Get a location from the templates 

258 template = self.default 

259 source = self.defaultKey 

260 for name in names: 

261 if name in self.templates: 

262 template = self.templates[name] 

263 source = name 

264 break 

265 

266 if template is None: 

267 raise KeyError(f"Unable to determine file template from supplied argument [{entity}]") 

268 

269 log.debug("Got file %s from %s via %s", template, entity, source) 

270 

271 return source, template 

272 

273 def getTemplate(self, entity: Union[DatasetType, DatasetRef, StorageClass]) -> FileTemplate: 

274 """Retrieve the `FileTemplate` associated with the dataset type. 

275 

276 If the lookup name corresponds to a component the base name for 

277 the component will be examined if the full component name does 

278 not match. 

279 

280 Parameters 

281 ---------- 

282 entity : `DatasetType`, `DatasetRef`, or `StorageClass` 

283 Instance to use to look for a corresponding template. 

284 A `DatasetType` name or a `StorageClass` name will be used 

285 depending on the supplied entity. Priority is given to a 

286 `DatasetType` name. Supports instrument override if a 

287 `DatasetRef` is provided configured with an ``instrument`` 

288 value for the data ID. 

289 

290 Returns 

291 ------- 

292 template : `FileTemplate` 

293 Template instance to use with that dataset type. 

294 

295 Raises 

296 ------ 

297 KeyError 

298 Raised if no template could be located for this Dataset type. 

299 """ 

300 _, template = self.getTemplateWithMatch(entity) 

301 return template 

302 

303 

304class FileTemplate: 

305 """Format a path template into a fully expanded path. 

306 

307 Parameters 

308 ---------- 

309 template : `str` 

310 Template string. 

311 

312 Raises 

313 ------ 

314 FileTemplateValidationError 

315 Raised if the template fails basic validation. 

316 

317 Notes 

318 ----- 

319 The templates use the standard Format Specification Mini-Language 

320 with the caveat that only named fields can be used. The field names 

321 are taken from the Dimensions along with several additional fields: 

322 

323 - datasetType: `str`, `DatasetType.name` 

324 - component: `str`, name of the StorageClass component 

325 - run: `str`, name of the run this dataset was added with 

326 

327 `run` must always be provided to ensure unique paths. 

328 

329 More detailed information can be requested from dimensions by using a dot 

330 notation, so ``visit.name`` would use the name of the visit and 

331 ``detector.name_in_raft`` would use the name of the detector within the 

332 raft. 

333 

334 The mini-language is extended to understand a "?" in the format 

335 specification. This indicates that a field is optional. If that 

336 Dimension is missing the field, along with the text before the field, 

337 unless it is a path separator, will be removed from the output path. 

338 

339 By default any "/" in a dataId value will be replaced by "_" to prevent 

340 unexpected directories being created in the path. If the "/" should be 

341 retained then a special "/" format specifier can be included in the 

342 template. 

343 """ 

344 

345 mandatoryFields = {"run"} 

346 """A set of fields, one of which must be present in a template.""" 

347 

348 datasetFields = {"datasetType", "component"} 

349 """Fields related to the supplied dataset, not a dimension.""" 

350 

351 specialFields = mandatoryFields | datasetFields 

352 """Set of special fields that are available independently of the defined 

353 Dimensions.""" 

354 

355 def __init__(self, template: str): 

356 if not isinstance(template, str): 

357 raise FileTemplateValidationError(f"Template ('{template}') does " 

358 "not contain any format specifiers") 

359 self.template = template 

360 

361 # Do basic validation without access to dimensions 

362 self.validateTemplate(None) 

363 

364 def __eq__(self, other: Any) -> bool: 

365 if not isinstance(other, FileTemplate): 

366 return False 

367 

368 return self.template == other.template 

369 

370 def __str__(self) -> str: 

371 return self.template 

372 

373 def __repr__(self) -> str: 

374 return f'{self.__class__.__name__}("{self.template}")' 

375 

376 def fields(self, optionals: bool = False, specials: bool = False, subfields: bool = False) -> Set[str]: 

377 """Return the field names used in this template. 

378 

379 Parameters 

380 ---------- 

381 optionals : `bool` 

382 If `True`, optional fields are included in the returned set. 

383 specials : `bool` 

384 If `True`, non-dimension fields are included. 

385 subfields : `bool`, optional 

386 If `True`, fields with syntax ``a.b`` are included. If `False`, 

387 the default, only ``a`` would be returned. 

388 

389 Returns 

390 ------- 

391 names : `set` 

392 Names of fields used in this template 

393 

394 Notes 

395 ----- 

396 The returned set will include the special values such as `datasetType` 

397 and `component`. 

398 """ 

399 fmt = string.Formatter() 

400 parts = fmt.parse(self.template) 

401 

402 names = set() 

403 for literal, field_name, format_spec, conversion in parts: 

404 if field_name is not None and format_spec is not None: 

405 if "?" in format_spec and not optionals: 

406 continue 

407 

408 if not specials and field_name in self.specialFields: 

409 continue 

410 

411 if "." in field_name and not subfields: 

412 field_name, _ = field_name.split(".") 

413 

414 names.add(field_name) 

415 

416 return names 

417 

418 def format(self, ref: DatasetRef) -> str: 

419 """Format a template string into a full path. 

420 

421 Parameters 

422 ---------- 

423 ref : `DatasetRef` 

424 The dataset to be formatted. 

425 

426 Returns 

427 ------- 

428 path : `str` 

429 Expanded path. 

430 

431 Raises 

432 ------ 

433 KeyError 

434 Raised if the requested field is not defined and the field is 

435 not optional. Or, `component` is specified but "component" was 

436 not part of the template. 

437 """ 

438 # Extract defined non-None dimensions from the dataId. 

439 # This guards against Nones being explicitly present in the data ID 

440 # (which can happen if, say, an exposure has no filter), as well as 

441 # the case where only required dimensions are present (which in this 

442 # context should only happen in unit tests; in general we need all 

443 # dimensions to fill out templates). 

444 fields = {k: ref.dataId.get(k) for k in ref.datasetType.dimensions.names 

445 if ref.dataId.get(k) is not None} 

446 # Extra information that can be included using . syntax 

447 extras = {} 

448 if isinstance(ref.dataId, DataCoordinate): 

449 if ref.dataId.hasRecords(): 

450 extras = ref.dataId.records.byName() 

451 skypix_alias = self._determine_skypix_alias(ref) 

452 if skypix_alias is not None: 

453 fields["skypix"] = fields[skypix_alias] 

454 if extras: 

455 extras["skypix"] = extras[skypix_alias] 

456 

457 datasetType = ref.datasetType 

458 fields["datasetType"], component = datasetType.nameAndComponent() 

459 

460 usedComponent = False 

461 if component is not None: 

462 fields["component"] = component 

463 

464 usedRun = False 

465 fields["run"] = ref.run 

466 

467 fmt = string.Formatter() 

468 parts = fmt.parse(self.template) 

469 output = "" 

470 

471 for literal, field_name, format_spec, conversion in parts: 

472 

473 if field_name == "component": 

474 usedComponent = True 

475 

476 if format_spec is None: 

477 output = output + literal 

478 continue 

479 

480 # Should only happen if format_spec is None 

481 if field_name is None: 

482 raise RuntimeError(f"Unexpected blank field_name encountered in {self.template} [{literal}]") 

483 

484 if "?" in format_spec: 

485 optional = True 

486 # Remove the non-standard character from the spec 

487 format_spec = format_spec.replace("?", "") 

488 else: 

489 optional = False 

490 

491 if field_name == "run": 

492 usedRun = True 

493 

494 if field_name == "collection": 

495 raise KeyError("'collection' is no longer supported as a " 

496 "file template placeholder; use 'run' instead.") 

497 

498 # Check for request for additional information from the dataId 

499 if "." in field_name: 

500 primary, secondary = field_name.split(".") 

501 if primary in extras: 

502 record = extras[primary] 

503 # Only fill in the fields if we have a value, the 

504 # KeyError will trigger below if the attribute is missing. 

505 if hasattr(record, secondary): 

506 fields[field_name] = getattr(record, secondary) 

507 

508 if field_name in fields: 

509 value = fields[field_name] 

510 elif optional: 

511 # If this is optional ignore the format spec 

512 # and do not include the literal text prior to the optional 

513 # field unless it contains a "/" path separator 

514 format_spec = "" 

515 value = "" 

516 if "/" not in literal: 

517 literal = "" 

518 else: 

519 raise KeyError(f"'{field_name}' requested in template via '{self.template}' " 

520 "but not defined and not optional") 

521 

522 # Handle "/" in values since we do not want to be surprised by 

523 # unexpected directories turning up 

524 replace_slash = True 

525 if "/" in format_spec: 

526 # Remove the non-standard character from the spec 

527 format_spec = format_spec.replace("/", "") 

528 replace_slash = False 

529 

530 if isinstance(value, str): 

531 # Replace spaces with underscores for more friendly file paths 

532 value = value.replace(" ", "_") 

533 if replace_slash: 

534 value = value.replace("/", "_") 

535 

536 # Now use standard formatting 

537 output = output + literal + format(value, format_spec) 

538 

539 # Replace periods with underscores in the non-directory part to 

540 # prevent file extension confusion. Also replace # in the non-dir 

541 # part to avoid confusion with URI fragments 

542 head, tail = os.path.split(output) 

543 tail = tail.replace(".", "_") 

544 tail = tail.replace("#", "HASH") 

545 output = os.path.join(head, tail) 

546 

547 # Complain if we were meant to use a component 

548 if component is not None and not usedComponent: 

549 raise KeyError("Component '{}' specified but template {} did not use it".format(component, 

550 self.template)) 

551 

552 # Complain if there's no run 

553 if not usedRun: 

554 raise KeyError("Template does not include 'run'.") 

555 

556 # Since this is known to be a path, normalize it in case some double 

557 # slashes have crept in 

558 path = os.path.normpath(output) 

559 

560 # It should not be an absolute path (may happen with optionals) 

561 if os.path.isabs(path): 

562 path = os.path.relpath(path, start="/") 

563 

564 return path 

565 

566 def validateTemplate(self, entity: Union[DatasetRef, DatasetType, StorageClass, None]) -> None: 

567 """Compare the template against supplied entity that wants to use it. 

568 

569 Parameters 

570 ---------- 

571 entity : `DatasetType`, `DatasetRef`, or `StorageClass` 

572 Entity to compare against template. If `None` is given only 

573 very basic validation of templates will be performed. 

574 

575 Raises 

576 ------ 

577 FileTemplateValidationError 

578 Raised if the template is inconsistent with the supplied entity. 

579 

580 Notes 

581 ----- 

582 Validation will always include a check that mandatory fields 

583 are present and that at least one field refers to a dimension. 

584 If the supplied entity includes a `DimensionGraph` then it will be 

585 used to compare the available dimensions with those specified in the 

586 template. 

587 """ 

588 # Check that the template has run 

589 withSpecials = self.fields(specials=True, optionals=True) 

590 if not withSpecials & self.mandatoryFields: 

591 raise FileTemplateValidationError(f"Template '{self}' is missing a mandatory field" 

592 f" from {self.mandatoryFields}") 

593 

594 # Check that there are some dimension fields in the template 

595 allfields = self.fields(optionals=True) 

596 if not allfields: 

597 raise FileTemplateValidationError(f"Template '{self}' does not seem to have any fields" 

598 " corresponding to dimensions.") 

599 

600 # If we do not have dimensions available then all we can do is shrug 

601 if not hasattr(entity, "dimensions"): 

602 return 

603 

604 # Mypy does not know about hasattr so help it out 

605 if entity is None: 

606 return 

607 

608 # if this entity represents a component then insist that component 

609 # is present in the template. If the entity is not a component 

610 # make sure that component is not mandatory. 

611 try: 

612 # mypy does not see the except block so complains about 

613 # StorageClass not supporting isComponent 

614 if entity.isComponent(): # type: ignore 

615 if "component" not in withSpecials: 

616 raise FileTemplateValidationError(f"Template '{self}' has no component but " 

617 f"{entity} refers to a component.") 

618 else: 

619 mandatorySpecials = self.fields(specials=True) 

620 if "component" in mandatorySpecials: 

621 raise FileTemplateValidationError(f"Template '{self}' has mandatory component but " 

622 f"{entity} does not refer to a component.") 

623 except AttributeError: 

624 pass 

625 

626 # From here on we need at least a DatasetType 

627 # Mypy doesn't understand the AttributeError clause below 

628 if isinstance(entity, StorageClass): 

629 return 

630 

631 # Get the dimension links to get the full set of available field names 

632 # Fall back to dataId keys if we have them but no links. 

633 # dataId keys must still be present in the template 

634 try: 

635 minimal = set(entity.dimensions.required.names) 

636 maximal = set(entity.dimensions.names) 

637 except AttributeError: 

638 try: 

639 minimal = set(entity.dataId.keys().names) # type: ignore 

640 maximal = minimal 

641 except AttributeError: 

642 return 

643 

644 # Replace specific skypix dimensions with generic one 

645 skypix_alias = self._determine_skypix_alias(entity) 

646 if skypix_alias is not None: 

647 minimal.add("skypix") 

648 maximal.add("skypix") 

649 minimal.remove(skypix_alias) 

650 maximal.remove(skypix_alias) 

651 

652 required = self.fields(optionals=False) 

653 

654 # Calculate any field usage that does not match a dimension 

655 if not required.issubset(maximal): 

656 raise FileTemplateValidationError(f"Template '{self}' is inconsistent with {entity}:" 

657 f" {required} is not a subset of {maximal}.") 

658 

659 if not allfields.issuperset(minimal): 

660 raise FileTemplateValidationError(f"Template '{self}' is inconsistent with {entity}:" 

661 f" {allfields} is not a superset of {minimal}.") 

662 

663 return 

664 

665 def _determine_skypix_alias(self, entity: Union[DatasetRef, DatasetType]) -> Optional[str]: 

666 """Return the dimension name that refers to a sky pixel. 

667 

668 Parameters 

669 ---------- 

670 ref : `DatasetRef` or `DatasetType` 

671 The entity to examine. 

672 

673 Returns 

674 ------- 

675 alias : `str` 

676 If there is a sky pixelization in the supplied dataId, return 

677 its name, else returns `None`. Will return `None` also if there 

678 is more than one sky pix dimension in the data ID or if the 

679 dataID is not a `DataCoordinate` 

680 """ 

681 alias = None 

682 

683 if isinstance(entity, DatasetRef): 

684 entity = entity.datasetType 

685 

686 # If there is exactly one SkyPixDimension in the data ID, alias its 

687 # value with the key "skypix", so we can use that to match any 

688 # skypix dimension. 

689 # We restrict this behavior to the (real-world) case where the 

690 # data ID is a DataCoordinate, not just a dict. That should only 

691 # not be true in some test code, but that test code is a pain to 

692 # update to be more like the real world while still providing our 

693 # only tests of important behavior. 

694 skypix = [dimension for dimension in entity.dimensions 

695 if isinstance(dimension, SkyPixDimension)] 

696 if len(skypix) == 1: 

697 alias = skypix[0].name 

698 return alias