Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""Support for file template string expansion.""" 

25 

26__all__ = ("FileTemplates", "FileTemplate", "FileTemplatesConfig", "FileTemplateValidationError") 

27 

28import os.path 

29import string 

30import logging 

31from types import MappingProxyType 

32 

33from typing import ( 

34 TYPE_CHECKING, 

35 Any, 

36 Iterable, 

37 Mapping, 

38 Optional, 

39 Set, 

40 Tuple, 

41 Union, 

42) 

43 

44from .config import Config 

45from .configSupport import processLookupConfigs, LookupKey 

46from .exceptions import ValidationError 

47from .dimensions import SkyPixDimension, DataCoordinate 

48 

49if TYPE_CHECKING: 49 ↛ 50line 49 didn't jump to line 50, because the condition on line 49 was never true

50 from .dimensions import DimensionUniverse 

51 from .datasets import DatasetType, DatasetRef 

52 from .storageClass import StorageClass 

53 

54log = logging.getLogger(__name__) 

55 

56 

57class FileTemplateValidationError(ValidationError): 

58 """Exception thrown when a file template is not consistent with the 

59 associated `DatasetType`.""" 

60 pass 

61 

62 

63class FileTemplatesConfig(Config): 

64 """Configuration information for `FileTemplates`""" 

65 pass 

66 

67 

68class FileTemplates: 

69 """Collection of `FileTemplate` templates. 

70 

71 Parameters 

72 ---------- 

73 config : `FileTemplatesConfig` or `str` 

74 Load configuration. 

75 default : `str`, optional 

76 If not `None`, a default template to use if no template has 

77 been specified explicitly in the configuration. 

78 universe : `DimensionUniverse` 

79 The set of all known dimensions, used to normalize any lookup keys 

80 involving dimensions. 

81 

82 Notes 

83 ----- 

84 The configuration can include one level of hierarchy where an 

85 instrument-specific section can be defined to override more general 

86 template specifications. This is represented in YAML using a 

87 key of form ``instrument<name>`` which can then define templates 

88 that will be returned if a `DatasetRef` contains a matching instrument 

89 name in the data ID. 

90 

91 A default fallback template can be specified using the key ``default``. 

92 Defaulting can be disabled in a child configuration by defining the 

93 value to be an empty string or a boolean `False`. 

94 

95 The config is parsed using the function 

96 `~lsst.daf.butler.configSubset.processLookupConfigs`. 

97 """ 

98 

99 defaultKey = LookupKey("default") 

100 """Configuration key associated with the default template.""" 

101 

102 def __init__(self, config: Union[FileTemplatesConfig, str], 

103 default: Optional[str] = None, *, 

104 universe: DimensionUniverse): 

105 self.config = FileTemplatesConfig(config) 

106 self._templates = {} 

107 

108 contents = processLookupConfigs(self.config, universe=universe) 

109 

110 # Determine default to use -- defaults can be disabled if 

111 # we get a False or None 

112 defaultValue = contents.get(self.defaultKey, default) 

113 if defaultValue and not isinstance(defaultValue, str): 

114 raise RuntimeError("Default template value should be str or False, or None. " 

115 f"Got '{defaultValue}'") 

116 self.default = FileTemplate(defaultValue) if isinstance(defaultValue, str) and defaultValue else None 

117 

118 # Convert all the values to FileTemplate, handling defaults 

119 for key, templateStr in contents.items(): 

120 if key == self.defaultKey: 

121 continue 

122 if not isinstance(templateStr, str): 

123 raise RuntimeError(f"Unexpected value in file template key {key}: {templateStr}") 

124 self._templates[key] = FileTemplate(templateStr) 

125 

126 @property 

127 def templates(self) -> Mapping[LookupKey, FileTemplate]: 

128 """Collection of templates indexed by lookup key (`dict`).""" 

129 return MappingProxyType(self._templates) 

130 

131 def __contains__(self, key: LookupKey) -> bool: 

132 """Indicates whether the supplied key is present in the templates. 

133 

134 Parameters 

135 ---------- 

136 key : `LookupKey` 

137 Key to use to determine if a corresponding value is present 

138 in the templates. 

139 

140 Returns 

141 ------- 

142 in : `bool` 

143 `True` if the supplied key is present in the templates. 

144 """ 

145 return key in self.templates 

146 

147 def __getitem__(self, key: LookupKey) -> FileTemplate: 

148 return self.templates[key] 

149 

150 def validateTemplates(self, entities: Iterable[Union[DatasetType, DatasetRef, StorageClass]], 

151 logFailures: bool = False) -> None: 

152 """Retrieve the template associated with each dataset type and 

153 validate the dimensions against the template. 

154 

155 Parameters 

156 ---------- 

157 entities : `DatasetType`, `DatasetRef`, or `StorageClass` 

158 Entities to validate against the matching templates. Can be 

159 differing types. 

160 logFailures : `bool`, optional 

161 If `True`, output a log message for every validation error 

162 detected. 

163 

164 Raises 

165 ------ 

166 FileTemplateValidationError 

167 Raised if an entity failed validation. 

168 

169 Notes 

170 ----- 

171 See `FileTemplate.validateTemplate()` for details on the validation. 

172 """ 

173 unmatchedKeys = set(self.templates) 

174 failed = [] 

175 for entity in entities: 

176 try: 

177 matchKey, template = self.getTemplateWithMatch(entity) 

178 except KeyError as e: 

179 # KeyError always quotes on stringification so strip here 

180 errMsg = str(e).strip('"\'') 

181 failed.append(errMsg) 

182 if logFailures: 

183 log.fatal("%s", errMsg) 

184 continue 

185 

186 if matchKey in unmatchedKeys: 

187 unmatchedKeys.remove(matchKey) 

188 

189 try: 

190 template.validateTemplate(entity) 

191 except FileTemplateValidationError as e: 

192 failed.append(f"{e} (via key '{matchKey}')") 

193 if logFailures: 

194 log.fatal("Template failure with key '%s': %s", matchKey, e) 

195 

196 if logFailures and unmatchedKeys: 

197 log.warning("Unchecked keys: %s", ", ".join([str(k) for k in unmatchedKeys])) 

198 

199 if failed: 

200 if len(failed) == 1: 

201 msg = str(failed[0]) 

202 else: 

203 failMsg = ";\n".join(failed) 

204 msg = f"{len(failed)} template validation failures: {failMsg}" 

205 raise FileTemplateValidationError(msg) 

206 

207 def getLookupKeys(self) -> Set[LookupKey]: 

208 """Retrieve the look up keys for all the template entries. 

209 

210 Returns 

211 ------- 

212 keys : `set` of `LookupKey` 

213 The keys available for matching a template. 

214 """ 

215 return set(self.templates) 

216 

217 def getTemplateWithMatch(self, entity: Union[DatasetRef, 

218 DatasetType, StorageClass]) -> Tuple[LookupKey, 

219 FileTemplate]: 

220 """Retrieve the `FileTemplate` associated with the dataset type along 

221 with the lookup key that was a match for this template. 

222 

223 If the lookup name corresponds to a component the base name for 

224 the component will be examined if the full component name does 

225 not match. 

226 

227 Parameters 

228 ---------- 

229 entity : `DatasetType`, `DatasetRef`, or `StorageClass` 

230 Instance to use to look for a corresponding template. 

231 A `DatasetType` name or a `StorageClass` name will be used 

232 depending on the supplied entity. Priority is given to a 

233 `DatasetType` name. Supports instrument override if a 

234 `DatasetRef` is provided configured with an ``instrument`` 

235 value for the data ID. 

236 

237 Returns 

238 ------- 

239 matchKey : `LookupKey` 

240 The key that resulted in the successful match. 

241 template : `FileTemplate` 

242 Template instance to use with that dataset type. 

243 

244 Raises 

245 ------ 

246 KeyError 

247 Raised if no template could be located for this Dataset type. 

248 """ 

249 # Get the names to use for lookup 

250 names = entity._lookupNames() 

251 

252 # Get a location from the templates 

253 template = self.default 

254 source = self.defaultKey 

255 for name in names: 

256 if name in self.templates: 

257 template = self.templates[name] 

258 source = name 

259 break 

260 

261 if template is None: 

262 raise KeyError(f"Unable to determine file template from supplied argument [{entity}]") 

263 

264 log.debug("Got file %s from %s via %s", template, entity, source) 

265 

266 return source, template 

267 

268 def getTemplate(self, entity: Union[DatasetType, DatasetRef, StorageClass]) -> FileTemplate: 

269 """Retrieve the `FileTemplate` associated with the dataset type. 

270 

271 If the lookup name corresponds to a component the base name for 

272 the component will be examined if the full component name does 

273 not match. 

274 

275 Parameters 

276 ---------- 

277 entity : `DatasetType`, `DatasetRef`, or `StorageClass` 

278 Instance to use to look for a corresponding template. 

279 A `DatasetType` name or a `StorageClass` name will be used 

280 depending on the supplied entity. Priority is given to a 

281 `DatasetType` name. Supports instrument override if a 

282 `DatasetRef` is provided configured with an ``instrument`` 

283 value for the data ID. 

284 

285 Returns 

286 ------- 

287 template : `FileTemplate` 

288 Template instance to use with that dataset type. 

289 

290 Raises 

291 ------ 

292 KeyError 

293 Raised if no template could be located for this Dataset type. 

294 """ 

295 _, template = self.getTemplateWithMatch(entity) 

296 return template 

297 

298 

299class FileTemplate: 

300 """Format a path template into a fully expanded path. 

301 

302 Parameters 

303 ---------- 

304 template : `str` 

305 Template string. 

306 

307 Raises 

308 ------ 

309 FileTemplateValidationError 

310 Raised if the template fails basic validation. 

311 

312 Notes 

313 ----- 

314 The templates use the standard Format Specification Mini-Language 

315 with the caveat that only named fields can be used. The field names 

316 are taken from the Dimensions along with several additional fields: 

317 

318 - datasetType: `str`, `DatasetType.name` 

319 - component: `str`, name of the StorageClass component 

320 - run: `str`, name of the run this dataset was added with 

321 

322 `run` must always be provided to ensure unique paths. 

323 

324 More detailed information can be requested from dimensions by using a dot 

325 notation, so ``visit.name`` would use the name of the visit and 

326 ``detector.name_in_raft`` would use the name of the detector within the 

327 raft. 

328 

329 The mini-language is extended to understand a "?" in the format 

330 specification. This indicates that a field is optional. If that 

331 Dimension is missing the field, along with the text before the field, 

332 unless it is a path separator, will be removed from the output path. 

333 

334 By default any "/" in a dataId value will be replaced by "_" to prevent 

335 unexpected directories being created in the path. If the "/" should be 

336 retained then a special "/" format specifier can be included in the 

337 template. 

338 """ 

339 

340 mandatoryFields = {"run"} 

341 """A set of fields, one of which must be present in a template.""" 

342 

343 datasetFields = {"datasetType", "component"} 

344 """Fields related to the supplied dataset, not a dimension.""" 

345 

346 specialFields = mandatoryFields | datasetFields 

347 """Set of special fields that are available independently of the defined 

348 Dimensions.""" 

349 

350 def __init__(self, template: str): 

351 if not isinstance(template, str): 

352 raise FileTemplateValidationError(f"Template ('{template}') does " 

353 "not contain any format specifiers") 

354 self.template = template 

355 

356 # Do basic validation without access to dimensions 

357 self.validateTemplate(None) 

358 

359 def __eq__(self, other: Any) -> bool: 

360 if not isinstance(other, FileTemplate): 

361 return False 

362 

363 return self.template == other.template 

364 

365 def __str__(self) -> str: 

366 return self.template 

367 

368 def __repr__(self) -> str: 

369 return f'{self.__class__.__name__}("{self.template}")' 

370 

371 def fields(self, optionals: bool = False, specials: bool = False, subfields: bool = False) -> Set[str]: 

372 """Return the field names used in this template. 

373 

374 Parameters 

375 ---------- 

376 optionals : `bool` 

377 If `True`, optional fields are included in the returned set. 

378 specials : `bool` 

379 If `True`, non-dimension fields are included. 

380 subfields : `bool`, optional 

381 If `True`, fields with syntax ``a.b`` are included. If `False`, 

382 the default, only ``a`` would be returned. 

383 

384 Returns 

385 ------- 

386 names : `set` 

387 Names of fields used in this template 

388 

389 Notes 

390 ----- 

391 The returned set will include the special values such as `datasetType` 

392 and `component`. 

393 """ 

394 fmt = string.Formatter() 

395 parts = fmt.parse(self.template) 

396 

397 names = set() 

398 for literal, field_name, format_spec, conversion in parts: 

399 if field_name is not None and format_spec is not None: 

400 if "?" in format_spec and not optionals: 

401 continue 

402 

403 if not specials and field_name in self.specialFields: 

404 continue 

405 

406 if "." in field_name and not subfields: 

407 field_name, _ = field_name.split(".") 

408 

409 names.add(field_name) 

410 

411 return names 

412 

413 def format(self, ref: DatasetRef) -> str: 

414 """Format a template string into a full path. 

415 

416 Parameters 

417 ---------- 

418 ref : `DatasetRef` 

419 The dataset to be formatted. 

420 

421 Returns 

422 ------- 

423 path : `str` 

424 Expanded path. 

425 

426 Raises 

427 ------ 

428 KeyError 

429 Raised if the requested field is not defined and the field is 

430 not optional. Or, `component` is specified but "component" was 

431 not part of the template. 

432 """ 

433 # Extract defined non-None dimensions from the dataId 

434 # We attempt to get the "full" dict on the assumption that ref.dataId 

435 # is a ExpandedDataCoordinate, as it should be when running 

436 # PipelineTasks. We should probably just require that when formatting 

437 # templates (and possibly when constructing DatasetRefs), but doing so 

438 # would break a ton of otherwise-useful tests that would need to be 

439 # modified to provide a lot more metadata. 

440 fields = {k: v for k, v in getattr(ref.dataId, "full", ref.dataId).items() if v is not None} 

441 

442 if isinstance(ref.dataId, DataCoordinate): 

443 # If there is exactly one SkyPixDimension in the data ID, alias its 

444 # value with the key "skypix", so we can use that to match any 

445 # skypix dimension. 

446 # We restrict this behavior to the (real-world) case where the 

447 # data ID is a DataCoordinate, not just a dict. That should only 

448 # not be true in some test code, but that test code is a pain to 

449 # update to be more like the real world while still providing our 

450 # only tests of important behavior. 

451 skypix = [dimension for dimension in ref.dataId.graph if isinstance(dimension, SkyPixDimension)] 

452 if len(skypix) == 1: 

453 fields["skypix"] = fields[skypix[0]] 

454 

455 # Extra information that can be included using . syntax 

456 extras = getattr(ref.dataId, "records", {}) 

457 

458 datasetType = ref.datasetType 

459 fields["datasetType"], component = datasetType.nameAndComponent() 

460 

461 usedComponent = False 

462 if component is not None: 

463 fields["component"] = component 

464 

465 usedRun = False 

466 fields["run"] = ref.run 

467 

468 fmt = string.Formatter() 

469 parts = fmt.parse(self.template) 

470 output = "" 

471 

472 for literal, field_name, format_spec, conversion in parts: 

473 

474 if field_name == "component": 

475 usedComponent = True 

476 

477 if format_spec is None: 

478 output = output + literal 

479 continue 

480 

481 # Should only happen if format_spec is None 

482 if field_name is None: 

483 raise RuntimeError(f"Unexpected blank field_name encountered in {self.template} [{literal}]") 

484 

485 if "?" in format_spec: 

486 optional = True 

487 # Remove the non-standard character from the spec 

488 format_spec = format_spec.replace("?", "") 

489 else: 

490 optional = False 

491 

492 if field_name == "run": 

493 usedRun = True 

494 

495 if field_name == "collection": 

496 raise KeyError("'collection' is no longer supported as a " 

497 "file template placeholder; use 'run' instead.") 

498 

499 # Check for request for additional information from the dataId 

500 if "." in field_name: 

501 primary, secondary = field_name.split(".") 

502 if primary in extras: 

503 record = extras[primary] 

504 # Only fill in the fields if we have a value, the 

505 # KeyError will trigger below if the attribute is missing. 

506 if hasattr(record, secondary): 

507 fields[field_name] = getattr(record, secondary) 

508 

509 if field_name in fields: 

510 value = fields[field_name] 

511 elif optional: 

512 # If this is optional ignore the format spec 

513 # and do not include the literal text prior to the optional 

514 # field unless it contains a "/" path separator 

515 format_spec = "" 

516 value = "" 

517 if "/" not in literal: 

518 literal = "" 

519 else: 

520 raise KeyError(f"'{field_name}' requested in template via '{self.template}' " 

521 "but not defined and not optional") 

522 

523 # Handle "/" in values since we do not want to be surprised by 

524 # unexpected directories turning up 

525 replace_slash = True 

526 if "/" in format_spec: 

527 # Remove the non-standard character from the spec 

528 format_spec = format_spec.replace("/", "") 

529 replace_slash = False 

530 

531 if isinstance(value, str): 

532 if replace_slash: 

533 value = value.replace("/", "_") 

534 

535 # Now use standard formatting 

536 output = output + literal + format(value, format_spec) 

537 

538 # Replace periods with underscores in the non-directory part to 

539 # prevent file extension confusion. 

540 head, tail = os.path.split(output) 

541 output = os.path.join(head, tail.replace(".", "_")) 

542 

543 # Complain if we were meant to use a component 

544 if component is not None and not usedComponent: 

545 raise KeyError("Component '{}' specified but template {} did not use it".format(component, 

546 self.template)) 

547 

548 # Complain if there's no run 

549 if not usedRun: 

550 raise KeyError("Template does not include 'run'.") 

551 

552 # Since this is known to be a path, normalize it in case some double 

553 # slashes have crept in 

554 path = os.path.normpath(output) 

555 

556 # It should not be an absolute path (may happen with optionals) 

557 if os.path.isabs(path): 

558 path = os.path.relpath(path, start="/") 

559 

560 return path 

561 

562 def validateTemplate(self, entity: Union[DatasetRef, DatasetType, StorageClass, None]) -> None: 

563 """Compare the template against a representative entity that would 

564 like to use template. 

565 

566 Parameters 

567 ---------- 

568 entity : `DatasetType`, `DatasetRef`, or `StorageClass` 

569 Entity to compare against template. If `None` is given only 

570 very basic validation of templates will be performed. 

571 

572 Raises 

573 ------ 

574 FileTemplateValidationError 

575 Raised if the template is inconsistent with the supplied entity. 

576 

577 Notes 

578 ----- 

579 Validation will always include a check that mandatory fields 

580 are present and that at least one field refers to a dimension. 

581 If the supplied entity includes a `DimensionGraph` then it will be 

582 used to compare the available dimensions with those specified in the 

583 template. 

584 """ 

585 

586 # Check that the template has run 

587 withSpecials = self.fields(specials=True, optionals=True) 

588 if not withSpecials & self.mandatoryFields: 

589 raise FileTemplateValidationError(f"Template '{self}' is missing a mandatory field" 

590 f" from {self.mandatoryFields}") 

591 

592 # Check that there are some dimension fields in the template 

593 allfields = self.fields(optionals=True) 

594 if not allfields: 

595 raise FileTemplateValidationError(f"Template '{self}' does not seem to have any fields" 

596 " corresponding to dimensions.") 

597 

598 # If we do not have dimensions available then all we can do is shrug 

599 if not hasattr(entity, "dimensions"): 

600 return 

601 

602 # if this entity represents a component then insist that component 

603 # is present in the template. If the entity is not a component 

604 # make sure that component is not mandatory. 

605 try: 

606 # mypy does not see the except block so complains about 

607 # StorageClass not supporting isComponent 

608 if entity.isComponent(): # type: ignore 

609 if "component" not in withSpecials: 

610 raise FileTemplateValidationError(f"Template '{self}' has no component but " 

611 f"{entity} refers to a component.") 

612 else: 

613 mandatorySpecials = self.fields(specials=True) 

614 if "component" in mandatorySpecials: 

615 raise FileTemplateValidationError(f"Template '{self}' has mandatory component but " 

616 f"{entity} does not refer to a component.") 

617 except AttributeError: 

618 pass 

619 

620 # Get the dimension links to get the full set of available field names 

621 # Fall back to dataId keys if we have them but no links. 

622 # dataId keys must still be present in the template 

623 # Ignore warnings from mypy concerning StorageClass and DatasetType 

624 # not supporting the full API. 

625 try: 

626 minimal = set(entity.dimensions.required.names) # type: ignore 

627 maximal = set(entity.dimensions.names) # type: ignore 

628 except AttributeError: 

629 try: 

630 minimal = set(entity.dataId.keys()) # type: ignore 

631 maximal = minimal 

632 except AttributeError: 

633 return 

634 

635 required = self.fields(optionals=False) 

636 

637 # Calculate any field usage that does not match a dimension 

638 if not required.issubset(maximal): 

639 raise FileTemplateValidationError(f"Template '{self}' is inconsistent with {entity}:" 

640 f" {required} is not a subset of {maximal}.") 

641 

642 if not allfields.issuperset(minimal): 

643 raise FileTemplateValidationError(f"Template '{self}' is inconsistent with {entity}:" 

644 f" {allfields} is not a superset of {minimal}.") 

645 

646 return