Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Generic file-based datastore code.""" 

23 

24__all__ = ("FileLikeDatastore", ) 

25 

26import logging 

27import itertools 

28from abc import abstractmethod 

29 

30from sqlalchemy import Integer, String 

31 

32from dataclasses import dataclass 

33from typing import Optional, List, Type 

34 

35from lsst.daf.butler import ( 

36 Config, 

37 FileDataset, 

38 DatasetRef, 

39 DatasetTypeNotSupportedError, 

40 Datastore, 

41 DatastoreConfig, 

42 DatastoreValidationError, 

43 FileDescriptor, 

44 FileTemplates, 

45 FileTemplateValidationError, 

46 Formatter, 

47 FormatterFactory, 

48 Location, 

49 LocationFactory, 

50 StorageClass, 

51 StoredFileInfo, 

52) 

53 

54from lsst.daf.butler import ddl 

55from lsst.daf.butler.registry.interfaces import ReadOnlyDatabaseError 

56 

57from lsst.daf.butler.core.repoRelocation import replaceRoot 

58from lsst.daf.butler.core.utils import getInstanceOf, NamedValueSet, getClassOf, transactional 

59from .genericDatastore import GenericBaseDatastore 

60 

61log = logging.getLogger(__name__) 

62 

63 

64class _IngestPrepData(Datastore.IngestPrepData): 

65 """Helper class for FileLikeDatastore ingest implementation. 

66 

67 Parameters 

68 ---------- 

69 datasets : `list` of `FileDataset` 

70 Files to be ingested by this datastore. 

71 """ 

72 def __init__(self, datasets: List[FileDataset]): 

73 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

74 self.datasets = datasets 

75 

76 

77@dataclass(frozen=True) 

78class DatastoreFileGetInformation: 

79 """Collection of useful parameters needed to retrieve a file from 

80 a Datastore. 

81 """ 

82 

83 location: Location 

84 """The location from which to read the dataset.""" 

85 

86 formatter: Formatter 

87 """The `Formatter` to use to deserialize the dataset.""" 

88 

89 info: StoredFileInfo 

90 """Stored information about this file and its formatter.""" 

91 

92 assemblerParams: dict 

93 """Parameters to use for post-processing the retrieved dataset.""" 

94 

95 component: Optional[str] 

96 """The component to be retrieved (can be `None`).""" 

97 

98 readStorageClass: StorageClass 

99 """The `StorageClass` of the dataset being read.""" 

100 

101 

102class FileLikeDatastore(GenericBaseDatastore): 

103 """Generic Datastore for file-based implementations. 

104 

105 Should always be sub-classed since key abstract methods are missing. 

106 

107 Parameters 

108 ---------- 

109 config : `DatastoreConfig` or `str` 

110 Configuration as either a `Config` object or URI to file. 

111 

112 Raises 

113 ------ 

114 ValueError 

115 If root location does not exist and ``create`` is `False` in the 

116 configuration. 

117 """ 

118 

119 defaultConfigFile = None 

120 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

121 absolute path. Can be None if no defaults specified. 

122 """ 

123 

124 root: str 

125 """Root directory or URI of this `Datastore`.""" 

126 

127 locationFactory: LocationFactory 

128 """Factory for creating locations relative to the datastore root.""" 

129 

130 formatterFactory: FormatterFactory 

131 """Factory for creating instances of formatters.""" 

132 

133 templates: FileTemplates 

134 """File templates that can be used by this `Datastore`.""" 

135 

136 @classmethod 

137 def setConfigRoot(cls, root, config, full, overwrite=True): 

138 """Set any filesystem-dependent config options for this Datastore to 

139 be appropriate for a new empty repository with the given root. 

140 

141 Parameters 

142 ---------- 

143 root : `str` 

144 URI to the root of the data repository. 

145 config : `Config` 

146 A `Config` to update. Only the subset understood by 

147 this component will be updated. Will not expand 

148 defaults. 

149 full : `Config` 

150 A complete config with all defaults expanded that can be 

151 converted to a `DatastoreConfig`. Read-only and will not be 

152 modified by this method. 

153 Repository-specific options that should not be obtained 

154 from defaults when Butler instances are constructed 

155 should be copied from ``full`` to ``config``. 

156 overwrite : `bool`, optional 

157 If `False`, do not modify a value in ``config`` if the value 

158 already exists. Default is always to overwrite with the provided 

159 ``root``. 

160 

161 Notes 

162 ----- 

163 If a keyword is explicitly defined in the supplied ``config`` it 

164 will not be overridden by this method if ``overwrite`` is `False`. 

165 This allows explicit values set in external configs to be retained. 

166 """ 

167 Config.updateParameters(DatastoreConfig, config, full, 

168 toUpdate={"root": root}, 

169 toCopy=("cls", ("records", "table")), overwrite=overwrite) 

170 

171 @classmethod 

172 def makeTableSpec(cls): 

173 return ddl.TableSpec( 

174 fields=NamedValueSet([ 

175 ddl.FieldSpec(name="dataset_id", dtype=Integer, primaryKey=True), 

176 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

177 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

178 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

179 # TODO: should checksum be Base64Bytes instead? 

180 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

181 ddl.FieldSpec(name="file_size", dtype=Integer, nullable=True), 

182 ]), 

183 unique=frozenset(), 

184 foreignKeys=[ddl.ForeignKeySpec(table="dataset", source=("dataset_id",), target=("dataset_id",), 

185 onDelete="CASCADE")] 

186 ) 

187 

188 def __init__(self, config, registry, butlerRoot=None): 

189 super().__init__(config, registry) 

190 if "root" not in self.config: 190 ↛ 191line 190 didn't jump to line 191, because the condition on line 190 was never true

191 raise ValueError("No root directory specified in configuration") 

192 

193 # Name ourselves either using an explicit name or a name 

194 # derived from the (unexpanded) root 

195 if "name" in self.config: 

196 self.name = self.config["name"] 

197 else: 

198 # We use the unexpanded root in the name to indicate that this 

199 # datastore can be moved without having to update registry. 

200 self.name = "{}@{}".format(type(self).__name__, 

201 self.config["root"]) 

202 

203 # Support repository relocation in config 

204 # Existence of self.root is checked in subclass 

205 self.root = replaceRoot(self.config["root"], butlerRoot) 

206 

207 self.locationFactory = LocationFactory(self.root) 

208 self.formatterFactory = FormatterFactory() 

209 

210 # Now associate formatters with storage classes 

211 self.formatterFactory.registerFormatters(self.config["formatters"], 

212 universe=self.registry.dimensions) 

213 

214 # Read the file naming templates 

215 self.templates = FileTemplates(self.config["templates"], 

216 universe=self.registry.dimensions) 

217 

218 # Storage of paths and formatters, keyed by dataset_id 

219 self._tableName = self.config["records", "table"] 

220 try: 

221 registry.registerOpaqueTable(self._tableName, self.makeTableSpec()) 

222 except ReadOnlyDatabaseError: 

223 # If the database is read only and we just tried and failed to 

224 # create a table, it means someone is trying to create a read-only 

225 # butler client for an empty repo. That should be okay, as long 

226 # as they then try to get any datasets before some other client 

227 # creates the table. Chances are they'rejust validating 

228 # configuration. 

229 pass 

230 

231 # Determine whether checksums should be used 

232 self.useChecksum = self.config.get("checksum", True) 

233 

234 def __str__(self): 

235 return self.root 

236 

237 def addStoredItemInfo(self, refs, infos): 

238 # Docstring inherited from GenericBaseDatastore 

239 records = [] 

240 for ref, info in zip(refs, infos): 

241 records.append( 

242 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path, 

243 storage_class=info.storageClass.name, 

244 checksum=info.checksum, file_size=info.file_size) 

245 ) 

246 self.registry.insertOpaqueData(self._tableName, *records) 

247 

248 def getStoredItemInfo(self, ref): 

249 # Docstring inherited from GenericBaseDatastore 

250 records = list(self.registry.fetchOpaqueData(self._tableName, dataset_id=ref.id)) 

251 if len(records) == 0: 

252 raise KeyError(f"Unable to retrieve location associated with Dataset {ref}.") 

253 assert len(records) == 1, "Primary key constraint should make more than one result impossible." 

254 record = records[0] 

255 # Convert name of StorageClass to instance 

256 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"]) 

257 return StoredFileInfo(formatter=record["formatter"], 

258 path=record["path"], 

259 storageClass=storageClass, 

260 checksum=record["checksum"], 

261 file_size=record["file_size"]) 

262 

263 def _registered_refs_per_artifact(self, pathInStore): 

264 """Return all dataset refs associated with the supplied path. 

265 

266 Parameters 

267 ---------- 

268 pathInStore : `str` 

269 Path of interest in the data store. 

270 

271 Returns 

272 ------- 

273 ids : `set` of `int` 

274 All `DatasetRef` IDs associated with this path. 

275 """ 

276 records = list(self.registry.fetchOpaqueData(self._tableName, path=pathInStore)) 

277 ids = {r["dataset_id"] for r in records} 

278 return ids 

279 

280 def removeStoredItemInfo(self, ref): 

281 # Docstring inherited from GenericBaseDatastore 

282 self.registry.deleteOpaqueData(self._tableName, dataset_id=ref.id) 

283 

284 def _get_dataset_location_info(self, ref): 

285 """Find the `Location` of the requested dataset in the 

286 `Datastore` and the associated stored file information. 

287 

288 Parameters 

289 ---------- 

290 ref : `DatasetRef` 

291 Reference to the required `Dataset`. 

292 

293 Returns 

294 ------- 

295 location : `Location` 

296 Location of the dataset within the datastore. 

297 Returns `None` if the dataset can not be located. 

298 info : `StoredFileInfo` 

299 Stored information about this file and its formatter. 

300 """ 

301 # Get the file information (this will fail if no file) 

302 try: 

303 storedFileInfo = self.getStoredItemInfo(ref) 

304 except KeyError: 

305 return None, None 

306 

307 # Use the path to determine the location 

308 location = self.locationFactory.fromPath(storedFileInfo.path) 

309 

310 return location, storedFileInfo 

311 

312 def _can_remove_dataset_artifact(self, ref): 

313 """Check that there is only one dataset associated with the 

314 specified artifact. 

315 

316 Parameters 

317 ---------- 

318 ref : `DatasetRef` 

319 Dataset to be removed. 

320 

321 Returns 

322 ------- 

323 can_remove : `Bool` 

324 True if the artifact can be safely removed. 

325 """ 

326 storedFileInfo = self.getStoredItemInfo(ref) 

327 

328 # Get all entries associated with this path 

329 allRefs = self._registered_refs_per_artifact(storedFileInfo.path) 

330 if not allRefs: 330 ↛ 331line 330 didn't jump to line 331, because the condition on line 330 was never true

331 raise RuntimeError(f"Datastore inconsistency error. {storedFileInfo.path} not in registry") 

332 

333 # Get all the refs associated with this dataset if it is a composite 

334 theseRefs = {r.id for r in itertools.chain([ref], ref.components.values())} 

335 

336 # Remove these refs from all the refs and if there is nothing left 

337 # then we can delete 

338 remainingRefs = allRefs - theseRefs 

339 

340 if remainingRefs: 

341 return False 

342 return True 

343 

344 def _prepare_for_get(self, ref, parameters=None): 

345 """Check parameters for ``get`` and obtain formatter and 

346 location. 

347 

348 Parameters 

349 ---------- 

350 ref : `DatasetRef` 

351 Reference to the required Dataset. 

352 parameters : `dict` 

353 `StorageClass`-specific parameters that specify, for example, 

354 a slice of the Dataset to be loaded. 

355 

356 Returns 

357 ------- 

358 getInfo : `DatastoreFileGetInformation` 

359 Parameters needed to retrieve the file. 

360 """ 

361 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

362 

363 # Get file metadata and internal metadata 

364 location, storedFileInfo = self._get_dataset_location_info(ref) 

365 if location is None: 

366 raise FileNotFoundError(f"Could not retrieve Dataset {ref}.") 

367 

368 # We have a write storage class and a read storage class and they 

369 # can be different for concrete composites. 

370 readStorageClass = ref.datasetType.storageClass 

371 writeStorageClass = storedFileInfo.storageClass 

372 

373 # Check that the supplied parameters are suitable for the type read 

374 readStorageClass.validateParameters(parameters) 

375 

376 # Is this a component request? 

377 component = ref.datasetType.component() 

378 

379 formatter = getInstanceOf(storedFileInfo.formatter, 

380 FileDescriptor(location, readStorageClass=readStorageClass, 

381 storageClass=writeStorageClass, parameters=parameters), 

382 ref.dataId) 

383 formatterParams, assemblerParams = formatter.segregateParameters() 

384 

385 return DatastoreFileGetInformation(location, formatter, storedFileInfo, 

386 assemblerParams, component, readStorageClass) 

387 

388 def _prepare_for_put(self, inMemoryDataset, ref): 

389 """Check the arguments for ``put`` and obtain formatter and 

390 location. 

391 

392 Parameters 

393 ---------- 

394 inMemoryDataset : `object` 

395 The Dataset to store. 

396 ref : `DatasetRef` 

397 Reference to the associated Dataset. 

398 

399 Returns 

400 ------- 

401 location : `Location` 

402 The location to write the dataset. 

403 formatter : `Formatter` 

404 The `Formatter` to use to write the dataset. 

405 

406 Raises 

407 ------ 

408 TypeError 

409 Supplied object and storage class are inconsistent. 

410 DatasetTypeNotSupportedError 

411 The associated `DatasetType` is not handled by this datastore. 

412 """ 

413 self._validate_put_parameters(inMemoryDataset, ref) 

414 

415 # Work out output file name 

416 try: 

417 template = self.templates.getTemplate(ref) 

418 except KeyError as e: 

419 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

420 

421 location = self.locationFactory.fromPath(template.format(ref)) 

422 

423 # Get the formatter based on the storage class 

424 storageClass = ref.datasetType.storageClass 

425 try: 

426 formatter = self.formatterFactory.getFormatter(ref, 

427 FileDescriptor(location, 

428 storageClass=storageClass), 

429 ref.dataId) 

430 except KeyError as e: 

431 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref}") from e 

432 

433 return location, formatter 

434 

435 @abstractmethod 

436 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

437 """Standardize the path of a to-be-ingested file. 

438 

439 Parameters 

440 ---------- 

441 path : `str` 

442 Path of a file to be ingested. 

443 transfer : `str`, optional 

444 How (and whether) the dataset should be added to the datastore. 

445 If `None` (default), the file must already be in a location 

446 appropriate for the datastore (e.g. within its root directory), 

447 and will not be moved. Other choices include "move", "copy", 

448 "symlink", and "hardlink". This is provided only so 

449 `NotImplementedError` can be raised if the mode is not supported; 

450 actual transfers are deferred to `_extractIngestInfo`. 

451 

452 Returns 

453 ------- 

454 path : `str` 

455 New path in what the datastore considers standard form. 

456 

457 Notes 

458 ----- 

459 Subclasses of `FileLikeDatastore` should implement this method instead 

460 of `_prepIngest`. It should not modify the data repository or given 

461 file in any way. 

462 

463 Raises 

464 ------ 

465 NotImplementedError 

466 Raised if the datastore does not support the given transfer mode 

467 (including the case where ingest is not supported at all). 

468 FileNotFoundError 

469 Raised if one of the given files does not exist. 

470 """ 

471 raise NotImplementedError("Must be implemented by subclasses.") 

472 

473 @abstractmethod 

474 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter], 

475 transfer: Optional[str] = None) -> StoredFileInfo: 

476 """Relocate (if necessary) and extract `StoredFileInfo` from a 

477 to-be-ingested file. 

478 

479 Parameters 

480 ---------- 

481 path : `str` 

482 Path of a file to be ingested. 

483 ref : `DatasetRef` 

484 Reference for the dataset being ingested. Guaranteed to have 

485 ``dataset_id not None`. 

486 formatter : `type` 

487 `Formatter` subclass to use for this dataset. 

488 transfer : `str`, optional 

489 How (and whether) the dataset should be added to the datastore. 

490 If `None` (default), the file must already be in a location 

491 appropriate for the datastore (e.g. within its root directory), 

492 and will not be modified. Other choices include "move", "copy", 

493 "symlink", and "hardlink". 

494 

495 Returns 

496 ------- 

497 info : `StoredFileInfo` 

498 Internal datastore record for this file. This will be inserted by 

499 the caller; the `_extractIngestInfo` is only resposible for 

500 creating and populating the struct. 

501 

502 Raises 

503 ------ 

504 FileNotFoundError 

505 Raised if one of the given files does not exist. 

506 FileExistsError 

507 Raised if transfer is not `None` but the (internal) location the 

508 file would be moved to is already occupied. 

509 """ 

510 raise NotImplementedError("Must be implemented by subclasses.") 

511 

512 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

513 # Docstring inherited from Datastore._prepIngest. 

514 filtered = [] 

515 for dataset in datasets: 

516 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

517 if not acceptable: 

518 continue 

519 else: 

520 dataset.refs = acceptable 

521 if dataset.formatter is None: 

522 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

523 else: 

524 dataset.formatter = getClassOf(dataset.formatter) 

525 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

526 filtered.append(dataset) 

527 return _IngestPrepData(filtered) 

528 

529 @transactional 

530 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None): 

531 # Docstring inherited from Datastore._finishIngest. 

532 refsAndInfos = [] 

533 for dataset in prepData.datasets: 

534 # Do ingest as if the first dataset ref is associated with the file 

535 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter, 

536 transfer=transfer) 

537 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

538 self._register_datasets(refsAndInfos) 

539 

540 def getUri(self, ref, predict=False): 

541 """URI to the Dataset. 

542 

543 Parameters 

544 ---------- 

545 ref : `DatasetRef` 

546 Reference to the required Dataset. 

547 predict : `bool` 

548 If `True`, allow URIs to be returned of datasets that have not 

549 been written. 

550 

551 Returns 

552 ------- 

553 uri : `str` 

554 URI string pointing to the Dataset within the datastore. If the 

555 Dataset does not exist in the datastore, and if ``predict`` is 

556 `True`, the URI will be a prediction and will include a URI 

557 fragment "#predicted". 

558 If the datastore does not have entities that relate well 

559 to the concept of a URI the returned URI string will be 

560 descriptive. The returned URI is not guaranteed to be obtainable. 

561 

562 Raises 

563 ------ 

564 FileNotFoundError 

565 A URI has been requested for a dataset that does not exist and 

566 guessing is not allowed. 

567 

568 Notes 

569 ----- 

570 When a predicted URI is requested an attempt will be made to form 

571 a reasonable URI based on file templates and the expected formatter. 

572 """ 

573 # if this has never been written then we have to guess 

574 if not self.exists(ref): 

575 if not predict: 

576 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

577 

578 template = self.templates.getTemplate(ref) 

579 location = self.locationFactory.fromPath(template.format(ref)) 

580 storageClass = ref.datasetType.storageClass 

581 formatter = self.formatterFactory.getFormatter(ref, FileDescriptor(location, 

582 storageClass=storageClass)) 

583 # Try to use the extension attribute but ignore problems if the 

584 # formatter does not define one. 

585 try: 

586 location = formatter.makeUpdatedLocation(location) 

587 except Exception: 

588 # Use the default extension 

589 pass 

590 

591 # Add a URI fragment to indicate this is a guess 

592 return location.uri + "#predicted" 

593 

594 # If this is a ref that we have written we can get the path. 

595 # Get file metadata and internal metadata 

596 storedFileInfo = self.getStoredItemInfo(ref) 

597 

598 # Use the path to determine the location 

599 location = self.locationFactory.fromPath(storedFileInfo.path) 

600 

601 return location.uri 

602 

603 def validateConfiguration(self, entities, logFailures=False): 

604 """Validate some of the configuration for this datastore. 

605 

606 Parameters 

607 ---------- 

608 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

609 Entities to test against this configuration. Can be differing 

610 types. 

611 logFailures : `bool`, optional 

612 If `True`, output a log message for every validation error 

613 detected. 

614 

615 Raises 

616 ------ 

617 DatastoreValidationError 

618 Raised if there is a validation problem with a configuration. 

619 All the problems are reported in a single exception. 

620 

621 Notes 

622 ----- 

623 This method checks that all the supplied entities have valid file 

624 templates and also have formatters defined. 

625 """ 

626 

627 templateFailed = None 

628 try: 

629 self.templates.validateTemplates(entities, logFailures=logFailures) 

630 except FileTemplateValidationError as e: 

631 templateFailed = str(e) 

632 

633 formatterFailed = [] 

634 for entity in entities: 

635 try: 

636 self.formatterFactory.getFormatterClass(entity) 

637 except KeyError as e: 

638 formatterFailed.append(str(e)) 

639 if logFailures: 639 ↛ 634line 639 didn't jump to line 634, because the condition on line 639 was never false

640 log.fatal("Formatter failure: %s", e) 

641 

642 if templateFailed or formatterFailed: 

643 messages = [] 

644 if templateFailed: 644 ↛ 645line 644 didn't jump to line 645, because the condition on line 644 was never true

645 messages.append(templateFailed) 

646 if formatterFailed: 646 ↛ 648line 646 didn't jump to line 648, because the condition on line 646 was never false

647 messages.append(",".join(formatterFailed)) 

648 msg = ";\n".join(messages) 

649 raise DatastoreValidationError(msg) 

650 

651 def getLookupKeys(self): 

652 # Docstring is inherited from base class 

653 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \ 

654 self.constraints.getLookupKeys() 

655 

656 def validateKey(self, lookupKey, entity): 

657 # Docstring is inherited from base class 

658 # The key can be valid in either formatters or templates so we can 

659 # only check the template if it exists 

660 if lookupKey in self.templates: 

661 try: 

662 self.templates[lookupKey].validateTemplate(entity) 

663 except FileTemplateValidationError as e: 

664 raise DatastoreValidationError(e) from e