Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileLikeDatastore", ) 

26 

27import logging 

28from abc import abstractmethod 

29 

30from sqlalchemy import BigInteger, String 

31 

32from dataclasses import dataclass 

33from typing import ( 

34 TYPE_CHECKING, 

35 Any, 

36 ClassVar, 

37 Dict, 

38 Iterable, 

39 List, 

40 Mapping, 

41 Optional, 

42 Set, 

43 Tuple, 

44 Type, 

45 Union, 

46) 

47 

48from lsst.daf.butler import ( 

49 ButlerURI, 

50 CompositesMap, 

51 Config, 

52 FileDataset, 

53 DatasetRef, 

54 DatasetType, 

55 DatasetTypeNotSupportedError, 

56 Datastore, 

57 DatastoreConfig, 

58 DatastoreValidationError, 

59 FileDescriptor, 

60 FileTemplates, 

61 FileTemplateValidationError, 

62 Formatter, 

63 FormatterFactory, 

64 Location, 

65 LocationFactory, 

66 StorageClass, 

67 StoredFileInfo, 

68) 

69 

70from lsst.daf.butler import ddl 

71from lsst.daf.butler.registry.interfaces import ( 

72 ReadOnlyDatabaseError, 

73 DatastoreRegistryBridge, 

74) 

75 

76from lsst.daf.butler.core.repoRelocation import replaceRoot 

77from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional 

78from .genericDatastore import GenericBaseDatastore 

79 

80if TYPE_CHECKING: 80 ↛ 81line 80 didn't jump to line 81, because the condition on line 80 was never true

81 from lsst.daf.butler import LookupKey 

82 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

83 

84log = logging.getLogger(__name__) 

85 

86# String to use when a Python None is encountered 

87NULLSTR = "__NULL_STRING__" 

88 

89 

90class _IngestPrepData(Datastore.IngestPrepData): 

91 """Helper class for FileLikeDatastore ingest implementation. 

92 

93 Parameters 

94 ---------- 

95 datasets : `list` of `FileDataset` 

96 Files to be ingested by this datastore. 

97 """ 

98 def __init__(self, datasets: List[FileDataset]): 

99 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

100 self.datasets = datasets 

101 

102 

103@dataclass(frozen=True) 

104class DatastoreFileGetInformation: 

105 """Collection of useful parameters needed to retrieve a file from 

106 a Datastore. 

107 """ 

108 

109 location: Location 

110 """The location from which to read the dataset.""" 

111 

112 formatter: Formatter 

113 """The `Formatter` to use to deserialize the dataset.""" 

114 

115 info: StoredFileInfo 

116 """Stored information about this file and its formatter.""" 

117 

118 assemblerParams: dict 

119 """Parameters to use for post-processing the retrieved dataset.""" 

120 

121 component: Optional[str] 

122 """The component to be retrieved (can be `None`).""" 

123 

124 readStorageClass: StorageClass 

125 """The `StorageClass` of the dataset being read.""" 

126 

127 

128class FileLikeDatastore(GenericBaseDatastore): 

129 """Generic Datastore for file-based implementations. 

130 

131 Should always be sub-classed since key abstract methods are missing. 

132 

133 Parameters 

134 ---------- 

135 config : `DatastoreConfig` or `str` 

136 Configuration as either a `Config` object or URI to file. 

137 bridgeManager : `DatastoreRegistryBridgeManager` 

138 Object that manages the interface between `Registry` and datastores. 

139 butlerRoot : `str`, optional 

140 New datastore root to use to override the configuration value. 

141 

142 Raises 

143 ------ 

144 ValueError 

145 If root location does not exist and ``create`` is `False` in the 

146 configuration. 

147 """ 

148 

149 defaultConfigFile: ClassVar[Optional[str]] = None 

150 """Path to configuration defaults. Accessed within the ``config`` resource 

151 or relative to a search path. Can be None if no defaults specified. 

152 """ 

153 

154 root: str 

155 """Root directory or URI of this `Datastore`.""" 

156 

157 locationFactory: LocationFactory 

158 """Factory for creating locations relative to the datastore root.""" 

159 

160 formatterFactory: FormatterFactory 

161 """Factory for creating instances of formatters.""" 

162 

163 templates: FileTemplates 

164 """File templates that can be used by this `Datastore`.""" 

165 

166 composites: CompositesMap 

167 """Determines whether a dataset should be disassembled on put.""" 

168 

169 @classmethod 

170 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

171 """Set any filesystem-dependent config options for this Datastore to 

172 be appropriate for a new empty repository with the given root. 

173 

174 Parameters 

175 ---------- 

176 root : `str` 

177 URI to the root of the data repository. 

178 config : `Config` 

179 A `Config` to update. Only the subset understood by 

180 this component will be updated. Will not expand 

181 defaults. 

182 full : `Config` 

183 A complete config with all defaults expanded that can be 

184 converted to a `DatastoreConfig`. Read-only and will not be 

185 modified by this method. 

186 Repository-specific options that should not be obtained 

187 from defaults when Butler instances are constructed 

188 should be copied from ``full`` to ``config``. 

189 overwrite : `bool`, optional 

190 If `False`, do not modify a value in ``config`` if the value 

191 already exists. Default is always to overwrite with the provided 

192 ``root``. 

193 

194 Notes 

195 ----- 

196 If a keyword is explicitly defined in the supplied ``config`` it 

197 will not be overridden by this method if ``overwrite`` is `False`. 

198 This allows explicit values set in external configs to be retained. 

199 """ 

200 Config.updateParameters(DatastoreConfig, config, full, 

201 toUpdate={"root": root}, 

202 toCopy=("cls", ("records", "table")), overwrite=overwrite) 

203 

204 @classmethod 

205 def makeTableSpec(cls) -> ddl.TableSpec: 

206 return ddl.TableSpec( 

207 fields=[ 

208 ddl.FieldSpec(name="dataset_id", dtype=BigInteger, primaryKey=True), 

209 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

210 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

211 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

212 # Use empty string to indicate no component 

213 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

214 # TODO: should checksum be Base64Bytes instead? 

215 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

216 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

217 ], 

218 unique=frozenset(), 

219 ) 

220 

221 def __init__(self, config: Union[DatastoreConfig, str], 

222 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

223 super().__init__(config, bridgeManager) 

224 if "root" not in self.config: 224 ↛ 225line 224 didn't jump to line 225, because the condition on line 224 was never true

225 raise ValueError("No root directory specified in configuration") 

226 

227 # Name ourselves either using an explicit name or a name 

228 # derived from the (unexpanded) root 

229 if "name" in self.config: 

230 self.name = self.config["name"] 

231 else: 

232 # We use the unexpanded root in the name to indicate that this 

233 # datastore can be moved without having to update registry. 

234 self.name = "{}@{}".format(type(self).__name__, 

235 self.config["root"]) 

236 

237 # Support repository relocation in config 

238 # Existence of self.root is checked in subclass 

239 self.root = replaceRoot(self.config["root"], butlerRoot) 

240 

241 self.locationFactory = LocationFactory(self.root) 

242 self.formatterFactory = FormatterFactory() 

243 

244 # Now associate formatters with storage classes 

245 self.formatterFactory.registerFormatters(self.config["formatters"], 

246 universe=bridgeManager.universe) 

247 

248 # Read the file naming templates 

249 self.templates = FileTemplates(self.config["templates"], 

250 universe=bridgeManager.universe) 

251 

252 # See if composites should be disassembled 

253 self.composites = CompositesMap(self.config["composites"], 

254 universe=bridgeManager.universe) 

255 

256 tableName = self.config["records", "table"] 

257 try: 

258 # Storage of paths and formatters, keyed by dataset_id 

259 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec()) 

260 # Interface to Registry. 

261 self._bridge = bridgeManager.register(self.name) 

262 except ReadOnlyDatabaseError: 

263 # If the database is read only and we just tried and failed to 

264 # create a table, it means someone is trying to create a read-only 

265 # butler client for an empty repo. That should be okay, as long 

266 # as they then try to get any datasets before some other client 

267 # creates the table. Chances are they'rejust validating 

268 # configuration. 

269 pass 

270 

271 # Determine whether checksums should be used 

272 self.useChecksum = self.config.get("checksum", True) 

273 

274 def __str__(self) -> str: 

275 return self.root 

276 

277 @property 

278 def bridge(self) -> DatastoreRegistryBridge: 

279 return self._bridge 

280 

281 @abstractmethod 

282 def _artifact_exists(self, location: Location) -> bool: 

283 """Check that an artifact exists in this datastore at the specified 

284 location. 

285 

286 Parameters 

287 ---------- 

288 location : `Location` 

289 Expected location of the artifact associated with this datastore. 

290 

291 Returns 

292 ------- 

293 exists : `bool` 

294 True if the location can be found, false otherwise. 

295 """ 

296 raise NotImplementedError() 

297 

298 @abstractmethod 

299 def _delete_artifact(self, location: Location) -> None: 

300 """Delete the artifact from the datastore. 

301 

302 Parameters 

303 ---------- 

304 location : `Location` 

305 Location of the artifact associated with this datastore. 

306 """ 

307 raise NotImplementedError() 

308 

309 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

310 # Docstring inherited from GenericBaseDatastore 

311 records = [] 

312 for ref, info in zip(refs, infos): 

313 # Component should come from ref and fall back on info 

314 component = ref.datasetType.component() 

315 if component is None and info.component is not None: 315 ↛ 316line 315 didn't jump to line 316, because the condition on line 315 was never true

316 component = info.component 

317 if component is None: 

318 # Use empty string since we want this to be part of the 

319 # primary key. 

320 component = NULLSTR 

321 records.append( 

322 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path, 

323 storage_class=info.storageClass.name, component=component, 

324 checksum=info.checksum, file_size=info.file_size) 

325 ) 

326 self._table.insert(*records) 

327 

328 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

329 # Docstring inherited from GenericBaseDatastore 

330 

331 # Look for the dataset_id -- there might be multiple matches 

332 # if we have disassembled the dataset. 

333 records = list(self._table.fetch(dataset_id=ref.id)) 

334 

335 results = [] 

336 for record in records: 

337 # Convert name of StorageClass to instance 

338 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"]) 

339 component = record["component"] if (record["component"] 

340 and record["component"] != NULLSTR) else None 

341 

342 info = StoredFileInfo(formatter=record["formatter"], 

343 path=record["path"], 

344 storageClass=storageClass, 

345 component=component, 

346 checksum=record["checksum"], 

347 file_size=record["file_size"]) 

348 results.append(info) 

349 

350 return results 

351 

352 def _registered_refs_per_artifact(self, pathInStore: str) -> Set[int]: 

353 """Return all dataset refs associated with the supplied path. 

354 

355 Parameters 

356 ---------- 

357 pathInStore : `str` 

358 Path of interest in the data store. 

359 

360 Returns 

361 ------- 

362 ids : `set` of `int` 

363 All `DatasetRef` IDs associated with this path. 

364 """ 

365 records = list(self._table.fetch(path=pathInStore)) 

366 ids = {r["dataset_id"] for r in records} 

367 return ids 

368 

369 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

370 # Docstring inherited from GenericBaseDatastore 

371 self._table.delete(dataset_id=ref.id) 

372 

373 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

374 r"""Find all the `Location`\ s of the requested dataset in the 

375 `Datastore` and the associated stored file information. 

376 

377 Parameters 

378 ---------- 

379 ref : `DatasetRef` 

380 Reference to the required `Dataset`. 

381 

382 Returns 

383 ------- 

384 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

385 Location of the dataset within the datastore and 

386 stored information about each file and its formatter. 

387 """ 

388 # Get the file information (this will fail if no file) 

389 records = self.getStoredItemsInfo(ref) 

390 

391 # Use the path to determine the location 

392 return [(self.locationFactory.fromPath(r.path), r) for r in records] 

393 

394 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

395 """Check that there is only one dataset associated with the 

396 specified artifact. 

397 

398 Parameters 

399 ---------- 

400 ref : `DatasetRef` or `FakeDatasetRef` 

401 Dataset to be removed. 

402 location : `Location` 

403 The location of the artifact to be removed. 

404 

405 Returns 

406 ------- 

407 can_remove : `Bool` 

408 True if the artifact can be safely removed. 

409 """ 

410 

411 # Get all entries associated with this path 

412 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

413 if not allRefs: 413 ↛ 414line 413 didn't jump to line 414, because the condition on line 413 was never true

414 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

415 

416 # Remove these refs from all the refs and if there is nothing left 

417 # then we can delete 

418 remainingRefs = allRefs - {ref.id} 

419 

420 if remainingRefs: 

421 return False 

422 return True 

423 

424 def _prepare_for_get(self, ref: DatasetRef, 

425 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]: 

426 """Check parameters for ``get`` and obtain formatter and 

427 location. 

428 

429 Parameters 

430 ---------- 

431 ref : `DatasetRef` 

432 Reference to the required Dataset. 

433 parameters : `dict` 

434 `StorageClass`-specific parameters that specify, for example, 

435 a slice of the dataset to be loaded. 

436 

437 Returns 

438 ------- 

439 getInfo : `list` [`DatastoreFileGetInformation`] 

440 Parameters needed to retrieve each file. 

441 """ 

442 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

443 

444 # Get file metadata and internal metadata 

445 fileLocations = self._get_dataset_locations_info(ref) 

446 if not fileLocations: 

447 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

448 

449 # The storage class we want to use eventually 

450 refStorageClass = ref.datasetType.storageClass 

451 

452 if len(fileLocations) > 1: 

453 disassembled = True 

454 else: 

455 disassembled = False 

456 

457 # Is this a component request? 

458 refComponent = ref.datasetType.component() 

459 

460 fileGetInfo = [] 

461 for location, storedFileInfo in fileLocations: 

462 

463 # The storage class used to write the file 

464 writeStorageClass = storedFileInfo.storageClass 

465 

466 # If this has been disassembled we need read to match the write 

467 if disassembled: 

468 readStorageClass = writeStorageClass 

469 else: 

470 readStorageClass = refStorageClass 

471 

472 formatter = getInstanceOf(storedFileInfo.formatter, 

473 FileDescriptor(location, readStorageClass=readStorageClass, 

474 storageClass=writeStorageClass, parameters=parameters), 

475 ref.dataId) 

476 

477 _, notFormatterParams = formatter.segregateParameters() 

478 

479 # Of the remaining parameters, extract the ones supported by 

480 # this StorageClass (for components not all will be handled) 

481 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

482 

483 # The ref itself could be a component if the dataset was 

484 # disassembled by butler, or we disassembled in datastore and 

485 # components came from the datastore records 

486 component = storedFileInfo.component if storedFileInfo.component else refComponent 

487 

488 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo, 

489 assemblerParams, component, readStorageClass)) 

490 

491 return fileGetInfo 

492 

493 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

494 """Check the arguments for ``put`` and obtain formatter and 

495 location. 

496 

497 Parameters 

498 ---------- 

499 inMemoryDataset : `object` 

500 The dataset to store. 

501 ref : `DatasetRef` 

502 Reference to the associated Dataset. 

503 

504 Returns 

505 ------- 

506 location : `Location` 

507 The location to write the dataset. 

508 formatter : `Formatter` 

509 The `Formatter` to use to write the dataset. 

510 

511 Raises 

512 ------ 

513 TypeError 

514 Supplied object and storage class are inconsistent. 

515 DatasetTypeNotSupportedError 

516 The associated `DatasetType` is not handled by this datastore. 

517 """ 

518 self._validate_put_parameters(inMemoryDataset, ref) 

519 

520 # Work out output file name 

521 try: 

522 template = self.templates.getTemplate(ref) 

523 except KeyError as e: 

524 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

525 

526 location = self.locationFactory.fromPath(template.format(ref)) 

527 

528 # Get the formatter based on the storage class 

529 storageClass = ref.datasetType.storageClass 

530 try: 

531 formatter = self.formatterFactory.getFormatter(ref, 

532 FileDescriptor(location, 

533 storageClass=storageClass), 

534 ref.dataId) 

535 except KeyError as e: 

536 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore " 

537 f"{self.name}") from e 

538 

539 # Now that we know the formatter, update the location 

540 location = formatter.makeUpdatedLocation(location) 

541 

542 return location, formatter 

543 

544 @abstractmethod 

545 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

546 """Standardize the path of a to-be-ingested file. 

547 

548 Parameters 

549 ---------- 

550 path : `str` 

551 Path of a file to be ingested. 

552 transfer : `str`, optional 

553 How (and whether) the dataset should be added to the datastore. 

554 See `ingest` for details of transfer modes. 

555 This implementation is provided only so 

556 `NotImplementedError` can be raised if the mode is not supported; 

557 actual transfers are deferred to `_extractIngestInfo`. 

558 

559 Returns 

560 ------- 

561 path : `str` 

562 New path in what the datastore considers standard form. 

563 

564 Notes 

565 ----- 

566 Subclasses of `FileLikeDatastore` should implement this method instead 

567 of `_prepIngest`. It should not modify the data repository or given 

568 file in any way. 

569 

570 Raises 

571 ------ 

572 NotImplementedError 

573 Raised if the datastore does not support the given transfer mode 

574 (including the case where ingest is not supported at all). 

575 FileNotFoundError 

576 Raised if one of the given files does not exist. 

577 """ 

578 raise NotImplementedError("Must be implemented by subclasses.") 

579 

580 @abstractmethod 

581 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, 

582 formatter: Union[Formatter, Type[Formatter]], 

583 transfer: Optional[str] = None) -> StoredFileInfo: 

584 """Relocate (if necessary) and extract `StoredFileInfo` from a 

585 to-be-ingested file. 

586 

587 Parameters 

588 ---------- 

589 path : `str` 

590 Path of a file to be ingested. 

591 ref : `DatasetRef` 

592 Reference for the dataset being ingested. Guaranteed to have 

593 ``dataset_id not None`. 

594 formatter : `type` or `Formatter` 

595 `Formatter` subclass to use for this dataset or an instance. 

596 transfer : `str`, optional 

597 How (and whether) the dataset should be added to the datastore. 

598 See `ingest` for details of transfer modes. 

599 

600 Returns 

601 ------- 

602 info : `StoredFileInfo` 

603 Internal datastore record for this file. This will be inserted by 

604 the caller; the `_extractIngestInfo` is only resposible for 

605 creating and populating the struct. 

606 

607 Raises 

608 ------ 

609 FileNotFoundError 

610 Raised if one of the given files does not exist. 

611 FileExistsError 

612 Raised if transfer is not `None` but the (internal) location the 

613 file would be moved to is already occupied. 

614 """ 

615 raise NotImplementedError("Must be implemented by subclasses.") 

616 

617 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

618 # Docstring inherited from Datastore._prepIngest. 

619 filtered = [] 

620 for dataset in datasets: 

621 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

622 if not acceptable: 

623 continue 

624 else: 

625 dataset.refs = acceptable 

626 if dataset.formatter is None: 

627 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

628 else: 

629 assert isinstance(dataset.formatter, (type, str)) 

630 dataset.formatter = getClassOf(dataset.formatter) 

631 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

632 filtered.append(dataset) 

633 return _IngestPrepData(filtered) 

634 

635 @transactional 

636 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None: 

637 # Docstring inherited from Datastore._finishIngest. 

638 refsAndInfos = [] 

639 for dataset in prepData.datasets: 

640 # Do ingest as if the first dataset ref is associated with the file 

641 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter, 

642 transfer=transfer) 

643 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

644 self._register_datasets(refsAndInfos) 

645 

646 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef, 

647 formatter: Union[Formatter, Type[Formatter]]) -> Location: 

648 """Given a source URI and a DatasetRef, determine the name the 

649 dataset will have inside datastore. 

650 

651 Parameters 

652 ---------- 

653 srcUri : `ButlerURI` 

654 URI to the source dataset file. 

655 ref : `DatasetRef` 

656 Ref associated with the newly-ingested dataset artifact. This 

657 is used to determine the name within the datastore. 

658 formatter : `Formatter` or Formatter class. 

659 Formatter to use for validation. Can be a class or an instance. 

660 

661 Returns 

662 ------- 

663 location : `Location` 

664 Target location for the newly-ingested dataset. 

665 """ 

666 # Ingesting a file from outside the datastore. 

667 # This involves a new name. 

668 template = self.templates.getTemplate(ref) 

669 location = self.locationFactory.fromPath(template.format(ref)) 

670 

671 # Get the extension 

672 ext = srcUri.getExtension() 

673 

674 # Update the destination to include that extension 

675 location.updateExtension(ext) 

676 

677 # Ask the formatter to validate this extension 

678 formatter.validateExtension(location) 

679 

680 return location 

681 

682 @abstractmethod 

683 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

684 """Write out in memory dataset to datastore. 

685 

686 Parameters 

687 ---------- 

688 inMemoryDataset : `object` 

689 Dataset to write to datastore. 

690 ref : `DatasetRef` 

691 Registry information associated with this dataset. 

692 

693 Returns 

694 ------- 

695 info : `StoredFileInfo` 

696 Information describin the artifact written to the datastore. 

697 """ 

698 raise NotImplementedError() 

699 

700 @abstractmethod 

701 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

702 ref: DatasetRef, isComponent: bool = False) -> Any: 

703 """Read the artifact from datastore into in memory object. 

704 

705 Parameters 

706 ---------- 

707 getInfo : `DatastoreFileGetInformation` 

708 Information about the artifact within the datastore. 

709 ref : `DatasetRef` 

710 The registry information associated with this artifact. 

711 isComponent : `bool` 

712 Flag to indicate if a component is being read from this artifact. 

713 

714 Returns 

715 ------- 

716 inMemoryDataset : `object` 

717 The artifact as a python object. 

718 """ 

719 raise NotImplementedError() 

720 

721 def exists(self, ref: DatasetRef) -> bool: 

722 """Check if the dataset exists in the datastore. 

723 

724 Parameters 

725 ---------- 

726 ref : `DatasetRef` 

727 Reference to the required dataset. 

728 

729 Returns 

730 ------- 

731 exists : `bool` 

732 `True` if the entity exists in the `Datastore`. 

733 """ 

734 fileLocations = self._get_dataset_locations_info(ref) 

735 if not fileLocations: 

736 return False 

737 for location, _ in fileLocations: 

738 if not self._artifact_exists(location): 

739 return False 

740 

741 return True 

742 

743 def getURIs(self, ref: DatasetRef, 

744 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

745 """Return URIs associated with dataset. 

746 

747 Parameters 

748 ---------- 

749 ref : `DatasetRef` 

750 Reference to the required dataset. 

751 predict : `bool`, optional 

752 If the datastore does not know about the dataset, should it 

753 return a predicted URI or not? 

754 

755 Returns 

756 ------- 

757 primary : `ButlerURI` 

758 The URI to the primary artifact associated with this dataset. 

759 If the dataset was disassembled within the datastore this 

760 may be `None`. 

761 components : `dict` 

762 URIs to any components associated with the dataset artifact. 

763 Can be empty if there are no components. 

764 """ 

765 

766 primary: Optional[ButlerURI] = None 

767 components: Dict[str, ButlerURI] = {} 

768 

769 # if this has never been written then we have to guess 

770 if not self.exists(ref): 

771 if not predict: 

772 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

773 

774 def predictLocation(thisRef: DatasetRef) -> Location: 

775 template = self.templates.getTemplate(thisRef) 

776 location = self.locationFactory.fromPath(template.format(thisRef)) 

777 storageClass = ref.datasetType.storageClass 

778 formatter = self.formatterFactory.getFormatter(thisRef, 

779 FileDescriptor(location, 

780 storageClass=storageClass)) 

781 # Try to use the extension attribute but ignore problems if the 

782 # formatter does not define one. 

783 try: 

784 location = formatter.makeUpdatedLocation(location) 

785 except Exception: 

786 # Use the default extension 

787 pass 

788 return location 

789 

790 doDisassembly = self.composites.shouldBeDisassembled(ref) 

791 

792 if doDisassembly: 

793 

794 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

795 compRef = ref.makeComponentRef(component) 

796 compLocation = predictLocation(compRef) 

797 

798 # Add a URI fragment to indicate this is a guess 

799 components[component] = ButlerURI(compLocation.uri + "#predicted") 

800 

801 else: 

802 

803 location = predictLocation(ref) 

804 

805 # Add a URI fragment to indicate this is a guess 

806 primary = ButlerURI(location.uri + "#predicted") 

807 

808 return primary, components 

809 

810 # If this is a ref that we have written we can get the path. 

811 # Get file metadata and internal metadata 

812 fileLocations = self._get_dataset_locations_info(ref) 

813 

814 if not fileLocations: 814 ↛ 815line 814 didn't jump to line 815, because the condition on line 814 was never true

815 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

816 

817 if len(fileLocations) == 1: 

818 # No disassembly so this is the primary URI 

819 primary = ButlerURI(fileLocations[0][0].uri) 

820 

821 else: 

822 for location, storedFileInfo in fileLocations: 

823 if storedFileInfo.component is None: 823 ↛ 824line 823 didn't jump to line 824, because the condition on line 823 was never true

824 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

825 components[storedFileInfo.component] = ButlerURI(location.uri) 

826 

827 return primary, components 

828 

829 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI: 

830 """URI to the Dataset. 

831 

832 Parameters 

833 ---------- 

834 ref : `DatasetRef` 

835 Reference to the required Dataset. 

836 predict : `bool` 

837 If `True`, allow URIs to be returned of datasets that have not 

838 been written. 

839 

840 Returns 

841 ------- 

842 uri : `str` 

843 URI pointing to the dataset within the datastore. If the 

844 dataset does not exist in the datastore, and if ``predict`` is 

845 `True`, the URI will be a prediction and will include a URI 

846 fragment "#predicted". 

847 If the datastore does not have entities that relate well 

848 to the concept of a URI the returned URI will be 

849 descriptive. The returned URI is not guaranteed to be obtainable. 

850 

851 Raises 

852 ------ 

853 FileNotFoundError 

854 Raised if a URI has been requested for a dataset that does not 

855 exist and guessing is not allowed. 

856 RuntimeError 

857 Raised if a request is made for a single URI but multiple URIs 

858 are associated with this dataset. 

859 

860 Notes 

861 ----- 

862 When a predicted URI is requested an attempt will be made to form 

863 a reasonable URI based on file templates and the expected formatter. 

864 """ 

865 primary, components = self.getURIs(ref, predict) 

866 if primary is None or components: 866 ↛ 867line 866 didn't jump to line 867, because the condition on line 866 was never true

867 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. " 

868 "Use Dataastore.getURIs() instead.") 

869 return primary 

870 

871 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

872 """Load an InMemoryDataset from the store. 

873 

874 Parameters 

875 ---------- 

876 ref : `DatasetRef` 

877 Reference to the required Dataset. 

878 parameters : `dict` 

879 `StorageClass`-specific parameters that specify, for example, 

880 a slice of the dataset to be loaded. 

881 

882 Returns 

883 ------- 

884 inMemoryDataset : `object` 

885 Requested dataset or slice thereof as an InMemoryDataset. 

886 

887 Raises 

888 ------ 

889 FileNotFoundError 

890 Requested dataset can not be retrieved. 

891 TypeError 

892 Return value from formatter has unexpected type. 

893 ValueError 

894 Formatter failed to process the dataset. 

895 """ 

896 allGetInfo = self._prepare_for_get(ref, parameters) 

897 refComponent = ref.datasetType.component() 

898 

899 # Supplied storage class for the component being read 

900 refStorageClass = ref.datasetType.storageClass 

901 

902 # Create mapping from component name to related info 

903 allComponents = {i.component: i for i in allGetInfo} 

904 

905 # By definition the dataset is disassembled if we have more 

906 # than one record for it. 

907 isDisassembled = len(allGetInfo) > 1 

908 

909 # Look for the special case where we are disassembled but the 

910 # component is a read-only component that was not written during 

911 # disassembly. For this scenario we need to check that the 

912 # component requested is listed as a read-only component for the 

913 # composite storage class 

914 isDisassembledReadOnlyComponent = False 

915 if isDisassembled and refComponent: 

916 # The composite storage class should be accessible through 

917 # the component dataset type 

918 compositeStorageClass = ref.datasetType.parentStorageClass 

919 

920 # In the unlikely scenario where the composite storage 

921 # class is not known, we can only assume that this is a 

922 # normal component. If that assumption is wrong then the 

923 # branch below that reads a persisted component will fail 

924 # so there is no need to complain here. 

925 if compositeStorageClass is not None: 925 ↛ 928line 925 didn't jump to line 928, because the condition on line 925 was never false

926 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.readComponents 

927 

928 if isDisassembled and not refComponent: 

929 # This was a disassembled dataset spread over multiple files 

930 # and we need to put them all back together again. 

931 # Read into memory and then assemble 

932 

933 # Check that the supplied parameters are suitable for the type read 

934 refStorageClass.validateParameters(parameters) 

935 

936 usedParams = set() 

937 components: Dict[str, Any] = {} 

938 for getInfo in allGetInfo: 

939 # assemblerParams are parameters not understood by the 

940 # associated formatter. 

941 usedParams.update(set(getInfo.assemblerParams)) 

942 

943 component = getInfo.component 

944 

945 if component is None: 945 ↛ 946line 945 didn't jump to line 946, because the condition on line 945 was never true

946 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

947 

948 # We do not want the formatter to think it's reading 

949 # a component though because it is really reading a 

950 # standalone dataset -- always tell reader it is not a 

951 # component. 

952 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False) 

953 

954 inMemoryDataset = ref.datasetType.storageClass.assembler().assemble(components) 

955 

956 # Any unused parameters will have to be passed to the assembler 

957 if parameters: 

958 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

959 else: 

960 unusedParams = {} 

961 

962 # Process parameters 

963 return ref.datasetType.storageClass.assembler().handleParameters(inMemoryDataset, 

964 parameters=unusedParams) 

965 

966 elif isDisassembledReadOnlyComponent: 

967 

968 compositeStorageClass = ref.datasetType.parentStorageClass 

969 if compositeStorageClass is None: 969 ↛ 970line 969 didn't jump to line 970, because the condition on line 969 was never true

970 raise RuntimeError(f"Unable to retrieve read-only component '{refComponent}' since" 

971 "no composite storage class is available.") 

972 

973 if refComponent is None: 973 ↛ 975line 973 didn't jump to line 975, because the condition on line 973 was never true

974 # Mainly for mypy 

975 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

976 

977 # Assume that every read-only component can be calculated by 

978 # forwarding the request to a single read/write component. 

979 # Rather than guessing which rw component is the right one by 

980 # scanning each for a read-only component of the same name, 

981 # we ask the composite assembler directly which one is best to 

982 # use. 

983 compositeAssembler = compositeStorageClass.assembler() 

984 forwardedComponent = compositeAssembler.selectResponsibleComponent(refComponent, 

985 set(allComponents)) 

986 

987 # Select the relevant component 

988 rwInfo = allComponents[forwardedComponent] 

989 

990 # For now assume that read parameters are validated against 

991 # the real component and not the requested component 

992 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

993 forwardedStorageClass.validateParameters(parameters) 

994 

995 # Unfortunately the FileDescriptor inside the formatter will have 

996 # the wrong write storage class so we need to create a new one 

997 # given the immutability constraint. 

998 writeStorageClass = rwInfo.info.storageClass 

999 

1000 # We may need to put some thought into parameters for read 

1001 # components but for now forward them on as is 

1002 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location, 

1003 readStorageClass=refStorageClass, 

1004 storageClass=writeStorageClass, 

1005 parameters=parameters), 

1006 ref.dataId) 

1007 

1008 # The assembler can not receive any parameter requests for a 

1009 # read-only component at this time since the assembler will 

1010 # see the storage class of the read-only component and those 

1011 # parameters will have to be handled by the formatter on the 

1012 # forwarded storage class. 

1013 assemblerParams: Dict[str, Any] = {} 

1014 

1015 # Need to created a new info that specifies the read-only 

1016 # component and associated storage class 

1017 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter, 

1018 rwInfo.info, assemblerParams, 

1019 refComponent, refStorageClass) 

1020 

1021 return self._read_artifact_into_memory(readInfo, ref, isComponent=True) 

1022 

1023 else: 

1024 # Single file request or component from that composite file 

1025 for lookup in (refComponent, None): 1025 ↛ 1030line 1025 didn't jump to line 1030, because the loop on line 1025 didn't complete

1026 if lookup in allComponents: 1026 ↛ 1025line 1026 didn't jump to line 1025, because the condition on line 1026 was never false

1027 getInfo = allComponents[lookup] 

1028 break 

1029 else: 

1030 raise FileNotFoundError(f"Component {refComponent} not found " 

1031 f"for ref {ref} in datastore {self.name}") 

1032 

1033 # Do not need the component itself if already disassembled 

1034 if isDisassembled: 

1035 isComponent = False 

1036 else: 

1037 isComponent = getInfo.component is not None 

1038 

1039 # For a disassembled component we can validate parametersagainst 

1040 # the component storage class directly 

1041 if isDisassembled: 

1042 refStorageClass.validateParameters(parameters) 

1043 else: 

1044 # For an assembled composite this could be a read-only 

1045 # component derived from a real component. The validity 

1046 # of the parameters is not clear. For now validate against 

1047 # the composite storage class 

1048 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

1049 

1050 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent) 

1051 

1052 @transactional 

1053 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

1054 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

1055 

1056 Parameters 

1057 ---------- 

1058 inMemoryDataset : `object` 

1059 The dataset to store. 

1060 ref : `DatasetRef` 

1061 Reference to the associated Dataset. 

1062 

1063 Raises 

1064 ------ 

1065 TypeError 

1066 Supplied object and storage class are inconsistent. 

1067 DatasetTypeNotSupportedError 

1068 The associated `DatasetType` is not handled by this datastore. 

1069 

1070 Notes 

1071 ----- 

1072 If the datastore is configured to reject certain dataset types it 

1073 is possible that the put will fail and raise a 

1074 `DatasetTypeNotSupportedError`. The main use case for this is to 

1075 allow `ChainedDatastore` to put to multiple datastores without 

1076 requiring that every datastore accepts the dataset. 

1077 """ 

1078 

1079 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1080 # doDisassembly = True 

1081 

1082 artifacts = [] 

1083 if doDisassembly: 

1084 components = ref.datasetType.storageClass.assembler().disassemble(inMemoryDataset) 

1085 for component, componentInfo in components.items(): 

1086 # Don't recurse because we want to take advantage of 

1087 # bulk insert -- need a new DatasetRef that refers to the 

1088 # same dataset_id but has the component DatasetType 

1089 # DatasetType does not refer to the types of components 

1090 # So we construct one ourselves. 

1091 compRef = ref.makeComponentRef(component) 

1092 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

1093 artifacts.append((compRef, storedInfo)) 

1094 else: 

1095 # Write the entire thing out 

1096 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

1097 artifacts.append((ref, storedInfo)) 

1098 

1099 self._register_datasets(artifacts) 

1100 

1101 @transactional 

1102 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None: 

1103 """Indicate to the datastore that a dataset can be removed. 

1104 

1105 Parameters 

1106 ---------- 

1107 ref : `DatasetRef` 

1108 Reference to the required Dataset. 

1109 ignore_errors : `bool` 

1110 If `True` return without error even if something went wrong. 

1111 Problems could occur if another process is simultaneously trying 

1112 to delete. 

1113 

1114 Raises 

1115 ------ 

1116 FileNotFoundError 

1117 Attempt to remove a dataset that does not exist. 

1118 """ 

1119 # Get file metadata and internal metadata 

1120 log.debug("Trashing %s in datastore %s", ref, self.name) 

1121 

1122 fileLocations = self._get_dataset_locations_info(ref) 

1123 

1124 if not fileLocations: 

1125 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

1126 if ignore_errors: 

1127 log.warning(err_msg) 

1128 return 

1129 else: 

1130 raise FileNotFoundError(err_msg) 

1131 

1132 for location, storedFileInfo in fileLocations: 

1133 if not self._artifact_exists(location): 1133 ↛ 1134line 1133 didn't jump to line 1134, because the condition on line 1133 was never true

1134 err_msg = f"Dataset is known to datastore {self.name} but " \ 

1135 f"associated artifact ({location.uri}) is missing" 

1136 if ignore_errors: 

1137 log.warning(err_msg) 

1138 return 

1139 else: 

1140 raise FileNotFoundError(err_msg) 

1141 

1142 # Mark dataset as trashed 

1143 try: 

1144 self._move_to_trash_in_registry(ref) 

1145 except Exception as e: 

1146 if ignore_errors: 

1147 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} " 

1148 f"but encountered an error: {e}") 

1149 pass 

1150 else: 

1151 raise 

1152 

1153 @transactional 

1154 def emptyTrash(self, ignore_errors: bool = True) -> None: 

1155 """Remove all datasets from the trash. 

1156 

1157 Parameters 

1158 ---------- 

1159 ignore_errors : `bool` 

1160 If `True` return without error even if something went wrong. 

1161 Problems could occur if another process is simultaneously trying 

1162 to delete. 

1163 """ 

1164 log.debug("Emptying trash in datastore %s", self.name) 

1165 # Context manager will empty trash iff we finish it without raising. 

1166 with self._bridge.emptyTrash() as trashed: 

1167 for ref in trashed: 

1168 fileLocations = self._get_dataset_locations_info(ref) 

1169 

1170 if not fileLocations: 1170 ↛ 1171line 1170 didn't jump to line 1171, because the condition on line 1170 was never true

1171 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}" 

1172 if ignore_errors: 

1173 log.warning(err_msg) 

1174 continue 

1175 else: 

1176 raise FileNotFoundError(err_msg) 

1177 

1178 for location, _ in fileLocations: 

1179 

1180 if not self._artifact_exists(location): 1180 ↛ 1181line 1180 didn't jump to line 1181, because the condition on line 1180 was never true

1181 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}" 

1182 if ignore_errors: 

1183 log.warning(err_msg) 

1184 continue 

1185 else: 

1186 raise FileNotFoundError(err_msg) 

1187 

1188 # Can only delete the artifact if there are no references 

1189 # to the file from untrashed dataset refs. 

1190 if self._can_remove_dataset_artifact(ref, location): 

1191 # Point of no return for this artifact 

1192 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

1193 try: 

1194 self._delete_artifact(location) 

1195 except Exception as e: 

1196 if ignore_errors: 

1197 log.critical("Encountered error removing artifact %s from datastore %s: %s", 

1198 location.uri, self.name, e) 

1199 else: 

1200 raise 

1201 

1202 # Now must remove the entry from the internal registry even if 

1203 # the artifact removal failed and was ignored, 

1204 # otherwise the removal check above will never be true 

1205 try: 

1206 # There may be multiple rows associated with this ref 

1207 # depending on disassembly 

1208 self.removeStoredItemInfo(ref) 

1209 except Exception as e: 

1210 if ignore_errors: 

1211 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s", 

1212 ref.id, location.uri, self.name, e) 

1213 continue 

1214 else: 

1215 raise FileNotFoundError(err_msg) 

1216 

1217 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], 

1218 logFailures: bool = False) -> None: 

1219 """Validate some of the configuration for this datastore. 

1220 

1221 Parameters 

1222 ---------- 

1223 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

1224 Entities to test against this configuration. Can be differing 

1225 types. 

1226 logFailures : `bool`, optional 

1227 If `True`, output a log message for every validation error 

1228 detected. 

1229 

1230 Raises 

1231 ------ 

1232 DatastoreValidationError 

1233 Raised if there is a validation problem with a configuration. 

1234 All the problems are reported in a single exception. 

1235 

1236 Notes 

1237 ----- 

1238 This method checks that all the supplied entities have valid file 

1239 templates and also have formatters defined. 

1240 """ 

1241 

1242 templateFailed = None 

1243 try: 

1244 self.templates.validateTemplates(entities, logFailures=logFailures) 

1245 except FileTemplateValidationError as e: 

1246 templateFailed = str(e) 

1247 

1248 formatterFailed = [] 

1249 for entity in entities: 

1250 try: 

1251 self.formatterFactory.getFormatterClass(entity) 

1252 except KeyError as e: 

1253 formatterFailed.append(str(e)) 

1254 if logFailures: 1254 ↛ 1249line 1254 didn't jump to line 1249, because the condition on line 1254 was never false

1255 log.fatal("Formatter failure: %s", e) 

1256 

1257 if templateFailed or formatterFailed: 

1258 messages = [] 

1259 if templateFailed: 1259 ↛ 1260line 1259 didn't jump to line 1260, because the condition on line 1259 was never true

1260 messages.append(templateFailed) 

1261 if formatterFailed: 1261 ↛ 1263line 1261 didn't jump to line 1263, because the condition on line 1261 was never false

1262 messages.append(",".join(formatterFailed)) 

1263 msg = ";\n".join(messages) 

1264 raise DatastoreValidationError(msg) 

1265 

1266 def getLookupKeys(self) -> Set[LookupKey]: 

1267 # Docstring is inherited from base class 

1268 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \ 

1269 self.constraints.getLookupKeys() 

1270 

1271 def validateKey(self, lookupKey: LookupKey, 

1272 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

1273 # Docstring is inherited from base class 

1274 # The key can be valid in either formatters or templates so we can 

1275 # only check the template if it exists 

1276 if lookupKey in self.templates: 

1277 try: 

1278 self.templates[lookupKey].validateTemplate(entity) 

1279 except FileTemplateValidationError as e: 

1280 raise DatastoreValidationError(e) from e