Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileDatastore", ) 

26 

27import hashlib 

28import logging 

29import os 

30import tempfile 

31 

32from sqlalchemy import BigInteger, String 

33 

34from dataclasses import dataclass 

35from typing import ( 

36 TYPE_CHECKING, 

37 Any, 

38 ClassVar, 

39 Dict, 

40 Iterable, 

41 List, 

42 Mapping, 

43 Optional, 

44 Set, 

45 Tuple, 

46 Type, 

47 Union, 

48) 

49 

50from lsst.daf.butler import ( 

51 ButlerURI, 

52 CompositesMap, 

53 Config, 

54 FileDataset, 

55 DatasetRef, 

56 DatasetType, 

57 DatasetTypeNotSupportedError, 

58 Datastore, 

59 DatastoreConfig, 

60 DatastoreValidationError, 

61 FileDescriptor, 

62 FileTemplates, 

63 FileTemplateValidationError, 

64 Formatter, 

65 FormatterFactory, 

66 Location, 

67 LocationFactory, 

68 StorageClass, 

69 StoredFileInfo, 

70) 

71 

72from lsst.daf.butler import ddl 

73from lsst.daf.butler.registry.interfaces import ( 

74 ReadOnlyDatabaseError, 

75 DatastoreRegistryBridge, 

76) 

77 

78from lsst.daf.butler.core.repoRelocation import replaceRoot 

79from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional 

80from .genericDatastore import GenericBaseDatastore 

81 

82if TYPE_CHECKING: 82 ↛ 83line 82 didn't jump to line 83, because the condition on line 82 was never true

83 from lsst.daf.butler import LookupKey 

84 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

85 

86log = logging.getLogger(__name__) 

87 

88# String to use when a Python None is encountered 

89NULLSTR = "__NULL_STRING__" 

90 

91 

92class _IngestPrepData(Datastore.IngestPrepData): 

93 """Helper class for FileDatastore ingest implementation. 

94 

95 Parameters 

96 ---------- 

97 datasets : `list` of `FileDataset` 

98 Files to be ingested by this datastore. 

99 """ 

100 def __init__(self, datasets: List[FileDataset]): 

101 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

102 self.datasets = datasets 

103 

104 

105@dataclass(frozen=True) 

106class DatastoreFileGetInformation: 

107 """Collection of useful parameters needed to retrieve a file from 

108 a Datastore. 

109 """ 

110 

111 location: Location 

112 """The location from which to read the dataset.""" 

113 

114 formatter: Formatter 

115 """The `Formatter` to use to deserialize the dataset.""" 

116 

117 info: StoredFileInfo 

118 """Stored information about this file and its formatter.""" 

119 

120 assemblerParams: Dict[str, Any] 

121 """Parameters to use for post-processing the retrieved dataset.""" 

122 

123 formatterParams: Dict[str, Any] 

124 """Parameters that were understood by the associated formatter.""" 

125 

126 component: Optional[str] 

127 """The component to be retrieved (can be `None`).""" 

128 

129 readStorageClass: StorageClass 

130 """The `StorageClass` of the dataset being read.""" 

131 

132 

133class FileDatastore(GenericBaseDatastore): 

134 """Generic Datastore for file-based implementations. 

135 

136 Should always be sub-classed since key abstract methods are missing. 

137 

138 Parameters 

139 ---------- 

140 config : `DatastoreConfig` or `str` 

141 Configuration as either a `Config` object or URI to file. 

142 bridgeManager : `DatastoreRegistryBridgeManager` 

143 Object that manages the interface between `Registry` and datastores. 

144 butlerRoot : `str`, optional 

145 New datastore root to use to override the configuration value. 

146 

147 Raises 

148 ------ 

149 ValueError 

150 If root location does not exist and ``create`` is `False` in the 

151 configuration. 

152 """ 

153 

154 defaultConfigFile: ClassVar[Optional[str]] = None 

155 """Path to configuration defaults. Accessed within the ``config`` resource 

156 or relative to a search path. Can be None if no defaults specified. 

157 """ 

158 

159 root: ButlerURI 

160 """Root directory URI of this `Datastore`.""" 

161 

162 locationFactory: LocationFactory 

163 """Factory for creating locations relative to the datastore root.""" 

164 

165 formatterFactory: FormatterFactory 

166 """Factory for creating instances of formatters.""" 

167 

168 templates: FileTemplates 

169 """File templates that can be used by this `Datastore`.""" 

170 

171 composites: CompositesMap 

172 """Determines whether a dataset should be disassembled on put.""" 

173 

174 defaultConfigFile = "datastores/fileDatastore.yaml" 

175 """Path to configuration defaults. Accessed within the ``config`` resource 

176 or relative to a search path. Can be None if no defaults specified. 

177 """ 

178 

179 @classmethod 

180 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

181 """Set any filesystem-dependent config options for this Datastore to 

182 be appropriate for a new empty repository with the given root. 

183 

184 Parameters 

185 ---------- 

186 root : `str` 

187 URI to the root of the data repository. 

188 config : `Config` 

189 A `Config` to update. Only the subset understood by 

190 this component will be updated. Will not expand 

191 defaults. 

192 full : `Config` 

193 A complete config with all defaults expanded that can be 

194 converted to a `DatastoreConfig`. Read-only and will not be 

195 modified by this method. 

196 Repository-specific options that should not be obtained 

197 from defaults when Butler instances are constructed 

198 should be copied from ``full`` to ``config``. 

199 overwrite : `bool`, optional 

200 If `False`, do not modify a value in ``config`` if the value 

201 already exists. Default is always to overwrite with the provided 

202 ``root``. 

203 

204 Notes 

205 ----- 

206 If a keyword is explicitly defined in the supplied ``config`` it 

207 will not be overridden by this method if ``overwrite`` is `False`. 

208 This allows explicit values set in external configs to be retained. 

209 """ 

210 Config.updateParameters(DatastoreConfig, config, full, 

211 toUpdate={"root": root}, 

212 toCopy=("cls", ("records", "table")), overwrite=overwrite) 

213 

214 @classmethod 

215 def makeTableSpec(cls) -> ddl.TableSpec: 

216 return ddl.TableSpec( 

217 fields=[ 

218 ddl.FieldSpec(name="dataset_id", dtype=BigInteger, primaryKey=True), 

219 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

220 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

221 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

222 # Use empty string to indicate no component 

223 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

224 # TODO: should checksum be Base64Bytes instead? 

225 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

226 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

227 ], 

228 unique=frozenset(), 

229 ) 

230 

231 def __init__(self, config: Union[DatastoreConfig, str], 

232 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

233 super().__init__(config, bridgeManager) 

234 if "root" not in self.config: 234 ↛ 235line 234 didn't jump to line 235, because the condition on line 234 was never true

235 raise ValueError("No root directory specified in configuration") 

236 

237 # Name ourselves either using an explicit name or a name 

238 # derived from the (unexpanded) root 

239 if "name" in self.config: 

240 self.name = self.config["name"] 

241 else: 

242 # We use the unexpanded root in the name to indicate that this 

243 # datastore can be moved without having to update registry. 

244 self.name = "{}@{}".format(type(self).__name__, 

245 self.config["root"]) 

246 

247 # Support repository relocation in config 

248 # Existence of self.root is checked in subclass 

249 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot), 

250 forceDirectory=True, forceAbsolute=True) 

251 

252 self.locationFactory = LocationFactory(self.root) 

253 self.formatterFactory = FormatterFactory() 

254 

255 # Now associate formatters with storage classes 

256 self.formatterFactory.registerFormatters(self.config["formatters"], 

257 universe=bridgeManager.universe) 

258 

259 # Read the file naming templates 

260 self.templates = FileTemplates(self.config["templates"], 

261 universe=bridgeManager.universe) 

262 

263 # See if composites should be disassembled 

264 self.composites = CompositesMap(self.config["composites"], 

265 universe=bridgeManager.universe) 

266 

267 tableName = self.config["records", "table"] 

268 try: 

269 # Storage of paths and formatters, keyed by dataset_id 

270 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec()) 

271 # Interface to Registry. 

272 self._bridge = bridgeManager.register(self.name) 

273 except ReadOnlyDatabaseError: 

274 # If the database is read only and we just tried and failed to 

275 # create a table, it means someone is trying to create a read-only 

276 # butler client for an empty repo. That should be okay, as long 

277 # as they then try to get any datasets before some other client 

278 # creates the table. Chances are they'rejust validating 

279 # configuration. 

280 pass 

281 

282 # Determine whether checksums should be used - default to False 

283 self.useChecksum = self.config.get("checksum", False) 

284 

285 # Determine whether we can fall back to configuration if a 

286 # requested dataset is not known to registry 

287 self.trustGetRequest = self.config.get("trust_get_request", False) 

288 

289 # Check existence and create directory structure if necessary 

290 if not self.root.exists(): 

291 if "create" not in self.config or not self.config["create"]: 291 ↛ 292line 291 didn't jump to line 292, because the condition on line 291 was never true

292 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

293 try: 

294 self.root.mkdir() 

295 except Exception as e: 

296 raise ValueError(f"Can not create datastore root '{self.root}', check permissions." 

297 f" Got error: {e}") from e 

298 

299 def __str__(self) -> str: 

300 return str(self.root) 

301 

302 @property 

303 def bridge(self) -> DatastoreRegistryBridge: 

304 return self._bridge 

305 

306 def _artifact_exists(self, location: Location) -> bool: 

307 """Check that an artifact exists in this datastore at the specified 

308 location. 

309 

310 Parameters 

311 ---------- 

312 location : `Location` 

313 Expected location of the artifact associated with this datastore. 

314 

315 Returns 

316 ------- 

317 exists : `bool` 

318 True if the location can be found, false otherwise. 

319 """ 

320 log.debug("Checking if resource exists: %s", location.uri) 

321 return location.uri.exists() 

322 

323 def _delete_artifact(self, location: Location) -> None: 

324 """Delete the artifact from the datastore. 

325 

326 Parameters 

327 ---------- 

328 location : `Location` 

329 Location of the artifact associated with this datastore. 

330 """ 

331 if location.pathInStore.isabs(): 331 ↛ 332line 331 didn't jump to line 332, because the condition on line 331 was never true

332 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

333 log.debug("Deleting file: %s", location.uri) 

334 location.uri.remove() 

335 log.debug("Successfully deleted file: %s", location.uri) 

336 

337 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

338 # Docstring inherited from GenericBaseDatastore 

339 records = [] 

340 for ref, info in zip(refs, infos): 

341 # Component should come from ref and fall back on info 

342 component = ref.datasetType.component() 

343 if component is None and info.component is not None: 343 ↛ 344line 343 didn't jump to line 344, because the condition on line 343 was never true

344 component = info.component 

345 if component is None: 

346 # Use empty string since we want this to be part of the 

347 # primary key. 

348 component = NULLSTR 

349 records.append( 

350 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path, 

351 storage_class=info.storageClass.name, component=component, 

352 checksum=info.checksum, file_size=info.file_size) 

353 ) 

354 self._table.insert(*records) 

355 

356 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

357 # Docstring inherited from GenericBaseDatastore 

358 

359 # Look for the dataset_id -- there might be multiple matches 

360 # if we have disassembled the dataset. 

361 records = list(self._table.fetch(dataset_id=ref.id)) 

362 

363 results = [] 

364 for record in records: 

365 # Convert name of StorageClass to instance 

366 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"]) 

367 component = record["component"] if (record["component"] 

368 and record["component"] != NULLSTR) else None 

369 

370 info = StoredFileInfo(formatter=record["formatter"], 

371 path=record["path"], 

372 storageClass=storageClass, 

373 component=component, 

374 checksum=record["checksum"], 

375 file_size=record["file_size"]) 

376 results.append(info) 

377 

378 return results 

379 

380 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[int]: 

381 """Return all dataset refs associated with the supplied path. 

382 

383 Parameters 

384 ---------- 

385 pathInStore : `ButlerURI` 

386 Path of interest in the data store. 

387 

388 Returns 

389 ------- 

390 ids : `set` of `int` 

391 All `DatasetRef` IDs associated with this path. 

392 """ 

393 records = list(self._table.fetch(path=str(pathInStore))) 

394 ids = {r["dataset_id"] for r in records} 

395 return ids 

396 

397 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

398 # Docstring inherited from GenericBaseDatastore 

399 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

400 

401 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

402 r"""Find all the `Location`\ s of the requested dataset in the 

403 `Datastore` and the associated stored file information. 

404 

405 Parameters 

406 ---------- 

407 ref : `DatasetRef` 

408 Reference to the required `Dataset`. 

409 

410 Returns 

411 ------- 

412 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

413 Location of the dataset within the datastore and 

414 stored information about each file and its formatter. 

415 """ 

416 # Get the file information (this will fail if no file) 

417 records = self.getStoredItemsInfo(ref) 

418 

419 # Use the path to determine the location -- we need to take 

420 # into account absolute URIs in the datastore record 

421 locations: List[Tuple[Location, StoredFileInfo]] = [] 

422 for r in records: 

423 uriInStore = ButlerURI(r.path, forceAbsolute=False) 

424 if uriInStore.isabs(): 424 ↛ 425line 424 didn't jump to line 425, because the condition on line 424 was never true

425 location = Location(None, uriInStore) 

426 else: 

427 location = self.locationFactory.fromPath(r.path) 

428 locations.append((location, r)) 

429 return locations 

430 

431 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

432 """Check that there is only one dataset associated with the 

433 specified artifact. 

434 

435 Parameters 

436 ---------- 

437 ref : `DatasetRef` or `FakeDatasetRef` 

438 Dataset to be removed. 

439 location : `Location` 

440 The location of the artifact to be removed. 

441 

442 Returns 

443 ------- 

444 can_remove : `Bool` 

445 True if the artifact can be safely removed. 

446 """ 

447 # Can't ever delete absolute URIs. 

448 if location.pathInStore.isabs(): 448 ↛ 449line 448 didn't jump to line 449, because the condition on line 448 was never true

449 return False 

450 

451 # Get all entries associated with this path 

452 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

453 if not allRefs: 453 ↛ 454line 453 didn't jump to line 454, because the condition on line 453 was never true

454 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

455 

456 # Remove these refs from all the refs and if there is nothing left 

457 # then we can delete 

458 remainingRefs = allRefs - {ref.id} 

459 

460 if remainingRefs: 

461 return False 

462 return True 

463 

464 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, 

465 StoredFileInfo]]: 

466 """Predict the location and related file information of the requested 

467 dataset in this datastore. 

468 

469 Parameters 

470 ---------- 

471 ref : `DatasetRef` 

472 Reference to the required `Dataset`. 

473 

474 Returns 

475 ------- 

476 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

477 Expected Location of the dataset within the datastore and 

478 placeholder information about each file and its formatter. 

479 

480 Notes 

481 ----- 

482 Uses the current configuration to determine how we would expect the 

483 datastore files to have been written if we couldn't ask registry. 

484 This is safe so long as there has been no change to datastore 

485 configuration between writing the dataset and wanting to read it. 

486 Will not work for files that have been ingested without using the 

487 standard file template or default formatter. 

488 """ 

489 

490 # If we have a component ref we always need to ask the questions 

491 # of the composite. If the composite is disassembled this routine 

492 # should return all components. If the composite was not 

493 # disassembled the composite is what is stored regardless of 

494 # component request. Note that if the caller has disassembled 

495 # a composite there is no way for this guess to know that 

496 # without trying both the composite and component ref and seeing 

497 # if there is something at the component Location even without 

498 # disassembly being enabled. 

499 if ref.datasetType.isComponent(): 

500 ref = ref.makeCompositeRef() 

501 

502 # See if the ref is a composite that should be disassembled 

503 doDisassembly = self.composites.shouldBeDisassembled(ref) 

504 

505 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = [] 

506 

507 if doDisassembly: 

508 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

509 compRef = ref.makeComponentRef(component) 

510 location, formatter = self._determine_put_formatter_location(compRef) 

511 all_info.append((location, formatter, componentStorage, component)) 

512 

513 else: 

514 # Always use the composite ref if no disassembly 

515 location, formatter = self._determine_put_formatter_location(ref) 

516 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

517 

518 # Convert the list of tuples to have StoredFileInfo as second element 

519 return [(location, StoredFileInfo(formatter=formatter, 

520 path=location.pathInStore.path, 

521 storageClass=storageClass, 

522 component=component, 

523 checksum=None, 

524 file_size=-1)) 

525 for location, formatter, storageClass, component in all_info] 

526 

527 def _prepare_for_get(self, ref: DatasetRef, 

528 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]: 

529 """Check parameters for ``get`` and obtain formatter and 

530 location. 

531 

532 Parameters 

533 ---------- 

534 ref : `DatasetRef` 

535 Reference to the required Dataset. 

536 parameters : `dict` 

537 `StorageClass`-specific parameters that specify, for example, 

538 a slice of the dataset to be loaded. 

539 

540 Returns 

541 ------- 

542 getInfo : `list` [`DatastoreFileGetInformation`] 

543 Parameters needed to retrieve each file. 

544 """ 

545 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

546 

547 # Get file metadata and internal metadata 

548 fileLocations = self._get_dataset_locations_info(ref) 

549 if not fileLocations: 

550 if not self.trustGetRequest: 

551 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

552 # Assume the dataset is where we think it should be 

553 fileLocations = self._get_expected_dataset_locations_info(ref) 

554 

555 # The storage class we want to use eventually 

556 refStorageClass = ref.datasetType.storageClass 

557 

558 if len(fileLocations) > 1: 

559 disassembled = True 

560 else: 

561 disassembled = False 

562 

563 # Is this a component request? 

564 refComponent = ref.datasetType.component() 

565 

566 fileGetInfo = [] 

567 for location, storedFileInfo in fileLocations: 

568 

569 # The storage class used to write the file 

570 writeStorageClass = storedFileInfo.storageClass 

571 

572 # If this has been disassembled we need read to match the write 

573 if disassembled: 

574 readStorageClass = writeStorageClass 

575 else: 

576 readStorageClass = refStorageClass 

577 

578 formatter = getInstanceOf(storedFileInfo.formatter, 

579 FileDescriptor(location, readStorageClass=readStorageClass, 

580 storageClass=writeStorageClass, parameters=parameters), 

581 ref.dataId) 

582 

583 formatterParams, notFormatterParams = formatter.segregateParameters() 

584 

585 # Of the remaining parameters, extract the ones supported by 

586 # this StorageClass (for components not all will be handled) 

587 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

588 

589 # The ref itself could be a component if the dataset was 

590 # disassembled by butler, or we disassembled in datastore and 

591 # components came from the datastore records 

592 component = storedFileInfo.component if storedFileInfo.component else refComponent 

593 

594 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo, 

595 assemblerParams, formatterParams, 

596 component, readStorageClass)) 

597 

598 return fileGetInfo 

599 

600 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

601 """Check the arguments for ``put`` and obtain formatter and 

602 location. 

603 

604 Parameters 

605 ---------- 

606 inMemoryDataset : `object` 

607 The dataset to store. 

608 ref : `DatasetRef` 

609 Reference to the associated Dataset. 

610 

611 Returns 

612 ------- 

613 location : `Location` 

614 The location to write the dataset. 

615 formatter : `Formatter` 

616 The `Formatter` to use to write the dataset. 

617 

618 Raises 

619 ------ 

620 TypeError 

621 Supplied object and storage class are inconsistent. 

622 DatasetTypeNotSupportedError 

623 The associated `DatasetType` is not handled by this datastore. 

624 """ 

625 self._validate_put_parameters(inMemoryDataset, ref) 

626 return self._determine_put_formatter_location(ref) 

627 

628 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]: 

629 """Calculate the formatter and output location to use for put. 

630 

631 Parameters 

632 ---------- 

633 ref : `DatasetRef` 

634 Reference to the associated Dataset. 

635 

636 Returns 

637 ------- 

638 location : `Location` 

639 The location to write the dataset. 

640 formatter : `Formatter` 

641 The `Formatter` to use to write the dataset. 

642 """ 

643 # Work out output file name 

644 try: 

645 template = self.templates.getTemplate(ref) 

646 except KeyError as e: 

647 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

648 

649 # Validate the template to protect against filenames from different 

650 # dataIds returning the same and causing overwrite confusion. 

651 template.validateTemplate(ref) 

652 

653 location = self.locationFactory.fromPath(template.format(ref)) 

654 

655 # Get the formatter based on the storage class 

656 storageClass = ref.datasetType.storageClass 

657 try: 

658 formatter = self.formatterFactory.getFormatter(ref, 

659 FileDescriptor(location, 

660 storageClass=storageClass), 

661 ref.dataId) 

662 except KeyError as e: 

663 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore " 

664 f"{self.name}") from e 

665 

666 # Now that we know the formatter, update the location 

667 location = formatter.makeUpdatedLocation(location) 

668 

669 return location, formatter 

670 

671 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

672 # Docstring inherited from base class 

673 if transfer != "auto": 

674 return transfer 

675 

676 # See if the paths are within the datastore or not 

677 inside = [self._pathInStore(d.path) is not None for d in datasets] 

678 

679 if all(inside): 

680 transfer = None 

681 elif not any(inside): 681 ↛ 685line 681 didn't jump to line 685, because the condition on line 681 was never false

682 # Allow ButlerURI to use its own knowledge 

683 transfer = "auto" 

684 else: 

685 raise ValueError("Some datasets are inside the datastore and some are outside." 

686 " Please use an explicit transfer mode and not 'auto'.") 

687 

688 return transfer 

689 

690 def _pathInStore(self, path: Union[str, ButlerURI]) -> Optional[str]: 

691 """Return path relative to datastore root 

692 

693 Parameters 

694 ---------- 

695 path : `str` or `ButlerURI` 

696 Path to dataset. Can be absolute URI. If relative assumed to 

697 be relative to the datastore. Returns path in datastore 

698 or raises an exception if the path it outside. 

699 

700 Returns 

701 ------- 

702 inStore : `str` 

703 Path relative to datastore root. Returns `None` if the file is 

704 outside the root. 

705 """ 

706 # Relative path will always be relative to datastore 

707 pathUri = ButlerURI(path, forceAbsolute=False) 

708 return pathUri.relative_to(self.root) 

709 

710 def _standardizeIngestPath(self, path: Union[str, ButlerURI], *, 

711 transfer: Optional[str] = None) -> Union[str, ButlerURI]: 

712 """Standardize the path of a to-be-ingested file. 

713 

714 Parameters 

715 ---------- 

716 path : `str` or `ButlerURI` 

717 Path of a file to be ingested. 

718 transfer : `str`, optional 

719 How (and whether) the dataset should be added to the datastore. 

720 See `ingest` for details of transfer modes. 

721 This implementation is provided only so 

722 `NotImplementedError` can be raised if the mode is not supported; 

723 actual transfers are deferred to `_extractIngestInfo`. 

724 

725 Returns 

726 ------- 

727 path : `str` or `ButlerURI` 

728 New path in what the datastore considers standard form. If an 

729 absolute URI was given that will be returned unchanged. 

730 

731 Notes 

732 ----- 

733 Subclasses of `FileDatastore` can implement this method instead 

734 of `_prepIngest`. It should not modify the data repository or given 

735 file in any way. 

736 

737 Raises 

738 ------ 

739 NotImplementedError 

740 Raised if the datastore does not support the given transfer mode 

741 (including the case where ingest is not supported at all). 

742 FileNotFoundError 

743 Raised if one of the given files does not exist. 

744 """ 

745 if transfer not in (None, "direct") + self.root.transferModes: 745 ↛ 746line 745 didn't jump to line 746, because the condition on line 745 was never true

746 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

747 

748 # A relative URI indicates relative to datastore root 

749 srcUri = ButlerURI(path, forceAbsolute=False) 

750 if not srcUri.isabs(): 

751 srcUri = self.root.join(path) 

752 

753 if not srcUri.exists(): 

754 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest " 

755 f"are assumed to be relative to {self.root} unless they are absolute.") 

756 

757 if transfer is None: 

758 relpath = srcUri.relative_to(self.root) 

759 if not relpath: 

760 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not " 

761 f"within datastore ({self.root})") 

762 

763 # Return the relative path within the datastore for internal 

764 # transfer 

765 path = relpath 

766 

767 return path 

768 

769 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *, 

770 formatter: Union[Formatter, Type[Formatter]], 

771 transfer: Optional[str] = None) -> StoredFileInfo: 

772 """Relocate (if necessary) and extract `StoredFileInfo` from a 

773 to-be-ingested file. 

774 

775 Parameters 

776 ---------- 

777 path : `str` or `ButlerURI` 

778 URI or path of a file to be ingested. 

779 ref : `DatasetRef` 

780 Reference for the dataset being ingested. Guaranteed to have 

781 ``dataset_id not None`. 

782 formatter : `type` or `Formatter` 

783 `Formatter` subclass to use for this dataset or an instance. 

784 transfer : `str`, optional 

785 How (and whether) the dataset should be added to the datastore. 

786 See `ingest` for details of transfer modes. 

787 

788 Returns 

789 ------- 

790 info : `StoredFileInfo` 

791 Internal datastore record for this file. This will be inserted by 

792 the caller; the `_extractIngestInfo` is only resposible for 

793 creating and populating the struct. 

794 

795 Raises 

796 ------ 

797 FileNotFoundError 

798 Raised if one of the given files does not exist. 

799 FileExistsError 

800 Raised if transfer is not `None` but the (internal) location the 

801 file would be moved to is already occupied. 

802 """ 

803 if self._transaction is None: 803 ↛ 804line 803 didn't jump to line 804, because the condition on line 803 was never true

804 raise RuntimeError("Ingest called without transaction enabled") 

805 

806 # Create URI of the source path, do not need to force a relative 

807 # path to absolute. 

808 srcUri = ButlerURI(path, forceAbsolute=False) 

809 

810 # Track whether we have read the size of the source yet 

811 have_sized = False 

812 

813 tgtLocation: Optional[Location] 

814 if transfer is None: 

815 # A relative path is assumed to be relative to the datastore 

816 # in this context 

817 if not srcUri.isabs(): 

818 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

819 else: 

820 # Work out the path in the datastore from an absolute URI 

821 # This is required to be within the datastore. 

822 pathInStore = srcUri.relative_to(self.root) 

823 if pathInStore is None: 823 ↛ 824line 823 didn't jump to line 824, because the condition on line 823 was never true

824 raise RuntimeError(f"Unexpectedly learned that {srcUri} is " 

825 f"not within datastore {self.root}") 

826 tgtLocation = self.locationFactory.fromPath(pathInStore) 

827 elif transfer == "direct": 827 ↛ 832line 827 didn't jump to line 832, because the condition on line 827 was never true

828 # Want to store the full URI to the resource directly in 

829 # datastore. This is useful for referring to permanent archive 

830 # storage for raw data. 

831 # Trust that people know what they are doing. 

832 tgtLocation = None 

833 else: 

834 # Work out the name we want this ingested file to have 

835 # inside the datastore 

836 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

837 if not tgtLocation.uri.dirname().exists(): 

838 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

839 tgtLocation.uri.dirname().mkdir() 

840 

841 # if we are transferring from a local file to a remote location 

842 # it may be more efficient to get the size and checksum of the 

843 # local file rather than the transferred one 

844 if not srcUri.scheme or srcUri.scheme == "file": 844 ↛ 850line 844 didn't jump to line 850, because the condition on line 844 was never false

845 size = srcUri.size() 

846 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

847 have_sized = True 

848 

849 # transfer the resource to the destination 

850 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction) 

851 

852 if tgtLocation is None: 852 ↛ 854line 852 didn't jump to line 854, because the condition on line 852 was never true

853 # This means we are using direct mode 

854 targetUri = srcUri 

855 targetPath = str(srcUri) 

856 else: 

857 targetUri = tgtLocation.uri 

858 targetPath = tgtLocation.pathInStore.path 

859 

860 # the file should exist in the datastore now 

861 if not have_sized: 

862 size = targetUri.size() 

863 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

864 

865 return StoredFileInfo(formatter=formatter, path=targetPath, 

866 storageClass=ref.datasetType.storageClass, 

867 component=ref.datasetType.component(), 

868 file_size=size, checksum=checksum) 

869 

870 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

871 # Docstring inherited from Datastore._prepIngest. 

872 filtered = [] 

873 for dataset in datasets: 

874 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

875 if not acceptable: 

876 continue 

877 else: 

878 dataset.refs = acceptable 

879 if dataset.formatter is None: 

880 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

881 else: 

882 assert isinstance(dataset.formatter, (type, str)) 

883 dataset.formatter = getClassOf(dataset.formatter) 

884 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

885 filtered.append(dataset) 

886 return _IngestPrepData(filtered) 

887 

888 @transactional 

889 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None: 

890 # Docstring inherited from Datastore._finishIngest. 

891 refsAndInfos = [] 

892 for dataset in prepData.datasets: 

893 # Do ingest as if the first dataset ref is associated with the file 

894 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter, 

895 transfer=transfer) 

896 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

897 self._register_datasets(refsAndInfos) 

898 

899 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef, 

900 formatter: Union[Formatter, Type[Formatter]]) -> Location: 

901 """Given a source URI and a DatasetRef, determine the name the 

902 dataset will have inside datastore. 

903 

904 Parameters 

905 ---------- 

906 srcUri : `ButlerURI` 

907 URI to the source dataset file. 

908 ref : `DatasetRef` 

909 Ref associated with the newly-ingested dataset artifact. This 

910 is used to determine the name within the datastore. 

911 formatter : `Formatter` or Formatter class. 

912 Formatter to use for validation. Can be a class or an instance. 

913 

914 Returns 

915 ------- 

916 location : `Location` 

917 Target location for the newly-ingested dataset. 

918 """ 

919 # Ingesting a file from outside the datastore. 

920 # This involves a new name. 

921 template = self.templates.getTemplate(ref) 

922 location = self.locationFactory.fromPath(template.format(ref)) 

923 

924 # Get the extension 

925 ext = srcUri.getExtension() 

926 

927 # Update the destination to include that extension 

928 location.updateExtension(ext) 

929 

930 # Ask the formatter to validate this extension 

931 formatter.validateExtension(location) 

932 

933 return location 

934 

935 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

936 """Write out in memory dataset to datastore. 

937 

938 Parameters 

939 ---------- 

940 inMemoryDataset : `object` 

941 Dataset to write to datastore. 

942 ref : `DatasetRef` 

943 Registry information associated with this dataset. 

944 

945 Returns 

946 ------- 

947 info : `StoredFileInfo` 

948 Information describin the artifact written to the datastore. 

949 """ 

950 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

951 uri = location.uri 

952 

953 if not uri.dirname().exists(): 

954 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

955 uri.dirname().mkdir() 

956 

957 if self._transaction is None: 957 ↛ 958line 957 didn't jump to line 958, because the condition on line 957 was never true

958 raise RuntimeError("Attempting to write artifact without transaction enabled") 

959 

960 def _removeFileExists(uri: ButlerURI) -> None: 

961 """Remove a file and do not complain if it is not there. 

962 

963 This is important since a formatter might fail before the file 

964 is written and we should not confuse people by writing spurious 

965 error messages to the log. 

966 """ 

967 try: 

968 uri.remove() 

969 except FileNotFoundError: 

970 pass 

971 

972 # Register a callback to try to delete the uploaded data if 

973 # something fails below 

974 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

975 

976 # For a local file, simply use the formatter directly 

977 if uri.isLocal: 

978 formatter.write(inMemoryDataset) 

979 log.debug("Successfully wrote python object to local file at %s", uri) 

980 else: 

981 # This is a remote URI, so first try bytes and write directly else 

982 # fallback to a temporary file 

983 try: 

984 serializedDataset = formatter.toBytes(inMemoryDataset) 

985 log.debug("Writing bytes directly to %s", uri) 

986 uri.write(serializedDataset, overwrite=True) 

987 log.debug("Successfully wrote bytes directly to %s", uri) 

988 except NotImplementedError: 

989 with tempfile.NamedTemporaryFile(suffix=uri.getExtension()) as tmpFile: 

990 # Need to configure the formatter to write to a different 

991 # location and that needs us to overwrite internals 

992 tmpLocation = Location(*os.path.split(tmpFile.name)) 

993 log.debug("Writing dataset to temporary location at %s", tmpLocation.uri) 

994 with formatter._updateLocation(tmpLocation): 

995 formatter.write(inMemoryDataset) 

996 uri.transfer_from(tmpLocation.uri, transfer="copy", overwrite=True) 

997 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

998 

999 # URI is needed to resolve what ingest case are we dealing with 

1000 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1001 

1002 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

1003 ref: DatasetRef, isComponent: bool = False) -> Any: 

1004 """Read the artifact from datastore into in memory object. 

1005 

1006 Parameters 

1007 ---------- 

1008 getInfo : `DatastoreFileGetInformation` 

1009 Information about the artifact within the datastore. 

1010 ref : `DatasetRef` 

1011 The registry information associated with this artifact. 

1012 isComponent : `bool` 

1013 Flag to indicate if a component is being read from this artifact. 

1014 

1015 Returns 

1016 ------- 

1017 inMemoryDataset : `object` 

1018 The artifact as a python object. 

1019 """ 

1020 location = getInfo.location 

1021 uri = location.uri 

1022 log.debug("Accessing data from %s", uri) 

1023 

1024 # Cannot recalculate checksum but can compare size as a quick check 

1025 # Do not do this if the size is negative since that indicates 

1026 # we do not know. 

1027 recorded_size = getInfo.info.file_size 

1028 resource_size = uri.size() 

1029 if recorded_size >= 0 and resource_size != recorded_size: 1029 ↛ 1030line 1029 didn't jump to line 1030, because the condition on line 1029 was never true

1030 raise RuntimeError("Integrity failure in Datastore. " 

1031 f"Size of file {uri} ({resource_size}) " 

1032 f"does not match size recorded in registry of {recorded_size}") 

1033 

1034 # For the general case we have choices for how to proceed. 

1035 # 1. Always use a local file (downloading the remote resource to a 

1036 # temporary file if needed). 

1037 # 2. Use a threshold size and read into memory and use bytes. 

1038 # Use both for now with an arbitrary hand off size. 

1039 # This allows small datasets to be downloaded from remote object 

1040 # stores without requiring a temporary file. 

1041 

1042 formatter = getInfo.formatter 

1043 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1044 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1045 serializedDataset = uri.read() 

1046 log.debug("Deserializing %s from %d bytes from location %s with formatter %s", 

1047 f"component {getInfo.component}" if isComponent else "", 

1048 len(serializedDataset), uri, formatter.name()) 

1049 try: 

1050 result = formatter.fromBytes(serializedDataset, 

1051 component=getInfo.component if isComponent else None) 

1052 except Exception as e: 

1053 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1054 f" ({ref.datasetType.name} from {uri}): {e}") from e 

1055 else: 

1056 # Read from file 

1057 with uri.as_local() as local_uri: 

1058 # Have to update the Location associated with the formatter 

1059 # because formatter.read does not allow an override. 

1060 # This could be improved. 

1061 msg = "" 

1062 newLocation = None 

1063 if uri != local_uri: 

1064 newLocation = Location(*local_uri.split()) 

1065 msg = "(via download to local file)" 

1066 

1067 log.debug("Reading %s from location %s %s with formatter %s", 

1068 f"component {getInfo.component}" if isComponent else "", 

1069 uri, msg, formatter.name()) 

1070 try: 

1071 with formatter._updateLocation(newLocation): 

1072 result = formatter.read(component=getInfo.component if isComponent else None) 

1073 except Exception as e: 

1074 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1075 f" ({ref.datasetType.name} from {uri}): {e}") from e 

1076 

1077 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

1078 isComponent=isComponent) 

1079 

1080 def exists(self, ref: DatasetRef) -> bool: 

1081 """Check if the dataset exists in the datastore. 

1082 

1083 Parameters 

1084 ---------- 

1085 ref : `DatasetRef` 

1086 Reference to the required dataset. 

1087 

1088 Returns 

1089 ------- 

1090 exists : `bool` 

1091 `True` if the entity exists in the `Datastore`. 

1092 """ 

1093 fileLocations = self._get_dataset_locations_info(ref) 

1094 

1095 # if we are being asked to trust that registry might not be correct 

1096 # we ask for the expected locations and check them explicitly 

1097 if not fileLocations: 

1098 if not self.trustGetRequest: 

1099 return False 

1100 fileLocations = self._get_expected_dataset_locations_info(ref) 

1101 for location, _ in fileLocations: 

1102 if not self._artifact_exists(location): 

1103 return False 

1104 

1105 return True 

1106 

1107 def getURIs(self, ref: DatasetRef, 

1108 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

1109 """Return URIs associated with dataset. 

1110 

1111 Parameters 

1112 ---------- 

1113 ref : `DatasetRef` 

1114 Reference to the required dataset. 

1115 predict : `bool`, optional 

1116 If the datastore does not know about the dataset, should it 

1117 return a predicted URI or not? 

1118 

1119 Returns 

1120 ------- 

1121 primary : `ButlerURI` 

1122 The URI to the primary artifact associated with this dataset. 

1123 If the dataset was disassembled within the datastore this 

1124 may be `None`. 

1125 components : `dict` 

1126 URIs to any components associated with the dataset artifact. 

1127 Can be empty if there are no components. 

1128 """ 

1129 

1130 primary: Optional[ButlerURI] = None 

1131 components: Dict[str, ButlerURI] = {} 

1132 

1133 # if this has never been written then we have to guess 

1134 if not self.exists(ref): 

1135 if not predict: 

1136 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

1137 

1138 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1139 

1140 if doDisassembly: 

1141 

1142 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

1143 compRef = ref.makeComponentRef(component) 

1144 compLocation, _ = self._determine_put_formatter_location(compRef) 

1145 

1146 # Add a URI fragment to indicate this is a guess 

1147 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted") 

1148 

1149 else: 

1150 

1151 location, _ = self._determine_put_formatter_location(ref) 

1152 

1153 # Add a URI fragment to indicate this is a guess 

1154 primary = ButlerURI(location.uri.geturl() + "#predicted") 

1155 

1156 return primary, components 

1157 

1158 # If this is a ref that we have written we can get the path. 

1159 # Get file metadata and internal metadata 

1160 fileLocations = self._get_dataset_locations_info(ref) 

1161 

1162 guessing = False 

1163 if not fileLocations: 

1164 if not self.trustGetRequest: 1164 ↛ 1165line 1164 didn't jump to line 1165, because the condition on line 1164 was never true

1165 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1166 fileLocations = self._get_expected_dataset_locations_info(ref) 

1167 guessing = True 

1168 

1169 if len(fileLocations) == 1: 

1170 # No disassembly so this is the primary URI 

1171 uri = fileLocations[0][0].uri 

1172 if guessing and not uri.exists(): 1172 ↛ 1173line 1172 didn't jump to line 1173, because the condition on line 1172 was never true

1173 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1174 primary = uri 

1175 

1176 else: 

1177 for location, storedFileInfo in fileLocations: 

1178 if storedFileInfo.component is None: 1178 ↛ 1179line 1178 didn't jump to line 1179, because the condition on line 1178 was never true

1179 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1180 uri = location.uri 

1181 if guessing and not uri.exists(): 1181 ↛ 1182line 1181 didn't jump to line 1182, because the condition on line 1181 was never true

1182 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1183 components[storedFileInfo.component] = uri 

1184 

1185 return primary, components 

1186 

1187 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI: 

1188 """URI to the Dataset. 

1189 

1190 Parameters 

1191 ---------- 

1192 ref : `DatasetRef` 

1193 Reference to the required Dataset. 

1194 predict : `bool` 

1195 If `True`, allow URIs to be returned of datasets that have not 

1196 been written. 

1197 

1198 Returns 

1199 ------- 

1200 uri : `str` 

1201 URI pointing to the dataset within the datastore. If the 

1202 dataset does not exist in the datastore, and if ``predict`` is 

1203 `True`, the URI will be a prediction and will include a URI 

1204 fragment "#predicted". 

1205 If the datastore does not have entities that relate well 

1206 to the concept of a URI the returned URI will be 

1207 descriptive. The returned URI is not guaranteed to be obtainable. 

1208 

1209 Raises 

1210 ------ 

1211 FileNotFoundError 

1212 Raised if a URI has been requested for a dataset that does not 

1213 exist and guessing is not allowed. 

1214 RuntimeError 

1215 Raised if a request is made for a single URI but multiple URIs 

1216 are associated with this dataset. 

1217 

1218 Notes 

1219 ----- 

1220 When a predicted URI is requested an attempt will be made to form 

1221 a reasonable URI based on file templates and the expected formatter. 

1222 """ 

1223 primary, components = self.getURIs(ref, predict) 

1224 if primary is None or components: 1224 ↛ 1225line 1224 didn't jump to line 1225, because the condition on line 1224 was never true

1225 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. " 

1226 "Use Dataastore.getURIs() instead.") 

1227 return primary 

1228 

1229 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

1230 """Load an InMemoryDataset from the store. 

1231 

1232 Parameters 

1233 ---------- 

1234 ref : `DatasetRef` 

1235 Reference to the required Dataset. 

1236 parameters : `dict` 

1237 `StorageClass`-specific parameters that specify, for example, 

1238 a slice of the dataset to be loaded. 

1239 

1240 Returns 

1241 ------- 

1242 inMemoryDataset : `object` 

1243 Requested dataset or slice thereof as an InMemoryDataset. 

1244 

1245 Raises 

1246 ------ 

1247 FileNotFoundError 

1248 Requested dataset can not be retrieved. 

1249 TypeError 

1250 Return value from formatter has unexpected type. 

1251 ValueError 

1252 Formatter failed to process the dataset. 

1253 """ 

1254 allGetInfo = self._prepare_for_get(ref, parameters) 

1255 refComponent = ref.datasetType.component() 

1256 

1257 # Supplied storage class for the component being read 

1258 refStorageClass = ref.datasetType.storageClass 

1259 

1260 # Create mapping from component name to related info 

1261 allComponents = {i.component: i for i in allGetInfo} 

1262 

1263 # By definition the dataset is disassembled if we have more 

1264 # than one record for it. 

1265 isDisassembled = len(allGetInfo) > 1 

1266 

1267 # Look for the special case where we are disassembled but the 

1268 # component is a derived component that was not written during 

1269 # disassembly. For this scenario we need to check that the 

1270 # component requested is listed as a derived component for the 

1271 # composite storage class 

1272 isDisassembledReadOnlyComponent = False 

1273 if isDisassembled and refComponent: 

1274 # The composite storage class should be accessible through 

1275 # the component dataset type 

1276 compositeStorageClass = ref.datasetType.parentStorageClass 

1277 

1278 # In the unlikely scenario where the composite storage 

1279 # class is not known, we can only assume that this is a 

1280 # normal component. If that assumption is wrong then the 

1281 # branch below that reads a persisted component will fail 

1282 # so there is no need to complain here. 

1283 if compositeStorageClass is not None: 1283 ↛ 1286line 1283 didn't jump to line 1286, because the condition on line 1283 was never false

1284 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

1285 

1286 if isDisassembled and not refComponent: 

1287 # This was a disassembled dataset spread over multiple files 

1288 # and we need to put them all back together again. 

1289 # Read into memory and then assemble 

1290 

1291 # Check that the supplied parameters are suitable for the type read 

1292 refStorageClass.validateParameters(parameters) 

1293 

1294 # We want to keep track of all the parameters that were not used 

1295 # by formatters. We assume that if any of the component formatters 

1296 # use a parameter that we do not need to apply it again in the 

1297 # assembler. 

1298 usedParams = set() 

1299 

1300 components: Dict[str, Any] = {} 

1301 for getInfo in allGetInfo: 

1302 # assemblerParams are parameters not understood by the 

1303 # associated formatter. 

1304 usedParams.update(set(getInfo.formatterParams)) 

1305 

1306 component = getInfo.component 

1307 

1308 if component is None: 1308 ↛ 1309line 1308 didn't jump to line 1309, because the condition on line 1308 was never true

1309 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

1310 

1311 # We do not want the formatter to think it's reading 

1312 # a component though because it is really reading a 

1313 # standalone dataset -- always tell reader it is not a 

1314 # component. 

1315 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False) 

1316 

1317 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

1318 

1319 # Any unused parameters will have to be passed to the assembler 

1320 if parameters: 

1321 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

1322 else: 

1323 unusedParams = {} 

1324 

1325 # Process parameters 

1326 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset, 

1327 parameters=unusedParams) 

1328 

1329 elif isDisassembledReadOnlyComponent: 

1330 

1331 compositeStorageClass = ref.datasetType.parentStorageClass 

1332 if compositeStorageClass is None: 1332 ↛ 1333line 1332 didn't jump to line 1333, because the condition on line 1332 was never true

1333 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since" 

1334 "no composite storage class is available.") 

1335 

1336 if refComponent is None: 1336 ↛ 1338line 1336 didn't jump to line 1338, because the condition on line 1336 was never true

1337 # Mainly for mypy 

1338 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

1339 

1340 # Assume that every derived component can be calculated by 

1341 # forwarding the request to a single read/write component. 

1342 # Rather than guessing which rw component is the right one by 

1343 # scanning each for a derived component of the same name, 

1344 # we ask the storage class delegate directly which one is best to 

1345 # use. 

1346 compositeDelegate = compositeStorageClass.delegate() 

1347 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent, 

1348 set(allComponents)) 

1349 

1350 # Select the relevant component 

1351 rwInfo = allComponents[forwardedComponent] 

1352 

1353 # For now assume that read parameters are validated against 

1354 # the real component and not the requested component 

1355 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

1356 forwardedStorageClass.validateParameters(parameters) 

1357 

1358 # Unfortunately the FileDescriptor inside the formatter will have 

1359 # the wrong write storage class so we need to create a new one 

1360 # given the immutability constraint. 

1361 writeStorageClass = rwInfo.info.storageClass 

1362 

1363 # We may need to put some thought into parameters for read 

1364 # components but for now forward them on as is 

1365 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location, 

1366 readStorageClass=refStorageClass, 

1367 storageClass=writeStorageClass, 

1368 parameters=parameters), 

1369 ref.dataId) 

1370 

1371 # The assembler can not receive any parameter requests for a 

1372 # derived component at this time since the assembler will 

1373 # see the storage class of the derived component and those 

1374 # parameters will have to be handled by the formatter on the 

1375 # forwarded storage class. 

1376 assemblerParams: Dict[str, Any] = {} 

1377 

1378 # Need to created a new info that specifies the derived 

1379 # component and associated storage class 

1380 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter, 

1381 rwInfo.info, assemblerParams, {}, 

1382 refComponent, refStorageClass) 

1383 

1384 return self._read_artifact_into_memory(readInfo, ref, isComponent=True) 

1385 

1386 else: 

1387 # Single file request or component from that composite file 

1388 for lookup in (refComponent, None): 1388 ↛ 1393line 1388 didn't jump to line 1393, because the loop on line 1388 didn't complete

1389 if lookup in allComponents: 1389 ↛ 1388line 1389 didn't jump to line 1388, because the condition on line 1389 was never false

1390 getInfo = allComponents[lookup] 

1391 break 

1392 else: 

1393 raise FileNotFoundError(f"Component {refComponent} not found " 

1394 f"for ref {ref} in datastore {self.name}") 

1395 

1396 # Do not need the component itself if already disassembled 

1397 if isDisassembled: 

1398 isComponent = False 

1399 else: 

1400 isComponent = getInfo.component is not None 

1401 

1402 # For a disassembled component we can validate parametersagainst 

1403 # the component storage class directly 

1404 if isDisassembled: 

1405 refStorageClass.validateParameters(parameters) 

1406 else: 

1407 # For an assembled composite this could be a derived 

1408 # component derived from a real component. The validity 

1409 # of the parameters is not clear. For now validate against 

1410 # the composite storage class 

1411 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

1412 

1413 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent) 

1414 

1415 @transactional 

1416 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

1417 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

1418 

1419 Parameters 

1420 ---------- 

1421 inMemoryDataset : `object` 

1422 The dataset to store. 

1423 ref : `DatasetRef` 

1424 Reference to the associated Dataset. 

1425 

1426 Raises 

1427 ------ 

1428 TypeError 

1429 Supplied object and storage class are inconsistent. 

1430 DatasetTypeNotSupportedError 

1431 The associated `DatasetType` is not handled by this datastore. 

1432 

1433 Notes 

1434 ----- 

1435 If the datastore is configured to reject certain dataset types it 

1436 is possible that the put will fail and raise a 

1437 `DatasetTypeNotSupportedError`. The main use case for this is to 

1438 allow `ChainedDatastore` to put to multiple datastores without 

1439 requiring that every datastore accepts the dataset. 

1440 """ 

1441 

1442 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1443 # doDisassembly = True 

1444 

1445 artifacts = [] 

1446 if doDisassembly: 

1447 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

1448 for component, componentInfo in components.items(): 

1449 # Don't recurse because we want to take advantage of 

1450 # bulk insert -- need a new DatasetRef that refers to the 

1451 # same dataset_id but has the component DatasetType 

1452 # DatasetType does not refer to the types of components 

1453 # So we construct one ourselves. 

1454 compRef = ref.makeComponentRef(component) 

1455 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

1456 artifacts.append((compRef, storedInfo)) 

1457 else: 

1458 # Write the entire thing out 

1459 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

1460 artifacts.append((ref, storedInfo)) 

1461 

1462 self._register_datasets(artifacts) 

1463 

1464 @transactional 

1465 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None: 

1466 """Indicate to the datastore that a dataset can be removed. 

1467 

1468 Parameters 

1469 ---------- 

1470 ref : `DatasetRef` 

1471 Reference to the required Dataset. 

1472 ignore_errors : `bool` 

1473 If `True` return without error even if something went wrong. 

1474 Problems could occur if another process is simultaneously trying 

1475 to delete. 

1476 

1477 Raises 

1478 ------ 

1479 FileNotFoundError 

1480 Attempt to remove a dataset that does not exist. 

1481 """ 

1482 # Get file metadata and internal metadata 

1483 log.debug("Trashing %s in datastore %s", ref, self.name) 

1484 

1485 fileLocations = self._get_dataset_locations_info(ref) 

1486 

1487 if not fileLocations: 

1488 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

1489 if ignore_errors: 

1490 log.warning(err_msg) 

1491 return 

1492 else: 

1493 raise FileNotFoundError(err_msg) 

1494 

1495 for location, storedFileInfo in fileLocations: 

1496 if not self._artifact_exists(location): 1496 ↛ 1497line 1496 didn't jump to line 1497, because the condition on line 1496 was never true

1497 err_msg = f"Dataset is known to datastore {self.name} but " \ 

1498 f"associated artifact ({location.uri}) is missing" 

1499 if ignore_errors: 

1500 log.warning(err_msg) 

1501 return 

1502 else: 

1503 raise FileNotFoundError(err_msg) 

1504 

1505 # Mark dataset as trashed 

1506 try: 

1507 self._move_to_trash_in_registry(ref) 

1508 except Exception as e: 

1509 if ignore_errors: 

1510 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} " 

1511 f"but encountered an error: {e}") 

1512 pass 

1513 else: 

1514 raise 

1515 

1516 @transactional 

1517 def emptyTrash(self, ignore_errors: bool = True) -> None: 

1518 """Remove all datasets from the trash. 

1519 

1520 Parameters 

1521 ---------- 

1522 ignore_errors : `bool` 

1523 If `True` return without error even if something went wrong. 

1524 Problems could occur if another process is simultaneously trying 

1525 to delete. 

1526 """ 

1527 log.debug("Emptying trash in datastore %s", self.name) 

1528 # Context manager will empty trash iff we finish it without raising. 

1529 with self.bridge.emptyTrash() as trashed: 

1530 for ref in trashed: 

1531 fileLocations = self._get_dataset_locations_info(ref) 

1532 

1533 if not fileLocations: 1533 ↛ 1534line 1533 didn't jump to line 1534, because the condition on line 1533 was never true

1534 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}" 

1535 if ignore_errors: 

1536 log.warning(err_msg) 

1537 continue 

1538 else: 

1539 raise FileNotFoundError(err_msg) 

1540 

1541 for location, _ in fileLocations: 

1542 

1543 if not self._artifact_exists(location): 1543 ↛ 1544line 1543 didn't jump to line 1544, because the condition on line 1543 was never true

1544 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}" 

1545 if ignore_errors: 

1546 log.warning(err_msg) 

1547 continue 

1548 else: 

1549 raise FileNotFoundError(err_msg) 

1550 

1551 # Can only delete the artifact if there are no references 

1552 # to the file from untrashed dataset refs. 

1553 if self._can_remove_dataset_artifact(ref, location): 

1554 # Point of no return for this artifact 

1555 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

1556 try: 

1557 self._delete_artifact(location) 

1558 except Exception as e: 

1559 if ignore_errors: 

1560 log.critical("Encountered error removing artifact %s from datastore %s: %s", 

1561 location.uri, self.name, e) 

1562 else: 

1563 raise 

1564 

1565 # Now must remove the entry from the internal registry even if 

1566 # the artifact removal failed and was ignored, 

1567 # otherwise the removal check above will never be true 

1568 try: 

1569 # There may be multiple rows associated with this ref 

1570 # depending on disassembly 

1571 self.removeStoredItemInfo(ref) 

1572 except Exception as e: 

1573 if ignore_errors: 

1574 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s", 

1575 ref.id, location.uri, self.name, e) 

1576 continue 

1577 else: 

1578 raise FileNotFoundError( 

1579 f"Error removing dataset {ref.id} ({location.uri}) from internal registry " 

1580 f"of {self.name}" 

1581 ) from e 

1582 

1583 @transactional 

1584 def forget(self, refs: Iterable[DatasetRef]) -> None: 

1585 # Docstring inherited. 

1586 refs = list(refs) 

1587 self.bridge.forget(refs) 

1588 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs]) 

1589 

1590 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], 

1591 logFailures: bool = False) -> None: 

1592 """Validate some of the configuration for this datastore. 

1593 

1594 Parameters 

1595 ---------- 

1596 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

1597 Entities to test against this configuration. Can be differing 

1598 types. 

1599 logFailures : `bool`, optional 

1600 If `True`, output a log message for every validation error 

1601 detected. 

1602 

1603 Raises 

1604 ------ 

1605 DatastoreValidationError 

1606 Raised if there is a validation problem with a configuration. 

1607 All the problems are reported in a single exception. 

1608 

1609 Notes 

1610 ----- 

1611 This method checks that all the supplied entities have valid file 

1612 templates and also have formatters defined. 

1613 """ 

1614 

1615 templateFailed = None 

1616 try: 

1617 self.templates.validateTemplates(entities, logFailures=logFailures) 

1618 except FileTemplateValidationError as e: 

1619 templateFailed = str(e) 

1620 

1621 formatterFailed = [] 

1622 for entity in entities: 

1623 try: 

1624 self.formatterFactory.getFormatterClass(entity) 

1625 except KeyError as e: 

1626 formatterFailed.append(str(e)) 

1627 if logFailures: 1627 ↛ 1622line 1627 didn't jump to line 1622, because the condition on line 1627 was never false

1628 log.critical("Formatter failure: %s", e) 

1629 

1630 if templateFailed or formatterFailed: 

1631 messages = [] 

1632 if templateFailed: 1632 ↛ 1633line 1632 didn't jump to line 1633, because the condition on line 1632 was never true

1633 messages.append(templateFailed) 

1634 if formatterFailed: 1634 ↛ 1636line 1634 didn't jump to line 1636, because the condition on line 1634 was never false

1635 messages.append(",".join(formatterFailed)) 

1636 msg = ";\n".join(messages) 

1637 raise DatastoreValidationError(msg) 

1638 

1639 def getLookupKeys(self) -> Set[LookupKey]: 

1640 # Docstring is inherited from base class 

1641 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \ 

1642 self.constraints.getLookupKeys() 

1643 

1644 def validateKey(self, lookupKey: LookupKey, 

1645 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

1646 # Docstring is inherited from base class 

1647 # The key can be valid in either formatters or templates so we can 

1648 # only check the template if it exists 

1649 if lookupKey in self.templates: 

1650 try: 

1651 self.templates[lookupKey].validateTemplate(entity) 

1652 except FileTemplateValidationError as e: 

1653 raise DatastoreValidationError(e) from e 

1654 

1655 def export(self, refs: Iterable[DatasetRef], *, 

1656 directory: Optional[Union[ButlerURI, str]] = None, 

1657 transfer: Optional[str] = "auto") -> Iterable[FileDataset]: 

1658 # Docstring inherited from Datastore.export. 

1659 if transfer is not None and directory is None: 1659 ↛ 1660line 1659 didn't jump to line 1660, because the condition on line 1659 was never true

1660 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no " 

1661 "export directory given") 

1662 

1663 # Force the directory to be a URI object 

1664 directoryUri: Optional[ButlerURI] = None 

1665 if directory is not None: 1665 ↛ 1668line 1665 didn't jump to line 1668, because the condition on line 1665 was never false

1666 directoryUri = ButlerURI(directory, forceDirectory=True) 

1667 

1668 if transfer is not None and directoryUri is not None: 1668 ↛ 1673line 1668 didn't jump to line 1673, because the condition on line 1668 was never false

1669 # mypy needs the second test 

1670 if not directoryUri.exists(): 1670 ↛ 1671line 1670 didn't jump to line 1671, because the condition on line 1670 was never true

1671 raise FileNotFoundError(f"Export location {directory} does not exist") 

1672 

1673 for ref in refs: 

1674 fileLocations = self._get_dataset_locations_info(ref) 

1675 if not fileLocations: 1675 ↛ 1676line 1675 didn't jump to line 1676, because the condition on line 1675 was never true

1676 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

1677 # For now we can not export disassembled datasets 

1678 if len(fileLocations) > 1: 

1679 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

1680 location, storedFileInfo = fileLocations[0] 

1681 

1682 pathInStore = location.pathInStore.path 

1683 if transfer is None: 1683 ↛ 1686line 1683 didn't jump to line 1686, because the condition on line 1683 was never true

1684 # TODO: do we also need to return the readStorageClass somehow? 

1685 # We will use the path in store directly 

1686 pass 

1687 elif transfer == "direct": 1687 ↛ 1689line 1687 didn't jump to line 1689, because the condition on line 1687 was never true

1688 # Use full URIs to the remote store in the export 

1689 pathInStore = str(location.uri) 

1690 else: 

1691 # mypy needs help 

1692 assert directoryUri is not None, "directoryUri must be defined to get here" 

1693 storeUri = ButlerURI(location.uri) 

1694 

1695 # if the datastore has an absolute URI to a resource, we 

1696 # have two options: 

1697 # 1. Keep the absolute URI in the exported YAML 

1698 # 2. Allocate a new name in the local datastore and transfer 

1699 # it. 

1700 # For now go with option 2 

1701 if location.pathInStore.isabs(): 1701 ↛ 1702line 1701 didn't jump to line 1702, because the condition on line 1701 was never true

1702 template = self.templates.getTemplate(ref) 

1703 pathInStore = template.format(ref) 

1704 

1705 exportUri = directoryUri.join(pathInStore) 

1706 exportUri.transfer_from(storeUri, transfer=transfer) 

1707 

1708 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

1709 

1710 @staticmethod 

1711 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]: 

1712 """Compute the checksum of the supplied file. 

1713 

1714 Parameters 

1715 ---------- 

1716 uri : `ButlerURI` 

1717 Name of resource to calculate checksum from. 

1718 algorithm : `str`, optional 

1719 Name of algorithm to use. Must be one of the algorithms supported 

1720 by :py:class`hashlib`. 

1721 block_size : `int` 

1722 Number of bytes to read from file at one time. 

1723 

1724 Returns 

1725 ------- 

1726 hexdigest : `str` 

1727 Hex digest of the file. 

1728 

1729 Notes 

1730 ----- 

1731 Currently returns None if the URI is for a remote resource. 

1732 """ 

1733 if algorithm not in hashlib.algorithms_guaranteed: 1733 ↛ 1734line 1733 didn't jump to line 1734, because the condition on line 1733 was never true

1734 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

1735 

1736 if not uri.isLocal: 1736 ↛ 1737line 1736 didn't jump to line 1737, because the condition on line 1736 was never true

1737 return None 

1738 

1739 hasher = hashlib.new(algorithm) 

1740 

1741 with uri.as_local() as local_uri: 

1742 with open(local_uri.ospath, "rb") as f: 

1743 for chunk in iter(lambda: f.read(block_size), b""): 

1744 hasher.update(chunk) 

1745 

1746 return hasher.hexdigest()