Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileDatastore", ) 

26 

27import hashlib 

28import logging 

29import os 

30import tempfile 

31 

32from sqlalchemy import BigInteger, String 

33 

34from dataclasses import dataclass 

35from typing import ( 

36 TYPE_CHECKING, 

37 Any, 

38 ClassVar, 

39 Dict, 

40 Iterable, 

41 List, 

42 Mapping, 

43 Optional, 

44 Set, 

45 Tuple, 

46 Type, 

47 Union, 

48) 

49 

50from lsst.daf.butler import ( 

51 ButlerURI, 

52 CompositesMap, 

53 Config, 

54 FileDataset, 

55 DatasetRef, 

56 DatasetType, 

57 DatasetTypeNotSupportedError, 

58 Datastore, 

59 DatastoreConfig, 

60 DatastoreValidationError, 

61 FileDescriptor, 

62 FileTemplates, 

63 FileTemplateValidationError, 

64 Formatter, 

65 FormatterFactory, 

66 Location, 

67 LocationFactory, 

68 StorageClass, 

69 StoredFileInfo, 

70) 

71 

72from lsst.daf.butler import ddl 

73from lsst.daf.butler.registry.interfaces import ( 

74 ReadOnlyDatabaseError, 

75 DatastoreRegistryBridge, 

76) 

77 

78from lsst.daf.butler.core.repoRelocation import replaceRoot 

79from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional 

80from .genericDatastore import GenericBaseDatastore 

81 

82if TYPE_CHECKING: 82 ↛ 83line 82 didn't jump to line 83, because the condition on line 82 was never true

83 from lsst.daf.butler import LookupKey 

84 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

85 

86log = logging.getLogger(__name__) 

87 

88# String to use when a Python None is encountered 

89NULLSTR = "__NULL_STRING__" 

90 

91 

92class _IngestPrepData(Datastore.IngestPrepData): 

93 """Helper class for FileDatastore ingest implementation. 

94 

95 Parameters 

96 ---------- 

97 datasets : `list` of `FileDataset` 

98 Files to be ingested by this datastore. 

99 """ 

100 def __init__(self, datasets: List[FileDataset]): 

101 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

102 self.datasets = datasets 

103 

104 

105@dataclass(frozen=True) 

106class DatastoreFileGetInformation: 

107 """Collection of useful parameters needed to retrieve a file from 

108 a Datastore. 

109 """ 

110 

111 location: Location 

112 """The location from which to read the dataset.""" 

113 

114 formatter: Formatter 

115 """The `Formatter` to use to deserialize the dataset.""" 

116 

117 info: StoredFileInfo 

118 """Stored information about this file and its formatter.""" 

119 

120 assemblerParams: Dict[str, Any] 

121 """Parameters to use for post-processing the retrieved dataset.""" 

122 

123 formatterParams: Dict[str, Any] 

124 """Parameters that were understood by the associated formatter.""" 

125 

126 component: Optional[str] 

127 """The component to be retrieved (can be `None`).""" 

128 

129 readStorageClass: StorageClass 

130 """The `StorageClass` of the dataset being read.""" 

131 

132 

133class FileDatastore(GenericBaseDatastore): 

134 """Generic Datastore for file-based implementations. 

135 

136 Should always be sub-classed since key abstract methods are missing. 

137 

138 Parameters 

139 ---------- 

140 config : `DatastoreConfig` or `str` 

141 Configuration as either a `Config` object or URI to file. 

142 bridgeManager : `DatastoreRegistryBridgeManager` 

143 Object that manages the interface between `Registry` and datastores. 

144 butlerRoot : `str`, optional 

145 New datastore root to use to override the configuration value. 

146 

147 Raises 

148 ------ 

149 ValueError 

150 If root location does not exist and ``create`` is `False` in the 

151 configuration. 

152 """ 

153 

154 defaultConfigFile: ClassVar[Optional[str]] = None 

155 """Path to configuration defaults. Accessed within the ``config`` resource 

156 or relative to a search path. Can be None if no defaults specified. 

157 """ 

158 

159 root: ButlerURI 

160 """Root directory URI of this `Datastore`.""" 

161 

162 locationFactory: LocationFactory 

163 """Factory for creating locations relative to the datastore root.""" 

164 

165 formatterFactory: FormatterFactory 

166 """Factory for creating instances of formatters.""" 

167 

168 templates: FileTemplates 

169 """File templates that can be used by this `Datastore`.""" 

170 

171 composites: CompositesMap 

172 """Determines whether a dataset should be disassembled on put.""" 

173 

174 defaultConfigFile = "datastores/fileDatastore.yaml" 

175 """Path to configuration defaults. Accessed within the ``config`` resource 

176 or relative to a search path. Can be None if no defaults specified. 

177 """ 

178 

179 @classmethod 

180 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

181 """Set any filesystem-dependent config options for this Datastore to 

182 be appropriate for a new empty repository with the given root. 

183 

184 Parameters 

185 ---------- 

186 root : `str` 

187 URI to the root of the data repository. 

188 config : `Config` 

189 A `Config` to update. Only the subset understood by 

190 this component will be updated. Will not expand 

191 defaults. 

192 full : `Config` 

193 A complete config with all defaults expanded that can be 

194 converted to a `DatastoreConfig`. Read-only and will not be 

195 modified by this method. 

196 Repository-specific options that should not be obtained 

197 from defaults when Butler instances are constructed 

198 should be copied from ``full`` to ``config``. 

199 overwrite : `bool`, optional 

200 If `False`, do not modify a value in ``config`` if the value 

201 already exists. Default is always to overwrite with the provided 

202 ``root``. 

203 

204 Notes 

205 ----- 

206 If a keyword is explicitly defined in the supplied ``config`` it 

207 will not be overridden by this method if ``overwrite`` is `False`. 

208 This allows explicit values set in external configs to be retained. 

209 """ 

210 Config.updateParameters(DatastoreConfig, config, full, 

211 toUpdate={"root": root}, 

212 toCopy=("cls", ("records", "table")), overwrite=overwrite) 

213 

214 @classmethod 

215 def makeTableSpec(cls) -> ddl.TableSpec: 

216 return ddl.TableSpec( 

217 fields=[ 

218 ddl.FieldSpec(name="dataset_id", dtype=BigInteger, primaryKey=True), 

219 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

220 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

221 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

222 # Use empty string to indicate no component 

223 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

224 # TODO: should checksum be Base64Bytes instead? 

225 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

226 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

227 ], 

228 unique=frozenset(), 

229 ) 

230 

231 def __init__(self, config: Union[DatastoreConfig, str], 

232 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

233 super().__init__(config, bridgeManager) 

234 if "root" not in self.config: 234 ↛ 235line 234 didn't jump to line 235, because the condition on line 234 was never true

235 raise ValueError("No root directory specified in configuration") 

236 

237 # Name ourselves either using an explicit name or a name 

238 # derived from the (unexpanded) root 

239 if "name" in self.config: 

240 self.name = self.config["name"] 

241 else: 

242 # We use the unexpanded root in the name to indicate that this 

243 # datastore can be moved without having to update registry. 

244 self.name = "{}@{}".format(type(self).__name__, 

245 self.config["root"]) 

246 

247 # Support repository relocation in config 

248 # Existence of self.root is checked in subclass 

249 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot), 

250 forceDirectory=True, forceAbsolute=True) 

251 

252 self.locationFactory = LocationFactory(self.root) 

253 self.formatterFactory = FormatterFactory() 

254 

255 # Now associate formatters with storage classes 

256 self.formatterFactory.registerFormatters(self.config["formatters"], 

257 universe=bridgeManager.universe) 

258 

259 # Read the file naming templates 

260 self.templates = FileTemplates(self.config["templates"], 

261 universe=bridgeManager.universe) 

262 

263 # See if composites should be disassembled 

264 self.composites = CompositesMap(self.config["composites"], 

265 universe=bridgeManager.universe) 

266 

267 tableName = self.config["records", "table"] 

268 try: 

269 # Storage of paths and formatters, keyed by dataset_id 

270 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec()) 

271 # Interface to Registry. 

272 self._bridge = bridgeManager.register(self.name) 

273 except ReadOnlyDatabaseError: 

274 # If the database is read only and we just tried and failed to 

275 # create a table, it means someone is trying to create a read-only 

276 # butler client for an empty repo. That should be okay, as long 

277 # as they then try to get any datasets before some other client 

278 # creates the table. Chances are they'rejust validating 

279 # configuration. 

280 pass 

281 

282 # Determine whether checksums should be used - default to False 

283 self.useChecksum = self.config.get("checksum", False) 

284 

285 # Determine whether we can fall back to configuration if a 

286 # requested dataset is not known to registry 

287 self.trustGetRequest = self.config.get("trust_get_request", False) 

288 

289 # Check existence and create directory structure if necessary 

290 if not self.root.exists(): 

291 if "create" not in self.config or not self.config["create"]: 291 ↛ 292line 291 didn't jump to line 292, because the condition on line 291 was never true

292 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

293 try: 

294 self.root.mkdir() 

295 except Exception as e: 

296 raise ValueError(f"Can not create datastore root '{self.root}', check permissions." 

297 f" Got error: {e}") from e 

298 

299 def __str__(self) -> str: 

300 return str(self.root) 

301 

302 @property 

303 def bridge(self) -> DatastoreRegistryBridge: 

304 return self._bridge 

305 

306 def _artifact_exists(self, location: Location) -> bool: 

307 """Check that an artifact exists in this datastore at the specified 

308 location. 

309 

310 Parameters 

311 ---------- 

312 location : `Location` 

313 Expected location of the artifact associated with this datastore. 

314 

315 Returns 

316 ------- 

317 exists : `bool` 

318 True if the location can be found, false otherwise. 

319 """ 

320 log.debug("Checking if resource exists: %s", location.uri) 

321 return location.uri.exists() 

322 

323 def _delete_artifact(self, location: Location) -> None: 

324 """Delete the artifact from the datastore. 

325 

326 Parameters 

327 ---------- 

328 location : `Location` 

329 Location of the artifact associated with this datastore. 

330 """ 

331 log.debug("Deleting file: %s", location.uri) 

332 location.uri.remove() 

333 log.debug("Successfully deleted file: %s", location.uri) 

334 

335 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

336 # Docstring inherited from GenericBaseDatastore 

337 records = [] 

338 for ref, info in zip(refs, infos): 

339 # Component should come from ref and fall back on info 

340 component = ref.datasetType.component() 

341 if component is None and info.component is not None: 341 ↛ 342line 341 didn't jump to line 342, because the condition on line 341 was never true

342 component = info.component 

343 if component is None: 

344 # Use empty string since we want this to be part of the 

345 # primary key. 

346 component = NULLSTR 

347 records.append( 

348 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path, 

349 storage_class=info.storageClass.name, component=component, 

350 checksum=info.checksum, file_size=info.file_size) 

351 ) 

352 self._table.insert(*records) 

353 

354 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

355 # Docstring inherited from GenericBaseDatastore 

356 

357 # Look for the dataset_id -- there might be multiple matches 

358 # if we have disassembled the dataset. 

359 records = list(self._table.fetch(dataset_id=ref.id)) 

360 

361 results = [] 

362 for record in records: 

363 # Convert name of StorageClass to instance 

364 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"]) 

365 component = record["component"] if (record["component"] 

366 and record["component"] != NULLSTR) else None 

367 

368 info = StoredFileInfo(formatter=record["formatter"], 

369 path=record["path"], 

370 storageClass=storageClass, 

371 component=component, 

372 checksum=record["checksum"], 

373 file_size=record["file_size"]) 

374 results.append(info) 

375 

376 return results 

377 

378 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[int]: 

379 """Return all dataset refs associated with the supplied path. 

380 

381 Parameters 

382 ---------- 

383 pathInStore : `ButlerURI` 

384 Path of interest in the data store. 

385 

386 Returns 

387 ------- 

388 ids : `set` of `int` 

389 All `DatasetRef` IDs associated with this path. 

390 """ 

391 records = list(self._table.fetch(path=str(pathInStore))) 

392 ids = {r["dataset_id"] for r in records} 

393 return ids 

394 

395 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

396 # Docstring inherited from GenericBaseDatastore 

397 self._table.delete(dataset_id=ref.id) 

398 

399 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

400 r"""Find all the `Location`\ s of the requested dataset in the 

401 `Datastore` and the associated stored file information. 

402 

403 Parameters 

404 ---------- 

405 ref : `DatasetRef` 

406 Reference to the required `Dataset`. 

407 

408 Returns 

409 ------- 

410 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

411 Location of the dataset within the datastore and 

412 stored information about each file and its formatter. 

413 """ 

414 # Get the file information (this will fail if no file) 

415 records = self.getStoredItemsInfo(ref) 

416 

417 # Use the path to determine the location -- we need to take 

418 # into account absolute URIs in the datastore record 

419 locations: List[Tuple[Location, StoredFileInfo]] = [] 

420 for r in records: 

421 uriInStore = ButlerURI(r.path, forceAbsolute=False) 

422 if uriInStore.isabs(): 422 ↛ 423line 422 didn't jump to line 423, because the condition on line 422 was never true

423 location = Location(None, uriInStore) 

424 else: 

425 location = self.locationFactory.fromPath(r.path) 

426 locations.append((location, r)) 

427 return locations 

428 

429 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

430 """Check that there is only one dataset associated with the 

431 specified artifact. 

432 

433 Parameters 

434 ---------- 

435 ref : `DatasetRef` or `FakeDatasetRef` 

436 Dataset to be removed. 

437 location : `Location` 

438 The location of the artifact to be removed. 

439 

440 Returns 

441 ------- 

442 can_remove : `Bool` 

443 True if the artifact can be safely removed. 

444 """ 

445 

446 # Get all entries associated with this path 

447 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

448 if not allRefs: 448 ↛ 449line 448 didn't jump to line 449, because the condition on line 448 was never true

449 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

450 

451 # Remove these refs from all the refs and if there is nothing left 

452 # then we can delete 

453 remainingRefs = allRefs - {ref.id} 

454 

455 if remainingRefs: 

456 return False 

457 return True 

458 

459 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, 

460 StoredFileInfo]]: 

461 """Predict the location and related file information of the requested 

462 dataset in this datastore. 

463 

464 Parameters 

465 ---------- 

466 ref : `DatasetRef` 

467 Reference to the required `Dataset`. 

468 

469 Returns 

470 ------- 

471 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

472 Expected Location of the dataset within the datastore and 

473 placeholder information about each file and its formatter. 

474 

475 Notes 

476 ----- 

477 Uses the current configuration to determine how we would expect the 

478 datastore files to have been written if we couldn't ask registry. 

479 This is safe so long as there has been no change to datastore 

480 configuration between writing the dataset and wanting to read it. 

481 Will not work for files that have been ingested without using the 

482 standard file template or default formatter. 

483 """ 

484 

485 # If we have a component ref we always need to ask the questions 

486 # of the composite. If the composite is disassembled this routine 

487 # should return all components. If the composite was not 

488 # disassembled the composite is what is stored regardless of 

489 # component request. Note that if the caller has disassembled 

490 # a composite there is no way for this guess to know that 

491 # without trying both the composite and component ref and seeing 

492 # if there is something at the component Location even without 

493 # disassembly being enabled. 

494 if ref.datasetType.isComponent(): 

495 ref = ref.makeCompositeRef() 

496 

497 # See if the ref is a composite that should be disassembled 

498 doDisassembly = self.composites.shouldBeDisassembled(ref) 

499 

500 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = [] 

501 

502 if doDisassembly: 

503 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

504 compRef = ref.makeComponentRef(component) 

505 location, formatter = self._determine_put_formatter_location(compRef) 

506 all_info.append((location, formatter, componentStorage, component)) 

507 

508 else: 

509 # Always use the composite ref if no disassembly 

510 location, formatter = self._determine_put_formatter_location(ref) 

511 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

512 

513 # Convert the list of tuples to have StoredFileInfo as second element 

514 return [(location, StoredFileInfo(formatter=formatter, 

515 path=location.pathInStore.path, 

516 storageClass=storageClass, 

517 component=component, 

518 checksum=None, 

519 file_size=-1)) 

520 for location, formatter, storageClass, component in all_info] 

521 

522 def _prepare_for_get(self, ref: DatasetRef, 

523 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]: 

524 """Check parameters for ``get`` and obtain formatter and 

525 location. 

526 

527 Parameters 

528 ---------- 

529 ref : `DatasetRef` 

530 Reference to the required Dataset. 

531 parameters : `dict` 

532 `StorageClass`-specific parameters that specify, for example, 

533 a slice of the dataset to be loaded. 

534 

535 Returns 

536 ------- 

537 getInfo : `list` [`DatastoreFileGetInformation`] 

538 Parameters needed to retrieve each file. 

539 """ 

540 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

541 

542 # Get file metadata and internal metadata 

543 fileLocations = self._get_dataset_locations_info(ref) 

544 if not fileLocations: 

545 if not self.trustGetRequest: 

546 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

547 # Assume the dataset is where we think it should be 

548 fileLocations = self._get_expected_dataset_locations_info(ref) 

549 

550 # The storage class we want to use eventually 

551 refStorageClass = ref.datasetType.storageClass 

552 

553 if len(fileLocations) > 1: 

554 disassembled = True 

555 else: 

556 disassembled = False 

557 

558 # Is this a component request? 

559 refComponent = ref.datasetType.component() 

560 

561 fileGetInfo = [] 

562 for location, storedFileInfo in fileLocations: 

563 

564 # The storage class used to write the file 

565 writeStorageClass = storedFileInfo.storageClass 

566 

567 # If this has been disassembled we need read to match the write 

568 if disassembled: 

569 readStorageClass = writeStorageClass 

570 else: 

571 readStorageClass = refStorageClass 

572 

573 formatter = getInstanceOf(storedFileInfo.formatter, 

574 FileDescriptor(location, readStorageClass=readStorageClass, 

575 storageClass=writeStorageClass, parameters=parameters), 

576 ref.dataId) 

577 

578 formatterParams, notFormatterParams = formatter.segregateParameters() 

579 

580 # Of the remaining parameters, extract the ones supported by 

581 # this StorageClass (for components not all will be handled) 

582 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

583 

584 # The ref itself could be a component if the dataset was 

585 # disassembled by butler, or we disassembled in datastore and 

586 # components came from the datastore records 

587 component = storedFileInfo.component if storedFileInfo.component else refComponent 

588 

589 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo, 

590 assemblerParams, formatterParams, 

591 component, readStorageClass)) 

592 

593 return fileGetInfo 

594 

595 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

596 """Check the arguments for ``put`` and obtain formatter and 

597 location. 

598 

599 Parameters 

600 ---------- 

601 inMemoryDataset : `object` 

602 The dataset to store. 

603 ref : `DatasetRef` 

604 Reference to the associated Dataset. 

605 

606 Returns 

607 ------- 

608 location : `Location` 

609 The location to write the dataset. 

610 formatter : `Formatter` 

611 The `Formatter` to use to write the dataset. 

612 

613 Raises 

614 ------ 

615 TypeError 

616 Supplied object and storage class are inconsistent. 

617 DatasetTypeNotSupportedError 

618 The associated `DatasetType` is not handled by this datastore. 

619 """ 

620 self._validate_put_parameters(inMemoryDataset, ref) 

621 return self._determine_put_formatter_location(ref) 

622 

623 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]: 

624 """Calculate the formatter and output location to use for put. 

625 

626 Parameters 

627 ---------- 

628 ref : `DatasetRef` 

629 Reference to the associated Dataset. 

630 

631 Returns 

632 ------- 

633 location : `Location` 

634 The location to write the dataset. 

635 formatter : `Formatter` 

636 The `Formatter` to use to write the dataset. 

637 """ 

638 # Work out output file name 

639 try: 

640 template = self.templates.getTemplate(ref) 

641 except KeyError as e: 

642 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

643 

644 # Validate the template to protect against filenames from different 

645 # dataIds returning the same and causing overwrite confusion. 

646 template.validateTemplate(ref) 

647 

648 location = self.locationFactory.fromPath(template.format(ref)) 

649 

650 # Get the formatter based on the storage class 

651 storageClass = ref.datasetType.storageClass 

652 try: 

653 formatter = self.formatterFactory.getFormatter(ref, 

654 FileDescriptor(location, 

655 storageClass=storageClass), 

656 ref.dataId) 

657 except KeyError as e: 

658 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore " 

659 f"{self.name}") from e 

660 

661 # Now that we know the formatter, update the location 

662 location = formatter.makeUpdatedLocation(location) 

663 

664 return location, formatter 

665 

666 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

667 # Docstring inherited from base class 

668 if transfer != "auto": 

669 return transfer 

670 

671 # See if the paths are within the datastore or not 

672 inside = [self._pathInStore(d.path) is not None for d in datasets] 

673 

674 if all(inside): 

675 transfer = None 

676 elif not any(inside): 676 ↛ 680line 676 didn't jump to line 680, because the condition on line 676 was never false

677 # Allow ButlerURI to use its own knowledge 

678 transfer = "auto" 

679 else: 

680 raise ValueError("Some datasets are inside the datastore and some are outside." 

681 " Please use an explicit transfer mode and not 'auto'.") 

682 

683 return transfer 

684 

685 def _pathInStore(self, path: str) -> Optional[str]: 

686 """Return path relative to datastore root 

687 

688 Parameters 

689 ---------- 

690 path : `str` 

691 Path to dataset. Can be absolute. If relative assumed to 

692 be relative to the datastore. Returns path in datastore 

693 or raises an exception if the path it outside. 

694 

695 Returns 

696 ------- 

697 inStore : `str` 

698 Path relative to datastore root. Returns `None` if the file is 

699 outside the root. 

700 """ 

701 # Relative path will always be relative to datastore 

702 pathUri = ButlerURI(path, forceAbsolute=False) 

703 return pathUri.relative_to(self.root) 

704 

705 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

706 """Standardize the path of a to-be-ingested file. 

707 

708 Parameters 

709 ---------- 

710 path : `str` 

711 Path of a file to be ingested. 

712 transfer : `str`, optional 

713 How (and whether) the dataset should be added to the datastore. 

714 See `ingest` for details of transfer modes. 

715 This implementation is provided only so 

716 `NotImplementedError` can be raised if the mode is not supported; 

717 actual transfers are deferred to `_extractIngestInfo`. 

718 

719 Returns 

720 ------- 

721 path : `str` 

722 New path in what the datastore considers standard form. 

723 

724 Notes 

725 ----- 

726 Subclasses of `FileDatastore` can implement this method instead 

727 of `_prepIngest`. It should not modify the data repository or given 

728 file in any way. 

729 

730 Raises 

731 ------ 

732 NotImplementedError 

733 Raised if the datastore does not support the given transfer mode 

734 (including the case where ingest is not supported at all). 

735 FileNotFoundError 

736 Raised if one of the given files does not exist. 

737 """ 

738 if transfer not in (None, "direct") + self.root.transferModes: 738 ↛ 739line 738 didn't jump to line 739, because the condition on line 738 was never true

739 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

740 

741 # A relative URI indicates relative to datastore root 

742 srcUri = ButlerURI(path, forceAbsolute=False) 

743 if not srcUri.isabs(): 

744 srcUri = self.root.join(path) 

745 

746 if not srcUri.exists(): 

747 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest " 

748 f"are assumed to be relative to {self.root} unless they are absolute.") 

749 

750 if transfer is None: 

751 relpath = srcUri.relative_to(self.root) 

752 if not relpath: 

753 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not " 

754 f"within datastore ({self.root})") 

755 

756 # Return the relative path within the datastore for internal 

757 # transfer 

758 path = relpath 

759 

760 return path 

761 

762 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *, 

763 formatter: Union[Formatter, Type[Formatter]], 

764 transfer: Optional[str] = None) -> StoredFileInfo: 

765 """Relocate (if necessary) and extract `StoredFileInfo` from a 

766 to-be-ingested file. 

767 

768 Parameters 

769 ---------- 

770 path : `str` or `ButlerURI` 

771 URI or path of a file to be ingested. 

772 ref : `DatasetRef` 

773 Reference for the dataset being ingested. Guaranteed to have 

774 ``dataset_id not None`. 

775 formatter : `type` or `Formatter` 

776 `Formatter` subclass to use for this dataset or an instance. 

777 transfer : `str`, optional 

778 How (and whether) the dataset should be added to the datastore. 

779 See `ingest` for details of transfer modes. 

780 

781 Returns 

782 ------- 

783 info : `StoredFileInfo` 

784 Internal datastore record for this file. This will be inserted by 

785 the caller; the `_extractIngestInfo` is only resposible for 

786 creating and populating the struct. 

787 

788 Raises 

789 ------ 

790 FileNotFoundError 

791 Raised if one of the given files does not exist. 

792 FileExistsError 

793 Raised if transfer is not `None` but the (internal) location the 

794 file would be moved to is already occupied. 

795 """ 

796 if self._transaction is None: 796 ↛ 797line 796 didn't jump to line 797, because the condition on line 796 was never true

797 raise RuntimeError("Ingest called without transaction enabled") 

798 

799 # Create URI of the source path, do not need to force a relative 

800 # path to absolute. 

801 srcUri = ButlerURI(path, forceAbsolute=False) 

802 

803 # Track whether we have read the size of the source yet 

804 have_sized = False 

805 

806 tgtLocation: Optional[Location] 

807 if transfer is None: 

808 # A relative path is assumed to be relative to the datastore 

809 # in this context 

810 if not srcUri.isabs(): 

811 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

812 else: 

813 # Work out the path in the datastore from an absolute URI 

814 # This is required to be within the datastore. 

815 pathInStore = srcUri.relative_to(self.root) 

816 if pathInStore is None: 816 ↛ 817line 816 didn't jump to line 817, because the condition on line 816 was never true

817 raise RuntimeError(f"Unexpectedly learned that {srcUri} is " 

818 f"not within datastore {self.root}") 

819 tgtLocation = self.locationFactory.fromPath(pathInStore) 

820 elif transfer == "direct": 820 ↛ 825line 820 didn't jump to line 825, because the condition on line 820 was never true

821 # Want to store the full URI to the resource directly in 

822 # datastore. This is useful for referring to permanent archive 

823 # storage for raw data. 

824 # Trust that people know what they are doing. 

825 tgtLocation = None 

826 else: 

827 # Work out the name we want this ingested file to have 

828 # inside the datastore 

829 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

830 if not tgtLocation.uri.dirname().exists(): 

831 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

832 tgtLocation.uri.dirname().mkdir() 

833 

834 # if we are transferring from a local file to a remote location 

835 # it may be more efficient to get the size and checksum of the 

836 # local file rather than the transferred one 

837 if not srcUri.scheme or srcUri.scheme == "file": 837 ↛ 843line 837 didn't jump to line 843, because the condition on line 837 was never false

838 size = srcUri.size() 

839 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

840 have_sized = True 

841 

842 # transfer the resource to the destination 

843 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction) 

844 

845 if tgtLocation is None: 845 ↛ 847line 845 didn't jump to line 847, because the condition on line 845 was never true

846 # This means we are using direct mode 

847 targetUri = srcUri 

848 targetPath = str(srcUri) 

849 else: 

850 targetUri = tgtLocation.uri 

851 targetPath = tgtLocation.pathInStore.path 

852 

853 # the file should exist in the datastore now 

854 if not have_sized: 

855 size = targetUri.size() 

856 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

857 

858 return StoredFileInfo(formatter=formatter, path=targetPath, 

859 storageClass=ref.datasetType.storageClass, 

860 component=ref.datasetType.component(), 

861 file_size=size, checksum=checksum) 

862 

863 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

864 # Docstring inherited from Datastore._prepIngest. 

865 filtered = [] 

866 for dataset in datasets: 

867 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

868 if not acceptable: 

869 continue 

870 else: 

871 dataset.refs = acceptable 

872 if dataset.formatter is None: 

873 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

874 else: 

875 assert isinstance(dataset.formatter, (type, str)) 

876 dataset.formatter = getClassOf(dataset.formatter) 

877 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

878 filtered.append(dataset) 

879 return _IngestPrepData(filtered) 

880 

881 @transactional 

882 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None: 

883 # Docstring inherited from Datastore._finishIngest. 

884 refsAndInfos = [] 

885 for dataset in prepData.datasets: 

886 # Do ingest as if the first dataset ref is associated with the file 

887 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter, 

888 transfer=transfer) 

889 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

890 self._register_datasets(refsAndInfos) 

891 

892 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef, 

893 formatter: Union[Formatter, Type[Formatter]]) -> Location: 

894 """Given a source URI and a DatasetRef, determine the name the 

895 dataset will have inside datastore. 

896 

897 Parameters 

898 ---------- 

899 srcUri : `ButlerURI` 

900 URI to the source dataset file. 

901 ref : `DatasetRef` 

902 Ref associated with the newly-ingested dataset artifact. This 

903 is used to determine the name within the datastore. 

904 formatter : `Formatter` or Formatter class. 

905 Formatter to use for validation. Can be a class or an instance. 

906 

907 Returns 

908 ------- 

909 location : `Location` 

910 Target location for the newly-ingested dataset. 

911 """ 

912 # Ingesting a file from outside the datastore. 

913 # This involves a new name. 

914 template = self.templates.getTemplate(ref) 

915 location = self.locationFactory.fromPath(template.format(ref)) 

916 

917 # Get the extension 

918 ext = srcUri.getExtension() 

919 

920 # Update the destination to include that extension 

921 location.updateExtension(ext) 

922 

923 # Ask the formatter to validate this extension 

924 formatter.validateExtension(location) 

925 

926 return location 

927 

928 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

929 """Write out in memory dataset to datastore. 

930 

931 Parameters 

932 ---------- 

933 inMemoryDataset : `object` 

934 Dataset to write to datastore. 

935 ref : `DatasetRef` 

936 Registry information associated with this dataset. 

937 

938 Returns 

939 ------- 

940 info : `StoredFileInfo` 

941 Information describin the artifact written to the datastore. 

942 """ 

943 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

944 uri = location.uri 

945 

946 if not uri.dirname().exists(): 

947 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

948 uri.dirname().mkdir() 

949 

950 if self._transaction is None: 950 ↛ 951line 950 didn't jump to line 951, because the condition on line 950 was never true

951 raise RuntimeError("Attempting to write artifact without transaction enabled") 

952 

953 def _removeFileExists(uri: ButlerURI) -> None: 

954 """Remove a file and do not complain if it is not there. 

955 

956 This is important since a formatter might fail before the file 

957 is written and we should not confuse people by writing spurious 

958 error messages to the log. 

959 """ 

960 try: 

961 uri.remove() 

962 except FileNotFoundError: 

963 pass 

964 

965 # Register a callback to try to delete the uploaded data if 

966 # something fails below 

967 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

968 

969 # For a local file, simply use the formatter directly 

970 if uri.isLocal: 

971 formatter.write(inMemoryDataset) 

972 log.debug("Successfully wrote python object to local file at %s", uri) 

973 else: 

974 # This is a remote URI, so first try bytes and write directly else 

975 # fallback to a temporary file 

976 try: 

977 serializedDataset = formatter.toBytes(inMemoryDataset) 

978 log.debug("Writing bytes directly to %s", uri) 

979 uri.write(serializedDataset, overwrite=True) 

980 log.debug("Successfully wrote bytes directly to %s", uri) 

981 except NotImplementedError: 

982 with tempfile.NamedTemporaryFile(suffix=uri.getExtension()) as tmpFile: 

983 # Need to configure the formatter to write to a different 

984 # location and that needs us to overwrite internals 

985 tmpLocation = Location(*os.path.split(tmpFile.name)) 

986 log.debug("Writing dataset to temporary location at %s", tmpLocation.uri) 

987 with formatter._updateLocation(tmpLocation): 

988 formatter.write(inMemoryDataset) 

989 uri.transfer_from(tmpLocation.uri, transfer="copy", overwrite=True) 

990 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

991 

992 # URI is needed to resolve what ingest case are we dealing with 

993 return self._extractIngestInfo(uri, ref, formatter=formatter) 

994 

995 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

996 ref: DatasetRef, isComponent: bool = False) -> Any: 

997 """Read the artifact from datastore into in memory object. 

998 

999 Parameters 

1000 ---------- 

1001 getInfo : `DatastoreFileGetInformation` 

1002 Information about the artifact within the datastore. 

1003 ref : `DatasetRef` 

1004 The registry information associated with this artifact. 

1005 isComponent : `bool` 

1006 Flag to indicate if a component is being read from this artifact. 

1007 

1008 Returns 

1009 ------- 

1010 inMemoryDataset : `object` 

1011 The artifact as a python object. 

1012 """ 

1013 location = getInfo.location 

1014 uri = location.uri 

1015 log.debug("Accessing data from %s", uri) 

1016 

1017 # Cannot recalculate checksum but can compare size as a quick check 

1018 # Do not do this if the size is negative since that indicates 

1019 # we do not know. 

1020 recorded_size = getInfo.info.file_size 

1021 resource_size = uri.size() 

1022 if recorded_size >= 0 and resource_size != recorded_size: 1022 ↛ 1023line 1022 didn't jump to line 1023, because the condition on line 1022 was never true

1023 raise RuntimeError("Integrity failure in Datastore. " 

1024 f"Size of file {uri} ({resource_size}) " 

1025 f"does not match size recorded in registry of {recorded_size}") 

1026 

1027 # For the general case we have choices for how to proceed. 

1028 # 1. Always use a local file (downloading the remote resource to a 

1029 # temporary file if needed). 

1030 # 2. Use a threshold size and read into memory and use bytes. 

1031 # Use both for now with an arbitrary hand off size. 

1032 # This allows small datasets to be downloaded from remote object 

1033 # stores without requiring a temporary file. 

1034 

1035 formatter = getInfo.formatter 

1036 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1037 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1038 serializedDataset = uri.read() 

1039 log.debug("Deserializing %s from %d bytes from location %s with formatter %s", 

1040 f"component {getInfo.component}" if isComponent else "", 

1041 len(serializedDataset), uri, formatter.name()) 

1042 try: 

1043 result = formatter.fromBytes(serializedDataset, 

1044 component=getInfo.component if isComponent else None) 

1045 except Exception as e: 

1046 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1047 f" ({ref.datasetType.name} from {uri}): {e}") from e 

1048 else: 

1049 # Read from file 

1050 with uri.as_local() as local_uri: 

1051 # Have to update the Location associated with the formatter 

1052 # because formatter.read does not allow an override. 

1053 # This could be improved. 

1054 msg = "" 

1055 newLocation = None 

1056 if uri != local_uri: 

1057 newLocation = Location(*local_uri.split()) 

1058 msg = "(via download to local file)" 

1059 

1060 log.debug("Reading %s from location %s %s with formatter %s", 

1061 f"component {getInfo.component}" if isComponent else "", 

1062 uri, msg, formatter.name()) 

1063 try: 

1064 with formatter._updateLocation(newLocation): 

1065 result = formatter.read(component=getInfo.component if isComponent else None) 

1066 except Exception as e: 

1067 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1068 f" ({ref.datasetType.name} from {uri}): {e}") from e 

1069 

1070 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

1071 isComponent=isComponent) 

1072 

1073 def exists(self, ref: DatasetRef) -> bool: 

1074 """Check if the dataset exists in the datastore. 

1075 

1076 Parameters 

1077 ---------- 

1078 ref : `DatasetRef` 

1079 Reference to the required dataset. 

1080 

1081 Returns 

1082 ------- 

1083 exists : `bool` 

1084 `True` if the entity exists in the `Datastore`. 

1085 """ 

1086 fileLocations = self._get_dataset_locations_info(ref) 

1087 

1088 # if we are being asked to trust that registry might not be correct 

1089 # we ask for the expected locations and check them explicitly 

1090 if not fileLocations: 

1091 if not self.trustGetRequest: 

1092 return False 

1093 fileLocations = self._get_expected_dataset_locations_info(ref) 

1094 for location, _ in fileLocations: 

1095 if not self._artifact_exists(location): 

1096 return False 

1097 

1098 return True 

1099 

1100 def getURIs(self, ref: DatasetRef, 

1101 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

1102 """Return URIs associated with dataset. 

1103 

1104 Parameters 

1105 ---------- 

1106 ref : `DatasetRef` 

1107 Reference to the required dataset. 

1108 predict : `bool`, optional 

1109 If the datastore does not know about the dataset, should it 

1110 return a predicted URI or not? 

1111 

1112 Returns 

1113 ------- 

1114 primary : `ButlerURI` 

1115 The URI to the primary artifact associated with this dataset. 

1116 If the dataset was disassembled within the datastore this 

1117 may be `None`. 

1118 components : `dict` 

1119 URIs to any components associated with the dataset artifact. 

1120 Can be empty if there are no components. 

1121 """ 

1122 

1123 primary: Optional[ButlerURI] = None 

1124 components: Dict[str, ButlerURI] = {} 

1125 

1126 # if this has never been written then we have to guess 

1127 if not self.exists(ref): 

1128 if not predict: 

1129 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

1130 

1131 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1132 

1133 if doDisassembly: 

1134 

1135 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

1136 compRef = ref.makeComponentRef(component) 

1137 compLocation, _ = self._determine_put_formatter_location(compRef) 

1138 

1139 # Add a URI fragment to indicate this is a guess 

1140 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted") 

1141 

1142 else: 

1143 

1144 location, _ = self._determine_put_formatter_location(ref) 

1145 

1146 # Add a URI fragment to indicate this is a guess 

1147 primary = ButlerURI(location.uri.geturl() + "#predicted") 

1148 

1149 return primary, components 

1150 

1151 # If this is a ref that we have written we can get the path. 

1152 # Get file metadata and internal metadata 

1153 fileLocations = self._get_dataset_locations_info(ref) 

1154 

1155 guessing = False 

1156 if not fileLocations: 

1157 if not self.trustGetRequest: 1157 ↛ 1158line 1157 didn't jump to line 1158, because the condition on line 1157 was never true

1158 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1159 fileLocations = self._get_expected_dataset_locations_info(ref) 

1160 guessing = True 

1161 

1162 if len(fileLocations) == 1: 

1163 # No disassembly so this is the primary URI 

1164 uri = fileLocations[0][0].uri 

1165 if guessing and not uri.exists(): 1165 ↛ 1166line 1165 didn't jump to line 1166, because the condition on line 1165 was never true

1166 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1167 primary = uri 

1168 

1169 else: 

1170 for location, storedFileInfo in fileLocations: 

1171 if storedFileInfo.component is None: 1171 ↛ 1172line 1171 didn't jump to line 1172, because the condition on line 1171 was never true

1172 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1173 uri = location.uri 

1174 if guessing and not uri.exists(): 1174 ↛ 1175line 1174 didn't jump to line 1175, because the condition on line 1174 was never true

1175 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1176 components[storedFileInfo.component] = uri 

1177 

1178 return primary, components 

1179 

1180 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI: 

1181 """URI to the Dataset. 

1182 

1183 Parameters 

1184 ---------- 

1185 ref : `DatasetRef` 

1186 Reference to the required Dataset. 

1187 predict : `bool` 

1188 If `True`, allow URIs to be returned of datasets that have not 

1189 been written. 

1190 

1191 Returns 

1192 ------- 

1193 uri : `str` 

1194 URI pointing to the dataset within the datastore. If the 

1195 dataset does not exist in the datastore, and if ``predict`` is 

1196 `True`, the URI will be a prediction and will include a URI 

1197 fragment "#predicted". 

1198 If the datastore does not have entities that relate well 

1199 to the concept of a URI the returned URI will be 

1200 descriptive. The returned URI is not guaranteed to be obtainable. 

1201 

1202 Raises 

1203 ------ 

1204 FileNotFoundError 

1205 Raised if a URI has been requested for a dataset that does not 

1206 exist and guessing is not allowed. 

1207 RuntimeError 

1208 Raised if a request is made for a single URI but multiple URIs 

1209 are associated with this dataset. 

1210 

1211 Notes 

1212 ----- 

1213 When a predicted URI is requested an attempt will be made to form 

1214 a reasonable URI based on file templates and the expected formatter. 

1215 """ 

1216 primary, components = self.getURIs(ref, predict) 

1217 if primary is None or components: 1217 ↛ 1218line 1217 didn't jump to line 1218, because the condition on line 1217 was never true

1218 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. " 

1219 "Use Dataastore.getURIs() instead.") 

1220 return primary 

1221 

1222 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

1223 """Load an InMemoryDataset from the store. 

1224 

1225 Parameters 

1226 ---------- 

1227 ref : `DatasetRef` 

1228 Reference to the required Dataset. 

1229 parameters : `dict` 

1230 `StorageClass`-specific parameters that specify, for example, 

1231 a slice of the dataset to be loaded. 

1232 

1233 Returns 

1234 ------- 

1235 inMemoryDataset : `object` 

1236 Requested dataset or slice thereof as an InMemoryDataset. 

1237 

1238 Raises 

1239 ------ 

1240 FileNotFoundError 

1241 Requested dataset can not be retrieved. 

1242 TypeError 

1243 Return value from formatter has unexpected type. 

1244 ValueError 

1245 Formatter failed to process the dataset. 

1246 """ 

1247 allGetInfo = self._prepare_for_get(ref, parameters) 

1248 refComponent = ref.datasetType.component() 

1249 

1250 # Supplied storage class for the component being read 

1251 refStorageClass = ref.datasetType.storageClass 

1252 

1253 # Create mapping from component name to related info 

1254 allComponents = {i.component: i for i in allGetInfo} 

1255 

1256 # By definition the dataset is disassembled if we have more 

1257 # than one record for it. 

1258 isDisassembled = len(allGetInfo) > 1 

1259 

1260 # Look for the special case where we are disassembled but the 

1261 # component is a derived component that was not written during 

1262 # disassembly. For this scenario we need to check that the 

1263 # component requested is listed as a derived component for the 

1264 # composite storage class 

1265 isDisassembledReadOnlyComponent = False 

1266 if isDisassembled and refComponent: 

1267 # The composite storage class should be accessible through 

1268 # the component dataset type 

1269 compositeStorageClass = ref.datasetType.parentStorageClass 

1270 

1271 # In the unlikely scenario where the composite storage 

1272 # class is not known, we can only assume that this is a 

1273 # normal component. If that assumption is wrong then the 

1274 # branch below that reads a persisted component will fail 

1275 # so there is no need to complain here. 

1276 if compositeStorageClass is not None: 1276 ↛ 1279line 1276 didn't jump to line 1279, because the condition on line 1276 was never false

1277 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

1278 

1279 if isDisassembled and not refComponent: 

1280 # This was a disassembled dataset spread over multiple files 

1281 # and we need to put them all back together again. 

1282 # Read into memory and then assemble 

1283 

1284 # Check that the supplied parameters are suitable for the type read 

1285 refStorageClass.validateParameters(parameters) 

1286 

1287 # We want to keep track of all the parameters that were not used 

1288 # by formatters. We assume that if any of the component formatters 

1289 # use a parameter that we do not need to apply it again in the 

1290 # assembler. 

1291 usedParams = set() 

1292 

1293 components: Dict[str, Any] = {} 

1294 for getInfo in allGetInfo: 

1295 # assemblerParams are parameters not understood by the 

1296 # associated formatter. 

1297 usedParams.update(set(getInfo.formatterParams)) 

1298 

1299 component = getInfo.component 

1300 

1301 if component is None: 1301 ↛ 1302line 1301 didn't jump to line 1302, because the condition on line 1301 was never true

1302 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

1303 

1304 # We do not want the formatter to think it's reading 

1305 # a component though because it is really reading a 

1306 # standalone dataset -- always tell reader it is not a 

1307 # component. 

1308 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False) 

1309 

1310 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

1311 

1312 # Any unused parameters will have to be passed to the assembler 

1313 if parameters: 

1314 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

1315 else: 

1316 unusedParams = {} 

1317 

1318 # Process parameters 

1319 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset, 

1320 parameters=unusedParams) 

1321 

1322 elif isDisassembledReadOnlyComponent: 

1323 

1324 compositeStorageClass = ref.datasetType.parentStorageClass 

1325 if compositeStorageClass is None: 1325 ↛ 1326line 1325 didn't jump to line 1326, because the condition on line 1325 was never true

1326 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since" 

1327 "no composite storage class is available.") 

1328 

1329 if refComponent is None: 1329 ↛ 1331line 1329 didn't jump to line 1331, because the condition on line 1329 was never true

1330 # Mainly for mypy 

1331 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

1332 

1333 # Assume that every derived component can be calculated by 

1334 # forwarding the request to a single read/write component. 

1335 # Rather than guessing which rw component is the right one by 

1336 # scanning each for a derived component of the same name, 

1337 # we ask the storage class delegate directly which one is best to 

1338 # use. 

1339 compositeDelegate = compositeStorageClass.delegate() 

1340 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent, 

1341 set(allComponents)) 

1342 

1343 # Select the relevant component 

1344 rwInfo = allComponents[forwardedComponent] 

1345 

1346 # For now assume that read parameters are validated against 

1347 # the real component and not the requested component 

1348 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

1349 forwardedStorageClass.validateParameters(parameters) 

1350 

1351 # Unfortunately the FileDescriptor inside the formatter will have 

1352 # the wrong write storage class so we need to create a new one 

1353 # given the immutability constraint. 

1354 writeStorageClass = rwInfo.info.storageClass 

1355 

1356 # We may need to put some thought into parameters for read 

1357 # components but for now forward them on as is 

1358 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location, 

1359 readStorageClass=refStorageClass, 

1360 storageClass=writeStorageClass, 

1361 parameters=parameters), 

1362 ref.dataId) 

1363 

1364 # The assembler can not receive any parameter requests for a 

1365 # derived component at this time since the assembler will 

1366 # see the storage class of the derived component and those 

1367 # parameters will have to be handled by the formatter on the 

1368 # forwarded storage class. 

1369 assemblerParams: Dict[str, Any] = {} 

1370 

1371 # Need to created a new info that specifies the derived 

1372 # component and associated storage class 

1373 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter, 

1374 rwInfo.info, assemblerParams, {}, 

1375 refComponent, refStorageClass) 

1376 

1377 return self._read_artifact_into_memory(readInfo, ref, isComponent=True) 

1378 

1379 else: 

1380 # Single file request or component from that composite file 

1381 for lookup in (refComponent, None): 1381 ↛ 1386line 1381 didn't jump to line 1386, because the loop on line 1381 didn't complete

1382 if lookup in allComponents: 1382 ↛ 1381line 1382 didn't jump to line 1381, because the condition on line 1382 was never false

1383 getInfo = allComponents[lookup] 

1384 break 

1385 else: 

1386 raise FileNotFoundError(f"Component {refComponent} not found " 

1387 f"for ref {ref} in datastore {self.name}") 

1388 

1389 # Do not need the component itself if already disassembled 

1390 if isDisassembled: 

1391 isComponent = False 

1392 else: 

1393 isComponent = getInfo.component is not None 

1394 

1395 # For a disassembled component we can validate parametersagainst 

1396 # the component storage class directly 

1397 if isDisassembled: 

1398 refStorageClass.validateParameters(parameters) 

1399 else: 

1400 # For an assembled composite this could be a derived 

1401 # component derived from a real component. The validity 

1402 # of the parameters is not clear. For now validate against 

1403 # the composite storage class 

1404 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

1405 

1406 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent) 

1407 

1408 @transactional 

1409 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

1410 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

1411 

1412 Parameters 

1413 ---------- 

1414 inMemoryDataset : `object` 

1415 The dataset to store. 

1416 ref : `DatasetRef` 

1417 Reference to the associated Dataset. 

1418 

1419 Raises 

1420 ------ 

1421 TypeError 

1422 Supplied object and storage class are inconsistent. 

1423 DatasetTypeNotSupportedError 

1424 The associated `DatasetType` is not handled by this datastore. 

1425 

1426 Notes 

1427 ----- 

1428 If the datastore is configured to reject certain dataset types it 

1429 is possible that the put will fail and raise a 

1430 `DatasetTypeNotSupportedError`. The main use case for this is to 

1431 allow `ChainedDatastore` to put to multiple datastores without 

1432 requiring that every datastore accepts the dataset. 

1433 """ 

1434 

1435 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1436 # doDisassembly = True 

1437 

1438 artifacts = [] 

1439 if doDisassembly: 

1440 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

1441 for component, componentInfo in components.items(): 

1442 # Don't recurse because we want to take advantage of 

1443 # bulk insert -- need a new DatasetRef that refers to the 

1444 # same dataset_id but has the component DatasetType 

1445 # DatasetType does not refer to the types of components 

1446 # So we construct one ourselves. 

1447 compRef = ref.makeComponentRef(component) 

1448 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

1449 artifacts.append((compRef, storedInfo)) 

1450 else: 

1451 # Write the entire thing out 

1452 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

1453 artifacts.append((ref, storedInfo)) 

1454 

1455 self._register_datasets(artifacts) 

1456 

1457 @transactional 

1458 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None: 

1459 """Indicate to the datastore that a dataset can be removed. 

1460 

1461 Parameters 

1462 ---------- 

1463 ref : `DatasetRef` 

1464 Reference to the required Dataset. 

1465 ignore_errors : `bool` 

1466 If `True` return without error even if something went wrong. 

1467 Problems could occur if another process is simultaneously trying 

1468 to delete. 

1469 

1470 Raises 

1471 ------ 

1472 FileNotFoundError 

1473 Attempt to remove a dataset that does not exist. 

1474 """ 

1475 # Get file metadata and internal metadata 

1476 log.debug("Trashing %s in datastore %s", ref, self.name) 

1477 

1478 fileLocations = self._get_dataset_locations_info(ref) 

1479 

1480 if not fileLocations: 

1481 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

1482 if ignore_errors: 

1483 log.warning(err_msg) 

1484 return 

1485 else: 

1486 raise FileNotFoundError(err_msg) 

1487 

1488 for location, storedFileInfo in fileLocations: 

1489 if not self._artifact_exists(location): 1489 ↛ 1490line 1489 didn't jump to line 1490, because the condition on line 1489 was never true

1490 err_msg = f"Dataset is known to datastore {self.name} but " \ 

1491 f"associated artifact ({location.uri}) is missing" 

1492 if ignore_errors: 

1493 log.warning(err_msg) 

1494 return 

1495 else: 

1496 raise FileNotFoundError(err_msg) 

1497 

1498 # Mark dataset as trashed 

1499 try: 

1500 self._move_to_trash_in_registry(ref) 

1501 except Exception as e: 

1502 if ignore_errors: 

1503 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} " 

1504 f"but encountered an error: {e}") 

1505 pass 

1506 else: 

1507 raise 

1508 

1509 @transactional 

1510 def emptyTrash(self, ignore_errors: bool = True) -> None: 

1511 """Remove all datasets from the trash. 

1512 

1513 Parameters 

1514 ---------- 

1515 ignore_errors : `bool` 

1516 If `True` return without error even if something went wrong. 

1517 Problems could occur if another process is simultaneously trying 

1518 to delete. 

1519 """ 

1520 log.debug("Emptying trash in datastore %s", self.name) 

1521 # Context manager will empty trash iff we finish it without raising. 

1522 with self.bridge.emptyTrash() as trashed: 

1523 for ref in trashed: 

1524 fileLocations = self._get_dataset_locations_info(ref) 

1525 

1526 if not fileLocations: 1526 ↛ 1527line 1526 didn't jump to line 1527, because the condition on line 1526 was never true

1527 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}" 

1528 if ignore_errors: 

1529 log.warning(err_msg) 

1530 continue 

1531 else: 

1532 raise FileNotFoundError(err_msg) 

1533 

1534 for location, _ in fileLocations: 

1535 

1536 if not self._artifact_exists(location): 1536 ↛ 1537line 1536 didn't jump to line 1537, because the condition on line 1536 was never true

1537 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}" 

1538 if ignore_errors: 

1539 log.warning(err_msg) 

1540 continue 

1541 else: 

1542 raise FileNotFoundError(err_msg) 

1543 

1544 # Can only delete the artifact if there are no references 

1545 # to the file from untrashed dataset refs. 

1546 if self._can_remove_dataset_artifact(ref, location): 

1547 # Point of no return for this artifact 

1548 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

1549 try: 

1550 self._delete_artifact(location) 

1551 except Exception as e: 

1552 if ignore_errors: 

1553 log.critical("Encountered error removing artifact %s from datastore %s: %s", 

1554 location.uri, self.name, e) 

1555 else: 

1556 raise 

1557 

1558 # Now must remove the entry from the internal registry even if 

1559 # the artifact removal failed and was ignored, 

1560 # otherwise the removal check above will never be true 

1561 try: 

1562 # There may be multiple rows associated with this ref 

1563 # depending on disassembly 

1564 self.removeStoredItemInfo(ref) 

1565 except Exception as e: 

1566 if ignore_errors: 

1567 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s", 

1568 ref.id, location.uri, self.name, e) 

1569 continue 

1570 else: 

1571 raise FileNotFoundError(err_msg) 

1572 

1573 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], 

1574 logFailures: bool = False) -> None: 

1575 """Validate some of the configuration for this datastore. 

1576 

1577 Parameters 

1578 ---------- 

1579 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

1580 Entities to test against this configuration. Can be differing 

1581 types. 

1582 logFailures : `bool`, optional 

1583 If `True`, output a log message for every validation error 

1584 detected. 

1585 

1586 Raises 

1587 ------ 

1588 DatastoreValidationError 

1589 Raised if there is a validation problem with a configuration. 

1590 All the problems are reported in a single exception. 

1591 

1592 Notes 

1593 ----- 

1594 This method checks that all the supplied entities have valid file 

1595 templates and also have formatters defined. 

1596 """ 

1597 

1598 templateFailed = None 

1599 try: 

1600 self.templates.validateTemplates(entities, logFailures=logFailures) 

1601 except FileTemplateValidationError as e: 

1602 templateFailed = str(e) 

1603 

1604 formatterFailed = [] 

1605 for entity in entities: 

1606 try: 

1607 self.formatterFactory.getFormatterClass(entity) 

1608 except KeyError as e: 

1609 formatterFailed.append(str(e)) 

1610 if logFailures: 1610 ↛ 1605line 1610 didn't jump to line 1605, because the condition on line 1610 was never false

1611 log.critical("Formatter failure: %s", e) 

1612 

1613 if templateFailed or formatterFailed: 

1614 messages = [] 

1615 if templateFailed: 1615 ↛ 1616line 1615 didn't jump to line 1616, because the condition on line 1615 was never true

1616 messages.append(templateFailed) 

1617 if formatterFailed: 1617 ↛ 1619line 1617 didn't jump to line 1619, because the condition on line 1617 was never false

1618 messages.append(",".join(formatterFailed)) 

1619 msg = ";\n".join(messages) 

1620 raise DatastoreValidationError(msg) 

1621 

1622 def getLookupKeys(self) -> Set[LookupKey]: 

1623 # Docstring is inherited from base class 

1624 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \ 

1625 self.constraints.getLookupKeys() 

1626 

1627 def validateKey(self, lookupKey: LookupKey, 

1628 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

1629 # Docstring is inherited from base class 

1630 # The key can be valid in either formatters or templates so we can 

1631 # only check the template if it exists 

1632 if lookupKey in self.templates: 

1633 try: 

1634 self.templates[lookupKey].validateTemplate(entity) 

1635 except FileTemplateValidationError as e: 

1636 raise DatastoreValidationError(e) from e 

1637 

1638 def export(self, refs: Iterable[DatasetRef], *, 

1639 directory: Optional[Union[ButlerURI, str]] = None, 

1640 transfer: Optional[str] = "auto") -> Iterable[FileDataset]: 

1641 # Docstring inherited from Datastore.export. 

1642 if transfer is not None and directory is None: 1642 ↛ 1643line 1642 didn't jump to line 1643, because the condition on line 1642 was never true

1643 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no " 

1644 "export directory given") 

1645 

1646 # Force the directory to be a URI object 

1647 directoryUri: Optional[ButlerURI] = None 

1648 if directory is not None: 1648 ↛ 1651line 1648 didn't jump to line 1651, because the condition on line 1648 was never false

1649 directoryUri = ButlerURI(directory, forceDirectory=True) 

1650 

1651 if transfer is not None and directoryUri is not None: 1651 ↛ 1656line 1651 didn't jump to line 1656, because the condition on line 1651 was never false

1652 # mypy needs the second test 

1653 if not directoryUri.exists(): 1653 ↛ 1654line 1653 didn't jump to line 1654, because the condition on line 1653 was never true

1654 raise FileNotFoundError(f"Export location {directory} does not exist") 

1655 

1656 for ref in refs: 

1657 fileLocations = self._get_dataset_locations_info(ref) 

1658 if not fileLocations: 1658 ↛ 1659line 1658 didn't jump to line 1659, because the condition on line 1658 was never true

1659 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

1660 # For now we can not export disassembled datasets 

1661 if len(fileLocations) > 1: 

1662 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

1663 location, storedFileInfo = fileLocations[0] 

1664 

1665 pathInStore = location.pathInStore.path 

1666 if transfer is None: 1666 ↛ 1669line 1666 didn't jump to line 1669, because the condition on line 1666 was never true

1667 # TODO: do we also need to return the readStorageClass somehow? 

1668 # We will use the path in store directly 

1669 pass 

1670 elif transfer == "direct": 1670 ↛ 1672line 1670 didn't jump to line 1672, because the condition on line 1670 was never true

1671 # Use full URIs to the remote store in the export 

1672 pathInStore = str(location.uri) 

1673 else: 

1674 # mypy needs help 

1675 assert directoryUri is not None, "directoryUri must be defined to get here" 

1676 storeUri = ButlerURI(location.uri) 

1677 

1678 # if the datastore has an absolute URI to a resource, we 

1679 # have two options: 

1680 # 1. Keep the absolute URI in the exported YAML 

1681 # 2. Allocate a new name in the local datastore and transfer 

1682 # it. 

1683 # For now go with option 2 

1684 if location.pathInStore.isabs(): 1684 ↛ 1685line 1684 didn't jump to line 1685, because the condition on line 1684 was never true

1685 template = self.templates.getTemplate(ref) 

1686 pathInStore = template.format(ref) 

1687 

1688 exportUri = directoryUri.join(pathInStore) 

1689 exportUri.transfer_from(storeUri, transfer=transfer) 

1690 

1691 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

1692 

1693 @staticmethod 

1694 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]: 

1695 """Compute the checksum of the supplied file. 

1696 

1697 Parameters 

1698 ---------- 

1699 uri : `ButlerURI` 

1700 Name of resource to calculate checksum from. 

1701 algorithm : `str`, optional 

1702 Name of algorithm to use. Must be one of the algorithms supported 

1703 by :py:class`hashlib`. 

1704 block_size : `int` 

1705 Number of bytes to read from file at one time. 

1706 

1707 Returns 

1708 ------- 

1709 hexdigest : `str` 

1710 Hex digest of the file. 

1711 

1712 Notes 

1713 ----- 

1714 Currently returns None if the URI is for a remote resource. 

1715 """ 

1716 if algorithm not in hashlib.algorithms_guaranteed: 1716 ↛ 1717line 1716 didn't jump to line 1717, because the condition on line 1716 was never true

1717 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

1718 

1719 if not uri.isLocal: 1719 ↛ 1720line 1719 didn't jump to line 1720, because the condition on line 1719 was never true

1720 return None 

1721 

1722 hasher = hashlib.new(algorithm) 

1723 

1724 with uri.as_local() as local_uri: 

1725 with open(local_uri.ospath, "rb") as f: 

1726 for chunk in iter(lambda: f.read(block_size), b""): 

1727 hasher.update(chunk) 

1728 

1729 return hasher.hexdigest()