Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileDatastore", ) 

26 

27import hashlib 

28import logging 

29import os 

30import tempfile 

31 

32from sqlalchemy import BigInteger, String 

33 

34from dataclasses import dataclass 

35from typing import ( 

36 TYPE_CHECKING, 

37 Any, 

38 ClassVar, 

39 Dict, 

40 Iterable, 

41 List, 

42 Mapping, 

43 Optional, 

44 Set, 

45 Tuple, 

46 Type, 

47 Union, 

48) 

49 

50from lsst.daf.butler import ( 

51 ButlerURI, 

52 CompositesMap, 

53 Config, 

54 FileDataset, 

55 DatasetRef, 

56 DatasetType, 

57 DatasetTypeNotSupportedError, 

58 Datastore, 

59 DatastoreConfig, 

60 DatastoreValidationError, 

61 FileDescriptor, 

62 FileTemplates, 

63 FileTemplateValidationError, 

64 Formatter, 

65 FormatterFactory, 

66 Location, 

67 LocationFactory, 

68 StorageClass, 

69 StoredFileInfo, 

70) 

71 

72from lsst.daf.butler import ddl 

73from lsst.daf.butler.registry.interfaces import ( 

74 ReadOnlyDatabaseError, 

75 DatastoreRegistryBridge, 

76) 

77 

78from lsst.daf.butler.core.repoRelocation import replaceRoot 

79from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional 

80from .genericDatastore import GenericBaseDatastore 

81 

82if TYPE_CHECKING: 82 ↛ 83line 82 didn't jump to line 83, because the condition on line 82 was never true

83 from lsst.daf.butler import LookupKey 

84 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

85 

86log = logging.getLogger(__name__) 

87 

88# String to use when a Python None is encountered 

89NULLSTR = "__NULL_STRING__" 

90 

91 

92class _IngestPrepData(Datastore.IngestPrepData): 

93 """Helper class for FileDatastore ingest implementation. 

94 

95 Parameters 

96 ---------- 

97 datasets : `list` of `FileDataset` 

98 Files to be ingested by this datastore. 

99 """ 

100 def __init__(self, datasets: List[FileDataset]): 

101 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

102 self.datasets = datasets 

103 

104 

105@dataclass(frozen=True) 

106class DatastoreFileGetInformation: 

107 """Collection of useful parameters needed to retrieve a file from 

108 a Datastore. 

109 """ 

110 

111 location: Location 

112 """The location from which to read the dataset.""" 

113 

114 formatter: Formatter 

115 """The `Formatter` to use to deserialize the dataset.""" 

116 

117 info: StoredFileInfo 

118 """Stored information about this file and its formatter.""" 

119 

120 assemblerParams: Dict[str, Any] 

121 """Parameters to use for post-processing the retrieved dataset.""" 

122 

123 formatterParams: Dict[str, Any] 

124 """Parameters that were understood by the associated formatter.""" 

125 

126 component: Optional[str] 

127 """The component to be retrieved (can be `None`).""" 

128 

129 readStorageClass: StorageClass 

130 """The `StorageClass` of the dataset being read.""" 

131 

132 

133class FileDatastore(GenericBaseDatastore): 

134 """Generic Datastore for file-based implementations. 

135 

136 Should always be sub-classed since key abstract methods are missing. 

137 

138 Parameters 

139 ---------- 

140 config : `DatastoreConfig` or `str` 

141 Configuration as either a `Config` object or URI to file. 

142 bridgeManager : `DatastoreRegistryBridgeManager` 

143 Object that manages the interface between `Registry` and datastores. 

144 butlerRoot : `str`, optional 

145 New datastore root to use to override the configuration value. 

146 

147 Raises 

148 ------ 

149 ValueError 

150 If root location does not exist and ``create`` is `False` in the 

151 configuration. 

152 """ 

153 

154 defaultConfigFile: ClassVar[Optional[str]] = None 

155 """Path to configuration defaults. Accessed within the ``config`` resource 

156 or relative to a search path. Can be None if no defaults specified. 

157 """ 

158 

159 root: ButlerURI 

160 """Root directory URI of this `Datastore`.""" 

161 

162 locationFactory: LocationFactory 

163 """Factory for creating locations relative to the datastore root.""" 

164 

165 formatterFactory: FormatterFactory 

166 """Factory for creating instances of formatters.""" 

167 

168 templates: FileTemplates 

169 """File templates that can be used by this `Datastore`.""" 

170 

171 composites: CompositesMap 

172 """Determines whether a dataset should be disassembled on put.""" 

173 

174 defaultConfigFile = "datastores/fileDatastore.yaml" 

175 """Path to configuration defaults. Accessed within the ``config`` resource 

176 or relative to a search path. Can be None if no defaults specified. 

177 """ 

178 

179 @classmethod 

180 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

181 """Set any filesystem-dependent config options for this Datastore to 

182 be appropriate for a new empty repository with the given root. 

183 

184 Parameters 

185 ---------- 

186 root : `str` 

187 URI to the root of the data repository. 

188 config : `Config` 

189 A `Config` to update. Only the subset understood by 

190 this component will be updated. Will not expand 

191 defaults. 

192 full : `Config` 

193 A complete config with all defaults expanded that can be 

194 converted to a `DatastoreConfig`. Read-only and will not be 

195 modified by this method. 

196 Repository-specific options that should not be obtained 

197 from defaults when Butler instances are constructed 

198 should be copied from ``full`` to ``config``. 

199 overwrite : `bool`, optional 

200 If `False`, do not modify a value in ``config`` if the value 

201 already exists. Default is always to overwrite with the provided 

202 ``root``. 

203 

204 Notes 

205 ----- 

206 If a keyword is explicitly defined in the supplied ``config`` it 

207 will not be overridden by this method if ``overwrite`` is `False`. 

208 This allows explicit values set in external configs to be retained. 

209 """ 

210 Config.updateParameters(DatastoreConfig, config, full, 

211 toUpdate={"root": root}, 

212 toCopy=("cls", ("records", "table")), overwrite=overwrite) 

213 

214 @classmethod 

215 def makeTableSpec(cls) -> ddl.TableSpec: 

216 return ddl.TableSpec( 

217 fields=[ 

218 ddl.FieldSpec(name="dataset_id", dtype=BigInteger, primaryKey=True), 

219 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

220 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

221 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

222 # Use empty string to indicate no component 

223 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

224 # TODO: should checksum be Base64Bytes instead? 

225 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

226 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

227 ], 

228 unique=frozenset(), 

229 ) 

230 

231 def __init__(self, config: Union[DatastoreConfig, str], 

232 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

233 super().__init__(config, bridgeManager) 

234 if "root" not in self.config: 234 ↛ 235line 234 didn't jump to line 235, because the condition on line 234 was never true

235 raise ValueError("No root directory specified in configuration") 

236 

237 # Name ourselves either using an explicit name or a name 

238 # derived from the (unexpanded) root 

239 if "name" in self.config: 

240 self.name = self.config["name"] 

241 else: 

242 # We use the unexpanded root in the name to indicate that this 

243 # datastore can be moved without having to update registry. 

244 self.name = "{}@{}".format(type(self).__name__, 

245 self.config["root"]) 

246 

247 # Support repository relocation in config 

248 # Existence of self.root is checked in subclass 

249 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot), 

250 forceDirectory=True, forceAbsolute=True) 

251 

252 self.locationFactory = LocationFactory(self.root) 

253 self.formatterFactory = FormatterFactory() 

254 

255 # Now associate formatters with storage classes 

256 self.formatterFactory.registerFormatters(self.config["formatters"], 

257 universe=bridgeManager.universe) 

258 

259 # Read the file naming templates 

260 self.templates = FileTemplates(self.config["templates"], 

261 universe=bridgeManager.universe) 

262 

263 # See if composites should be disassembled 

264 self.composites = CompositesMap(self.config["composites"], 

265 universe=bridgeManager.universe) 

266 

267 tableName = self.config["records", "table"] 

268 try: 

269 # Storage of paths and formatters, keyed by dataset_id 

270 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec()) 

271 # Interface to Registry. 

272 self._bridge = bridgeManager.register(self.name) 

273 except ReadOnlyDatabaseError: 

274 # If the database is read only and we just tried and failed to 

275 # create a table, it means someone is trying to create a read-only 

276 # butler client for an empty repo. That should be okay, as long 

277 # as they then try to get any datasets before some other client 

278 # creates the table. Chances are they'rejust validating 

279 # configuration. 

280 pass 

281 

282 # Determine whether checksums should be used - default to False 

283 self.useChecksum = self.config.get("checksum", False) 

284 

285 # Check existence and create directory structure if necessary 

286 if not self.root.exists(): 

287 if "create" not in self.config or not self.config["create"]: 287 ↛ 288line 287 didn't jump to line 288, because the condition on line 287 was never true

288 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

289 try: 

290 self.root.mkdir() 

291 except Exception as e: 

292 raise ValueError(f"Can not create datastore root '{self.root}', check permissions." 

293 f" Got error: {e}") from e 

294 

295 def __str__(self) -> str: 

296 return str(self.root) 

297 

298 @property 

299 def bridge(self) -> DatastoreRegistryBridge: 

300 return self._bridge 

301 

302 def _artifact_exists(self, location: Location) -> bool: 

303 """Check that an artifact exists in this datastore at the specified 

304 location. 

305 

306 Parameters 

307 ---------- 

308 location : `Location` 

309 Expected location of the artifact associated with this datastore. 

310 

311 Returns 

312 ------- 

313 exists : `bool` 

314 True if the location can be found, false otherwise. 

315 """ 

316 log.debug("Checking if resource exists: %s", location.uri) 

317 return location.uri.exists() 

318 

319 def _delete_artifact(self, location: Location) -> None: 

320 """Delete the artifact from the datastore. 

321 

322 Parameters 

323 ---------- 

324 location : `Location` 

325 Location of the artifact associated with this datastore. 

326 """ 

327 log.debug("Deleting file: %s", location.uri) 

328 location.uri.remove() 

329 log.debug("Successfully deleted file: %s", location.uri) 

330 

331 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

332 # Docstring inherited from GenericBaseDatastore 

333 records = [] 

334 for ref, info in zip(refs, infos): 

335 # Component should come from ref and fall back on info 

336 component = ref.datasetType.component() 

337 if component is None and info.component is not None: 337 ↛ 338line 337 didn't jump to line 338, because the condition on line 337 was never true

338 component = info.component 

339 if component is None: 

340 # Use empty string since we want this to be part of the 

341 # primary key. 

342 component = NULLSTR 

343 records.append( 

344 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path, 

345 storage_class=info.storageClass.name, component=component, 

346 checksum=info.checksum, file_size=info.file_size) 

347 ) 

348 self._table.insert(*records) 

349 

350 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

351 # Docstring inherited from GenericBaseDatastore 

352 

353 # Look for the dataset_id -- there might be multiple matches 

354 # if we have disassembled the dataset. 

355 records = list(self._table.fetch(dataset_id=ref.id)) 

356 

357 results = [] 

358 for record in records: 

359 # Convert name of StorageClass to instance 

360 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"]) 

361 component = record["component"] if (record["component"] 

362 and record["component"] != NULLSTR) else None 

363 

364 info = StoredFileInfo(formatter=record["formatter"], 

365 path=record["path"], 

366 storageClass=storageClass, 

367 component=component, 

368 checksum=record["checksum"], 

369 file_size=record["file_size"]) 

370 results.append(info) 

371 

372 return results 

373 

374 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[int]: 

375 """Return all dataset refs associated with the supplied path. 

376 

377 Parameters 

378 ---------- 

379 pathInStore : `ButlerURI` 

380 Path of interest in the data store. 

381 

382 Returns 

383 ------- 

384 ids : `set` of `int` 

385 All `DatasetRef` IDs associated with this path. 

386 """ 

387 records = list(self._table.fetch(path=str(pathInStore))) 

388 ids = {r["dataset_id"] for r in records} 

389 return ids 

390 

391 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

392 # Docstring inherited from GenericBaseDatastore 

393 self._table.delete(dataset_id=ref.id) 

394 

395 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

396 r"""Find all the `Location`\ s of the requested dataset in the 

397 `Datastore` and the associated stored file information. 

398 

399 Parameters 

400 ---------- 

401 ref : `DatasetRef` 

402 Reference to the required `Dataset`. 

403 

404 Returns 

405 ------- 

406 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

407 Location of the dataset within the datastore and 

408 stored information about each file and its formatter. 

409 """ 

410 # Get the file information (this will fail if no file) 

411 records = self.getStoredItemsInfo(ref) 

412 

413 # Use the path to determine the location -- we need to take 

414 # into account absolute URIs in the datastore record 

415 locations: List[Tuple[Location, StoredFileInfo]] = [] 

416 for r in records: 

417 uriInStore = ButlerURI(r.path, forceAbsolute=False) 

418 if uriInStore.isabs(): 418 ↛ 419line 418 didn't jump to line 419, because the condition on line 418 was never true

419 location = Location(None, uriInStore) 

420 else: 

421 location = self.locationFactory.fromPath(r.path) 

422 locations.append((location, r)) 

423 return locations 

424 

425 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

426 """Check that there is only one dataset associated with the 

427 specified artifact. 

428 

429 Parameters 

430 ---------- 

431 ref : `DatasetRef` or `FakeDatasetRef` 

432 Dataset to be removed. 

433 location : `Location` 

434 The location of the artifact to be removed. 

435 

436 Returns 

437 ------- 

438 can_remove : `Bool` 

439 True if the artifact can be safely removed. 

440 """ 

441 

442 # Get all entries associated with this path 

443 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

444 if not allRefs: 444 ↛ 445line 444 didn't jump to line 445, because the condition on line 444 was never true

445 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

446 

447 # Remove these refs from all the refs and if there is nothing left 

448 # then we can delete 

449 remainingRefs = allRefs - {ref.id} 

450 

451 if remainingRefs: 

452 return False 

453 return True 

454 

455 def _prepare_for_get(self, ref: DatasetRef, 

456 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]: 

457 """Check parameters for ``get`` and obtain formatter and 

458 location. 

459 

460 Parameters 

461 ---------- 

462 ref : `DatasetRef` 

463 Reference to the required Dataset. 

464 parameters : `dict` 

465 `StorageClass`-specific parameters that specify, for example, 

466 a slice of the dataset to be loaded. 

467 

468 Returns 

469 ------- 

470 getInfo : `list` [`DatastoreFileGetInformation`] 

471 Parameters needed to retrieve each file. 

472 """ 

473 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

474 

475 # Get file metadata and internal metadata 

476 fileLocations = self._get_dataset_locations_info(ref) 

477 if not fileLocations: 

478 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

479 

480 # The storage class we want to use eventually 

481 refStorageClass = ref.datasetType.storageClass 

482 

483 if len(fileLocations) > 1: 

484 disassembled = True 

485 else: 

486 disassembled = False 

487 

488 # Is this a component request? 

489 refComponent = ref.datasetType.component() 

490 

491 fileGetInfo = [] 

492 for location, storedFileInfo in fileLocations: 

493 

494 # The storage class used to write the file 

495 writeStorageClass = storedFileInfo.storageClass 

496 

497 # If this has been disassembled we need read to match the write 

498 if disassembled: 

499 readStorageClass = writeStorageClass 

500 else: 

501 readStorageClass = refStorageClass 

502 

503 formatter = getInstanceOf(storedFileInfo.formatter, 

504 FileDescriptor(location, readStorageClass=readStorageClass, 

505 storageClass=writeStorageClass, parameters=parameters), 

506 ref.dataId) 

507 

508 formatterParams, notFormatterParams = formatter.segregateParameters() 

509 

510 # Of the remaining parameters, extract the ones supported by 

511 # this StorageClass (for components not all will be handled) 

512 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

513 

514 # The ref itself could be a component if the dataset was 

515 # disassembled by butler, or we disassembled in datastore and 

516 # components came from the datastore records 

517 component = storedFileInfo.component if storedFileInfo.component else refComponent 

518 

519 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo, 

520 assemblerParams, formatterParams, 

521 component, readStorageClass)) 

522 

523 return fileGetInfo 

524 

525 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

526 """Check the arguments for ``put`` and obtain formatter and 

527 location. 

528 

529 Parameters 

530 ---------- 

531 inMemoryDataset : `object` 

532 The dataset to store. 

533 ref : `DatasetRef` 

534 Reference to the associated Dataset. 

535 

536 Returns 

537 ------- 

538 location : `Location` 

539 The location to write the dataset. 

540 formatter : `Formatter` 

541 The `Formatter` to use to write the dataset. 

542 

543 Raises 

544 ------ 

545 TypeError 

546 Supplied object and storage class are inconsistent. 

547 DatasetTypeNotSupportedError 

548 The associated `DatasetType` is not handled by this datastore. 

549 """ 

550 self._validate_put_parameters(inMemoryDataset, ref) 

551 

552 # Work out output file name 

553 try: 

554 template = self.templates.getTemplate(ref) 

555 except KeyError as e: 

556 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

557 

558 # Validate the template to protect against filenames from different 

559 # dataIds returning the same and causing overwrite confusion. 

560 template.validateTemplate(ref) 

561 

562 location = self.locationFactory.fromPath(template.format(ref)) 

563 

564 # Get the formatter based on the storage class 

565 storageClass = ref.datasetType.storageClass 

566 try: 

567 formatter = self.formatterFactory.getFormatter(ref, 

568 FileDescriptor(location, 

569 storageClass=storageClass), 

570 ref.dataId) 

571 except KeyError as e: 

572 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore " 

573 f"{self.name}") from e 

574 

575 # Now that we know the formatter, update the location 

576 location = formatter.makeUpdatedLocation(location) 

577 

578 return location, formatter 

579 

580 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

581 # Docstring inherited from base class 

582 if transfer != "auto": 

583 return transfer 

584 

585 # See if the paths are within the datastore or not 

586 inside = [self._pathInStore(d.path) is not None for d in datasets] 

587 

588 if all(inside): 

589 transfer = None 

590 elif not any(inside): 590 ↛ 594line 590 didn't jump to line 594, because the condition on line 590 was never false

591 # Allow ButlerURI to use its own knowledge 

592 transfer = "auto" 

593 else: 

594 raise ValueError("Some datasets are inside the datastore and some are outside." 

595 " Please use an explicit transfer mode and not 'auto'.") 

596 

597 return transfer 

598 

599 def _pathInStore(self, path: str) -> Optional[str]: 

600 """Return path relative to datastore root 

601 

602 Parameters 

603 ---------- 

604 path : `str` 

605 Path to dataset. Can be absolute. If relative assumed to 

606 be relative to the datastore. Returns path in datastore 

607 or raises an exception if the path it outside. 

608 

609 Returns 

610 ------- 

611 inStore : `str` 

612 Path relative to datastore root. Returns `None` if the file is 

613 outside the root. 

614 """ 

615 # Relative path will always be relative to datastore 

616 pathUri = ButlerURI(path, forceAbsolute=False) 

617 return pathUri.relative_to(self.root) 

618 

619 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

620 """Standardize the path of a to-be-ingested file. 

621 

622 Parameters 

623 ---------- 

624 path : `str` 

625 Path of a file to be ingested. 

626 transfer : `str`, optional 

627 How (and whether) the dataset should be added to the datastore. 

628 See `ingest` for details of transfer modes. 

629 This implementation is provided only so 

630 `NotImplementedError` can be raised if the mode is not supported; 

631 actual transfers are deferred to `_extractIngestInfo`. 

632 

633 Returns 

634 ------- 

635 path : `str` 

636 New path in what the datastore considers standard form. 

637 

638 Notes 

639 ----- 

640 Subclasses of `FileDatastore` can implement this method instead 

641 of `_prepIngest`. It should not modify the data repository or given 

642 file in any way. 

643 

644 Raises 

645 ------ 

646 NotImplementedError 

647 Raised if the datastore does not support the given transfer mode 

648 (including the case where ingest is not supported at all). 

649 FileNotFoundError 

650 Raised if one of the given files does not exist. 

651 """ 

652 if transfer not in (None, "direct") + self.root.transferModes: 652 ↛ 653line 652 didn't jump to line 653, because the condition on line 652 was never true

653 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

654 

655 # A relative URI indicates relative to datastore root 

656 srcUri = ButlerURI(path, forceAbsolute=False) 

657 if not srcUri.isabs(): 

658 srcUri = self.root.join(path) 

659 

660 if not srcUri.exists(): 

661 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest " 

662 f"are assumed to be relative to {self.root} unless they are absolute.") 

663 

664 if transfer is None: 

665 relpath = srcUri.relative_to(self.root) 

666 if not relpath: 

667 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not " 

668 f"within datastore ({self.root})") 

669 

670 # Return the relative path within the datastore for internal 

671 # transfer 

672 path = relpath 

673 

674 return path 

675 

676 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *, 

677 formatter: Union[Formatter, Type[Formatter]], 

678 transfer: Optional[str] = None) -> StoredFileInfo: 

679 """Relocate (if necessary) and extract `StoredFileInfo` from a 

680 to-be-ingested file. 

681 

682 Parameters 

683 ---------- 

684 path : `str` or `ButlerURI` 

685 URI or path of a file to be ingested. 

686 ref : `DatasetRef` 

687 Reference for the dataset being ingested. Guaranteed to have 

688 ``dataset_id not None`. 

689 formatter : `type` or `Formatter` 

690 `Formatter` subclass to use for this dataset or an instance. 

691 transfer : `str`, optional 

692 How (and whether) the dataset should be added to the datastore. 

693 See `ingest` for details of transfer modes. 

694 

695 Returns 

696 ------- 

697 info : `StoredFileInfo` 

698 Internal datastore record for this file. This will be inserted by 

699 the caller; the `_extractIngestInfo` is only resposible for 

700 creating and populating the struct. 

701 

702 Raises 

703 ------ 

704 FileNotFoundError 

705 Raised if one of the given files does not exist. 

706 FileExistsError 

707 Raised if transfer is not `None` but the (internal) location the 

708 file would be moved to is already occupied. 

709 """ 

710 if self._transaction is None: 710 ↛ 711line 710 didn't jump to line 711, because the condition on line 710 was never true

711 raise RuntimeError("Ingest called without transaction enabled") 

712 

713 # Create URI of the source path, do not need to force a relative 

714 # path to absolute. 

715 srcUri = ButlerURI(path, forceAbsolute=False) 

716 

717 # Track whether we have read the size of the source yet 

718 have_sized = False 

719 

720 tgtLocation: Optional[Location] 

721 if transfer is None: 

722 # A relative path is assumed to be relative to the datastore 

723 # in this context 

724 if not srcUri.isabs(): 

725 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

726 else: 

727 # Work out the path in the datastore from an absolute URI 

728 # This is required to be within the datastore. 

729 pathInStore = srcUri.relative_to(self.root) 

730 if pathInStore is None: 730 ↛ 731line 730 didn't jump to line 731, because the condition on line 730 was never true

731 raise RuntimeError(f"Unexpectedly learned that {srcUri} is " 

732 f"not within datastore {self.root}") 

733 tgtLocation = self.locationFactory.fromPath(pathInStore) 

734 elif transfer == "direct": 734 ↛ 739line 734 didn't jump to line 739, because the condition on line 734 was never true

735 # Want to store the full URI to the resource directly in 

736 # datastore. This is useful for referring to permanent archive 

737 # storage for raw data. 

738 # Trust that people know what they are doing. 

739 tgtLocation = None 

740 else: 

741 # Work out the name we want this ingested file to have 

742 # inside the datastore 

743 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

744 if not tgtLocation.uri.dirname().exists(): 

745 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

746 tgtLocation.uri.dirname().mkdir() 

747 

748 # if we are transferring from a local file to a remote location 

749 # it may be more efficient to get the size and checksum of the 

750 # local file rather than the transferred one 

751 if not srcUri.scheme or srcUri.scheme == "file": 751 ↛ 757line 751 didn't jump to line 757, because the condition on line 751 was never false

752 size = srcUri.size() 

753 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

754 have_sized = True 

755 

756 # transfer the resource to the destination 

757 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction) 

758 

759 if tgtLocation is None: 759 ↛ 761line 759 didn't jump to line 761, because the condition on line 759 was never true

760 # This means we are using direct mode 

761 targetUri = srcUri 

762 targetPath = str(srcUri) 

763 else: 

764 targetUri = tgtLocation.uri 

765 targetPath = tgtLocation.pathInStore.path 

766 

767 # the file should exist in the datastore now 

768 if not have_sized: 

769 size = targetUri.size() 

770 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

771 

772 return StoredFileInfo(formatter=formatter, path=targetPath, 

773 storageClass=ref.datasetType.storageClass, 

774 component=ref.datasetType.component(), 

775 file_size=size, checksum=checksum) 

776 

777 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

778 # Docstring inherited from Datastore._prepIngest. 

779 filtered = [] 

780 for dataset in datasets: 

781 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

782 if not acceptable: 

783 continue 

784 else: 

785 dataset.refs = acceptable 

786 if dataset.formatter is None: 

787 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

788 else: 

789 assert isinstance(dataset.formatter, (type, str)) 

790 dataset.formatter = getClassOf(dataset.formatter) 

791 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

792 filtered.append(dataset) 

793 return _IngestPrepData(filtered) 

794 

795 @transactional 

796 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None: 

797 # Docstring inherited from Datastore._finishIngest. 

798 refsAndInfos = [] 

799 for dataset in prepData.datasets: 

800 # Do ingest as if the first dataset ref is associated with the file 

801 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter, 

802 transfer=transfer) 

803 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

804 self._register_datasets(refsAndInfos) 

805 

806 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef, 

807 formatter: Union[Formatter, Type[Formatter]]) -> Location: 

808 """Given a source URI and a DatasetRef, determine the name the 

809 dataset will have inside datastore. 

810 

811 Parameters 

812 ---------- 

813 srcUri : `ButlerURI` 

814 URI to the source dataset file. 

815 ref : `DatasetRef` 

816 Ref associated with the newly-ingested dataset artifact. This 

817 is used to determine the name within the datastore. 

818 formatter : `Formatter` or Formatter class. 

819 Formatter to use for validation. Can be a class or an instance. 

820 

821 Returns 

822 ------- 

823 location : `Location` 

824 Target location for the newly-ingested dataset. 

825 """ 

826 # Ingesting a file from outside the datastore. 

827 # This involves a new name. 

828 template = self.templates.getTemplate(ref) 

829 location = self.locationFactory.fromPath(template.format(ref)) 

830 

831 # Get the extension 

832 ext = srcUri.getExtension() 

833 

834 # Update the destination to include that extension 

835 location.updateExtension(ext) 

836 

837 # Ask the formatter to validate this extension 

838 formatter.validateExtension(location) 

839 

840 return location 

841 

842 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

843 """Write out in memory dataset to datastore. 

844 

845 Parameters 

846 ---------- 

847 inMemoryDataset : `object` 

848 Dataset to write to datastore. 

849 ref : `DatasetRef` 

850 Registry information associated with this dataset. 

851 

852 Returns 

853 ------- 

854 info : `StoredFileInfo` 

855 Information describin the artifact written to the datastore. 

856 """ 

857 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

858 uri = location.uri 

859 

860 if not uri.dirname().exists(): 

861 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

862 uri.dirname().mkdir() 

863 

864 if self._transaction is None: 864 ↛ 865line 864 didn't jump to line 865, because the condition on line 864 was never true

865 raise RuntimeError("Attempting to write artifact without transaction enabled") 

866 

867 def _removeFileExists(uri: ButlerURI) -> None: 

868 """Remove a file and do not complain if it is not there. 

869 

870 This is important since a formatter might fail before the file 

871 is written and we should not confuse people by writing spurious 

872 error messages to the log. 

873 """ 

874 try: 

875 uri.remove() 

876 except FileNotFoundError: 

877 pass 

878 

879 # Register a callback to try to delete the uploaded data if 

880 # something fails below 

881 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

882 

883 # For a local file, simply use the formatter directly 

884 if uri.isLocal: 

885 formatter.write(inMemoryDataset) 

886 log.debug("Successfully wrote python object to local file at %s", uri) 

887 else: 

888 # This is a remote URI, so first try bytes and write directly else 

889 # fallback to a temporary file 

890 try: 

891 serializedDataset = formatter.toBytes(inMemoryDataset) 

892 log.debug("Writing bytes directly to %s", uri) 

893 uri.write(serializedDataset, overwrite=True) 

894 log.debug("Successfully wrote bytes directly to %s", uri) 

895 except NotImplementedError: 

896 with tempfile.NamedTemporaryFile(suffix=uri.getExtension()) as tmpFile: 

897 # Need to configure the formatter to write to a different 

898 # location and that needs us to overwrite internals 

899 tmpLocation = Location(*os.path.split(tmpFile.name)) 

900 log.debug("Writing dataset to temporary location at %s", tmpLocation.uri) 

901 with formatter._updateLocation(tmpLocation): 

902 formatter.write(inMemoryDataset) 

903 uri.transfer_from(tmpLocation.uri, transfer="copy", overwrite=True) 

904 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

905 

906 # URI is needed to resolve what ingest case are we dealing with 

907 return self._extractIngestInfo(uri, ref, formatter=formatter) 

908 

909 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

910 ref: DatasetRef, isComponent: bool = False) -> Any: 

911 """Read the artifact from datastore into in memory object. 

912 

913 Parameters 

914 ---------- 

915 getInfo : `DatastoreFileGetInformation` 

916 Information about the artifact within the datastore. 

917 ref : `DatasetRef` 

918 The registry information associated with this artifact. 

919 isComponent : `bool` 

920 Flag to indicate if a component is being read from this artifact. 

921 

922 Returns 

923 ------- 

924 inMemoryDataset : `object` 

925 The artifact as a python object. 

926 """ 

927 location = getInfo.location 

928 uri = location.uri 

929 log.debug("Accessing data from %s", uri) 

930 

931 # Cannot recalculate checksum but can compare size as a quick check 

932 recorded_size = getInfo.info.file_size 

933 resource_size = uri.size() 

934 if resource_size != recorded_size: 934 ↛ 935line 934 didn't jump to line 935, because the condition on line 934 was never true

935 raise RuntimeError("Integrity failure in Datastore. " 

936 f"Size of file {uri} ({resource_size}) " 

937 f"does not match size recorded in registry of {recorded_size}") 

938 

939 # For the general case we have choices for how to proceed. 

940 # 1. Always use a local file (downloading the remote resource to a 

941 # temporary file if needed). 

942 # 2. Use a threshold size and read into memory and use bytes. 

943 # Use both for now with an arbitrary hand off size. 

944 # This allows small datasets to be downloaded from remote object 

945 # stores without requiring a temporary file. 

946 

947 formatter = getInfo.formatter 

948 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

949 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

950 serializedDataset = uri.read() 

951 log.debug("Deserializing %s from %d bytes from location %s with formatter %s", 

952 f"component {getInfo.component}" if isComponent else "", 

953 len(serializedDataset), uri, formatter.name()) 

954 try: 

955 result = formatter.fromBytes(serializedDataset, 

956 component=getInfo.component if isComponent else None) 

957 except Exception as e: 

958 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

959 f" ({ref.datasetType.name} from {uri}): {e}") from e 

960 else: 

961 # Read from file 

962 with uri.as_local() as local_uri: 

963 # Have to update the Location associated with the formatter 

964 # because formatter.read does not allow an override. 

965 # This could be improved. 

966 msg = "" 

967 newLocation = None 

968 if uri != local_uri: 

969 newLocation = Location(*local_uri.split()) 

970 msg = "(via download to local file)" 

971 

972 log.debug("Reading %s from location %s %s with formatter %s", 

973 f"component {getInfo.component}" if isComponent else "", 

974 uri, msg, formatter.name()) 

975 try: 

976 with formatter._updateLocation(newLocation): 

977 result = formatter.read(component=getInfo.component if isComponent else None) 

978 except Exception as e: 

979 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

980 f" ({ref.datasetType.name} from {uri}): {e}") from e 

981 

982 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

983 isComponent=isComponent) 

984 

985 def exists(self, ref: DatasetRef) -> bool: 

986 """Check if the dataset exists in the datastore. 

987 

988 Parameters 

989 ---------- 

990 ref : `DatasetRef` 

991 Reference to the required dataset. 

992 

993 Returns 

994 ------- 

995 exists : `bool` 

996 `True` if the entity exists in the `Datastore`. 

997 """ 

998 fileLocations = self._get_dataset_locations_info(ref) 

999 if not fileLocations: 

1000 return False 

1001 for location, _ in fileLocations: 

1002 if not self._artifact_exists(location): 

1003 return False 

1004 

1005 return True 

1006 

1007 def getURIs(self, ref: DatasetRef, 

1008 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

1009 """Return URIs associated with dataset. 

1010 

1011 Parameters 

1012 ---------- 

1013 ref : `DatasetRef` 

1014 Reference to the required dataset. 

1015 predict : `bool`, optional 

1016 If the datastore does not know about the dataset, should it 

1017 return a predicted URI or not? 

1018 

1019 Returns 

1020 ------- 

1021 primary : `ButlerURI` 

1022 The URI to the primary artifact associated with this dataset. 

1023 If the dataset was disassembled within the datastore this 

1024 may be `None`. 

1025 components : `dict` 

1026 URIs to any components associated with the dataset artifact. 

1027 Can be empty if there are no components. 

1028 """ 

1029 

1030 primary: Optional[ButlerURI] = None 

1031 components: Dict[str, ButlerURI] = {} 

1032 

1033 # if this has never been written then we have to guess 

1034 if not self.exists(ref): 

1035 if not predict: 

1036 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

1037 

1038 def predictLocation(thisRef: DatasetRef) -> Location: 

1039 template = self.templates.getTemplate(thisRef) 

1040 location = self.locationFactory.fromPath(template.format(thisRef)) 

1041 storageClass = ref.datasetType.storageClass 

1042 formatter = self.formatterFactory.getFormatter(thisRef, 

1043 FileDescriptor(location, 

1044 storageClass=storageClass)) 

1045 # Try to use the extension attribute but ignore problems if the 

1046 # formatter does not define one. 

1047 try: 

1048 location = formatter.makeUpdatedLocation(location) 

1049 except Exception: 

1050 # Use the default extension 

1051 pass 

1052 return location 

1053 

1054 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1055 

1056 if doDisassembly: 

1057 

1058 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

1059 compRef = ref.makeComponentRef(component) 

1060 compLocation = predictLocation(compRef) 

1061 

1062 # Add a URI fragment to indicate this is a guess 

1063 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted") 

1064 

1065 else: 

1066 

1067 location = predictLocation(ref) 

1068 

1069 # Add a URI fragment to indicate this is a guess 

1070 primary = ButlerURI(location.uri.geturl() + "#predicted") 

1071 

1072 return primary, components 

1073 

1074 # If this is a ref that we have written we can get the path. 

1075 # Get file metadata and internal metadata 

1076 fileLocations = self._get_dataset_locations_info(ref) 

1077 

1078 if not fileLocations: 1078 ↛ 1079line 1078 didn't jump to line 1079, because the condition on line 1078 was never true

1079 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1080 

1081 if len(fileLocations) == 1: 

1082 # No disassembly so this is the primary URI 

1083 primary = ButlerURI(fileLocations[0][0].uri) 

1084 

1085 else: 

1086 for location, storedFileInfo in fileLocations: 

1087 if storedFileInfo.component is None: 1087 ↛ 1088line 1087 didn't jump to line 1088, because the condition on line 1087 was never true

1088 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1089 components[storedFileInfo.component] = ButlerURI(location.uri) 

1090 

1091 return primary, components 

1092 

1093 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI: 

1094 """URI to the Dataset. 

1095 

1096 Parameters 

1097 ---------- 

1098 ref : `DatasetRef` 

1099 Reference to the required Dataset. 

1100 predict : `bool` 

1101 If `True`, allow URIs to be returned of datasets that have not 

1102 been written. 

1103 

1104 Returns 

1105 ------- 

1106 uri : `str` 

1107 URI pointing to the dataset within the datastore. If the 

1108 dataset does not exist in the datastore, and if ``predict`` is 

1109 `True`, the URI will be a prediction and will include a URI 

1110 fragment "#predicted". 

1111 If the datastore does not have entities that relate well 

1112 to the concept of a URI the returned URI will be 

1113 descriptive. The returned URI is not guaranteed to be obtainable. 

1114 

1115 Raises 

1116 ------ 

1117 FileNotFoundError 

1118 Raised if a URI has been requested for a dataset that does not 

1119 exist and guessing is not allowed. 

1120 RuntimeError 

1121 Raised if a request is made for a single URI but multiple URIs 

1122 are associated with this dataset. 

1123 

1124 Notes 

1125 ----- 

1126 When a predicted URI is requested an attempt will be made to form 

1127 a reasonable URI based on file templates and the expected formatter. 

1128 """ 

1129 primary, components = self.getURIs(ref, predict) 

1130 if primary is None or components: 1130 ↛ 1131line 1130 didn't jump to line 1131, because the condition on line 1130 was never true

1131 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. " 

1132 "Use Dataastore.getURIs() instead.") 

1133 return primary 

1134 

1135 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

1136 """Load an InMemoryDataset from the store. 

1137 

1138 Parameters 

1139 ---------- 

1140 ref : `DatasetRef` 

1141 Reference to the required Dataset. 

1142 parameters : `dict` 

1143 `StorageClass`-specific parameters that specify, for example, 

1144 a slice of the dataset to be loaded. 

1145 

1146 Returns 

1147 ------- 

1148 inMemoryDataset : `object` 

1149 Requested dataset or slice thereof as an InMemoryDataset. 

1150 

1151 Raises 

1152 ------ 

1153 FileNotFoundError 

1154 Requested dataset can not be retrieved. 

1155 TypeError 

1156 Return value from formatter has unexpected type. 

1157 ValueError 

1158 Formatter failed to process the dataset. 

1159 """ 

1160 allGetInfo = self._prepare_for_get(ref, parameters) 

1161 refComponent = ref.datasetType.component() 

1162 

1163 # Supplied storage class for the component being read 

1164 refStorageClass = ref.datasetType.storageClass 

1165 

1166 # Create mapping from component name to related info 

1167 allComponents = {i.component: i for i in allGetInfo} 

1168 

1169 # By definition the dataset is disassembled if we have more 

1170 # than one record for it. 

1171 isDisassembled = len(allGetInfo) > 1 

1172 

1173 # Look for the special case where we are disassembled but the 

1174 # component is a derived component that was not written during 

1175 # disassembly. For this scenario we need to check that the 

1176 # component requested is listed as a derived component for the 

1177 # composite storage class 

1178 isDisassembledReadOnlyComponent = False 

1179 if isDisassembled and refComponent: 

1180 # The composite storage class should be accessible through 

1181 # the component dataset type 

1182 compositeStorageClass = ref.datasetType.parentStorageClass 

1183 

1184 # In the unlikely scenario where the composite storage 

1185 # class is not known, we can only assume that this is a 

1186 # normal component. If that assumption is wrong then the 

1187 # branch below that reads a persisted component will fail 

1188 # so there is no need to complain here. 

1189 if compositeStorageClass is not None: 1189 ↛ 1192line 1189 didn't jump to line 1192, because the condition on line 1189 was never false

1190 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

1191 

1192 if isDisassembled and not refComponent: 

1193 # This was a disassembled dataset spread over multiple files 

1194 # and we need to put them all back together again. 

1195 # Read into memory and then assemble 

1196 

1197 # Check that the supplied parameters are suitable for the type read 

1198 refStorageClass.validateParameters(parameters) 

1199 

1200 # We want to keep track of all the parameters that were not used 

1201 # by formatters. We assume that if any of the component formatters 

1202 # use a parameter that we do not need to apply it again in the 

1203 # assembler. 

1204 usedParams = set() 

1205 

1206 components: Dict[str, Any] = {} 

1207 for getInfo in allGetInfo: 

1208 # assemblerParams are parameters not understood by the 

1209 # associated formatter. 

1210 usedParams.update(set(getInfo.formatterParams)) 

1211 

1212 component = getInfo.component 

1213 

1214 if component is None: 1214 ↛ 1215line 1214 didn't jump to line 1215, because the condition on line 1214 was never true

1215 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

1216 

1217 # We do not want the formatter to think it's reading 

1218 # a component though because it is really reading a 

1219 # standalone dataset -- always tell reader it is not a 

1220 # component. 

1221 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False) 

1222 

1223 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

1224 

1225 # Any unused parameters will have to be passed to the assembler 

1226 if parameters: 

1227 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

1228 else: 

1229 unusedParams = {} 

1230 

1231 # Process parameters 

1232 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset, 

1233 parameters=unusedParams) 

1234 

1235 elif isDisassembledReadOnlyComponent: 

1236 

1237 compositeStorageClass = ref.datasetType.parentStorageClass 

1238 if compositeStorageClass is None: 1238 ↛ 1239line 1238 didn't jump to line 1239, because the condition on line 1238 was never true

1239 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since" 

1240 "no composite storage class is available.") 

1241 

1242 if refComponent is None: 1242 ↛ 1244line 1242 didn't jump to line 1244, because the condition on line 1242 was never true

1243 # Mainly for mypy 

1244 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

1245 

1246 # Assume that every derived component can be calculated by 

1247 # forwarding the request to a single read/write component. 

1248 # Rather than guessing which rw component is the right one by 

1249 # scanning each for a derived component of the same name, 

1250 # we ask the storage class delegate directly which one is best to 

1251 # use. 

1252 compositeDelegate = compositeStorageClass.delegate() 

1253 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent, 

1254 set(allComponents)) 

1255 

1256 # Select the relevant component 

1257 rwInfo = allComponents[forwardedComponent] 

1258 

1259 # For now assume that read parameters are validated against 

1260 # the real component and not the requested component 

1261 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

1262 forwardedStorageClass.validateParameters(parameters) 

1263 

1264 # Unfortunately the FileDescriptor inside the formatter will have 

1265 # the wrong write storage class so we need to create a new one 

1266 # given the immutability constraint. 

1267 writeStorageClass = rwInfo.info.storageClass 

1268 

1269 # We may need to put some thought into parameters for read 

1270 # components but for now forward them on as is 

1271 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location, 

1272 readStorageClass=refStorageClass, 

1273 storageClass=writeStorageClass, 

1274 parameters=parameters), 

1275 ref.dataId) 

1276 

1277 # The assembler can not receive any parameter requests for a 

1278 # derived component at this time since the assembler will 

1279 # see the storage class of the derived component and those 

1280 # parameters will have to be handled by the formatter on the 

1281 # forwarded storage class. 

1282 assemblerParams: Dict[str, Any] = {} 

1283 

1284 # Need to created a new info that specifies the derived 

1285 # component and associated storage class 

1286 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter, 

1287 rwInfo.info, assemblerParams, {}, 

1288 refComponent, refStorageClass) 

1289 

1290 return self._read_artifact_into_memory(readInfo, ref, isComponent=True) 

1291 

1292 else: 

1293 # Single file request or component from that composite file 

1294 for lookup in (refComponent, None): 1294 ↛ 1299line 1294 didn't jump to line 1299, because the loop on line 1294 didn't complete

1295 if lookup in allComponents: 1295 ↛ 1294line 1295 didn't jump to line 1294, because the condition on line 1295 was never false

1296 getInfo = allComponents[lookup] 

1297 break 

1298 else: 

1299 raise FileNotFoundError(f"Component {refComponent} not found " 

1300 f"for ref {ref} in datastore {self.name}") 

1301 

1302 # Do not need the component itself if already disassembled 

1303 if isDisassembled: 

1304 isComponent = False 

1305 else: 

1306 isComponent = getInfo.component is not None 

1307 

1308 # For a disassembled component we can validate parametersagainst 

1309 # the component storage class directly 

1310 if isDisassembled: 

1311 refStorageClass.validateParameters(parameters) 

1312 else: 

1313 # For an assembled composite this could be a derived 

1314 # component derived from a real component. The validity 

1315 # of the parameters is not clear. For now validate against 

1316 # the composite storage class 

1317 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

1318 

1319 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent) 

1320 

1321 @transactional 

1322 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

1323 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

1324 

1325 Parameters 

1326 ---------- 

1327 inMemoryDataset : `object` 

1328 The dataset to store. 

1329 ref : `DatasetRef` 

1330 Reference to the associated Dataset. 

1331 

1332 Raises 

1333 ------ 

1334 TypeError 

1335 Supplied object and storage class are inconsistent. 

1336 DatasetTypeNotSupportedError 

1337 The associated `DatasetType` is not handled by this datastore. 

1338 

1339 Notes 

1340 ----- 

1341 If the datastore is configured to reject certain dataset types it 

1342 is possible that the put will fail and raise a 

1343 `DatasetTypeNotSupportedError`. The main use case for this is to 

1344 allow `ChainedDatastore` to put to multiple datastores without 

1345 requiring that every datastore accepts the dataset. 

1346 """ 

1347 

1348 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1349 # doDisassembly = True 

1350 

1351 artifacts = [] 

1352 if doDisassembly: 

1353 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

1354 for component, componentInfo in components.items(): 

1355 # Don't recurse because we want to take advantage of 

1356 # bulk insert -- need a new DatasetRef that refers to the 

1357 # same dataset_id but has the component DatasetType 

1358 # DatasetType does not refer to the types of components 

1359 # So we construct one ourselves. 

1360 compRef = ref.makeComponentRef(component) 

1361 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

1362 artifacts.append((compRef, storedInfo)) 

1363 else: 

1364 # Write the entire thing out 

1365 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

1366 artifacts.append((ref, storedInfo)) 

1367 

1368 self._register_datasets(artifacts) 

1369 

1370 @transactional 

1371 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None: 

1372 """Indicate to the datastore that a dataset can be removed. 

1373 

1374 Parameters 

1375 ---------- 

1376 ref : `DatasetRef` 

1377 Reference to the required Dataset. 

1378 ignore_errors : `bool` 

1379 If `True` return without error even if something went wrong. 

1380 Problems could occur if another process is simultaneously trying 

1381 to delete. 

1382 

1383 Raises 

1384 ------ 

1385 FileNotFoundError 

1386 Attempt to remove a dataset that does not exist. 

1387 """ 

1388 # Get file metadata and internal metadata 

1389 log.debug("Trashing %s in datastore %s", ref, self.name) 

1390 

1391 fileLocations = self._get_dataset_locations_info(ref) 

1392 

1393 if not fileLocations: 

1394 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

1395 if ignore_errors: 

1396 log.warning(err_msg) 

1397 return 

1398 else: 

1399 raise FileNotFoundError(err_msg) 

1400 

1401 for location, storedFileInfo in fileLocations: 

1402 if not self._artifact_exists(location): 1402 ↛ 1403line 1402 didn't jump to line 1403, because the condition on line 1402 was never true

1403 err_msg = f"Dataset is known to datastore {self.name} but " \ 

1404 f"associated artifact ({location.uri}) is missing" 

1405 if ignore_errors: 

1406 log.warning(err_msg) 

1407 return 

1408 else: 

1409 raise FileNotFoundError(err_msg) 

1410 

1411 # Mark dataset as trashed 

1412 try: 

1413 self._move_to_trash_in_registry(ref) 

1414 except Exception as e: 

1415 if ignore_errors: 

1416 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} " 

1417 f"but encountered an error: {e}") 

1418 pass 

1419 else: 

1420 raise 

1421 

1422 @transactional 

1423 def emptyTrash(self, ignore_errors: bool = True) -> None: 

1424 """Remove all datasets from the trash. 

1425 

1426 Parameters 

1427 ---------- 

1428 ignore_errors : `bool` 

1429 If `True` return without error even if something went wrong. 

1430 Problems could occur if another process is simultaneously trying 

1431 to delete. 

1432 """ 

1433 log.debug("Emptying trash in datastore %s", self.name) 

1434 # Context manager will empty trash iff we finish it without raising. 

1435 with self.bridge.emptyTrash() as trashed: 

1436 for ref in trashed: 

1437 fileLocations = self._get_dataset_locations_info(ref) 

1438 

1439 if not fileLocations: 1439 ↛ 1440line 1439 didn't jump to line 1440, because the condition on line 1439 was never true

1440 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}" 

1441 if ignore_errors: 

1442 log.warning(err_msg) 

1443 continue 

1444 else: 

1445 raise FileNotFoundError(err_msg) 

1446 

1447 for location, _ in fileLocations: 

1448 

1449 if not self._artifact_exists(location): 1449 ↛ 1450line 1449 didn't jump to line 1450, because the condition on line 1449 was never true

1450 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}" 

1451 if ignore_errors: 

1452 log.warning(err_msg) 

1453 continue 

1454 else: 

1455 raise FileNotFoundError(err_msg) 

1456 

1457 # Can only delete the artifact if there are no references 

1458 # to the file from untrashed dataset refs. 

1459 if self._can_remove_dataset_artifact(ref, location): 

1460 # Point of no return for this artifact 

1461 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

1462 try: 

1463 self._delete_artifact(location) 

1464 except Exception as e: 

1465 if ignore_errors: 

1466 log.critical("Encountered error removing artifact %s from datastore %s: %s", 

1467 location.uri, self.name, e) 

1468 else: 

1469 raise 

1470 

1471 # Now must remove the entry from the internal registry even if 

1472 # the artifact removal failed and was ignored, 

1473 # otherwise the removal check above will never be true 

1474 try: 

1475 # There may be multiple rows associated with this ref 

1476 # depending on disassembly 

1477 self.removeStoredItemInfo(ref) 

1478 except Exception as e: 

1479 if ignore_errors: 

1480 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s", 

1481 ref.id, location.uri, self.name, e) 

1482 continue 

1483 else: 

1484 raise FileNotFoundError(err_msg) 

1485 

1486 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], 

1487 logFailures: bool = False) -> None: 

1488 """Validate some of the configuration for this datastore. 

1489 

1490 Parameters 

1491 ---------- 

1492 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

1493 Entities to test against this configuration. Can be differing 

1494 types. 

1495 logFailures : `bool`, optional 

1496 If `True`, output a log message for every validation error 

1497 detected. 

1498 

1499 Raises 

1500 ------ 

1501 DatastoreValidationError 

1502 Raised if there is a validation problem with a configuration. 

1503 All the problems are reported in a single exception. 

1504 

1505 Notes 

1506 ----- 

1507 This method checks that all the supplied entities have valid file 

1508 templates and also have formatters defined. 

1509 """ 

1510 

1511 templateFailed = None 

1512 try: 

1513 self.templates.validateTemplates(entities, logFailures=logFailures) 

1514 except FileTemplateValidationError as e: 

1515 templateFailed = str(e) 

1516 

1517 formatterFailed = [] 

1518 for entity in entities: 

1519 try: 

1520 self.formatterFactory.getFormatterClass(entity) 

1521 except KeyError as e: 

1522 formatterFailed.append(str(e)) 

1523 if logFailures: 1523 ↛ 1518line 1523 didn't jump to line 1518, because the condition on line 1523 was never false

1524 log.critical("Formatter failure: %s", e) 

1525 

1526 if templateFailed or formatterFailed: 

1527 messages = [] 

1528 if templateFailed: 1528 ↛ 1529line 1528 didn't jump to line 1529, because the condition on line 1528 was never true

1529 messages.append(templateFailed) 

1530 if formatterFailed: 1530 ↛ 1532line 1530 didn't jump to line 1532, because the condition on line 1530 was never false

1531 messages.append(",".join(formatterFailed)) 

1532 msg = ";\n".join(messages) 

1533 raise DatastoreValidationError(msg) 

1534 

1535 def getLookupKeys(self) -> Set[LookupKey]: 

1536 # Docstring is inherited from base class 

1537 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \ 

1538 self.constraints.getLookupKeys() 

1539 

1540 def validateKey(self, lookupKey: LookupKey, 

1541 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

1542 # Docstring is inherited from base class 

1543 # The key can be valid in either formatters or templates so we can 

1544 # only check the template if it exists 

1545 if lookupKey in self.templates: 

1546 try: 

1547 self.templates[lookupKey].validateTemplate(entity) 

1548 except FileTemplateValidationError as e: 

1549 raise DatastoreValidationError(e) from e 

1550 

1551 def export(self, refs: Iterable[DatasetRef], *, 

1552 directory: Optional[Union[ButlerURI, str]] = None, 

1553 transfer: Optional[str] = "auto") -> Iterable[FileDataset]: 

1554 # Docstring inherited from Datastore.export. 

1555 if transfer is not None and directory is None: 1555 ↛ 1556line 1555 didn't jump to line 1556, because the condition on line 1555 was never true

1556 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no " 

1557 "export directory given") 

1558 

1559 # Force the directory to be a URI object 

1560 directoryUri: Optional[ButlerURI] = None 

1561 if directory is not None: 1561 ↛ 1564line 1561 didn't jump to line 1564, because the condition on line 1561 was never false

1562 directoryUri = ButlerURI(directory, forceDirectory=True) 

1563 

1564 if transfer is not None and directoryUri is not None: 1564 ↛ 1569line 1564 didn't jump to line 1569, because the condition on line 1564 was never false

1565 # mypy needs the second test 

1566 if not directoryUri.exists(): 1566 ↛ 1567line 1566 didn't jump to line 1567, because the condition on line 1566 was never true

1567 raise FileNotFoundError(f"Export location {directory} does not exist") 

1568 

1569 for ref in refs: 

1570 fileLocations = self._get_dataset_locations_info(ref) 

1571 if not fileLocations: 1571 ↛ 1572line 1571 didn't jump to line 1572, because the condition on line 1571 was never true

1572 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

1573 # For now we can not export disassembled datasets 

1574 if len(fileLocations) > 1: 

1575 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

1576 location, storedFileInfo = fileLocations[0] 

1577 

1578 pathInStore = location.pathInStore.path 

1579 if transfer is None: 1579 ↛ 1582line 1579 didn't jump to line 1582, because the condition on line 1579 was never true

1580 # TODO: do we also need to return the readStorageClass somehow? 

1581 # We will use the path in store directly 

1582 pass 

1583 elif transfer == "direct": 1583 ↛ 1585line 1583 didn't jump to line 1585, because the condition on line 1583 was never true

1584 # Use full URIs to the remote store in the export 

1585 pathInStore = str(location.uri) 

1586 else: 

1587 # mypy needs help 

1588 assert directoryUri is not None, "directoryUri must be defined to get here" 

1589 storeUri = ButlerURI(location.uri) 

1590 

1591 # if the datastore has an absolute URI to a resource, we 

1592 # have two options: 

1593 # 1. Keep the absolute URI in the exported YAML 

1594 # 2. Allocate a new name in the local datastore and transfer 

1595 # it. 

1596 # For now go with option 2 

1597 if location.pathInStore.isabs(): 1597 ↛ 1598line 1597 didn't jump to line 1598, because the condition on line 1597 was never true

1598 template = self.templates.getTemplate(ref) 

1599 pathInStore = template.format(ref) 

1600 

1601 exportUri = directoryUri.join(pathInStore) 

1602 exportUri.transfer_from(storeUri, transfer=transfer) 

1603 

1604 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

1605 

1606 @staticmethod 

1607 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]: 

1608 """Compute the checksum of the supplied file. 

1609 

1610 Parameters 

1611 ---------- 

1612 uri : `ButlerURI` 

1613 Name of resource to calculate checksum from. 

1614 algorithm : `str`, optional 

1615 Name of algorithm to use. Must be one of the algorithms supported 

1616 by :py:class`hashlib`. 

1617 block_size : `int` 

1618 Number of bytes to read from file at one time. 

1619 

1620 Returns 

1621 ------- 

1622 hexdigest : `str` 

1623 Hex digest of the file. 

1624 

1625 Notes 

1626 ----- 

1627 Currently returns None if the URI is for a remote resource. 

1628 """ 

1629 if algorithm not in hashlib.algorithms_guaranteed: 1629 ↛ 1630line 1629 didn't jump to line 1630, because the condition on line 1629 was never true

1630 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

1631 

1632 if not uri.isLocal: 1632 ↛ 1633line 1632 didn't jump to line 1633, because the condition on line 1632 was never true

1633 return None 

1634 

1635 hasher = hashlib.new(algorithm) 

1636 

1637 with uri.as_local() as local_uri: 

1638 with open(local_uri.ospath, "rb") as f: 

1639 for chunk in iter(lambda: f.read(block_size), b""): 

1640 hasher.update(chunk) 

1641 

1642 return hasher.hexdigest()