Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileLikeDatastore", ) 

26 

27import hashlib 

28import logging 

29import os 

30from abc import abstractmethod 

31 

32from sqlalchemy import BigInteger, String 

33 

34from dataclasses import dataclass 

35from typing import ( 

36 TYPE_CHECKING, 

37 Any, 

38 ClassVar, 

39 Dict, 

40 Iterable, 

41 List, 

42 Mapping, 

43 Optional, 

44 Set, 

45 Tuple, 

46 Type, 

47 Union, 

48) 

49 

50from lsst.daf.butler import ( 

51 ButlerURI, 

52 CompositesMap, 

53 Config, 

54 FileDataset, 

55 DatasetRef, 

56 DatasetType, 

57 DatasetTypeNotSupportedError, 

58 Datastore, 

59 DatastoreConfig, 

60 DatastoreValidationError, 

61 FileDescriptor, 

62 FileTemplates, 

63 FileTemplateValidationError, 

64 Formatter, 

65 FormatterFactory, 

66 Location, 

67 LocationFactory, 

68 StorageClass, 

69 StoredFileInfo, 

70) 

71 

72from lsst.daf.butler import ddl 

73from lsst.daf.butler.registry.interfaces import ( 

74 ReadOnlyDatabaseError, 

75 DatastoreRegistryBridge, 

76) 

77 

78from lsst.daf.butler.core.repoRelocation import replaceRoot 

79from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional 

80from .genericDatastore import GenericBaseDatastore 

81 

82if TYPE_CHECKING: 82 ↛ 83line 82 didn't jump to line 83, because the condition on line 82 was never true

83 from lsst.daf.butler import LookupKey 

84 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

85 

86log = logging.getLogger(__name__) 

87 

88# String to use when a Python None is encountered 

89NULLSTR = "__NULL_STRING__" 

90 

91 

92class _IngestPrepData(Datastore.IngestPrepData): 

93 """Helper class for FileLikeDatastore ingest implementation. 

94 

95 Parameters 

96 ---------- 

97 datasets : `list` of `FileDataset` 

98 Files to be ingested by this datastore. 

99 """ 

100 def __init__(self, datasets: List[FileDataset]): 

101 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

102 self.datasets = datasets 

103 

104 

105@dataclass(frozen=True) 

106class DatastoreFileGetInformation: 

107 """Collection of useful parameters needed to retrieve a file from 

108 a Datastore. 

109 """ 

110 

111 location: Location 

112 """The location from which to read the dataset.""" 

113 

114 formatter: Formatter 

115 """The `Formatter` to use to deserialize the dataset.""" 

116 

117 info: StoredFileInfo 

118 """Stored information about this file and its formatter.""" 

119 

120 assemblerParams: Dict[str, Any] 

121 """Parameters to use for post-processing the retrieved dataset.""" 

122 

123 formatterParams: Dict[str, Any] 

124 """Parameters that were understood by the associated formatter.""" 

125 

126 component: Optional[str] 

127 """The component to be retrieved (can be `None`).""" 

128 

129 readStorageClass: StorageClass 

130 """The `StorageClass` of the dataset being read.""" 

131 

132 

133class FileLikeDatastore(GenericBaseDatastore): 

134 """Generic Datastore for file-based implementations. 

135 

136 Should always be sub-classed since key abstract methods are missing. 

137 

138 Parameters 

139 ---------- 

140 config : `DatastoreConfig` or `str` 

141 Configuration as either a `Config` object or URI to file. 

142 bridgeManager : `DatastoreRegistryBridgeManager` 

143 Object that manages the interface between `Registry` and datastores. 

144 butlerRoot : `str`, optional 

145 New datastore root to use to override the configuration value. 

146 

147 Raises 

148 ------ 

149 ValueError 

150 If root location does not exist and ``create`` is `False` in the 

151 configuration. 

152 """ 

153 

154 defaultConfigFile: ClassVar[Optional[str]] = None 

155 """Path to configuration defaults. Accessed within the ``config`` resource 

156 or relative to a search path. Can be None if no defaults specified. 

157 """ 

158 

159 root: ButlerURI 

160 """Root directory URI of this `Datastore`.""" 

161 

162 locationFactory: LocationFactory 

163 """Factory for creating locations relative to the datastore root.""" 

164 

165 formatterFactory: FormatterFactory 

166 """Factory for creating instances of formatters.""" 

167 

168 templates: FileTemplates 

169 """File templates that can be used by this `Datastore`.""" 

170 

171 composites: CompositesMap 

172 """Determines whether a dataset should be disassembled on put.""" 

173 

174 @classmethod 

175 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

176 """Set any filesystem-dependent config options for this Datastore to 

177 be appropriate for a new empty repository with the given root. 

178 

179 Parameters 

180 ---------- 

181 root : `str` 

182 URI to the root of the data repository. 

183 config : `Config` 

184 A `Config` to update. Only the subset understood by 

185 this component will be updated. Will not expand 

186 defaults. 

187 full : `Config` 

188 A complete config with all defaults expanded that can be 

189 converted to a `DatastoreConfig`. Read-only and will not be 

190 modified by this method. 

191 Repository-specific options that should not be obtained 

192 from defaults when Butler instances are constructed 

193 should be copied from ``full`` to ``config``. 

194 overwrite : `bool`, optional 

195 If `False`, do not modify a value in ``config`` if the value 

196 already exists. Default is always to overwrite with the provided 

197 ``root``. 

198 

199 Notes 

200 ----- 

201 If a keyword is explicitly defined in the supplied ``config`` it 

202 will not be overridden by this method if ``overwrite`` is `False`. 

203 This allows explicit values set in external configs to be retained. 

204 """ 

205 Config.updateParameters(DatastoreConfig, config, full, 

206 toUpdate={"root": root}, 

207 toCopy=("cls", ("records", "table")), overwrite=overwrite) 

208 

209 @classmethod 

210 def makeTableSpec(cls) -> ddl.TableSpec: 

211 return ddl.TableSpec( 

212 fields=[ 

213 ddl.FieldSpec(name="dataset_id", dtype=BigInteger, primaryKey=True), 

214 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

215 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

216 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

217 # Use empty string to indicate no component 

218 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

219 # TODO: should checksum be Base64Bytes instead? 

220 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

221 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

222 ], 

223 unique=frozenset(), 

224 ) 

225 

226 def __init__(self, config: Union[DatastoreConfig, str], 

227 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

228 super().__init__(config, bridgeManager) 

229 if "root" not in self.config: 229 ↛ 230line 229 didn't jump to line 230, because the condition on line 229 was never true

230 raise ValueError("No root directory specified in configuration") 

231 

232 # Name ourselves either using an explicit name or a name 

233 # derived from the (unexpanded) root 

234 if "name" in self.config: 

235 self.name = self.config["name"] 

236 else: 

237 # We use the unexpanded root in the name to indicate that this 

238 # datastore can be moved without having to update registry. 

239 self.name = "{}@{}".format(type(self).__name__, 

240 self.config["root"]) 

241 

242 # Support repository relocation in config 

243 # Existence of self.root is checked in subclass 

244 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot), 

245 forceDirectory=True, forceAbsolute=True) 

246 

247 self.locationFactory = LocationFactory(self.root) 

248 self.formatterFactory = FormatterFactory() 

249 

250 # Now associate formatters with storage classes 

251 self.formatterFactory.registerFormatters(self.config["formatters"], 

252 universe=bridgeManager.universe) 

253 

254 # Read the file naming templates 

255 self.templates = FileTemplates(self.config["templates"], 

256 universe=bridgeManager.universe) 

257 

258 # See if composites should be disassembled 

259 self.composites = CompositesMap(self.config["composites"], 

260 universe=bridgeManager.universe) 

261 

262 tableName = self.config["records", "table"] 

263 try: 

264 # Storage of paths and formatters, keyed by dataset_id 

265 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec()) 

266 # Interface to Registry. 

267 self._bridge = bridgeManager.register(self.name) 

268 except ReadOnlyDatabaseError: 

269 # If the database is read only and we just tried and failed to 

270 # create a table, it means someone is trying to create a read-only 

271 # butler client for an empty repo. That should be okay, as long 

272 # as they then try to get any datasets before some other client 

273 # creates the table. Chances are they'rejust validating 

274 # configuration. 

275 pass 

276 

277 # Determine whether checksums should be used 

278 self.useChecksum = self.config.get("checksum", True) 

279 

280 def __str__(self) -> str: 

281 return str(self.root) 

282 

283 @property 

284 def bridge(self) -> DatastoreRegistryBridge: 

285 return self._bridge 

286 

287 def _artifact_exists(self, location: Location) -> bool: 

288 """Check that an artifact exists in this datastore at the specified 

289 location. 

290 

291 Parameters 

292 ---------- 

293 location : `Location` 

294 Expected location of the artifact associated with this datastore. 

295 

296 Returns 

297 ------- 

298 exists : `bool` 

299 True if the location can be found, false otherwise. 

300 """ 

301 log.debug("Checking if resource exists: %s", location.uri) 

302 return location.uri.exists() 

303 

304 def _delete_artifact(self, location: Location) -> None: 

305 """Delete the artifact from the datastore. 

306 

307 Parameters 

308 ---------- 

309 location : `Location` 

310 Location of the artifact associated with this datastore. 

311 """ 

312 log.debug("Deleting file: %s", location.uri) 

313 location.uri.remove() 

314 log.debug("Successfully deleted file: %s", location.uri) 

315 

316 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

317 # Docstring inherited from GenericBaseDatastore 

318 records = [] 

319 for ref, info in zip(refs, infos): 

320 # Component should come from ref and fall back on info 

321 component = ref.datasetType.component() 

322 if component is None and info.component is not None: 322 ↛ 323line 322 didn't jump to line 323, because the condition on line 322 was never true

323 component = info.component 

324 if component is None: 

325 # Use empty string since we want this to be part of the 

326 # primary key. 

327 component = NULLSTR 

328 records.append( 

329 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path, 

330 storage_class=info.storageClass.name, component=component, 

331 checksum=info.checksum, file_size=info.file_size) 

332 ) 

333 self._table.insert(*records) 

334 

335 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

336 # Docstring inherited from GenericBaseDatastore 

337 

338 # Look for the dataset_id -- there might be multiple matches 

339 # if we have disassembled the dataset. 

340 records = list(self._table.fetch(dataset_id=ref.id)) 

341 

342 results = [] 

343 for record in records: 

344 # Convert name of StorageClass to instance 

345 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"]) 

346 component = record["component"] if (record["component"] 

347 and record["component"] != NULLSTR) else None 

348 

349 info = StoredFileInfo(formatter=record["formatter"], 

350 path=record["path"], 

351 storageClass=storageClass, 

352 component=component, 

353 checksum=record["checksum"], 

354 file_size=record["file_size"]) 

355 results.append(info) 

356 

357 return results 

358 

359 def _registered_refs_per_artifact(self, pathInStore: str) -> Set[int]: 

360 """Return all dataset refs associated with the supplied path. 

361 

362 Parameters 

363 ---------- 

364 pathInStore : `str` 

365 Path of interest in the data store. 

366 

367 Returns 

368 ------- 

369 ids : `set` of `int` 

370 All `DatasetRef` IDs associated with this path. 

371 """ 

372 records = list(self._table.fetch(path=pathInStore)) 

373 ids = {r["dataset_id"] for r in records} 

374 return ids 

375 

376 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

377 # Docstring inherited from GenericBaseDatastore 

378 self._table.delete(dataset_id=ref.id) 

379 

380 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

381 r"""Find all the `Location`\ s of the requested dataset in the 

382 `Datastore` and the associated stored file information. 

383 

384 Parameters 

385 ---------- 

386 ref : `DatasetRef` 

387 Reference to the required `Dataset`. 

388 

389 Returns 

390 ------- 

391 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

392 Location of the dataset within the datastore and 

393 stored information about each file and its formatter. 

394 """ 

395 # Get the file information (this will fail if no file) 

396 records = self.getStoredItemsInfo(ref) 

397 

398 # Use the path to determine the location 

399 return [(self.locationFactory.fromPath(r.path), r) for r in records] 

400 

401 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

402 """Check that there is only one dataset associated with the 

403 specified artifact. 

404 

405 Parameters 

406 ---------- 

407 ref : `DatasetRef` or `FakeDatasetRef` 

408 Dataset to be removed. 

409 location : `Location` 

410 The location of the artifact to be removed. 

411 

412 Returns 

413 ------- 

414 can_remove : `Bool` 

415 True if the artifact can be safely removed. 

416 """ 

417 

418 # Get all entries associated with this path 

419 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

420 if not allRefs: 420 ↛ 421line 420 didn't jump to line 421, because the condition on line 420 was never true

421 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

422 

423 # Remove these refs from all the refs and if there is nothing left 

424 # then we can delete 

425 remainingRefs = allRefs - {ref.id} 

426 

427 if remainingRefs: 

428 return False 

429 return True 

430 

431 def _prepare_for_get(self, ref: DatasetRef, 

432 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]: 

433 """Check parameters for ``get`` and obtain formatter and 

434 location. 

435 

436 Parameters 

437 ---------- 

438 ref : `DatasetRef` 

439 Reference to the required Dataset. 

440 parameters : `dict` 

441 `StorageClass`-specific parameters that specify, for example, 

442 a slice of the dataset to be loaded. 

443 

444 Returns 

445 ------- 

446 getInfo : `list` [`DatastoreFileGetInformation`] 

447 Parameters needed to retrieve each file. 

448 """ 

449 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

450 

451 # Get file metadata and internal metadata 

452 fileLocations = self._get_dataset_locations_info(ref) 

453 if not fileLocations: 

454 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

455 

456 # The storage class we want to use eventually 

457 refStorageClass = ref.datasetType.storageClass 

458 

459 if len(fileLocations) > 1: 

460 disassembled = True 

461 else: 

462 disassembled = False 

463 

464 # Is this a component request? 

465 refComponent = ref.datasetType.component() 

466 

467 fileGetInfo = [] 

468 for location, storedFileInfo in fileLocations: 

469 

470 # The storage class used to write the file 

471 writeStorageClass = storedFileInfo.storageClass 

472 

473 # If this has been disassembled we need read to match the write 

474 if disassembled: 

475 readStorageClass = writeStorageClass 

476 else: 

477 readStorageClass = refStorageClass 

478 

479 formatter = getInstanceOf(storedFileInfo.formatter, 

480 FileDescriptor(location, readStorageClass=readStorageClass, 

481 storageClass=writeStorageClass, parameters=parameters), 

482 ref.dataId) 

483 

484 formatterParams, notFormatterParams = formatter.segregateParameters() 

485 

486 # Of the remaining parameters, extract the ones supported by 

487 # this StorageClass (for components not all will be handled) 

488 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

489 

490 # The ref itself could be a component if the dataset was 

491 # disassembled by butler, or we disassembled in datastore and 

492 # components came from the datastore records 

493 component = storedFileInfo.component if storedFileInfo.component else refComponent 

494 

495 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo, 

496 assemblerParams, formatterParams, 

497 component, readStorageClass)) 

498 

499 return fileGetInfo 

500 

501 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

502 """Check the arguments for ``put`` and obtain formatter and 

503 location. 

504 

505 Parameters 

506 ---------- 

507 inMemoryDataset : `object` 

508 The dataset to store. 

509 ref : `DatasetRef` 

510 Reference to the associated Dataset. 

511 

512 Returns 

513 ------- 

514 location : `Location` 

515 The location to write the dataset. 

516 formatter : `Formatter` 

517 The `Formatter` to use to write the dataset. 

518 

519 Raises 

520 ------ 

521 TypeError 

522 Supplied object and storage class are inconsistent. 

523 DatasetTypeNotSupportedError 

524 The associated `DatasetType` is not handled by this datastore. 

525 """ 

526 self._validate_put_parameters(inMemoryDataset, ref) 

527 

528 # Work out output file name 

529 try: 

530 template = self.templates.getTemplate(ref) 

531 except KeyError as e: 

532 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

533 

534 # Validate the template to protect against filenames from different 

535 # dataIds returning the same and causing overwrite confusion. 

536 template.validateTemplate(ref) 

537 

538 location = self.locationFactory.fromPath(template.format(ref)) 

539 

540 # Get the formatter based on the storage class 

541 storageClass = ref.datasetType.storageClass 

542 try: 

543 formatter = self.formatterFactory.getFormatter(ref, 

544 FileDescriptor(location, 

545 storageClass=storageClass), 

546 ref.dataId) 

547 except KeyError as e: 

548 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore " 

549 f"{self.name}") from e 

550 

551 # Now that we know the formatter, update the location 

552 location = formatter.makeUpdatedLocation(location) 

553 

554 return location, formatter 

555 

556 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

557 # Docstring inherited from base class 

558 if transfer != "auto": 

559 return transfer 

560 

561 # See if the paths are within the datastore or not 

562 inside = [self._pathInStore(d.path) is not None for d in datasets] 

563 

564 if all(inside): 

565 transfer = None 

566 elif not any(inside): 566 ↛ 570line 566 didn't jump to line 570, because the condition on line 566 was never false

567 # Allow ButlerURI to use its own knowledge 

568 transfer = "auto" 

569 else: 

570 raise ValueError("Some datasets are inside the datastore and some are outside." 

571 " Please use an explicit transfer mode and not 'auto'.") 

572 

573 return transfer 

574 

575 def _pathInStore(self, path: str) -> Optional[str]: 

576 """Return path relative to datastore root 

577 

578 Parameters 

579 ---------- 

580 path : `str` 

581 Path to dataset. Can be absolute. If relative assumed to 

582 be relative to the datastore. Returns path in datastore 

583 or raises an exception if the path it outside. 

584 

585 Returns 

586 ------- 

587 inStore : `str` 

588 Path relative to datastore root. Returns `None` if the file is 

589 outside the root. 

590 """ 

591 # Relative path will always be relative to datastore 

592 pathUri = ButlerURI(path, forceAbsolute=False) 

593 return pathUri.relative_to(self.root) 

594 

595 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

596 """Standardize the path of a to-be-ingested file. 

597 

598 Parameters 

599 ---------- 

600 path : `str` 

601 Path of a file to be ingested. 

602 transfer : `str`, optional 

603 How (and whether) the dataset should be added to the datastore. 

604 See `ingest` for details of transfer modes. 

605 This implementation is provided only so 

606 `NotImplementedError` can be raised if the mode is not supported; 

607 actual transfers are deferred to `_extractIngestInfo`. 

608 

609 Returns 

610 ------- 

611 path : `str` 

612 New path in what the datastore considers standard form. 

613 

614 Notes 

615 ----- 

616 Subclasses of `FileLikeDatastore` can implement this method instead 

617 of `_prepIngest`. It should not modify the data repository or given 

618 file in any way. 

619 

620 Raises 

621 ------ 

622 NotImplementedError 

623 Raised if the datastore does not support the given transfer mode 

624 (including the case where ingest is not supported at all). 

625 FileNotFoundError 

626 Raised if one of the given files does not exist. 

627 """ 

628 if transfer not in (None,) + self.root.transferModes: 628 ↛ 629line 628 didn't jump to line 629, because the condition on line 628 was never true

629 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

630 

631 # A relative URI indicates relative to datastore root 

632 srcUri = ButlerURI(path, forceAbsolute=False) 

633 if not srcUri.isabs(): 

634 srcUri = self.root.join(path) 

635 

636 if not srcUri.exists(): 

637 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest " 

638 f"are assumed to be relative to {self.root} unless they are absolute.") 

639 

640 if transfer is None: 

641 relpath = srcUri.relative_to(self.root) 

642 if not relpath: 

643 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not " 

644 f"within datastore ({self.root})") 

645 

646 # Return the relative path within the datastore for internal 

647 # transfer 

648 path = relpath 

649 

650 return path 

651 

652 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *, 

653 formatter: Union[Formatter, Type[Formatter]], 

654 transfer: Optional[str] = None) -> StoredFileInfo: 

655 """Relocate (if necessary) and extract `StoredFileInfo` from a 

656 to-be-ingested file. 

657 

658 Parameters 

659 ---------- 

660 path : `str` or `ButlerURI` 

661 URI or path of a file to be ingested. 

662 ref : `DatasetRef` 

663 Reference for the dataset being ingested. Guaranteed to have 

664 ``dataset_id not None`. 

665 formatter : `type` or `Formatter` 

666 `Formatter` subclass to use for this dataset or an instance. 

667 transfer : `str`, optional 

668 How (and whether) the dataset should be added to the datastore. 

669 See `ingest` for details of transfer modes. 

670 

671 Returns 

672 ------- 

673 info : `StoredFileInfo` 

674 Internal datastore record for this file. This will be inserted by 

675 the caller; the `_extractIngestInfo` is only resposible for 

676 creating and populating the struct. 

677 

678 Raises 

679 ------ 

680 FileNotFoundError 

681 Raised if one of the given files does not exist. 

682 FileExistsError 

683 Raised if transfer is not `None` but the (internal) location the 

684 file would be moved to is already occupied. 

685 """ 

686 if self._transaction is None: 686 ↛ 687line 686 didn't jump to line 687, because the condition on line 686 was never true

687 raise RuntimeError("Ingest called without transaction enabled") 

688 

689 # Create URI of the source path, do not need to force a relative 

690 # path to absolute. 

691 srcUri = ButlerURI(path, forceAbsolute=False) 

692 

693 # Track whether we have read the size of the source yet 

694 have_sized = False 

695 

696 if transfer is None: 

697 # A relative path is assumed to be relative to the datastore 

698 # in this context 

699 if not srcUri.isabs(): 

700 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

701 else: 

702 # Work out the path in the datastore from an absolute URI 

703 # This is required to be within the datastore. 

704 pathInStore = srcUri.relative_to(self.root) 

705 if pathInStore is None: 705 ↛ 706line 705 didn't jump to line 706, because the condition on line 705 was never true

706 raise RuntimeError(f"Unexpectedly learned that {srcUri} is " 

707 f"not within datastore {self.root}") 

708 tgtLocation = self.locationFactory.fromPath(pathInStore) 

709 else: 

710 # Work out the name we want this ingested file to have 

711 # inside the datastore 

712 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

713 

714 # if we are transferring from a local file to a remote location 

715 # it may be more efficient to get the size and checksum of the 

716 # local file rather than the transferred one 

717 if not srcUri.scheme or srcUri.scheme == "file": 717 ↛ 722line 717 didn't jump to line 722, because the condition on line 717 was never false

718 size = srcUri.size() 

719 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

720 

721 # transfer the resource to the destination 

722 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction) 

723 

724 # the file should exist in the datastore now 

725 if not have_sized: 725 ↛ 729line 725 didn't jump to line 729, because the condition on line 725 was never false

726 size = tgtLocation.uri.size() 

727 checksum = self.computeChecksum(tgtLocation.uri) if self.useChecksum else None 

728 

729 return StoredFileInfo(formatter=formatter, path=tgtLocation.pathInStore, 

730 storageClass=ref.datasetType.storageClass, 

731 component=ref.datasetType.component(), 

732 file_size=size, checksum=checksum) 

733 

734 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

735 # Docstring inherited from Datastore._prepIngest. 

736 filtered = [] 

737 for dataset in datasets: 

738 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

739 if not acceptable: 

740 continue 

741 else: 

742 dataset.refs = acceptable 

743 if dataset.formatter is None: 

744 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

745 else: 

746 assert isinstance(dataset.formatter, (type, str)) 

747 dataset.formatter = getClassOf(dataset.formatter) 

748 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

749 filtered.append(dataset) 

750 return _IngestPrepData(filtered) 

751 

752 @transactional 

753 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None: 

754 # Docstring inherited from Datastore._finishIngest. 

755 refsAndInfos = [] 

756 for dataset in prepData.datasets: 

757 # Do ingest as if the first dataset ref is associated with the file 

758 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter, 

759 transfer=transfer) 

760 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

761 self._register_datasets(refsAndInfos) 

762 

763 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef, 

764 formatter: Union[Formatter, Type[Formatter]]) -> Location: 

765 """Given a source URI and a DatasetRef, determine the name the 

766 dataset will have inside datastore. 

767 

768 Parameters 

769 ---------- 

770 srcUri : `ButlerURI` 

771 URI to the source dataset file. 

772 ref : `DatasetRef` 

773 Ref associated with the newly-ingested dataset artifact. This 

774 is used to determine the name within the datastore. 

775 formatter : `Formatter` or Formatter class. 

776 Formatter to use for validation. Can be a class or an instance. 

777 

778 Returns 

779 ------- 

780 location : `Location` 

781 Target location for the newly-ingested dataset. 

782 """ 

783 # Ingesting a file from outside the datastore. 

784 # This involves a new name. 

785 template = self.templates.getTemplate(ref) 

786 location = self.locationFactory.fromPath(template.format(ref)) 

787 

788 # Get the extension 

789 ext = srcUri.getExtension() 

790 

791 # Update the destination to include that extension 

792 location.updateExtension(ext) 

793 

794 # Ask the formatter to validate this extension 

795 formatter.validateExtension(location) 

796 

797 return location 

798 

799 @abstractmethod 

800 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

801 """Write out in memory dataset to datastore. 

802 

803 Parameters 

804 ---------- 

805 inMemoryDataset : `object` 

806 Dataset to write to datastore. 

807 ref : `DatasetRef` 

808 Registry information associated with this dataset. 

809 

810 Returns 

811 ------- 

812 info : `StoredFileInfo` 

813 Information describin the artifact written to the datastore. 

814 """ 

815 raise NotImplementedError() 

816 

817 @abstractmethod 

818 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

819 ref: DatasetRef, isComponent: bool = False) -> Any: 

820 """Read the artifact from datastore into in memory object. 

821 

822 Parameters 

823 ---------- 

824 getInfo : `DatastoreFileGetInformation` 

825 Information about the artifact within the datastore. 

826 ref : `DatasetRef` 

827 The registry information associated with this artifact. 

828 isComponent : `bool` 

829 Flag to indicate if a component is being read from this artifact. 

830 

831 Returns 

832 ------- 

833 inMemoryDataset : `object` 

834 The artifact as a python object. 

835 """ 

836 raise NotImplementedError() 

837 

838 def exists(self, ref: DatasetRef) -> bool: 

839 """Check if the dataset exists in the datastore. 

840 

841 Parameters 

842 ---------- 

843 ref : `DatasetRef` 

844 Reference to the required dataset. 

845 

846 Returns 

847 ------- 

848 exists : `bool` 

849 `True` if the entity exists in the `Datastore`. 

850 """ 

851 fileLocations = self._get_dataset_locations_info(ref) 

852 if not fileLocations: 

853 return False 

854 for location, _ in fileLocations: 

855 if not self._artifact_exists(location): 

856 return False 

857 

858 return True 

859 

860 def getURIs(self, ref: DatasetRef, 

861 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

862 """Return URIs associated with dataset. 

863 

864 Parameters 

865 ---------- 

866 ref : `DatasetRef` 

867 Reference to the required dataset. 

868 predict : `bool`, optional 

869 If the datastore does not know about the dataset, should it 

870 return a predicted URI or not? 

871 

872 Returns 

873 ------- 

874 primary : `ButlerURI` 

875 The URI to the primary artifact associated with this dataset. 

876 If the dataset was disassembled within the datastore this 

877 may be `None`. 

878 components : `dict` 

879 URIs to any components associated with the dataset artifact. 

880 Can be empty if there are no components. 

881 """ 

882 

883 primary: Optional[ButlerURI] = None 

884 components: Dict[str, ButlerURI] = {} 

885 

886 # if this has never been written then we have to guess 

887 if not self.exists(ref): 

888 if not predict: 

889 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

890 

891 def predictLocation(thisRef: DatasetRef) -> Location: 

892 template = self.templates.getTemplate(thisRef) 

893 location = self.locationFactory.fromPath(template.format(thisRef)) 

894 storageClass = ref.datasetType.storageClass 

895 formatter = self.formatterFactory.getFormatter(thisRef, 

896 FileDescriptor(location, 

897 storageClass=storageClass)) 

898 # Try to use the extension attribute but ignore problems if the 

899 # formatter does not define one. 

900 try: 

901 location = formatter.makeUpdatedLocation(location) 

902 except Exception: 

903 # Use the default extension 

904 pass 

905 return location 

906 

907 doDisassembly = self.composites.shouldBeDisassembled(ref) 

908 

909 if doDisassembly: 

910 

911 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

912 compRef = ref.makeComponentRef(component) 

913 compLocation = predictLocation(compRef) 

914 

915 # Add a URI fragment to indicate this is a guess 

916 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted") 

917 

918 else: 

919 

920 location = predictLocation(ref) 

921 

922 # Add a URI fragment to indicate this is a guess 

923 primary = ButlerURI(location.uri.geturl() + "#predicted") 

924 

925 return primary, components 

926 

927 # If this is a ref that we have written we can get the path. 

928 # Get file metadata and internal metadata 

929 fileLocations = self._get_dataset_locations_info(ref) 

930 

931 if not fileLocations: 931 ↛ 932line 931 didn't jump to line 932, because the condition on line 931 was never true

932 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

933 

934 if len(fileLocations) == 1: 

935 # No disassembly so this is the primary URI 

936 primary = ButlerURI(fileLocations[0][0].uri) 

937 

938 else: 

939 for location, storedFileInfo in fileLocations: 

940 if storedFileInfo.component is None: 940 ↛ 941line 940 didn't jump to line 941, because the condition on line 940 was never true

941 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

942 components[storedFileInfo.component] = ButlerURI(location.uri) 

943 

944 return primary, components 

945 

946 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI: 

947 """URI to the Dataset. 

948 

949 Parameters 

950 ---------- 

951 ref : `DatasetRef` 

952 Reference to the required Dataset. 

953 predict : `bool` 

954 If `True`, allow URIs to be returned of datasets that have not 

955 been written. 

956 

957 Returns 

958 ------- 

959 uri : `str` 

960 URI pointing to the dataset within the datastore. If the 

961 dataset does not exist in the datastore, and if ``predict`` is 

962 `True`, the URI will be a prediction and will include a URI 

963 fragment "#predicted". 

964 If the datastore does not have entities that relate well 

965 to the concept of a URI the returned URI will be 

966 descriptive. The returned URI is not guaranteed to be obtainable. 

967 

968 Raises 

969 ------ 

970 FileNotFoundError 

971 Raised if a URI has been requested for a dataset that does not 

972 exist and guessing is not allowed. 

973 RuntimeError 

974 Raised if a request is made for a single URI but multiple URIs 

975 are associated with this dataset. 

976 

977 Notes 

978 ----- 

979 When a predicted URI is requested an attempt will be made to form 

980 a reasonable URI based on file templates and the expected formatter. 

981 """ 

982 primary, components = self.getURIs(ref, predict) 

983 if primary is None or components: 983 ↛ 984line 983 didn't jump to line 984, because the condition on line 983 was never true

984 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. " 

985 "Use Dataastore.getURIs() instead.") 

986 return primary 

987 

988 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

989 """Load an InMemoryDataset from the store. 

990 

991 Parameters 

992 ---------- 

993 ref : `DatasetRef` 

994 Reference to the required Dataset. 

995 parameters : `dict` 

996 `StorageClass`-specific parameters that specify, for example, 

997 a slice of the dataset to be loaded. 

998 

999 Returns 

1000 ------- 

1001 inMemoryDataset : `object` 

1002 Requested dataset or slice thereof as an InMemoryDataset. 

1003 

1004 Raises 

1005 ------ 

1006 FileNotFoundError 

1007 Requested dataset can not be retrieved. 

1008 TypeError 

1009 Return value from formatter has unexpected type. 

1010 ValueError 

1011 Formatter failed to process the dataset. 

1012 """ 

1013 allGetInfo = self._prepare_for_get(ref, parameters) 

1014 refComponent = ref.datasetType.component() 

1015 

1016 # Supplied storage class for the component being read 

1017 refStorageClass = ref.datasetType.storageClass 

1018 

1019 # Create mapping from component name to related info 

1020 allComponents = {i.component: i for i in allGetInfo} 

1021 

1022 # By definition the dataset is disassembled if we have more 

1023 # than one record for it. 

1024 isDisassembled = len(allGetInfo) > 1 

1025 

1026 # Look for the special case where we are disassembled but the 

1027 # component is a read-only component that was not written during 

1028 # disassembly. For this scenario we need to check that the 

1029 # component requested is listed as a read-only component for the 

1030 # composite storage class 

1031 isDisassembledReadOnlyComponent = False 

1032 if isDisassembled and refComponent: 

1033 # The composite storage class should be accessible through 

1034 # the component dataset type 

1035 compositeStorageClass = ref.datasetType.parentStorageClass 

1036 

1037 # In the unlikely scenario where the composite storage 

1038 # class is not known, we can only assume that this is a 

1039 # normal component. If that assumption is wrong then the 

1040 # branch below that reads a persisted component will fail 

1041 # so there is no need to complain here. 

1042 if compositeStorageClass is not None: 1042 ↛ 1045line 1042 didn't jump to line 1045, because the condition on line 1042 was never false

1043 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.readComponents 

1044 

1045 if isDisassembled and not refComponent: 

1046 # This was a disassembled dataset spread over multiple files 

1047 # and we need to put them all back together again. 

1048 # Read into memory and then assemble 

1049 

1050 # Check that the supplied parameters are suitable for the type read 

1051 refStorageClass.validateParameters(parameters) 

1052 

1053 # We want to keep track of all the parameters that were not used 

1054 # by formatters. We assume that if any of the component formatters 

1055 # use a parameter that we do not need to apply it again in the 

1056 # assembler. 

1057 usedParams = set() 

1058 

1059 components: Dict[str, Any] = {} 

1060 for getInfo in allGetInfo: 

1061 # assemblerParams are parameters not understood by the 

1062 # associated formatter. 

1063 usedParams.update(set(getInfo.formatterParams)) 

1064 

1065 component = getInfo.component 

1066 

1067 if component is None: 1067 ↛ 1068line 1067 didn't jump to line 1068, because the condition on line 1067 was never true

1068 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

1069 

1070 # We do not want the formatter to think it's reading 

1071 # a component though because it is really reading a 

1072 # standalone dataset -- always tell reader it is not a 

1073 # component. 

1074 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False) 

1075 

1076 inMemoryDataset = ref.datasetType.storageClass.assembler().assemble(components) 

1077 

1078 # Any unused parameters will have to be passed to the assembler 

1079 if parameters: 

1080 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

1081 else: 

1082 unusedParams = {} 

1083 

1084 # Process parameters 

1085 return ref.datasetType.storageClass.assembler().handleParameters(inMemoryDataset, 

1086 parameters=unusedParams) 

1087 

1088 elif isDisassembledReadOnlyComponent: 

1089 

1090 compositeStorageClass = ref.datasetType.parentStorageClass 

1091 if compositeStorageClass is None: 1091 ↛ 1092line 1091 didn't jump to line 1092, because the condition on line 1091 was never true

1092 raise RuntimeError(f"Unable to retrieve read-only component '{refComponent}' since" 

1093 "no composite storage class is available.") 

1094 

1095 if refComponent is None: 1095 ↛ 1097line 1095 didn't jump to line 1097, because the condition on line 1095 was never true

1096 # Mainly for mypy 

1097 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

1098 

1099 # Assume that every read-only component can be calculated by 

1100 # forwarding the request to a single read/write component. 

1101 # Rather than guessing which rw component is the right one by 

1102 # scanning each for a read-only component of the same name, 

1103 # we ask the composite assembler directly which one is best to 

1104 # use. 

1105 compositeAssembler = compositeStorageClass.assembler() 

1106 forwardedComponent = compositeAssembler.selectResponsibleComponent(refComponent, 

1107 set(allComponents)) 

1108 

1109 # Select the relevant component 

1110 rwInfo = allComponents[forwardedComponent] 

1111 

1112 # For now assume that read parameters are validated against 

1113 # the real component and not the requested component 

1114 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

1115 forwardedStorageClass.validateParameters(parameters) 

1116 

1117 # Unfortunately the FileDescriptor inside the formatter will have 

1118 # the wrong write storage class so we need to create a new one 

1119 # given the immutability constraint. 

1120 writeStorageClass = rwInfo.info.storageClass 

1121 

1122 # We may need to put some thought into parameters for read 

1123 # components but for now forward them on as is 

1124 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location, 

1125 readStorageClass=refStorageClass, 

1126 storageClass=writeStorageClass, 

1127 parameters=parameters), 

1128 ref.dataId) 

1129 

1130 # The assembler can not receive any parameter requests for a 

1131 # read-only component at this time since the assembler will 

1132 # see the storage class of the read-only component and those 

1133 # parameters will have to be handled by the formatter on the 

1134 # forwarded storage class. 

1135 assemblerParams: Dict[str, Any] = {} 

1136 

1137 # Need to created a new info that specifies the read-only 

1138 # component and associated storage class 

1139 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter, 

1140 rwInfo.info, assemblerParams, {}, 

1141 refComponent, refStorageClass) 

1142 

1143 return self._read_artifact_into_memory(readInfo, ref, isComponent=True) 

1144 

1145 else: 

1146 # Single file request or component from that composite file 

1147 for lookup in (refComponent, None): 1147 ↛ 1152line 1147 didn't jump to line 1152, because the loop on line 1147 didn't complete

1148 if lookup in allComponents: 1148 ↛ 1147line 1148 didn't jump to line 1147, because the condition on line 1148 was never false

1149 getInfo = allComponents[lookup] 

1150 break 

1151 else: 

1152 raise FileNotFoundError(f"Component {refComponent} not found " 

1153 f"for ref {ref} in datastore {self.name}") 

1154 

1155 # Do not need the component itself if already disassembled 

1156 if isDisassembled: 

1157 isComponent = False 

1158 else: 

1159 isComponent = getInfo.component is not None 

1160 

1161 # For a disassembled component we can validate parametersagainst 

1162 # the component storage class directly 

1163 if isDisassembled: 

1164 refStorageClass.validateParameters(parameters) 

1165 else: 

1166 # For an assembled composite this could be a read-only 

1167 # component derived from a real component. The validity 

1168 # of the parameters is not clear. For now validate against 

1169 # the composite storage class 

1170 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

1171 

1172 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent) 

1173 

1174 @transactional 

1175 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

1176 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

1177 

1178 Parameters 

1179 ---------- 

1180 inMemoryDataset : `object` 

1181 The dataset to store. 

1182 ref : `DatasetRef` 

1183 Reference to the associated Dataset. 

1184 

1185 Raises 

1186 ------ 

1187 TypeError 

1188 Supplied object and storage class are inconsistent. 

1189 DatasetTypeNotSupportedError 

1190 The associated `DatasetType` is not handled by this datastore. 

1191 

1192 Notes 

1193 ----- 

1194 If the datastore is configured to reject certain dataset types it 

1195 is possible that the put will fail and raise a 

1196 `DatasetTypeNotSupportedError`. The main use case for this is to 

1197 allow `ChainedDatastore` to put to multiple datastores without 

1198 requiring that every datastore accepts the dataset. 

1199 """ 

1200 

1201 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1202 # doDisassembly = True 

1203 

1204 artifacts = [] 

1205 if doDisassembly: 

1206 components = ref.datasetType.storageClass.assembler().disassemble(inMemoryDataset) 

1207 for component, componentInfo in components.items(): 

1208 # Don't recurse because we want to take advantage of 

1209 # bulk insert -- need a new DatasetRef that refers to the 

1210 # same dataset_id but has the component DatasetType 

1211 # DatasetType does not refer to the types of components 

1212 # So we construct one ourselves. 

1213 compRef = ref.makeComponentRef(component) 

1214 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

1215 artifacts.append((compRef, storedInfo)) 

1216 else: 

1217 # Write the entire thing out 

1218 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

1219 artifacts.append((ref, storedInfo)) 

1220 

1221 self._register_datasets(artifacts) 

1222 

1223 @transactional 

1224 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None: 

1225 """Indicate to the datastore that a dataset can be removed. 

1226 

1227 Parameters 

1228 ---------- 

1229 ref : `DatasetRef` 

1230 Reference to the required Dataset. 

1231 ignore_errors : `bool` 

1232 If `True` return without error even if something went wrong. 

1233 Problems could occur if another process is simultaneously trying 

1234 to delete. 

1235 

1236 Raises 

1237 ------ 

1238 FileNotFoundError 

1239 Attempt to remove a dataset that does not exist. 

1240 """ 

1241 # Get file metadata and internal metadata 

1242 log.debug("Trashing %s in datastore %s", ref, self.name) 

1243 

1244 fileLocations = self._get_dataset_locations_info(ref) 

1245 

1246 if not fileLocations: 

1247 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

1248 if ignore_errors: 

1249 log.warning(err_msg) 

1250 return 

1251 else: 

1252 raise FileNotFoundError(err_msg) 

1253 

1254 for location, storedFileInfo in fileLocations: 

1255 if not self._artifact_exists(location): 1255 ↛ 1256line 1255 didn't jump to line 1256, because the condition on line 1255 was never true

1256 err_msg = f"Dataset is known to datastore {self.name} but " \ 

1257 f"associated artifact ({location.uri}) is missing" 

1258 if ignore_errors: 

1259 log.warning(err_msg) 

1260 return 

1261 else: 

1262 raise FileNotFoundError(err_msg) 

1263 

1264 # Mark dataset as trashed 

1265 try: 

1266 self._move_to_trash_in_registry(ref) 

1267 except Exception as e: 

1268 if ignore_errors: 

1269 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} " 

1270 f"but encountered an error: {e}") 

1271 pass 

1272 else: 

1273 raise 

1274 

1275 @transactional 

1276 def emptyTrash(self, ignore_errors: bool = True) -> None: 

1277 """Remove all datasets from the trash. 

1278 

1279 Parameters 

1280 ---------- 

1281 ignore_errors : `bool` 

1282 If `True` return without error even if something went wrong. 

1283 Problems could occur if another process is simultaneously trying 

1284 to delete. 

1285 """ 

1286 log.debug("Emptying trash in datastore %s", self.name) 

1287 # Context manager will empty trash iff we finish it without raising. 

1288 with self.bridge.emptyTrash() as trashed: 

1289 for ref in trashed: 

1290 fileLocations = self._get_dataset_locations_info(ref) 

1291 

1292 if not fileLocations: 1292 ↛ 1293line 1292 didn't jump to line 1293, because the condition on line 1292 was never true

1293 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}" 

1294 if ignore_errors: 

1295 log.warning(err_msg) 

1296 continue 

1297 else: 

1298 raise FileNotFoundError(err_msg) 

1299 

1300 for location, _ in fileLocations: 

1301 

1302 if not self._artifact_exists(location): 1302 ↛ 1303line 1302 didn't jump to line 1303, because the condition on line 1302 was never true

1303 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}" 

1304 if ignore_errors: 

1305 log.warning(err_msg) 

1306 continue 

1307 else: 

1308 raise FileNotFoundError(err_msg) 

1309 

1310 # Can only delete the artifact if there are no references 

1311 # to the file from untrashed dataset refs. 

1312 if self._can_remove_dataset_artifact(ref, location): 

1313 # Point of no return for this artifact 

1314 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

1315 try: 

1316 self._delete_artifact(location) 

1317 except Exception as e: 

1318 if ignore_errors: 

1319 log.critical("Encountered error removing artifact %s from datastore %s: %s", 

1320 location.uri, self.name, e) 

1321 else: 

1322 raise 

1323 

1324 # Now must remove the entry from the internal registry even if 

1325 # the artifact removal failed and was ignored, 

1326 # otherwise the removal check above will never be true 

1327 try: 

1328 # There may be multiple rows associated with this ref 

1329 # depending on disassembly 

1330 self.removeStoredItemInfo(ref) 

1331 except Exception as e: 

1332 if ignore_errors: 

1333 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s", 

1334 ref.id, location.uri, self.name, e) 

1335 continue 

1336 else: 

1337 raise FileNotFoundError(err_msg) 

1338 

1339 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], 

1340 logFailures: bool = False) -> None: 

1341 """Validate some of the configuration for this datastore. 

1342 

1343 Parameters 

1344 ---------- 

1345 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

1346 Entities to test against this configuration. Can be differing 

1347 types. 

1348 logFailures : `bool`, optional 

1349 If `True`, output a log message for every validation error 

1350 detected. 

1351 

1352 Raises 

1353 ------ 

1354 DatastoreValidationError 

1355 Raised if there is a validation problem with a configuration. 

1356 All the problems are reported in a single exception. 

1357 

1358 Notes 

1359 ----- 

1360 This method checks that all the supplied entities have valid file 

1361 templates and also have formatters defined. 

1362 """ 

1363 

1364 templateFailed = None 

1365 try: 

1366 self.templates.validateTemplates(entities, logFailures=logFailures) 

1367 except FileTemplateValidationError as e: 

1368 templateFailed = str(e) 

1369 

1370 formatterFailed = [] 

1371 for entity in entities: 

1372 try: 

1373 self.formatterFactory.getFormatterClass(entity) 

1374 except KeyError as e: 

1375 formatterFailed.append(str(e)) 

1376 if logFailures: 1376 ↛ 1371line 1376 didn't jump to line 1371, because the condition on line 1376 was never false

1377 log.fatal("Formatter failure: %s", e) 

1378 

1379 if templateFailed or formatterFailed: 

1380 messages = [] 

1381 if templateFailed: 1381 ↛ 1382line 1381 didn't jump to line 1382, because the condition on line 1381 was never true

1382 messages.append(templateFailed) 

1383 if formatterFailed: 1383 ↛ 1385line 1383 didn't jump to line 1385, because the condition on line 1383 was never false

1384 messages.append(",".join(formatterFailed)) 

1385 msg = ";\n".join(messages) 

1386 raise DatastoreValidationError(msg) 

1387 

1388 def getLookupKeys(self) -> Set[LookupKey]: 

1389 # Docstring is inherited from base class 

1390 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \ 

1391 self.constraints.getLookupKeys() 

1392 

1393 def validateKey(self, lookupKey: LookupKey, 

1394 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

1395 # Docstring is inherited from base class 

1396 # The key can be valid in either formatters or templates so we can 

1397 # only check the template if it exists 

1398 if lookupKey in self.templates: 

1399 try: 

1400 self.templates[lookupKey].validateTemplate(entity) 

1401 except FileTemplateValidationError as e: 

1402 raise DatastoreValidationError(e) from e 

1403 

1404 def export(self, refs: Iterable[DatasetRef], *, 

1405 directory: Optional[Union[ButlerURI, str]] = None, 

1406 transfer: Optional[str] = "auto") -> Iterable[FileDataset]: 

1407 # Docstring inherited from Datastore.export. 

1408 if transfer is not None and directory is None: 1408 ↛ 1409line 1408 didn't jump to line 1409, because the condition on line 1408 was never true

1409 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no " 

1410 "export directory given") 

1411 

1412 # Force the directory to be a URI object 

1413 directoryUri: Optional[ButlerURI] = None 

1414 if directory is not None: 1414 ↛ 1417line 1414 didn't jump to line 1417, because the condition on line 1414 was never false

1415 directoryUri = ButlerURI(directory, forceDirectory=True) 

1416 

1417 if transfer is not None and directoryUri is not None: 1417 ↛ 1422line 1417 didn't jump to line 1422, because the condition on line 1417 was never false

1418 # mypy needs the second test 

1419 if not directoryUri.exists(): 1419 ↛ 1420line 1419 didn't jump to line 1420, because the condition on line 1419 was never true

1420 raise FileNotFoundError(f"Export location {directory} does not exist") 

1421 

1422 for ref in refs: 

1423 fileLocations = self._get_dataset_locations_info(ref) 

1424 if not fileLocations: 1424 ↛ 1425line 1424 didn't jump to line 1425, because the condition on line 1424 was never true

1425 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

1426 # For now we can not export disassembled datasets 

1427 if len(fileLocations) > 1: 

1428 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

1429 location, storedFileInfo = fileLocations[0] 

1430 if transfer is None: 1430 ↛ 1433line 1430 didn't jump to line 1433, because the condition on line 1430 was never true

1431 # TODO: do we also need to return the readStorageClass somehow? 

1432 # We will use the path in store directly 

1433 pass 

1434 else: 

1435 # mypy needs help 

1436 assert directoryUri is not None, "directoryUri must be defined to get here" 

1437 storeUri = ButlerURI(location.uri) 

1438 exportUri = directoryUri.join(location.pathInStore) 

1439 exportUri.transfer_from(storeUri, transfer=transfer) 

1440 

1441 yield FileDataset(refs=[ref], path=location.pathInStore, formatter=storedFileInfo.formatter) 

1442 

1443 @staticmethod 

1444 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]: 

1445 """Compute the checksum of the supplied file. 

1446 

1447 Parameters 

1448 ---------- 

1449 uri : `ButlerURI` 

1450 Name of resource to calculate checksum from. 

1451 algorithm : `str`, optional 

1452 Name of algorithm to use. Must be one of the algorithms supported 

1453 by :py:class`hashlib`. 

1454 block_size : `int` 

1455 Number of bytes to read from file at one time. 

1456 

1457 Returns 

1458 ------- 

1459 hexdigest : `str` 

1460 Hex digest of the file. 

1461 

1462 Notes 

1463 ----- 

1464 Currently returns None if the URI is for a remote resource. 

1465 """ 

1466 if algorithm not in hashlib.algorithms_guaranteed: 1466 ↛ 1467line 1466 didn't jump to line 1467, because the condition on line 1466 was never true

1467 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

1468 

1469 if uri.scheme and uri.scheme != "file": 

1470 return None 

1471 

1472 hasher = hashlib.new(algorithm) 

1473 

1474 filename, is_temp = uri.as_local() 

1475 

1476 with open(filename, "rb") as f: 

1477 for chunk in iter(lambda: f.read(block_size), b""): 

1478 hasher.update(chunk) 

1479 

1480 if is_temp: 1480 ↛ 1481line 1480 didn't jump to line 1481, because the condition on line 1480 was never true

1481 os.remove(filename) 

1482 

1483 return hasher.hexdigest()