Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileLikeDatastore", ) 

26 

27import hashlib 

28import logging 

29import os 

30from abc import abstractmethod 

31 

32from sqlalchemy import BigInteger, String 

33 

34from dataclasses import dataclass 

35from typing import ( 

36 TYPE_CHECKING, 

37 Any, 

38 ClassVar, 

39 Dict, 

40 Iterable, 

41 List, 

42 Mapping, 

43 Optional, 

44 Set, 

45 Tuple, 

46 Type, 

47 Union, 

48) 

49 

50from lsst.daf.butler import ( 

51 ButlerURI, 

52 CompositesMap, 

53 Config, 

54 FileDataset, 

55 DatasetRef, 

56 DatasetType, 

57 DatasetTypeNotSupportedError, 

58 Datastore, 

59 DatastoreConfig, 

60 DatastoreValidationError, 

61 FileDescriptor, 

62 FileTemplates, 

63 FileTemplateValidationError, 

64 Formatter, 

65 FormatterFactory, 

66 Location, 

67 LocationFactory, 

68 StorageClass, 

69 StoredFileInfo, 

70) 

71 

72from lsst.daf.butler import ddl 

73from lsst.daf.butler.registry.interfaces import ( 

74 ReadOnlyDatabaseError, 

75 DatastoreRegistryBridge, 

76) 

77 

78from lsst.daf.butler.core.repoRelocation import replaceRoot 

79from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional 

80from .genericDatastore import GenericBaseDatastore 

81 

82if TYPE_CHECKING: 82 ↛ 83line 82 didn't jump to line 83, because the condition on line 82 was never true

83 from lsst.daf.butler import LookupKey 

84 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

85 

86log = logging.getLogger(__name__) 

87 

88# String to use when a Python None is encountered 

89NULLSTR = "__NULL_STRING__" 

90 

91 

92class _IngestPrepData(Datastore.IngestPrepData): 

93 """Helper class for FileLikeDatastore ingest implementation. 

94 

95 Parameters 

96 ---------- 

97 datasets : `list` of `FileDataset` 

98 Files to be ingested by this datastore. 

99 """ 

100 def __init__(self, datasets: List[FileDataset]): 

101 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

102 self.datasets = datasets 

103 

104 

105@dataclass(frozen=True) 

106class DatastoreFileGetInformation: 

107 """Collection of useful parameters needed to retrieve a file from 

108 a Datastore. 

109 """ 

110 

111 location: Location 

112 """The location from which to read the dataset.""" 

113 

114 formatter: Formatter 

115 """The `Formatter` to use to deserialize the dataset.""" 

116 

117 info: StoredFileInfo 

118 """Stored information about this file and its formatter.""" 

119 

120 assemblerParams: Dict[str, Any] 

121 """Parameters to use for post-processing the retrieved dataset.""" 

122 

123 formatterParams: Dict[str, Any] 

124 """Parameters that were understood by the associated formatter.""" 

125 

126 component: Optional[str] 

127 """The component to be retrieved (can be `None`).""" 

128 

129 readStorageClass: StorageClass 

130 """The `StorageClass` of the dataset being read.""" 

131 

132 

133class FileLikeDatastore(GenericBaseDatastore): 

134 """Generic Datastore for file-based implementations. 

135 

136 Should always be sub-classed since key abstract methods are missing. 

137 

138 Parameters 

139 ---------- 

140 config : `DatastoreConfig` or `str` 

141 Configuration as either a `Config` object or URI to file. 

142 bridgeManager : `DatastoreRegistryBridgeManager` 

143 Object that manages the interface between `Registry` and datastores. 

144 butlerRoot : `str`, optional 

145 New datastore root to use to override the configuration value. 

146 

147 Raises 

148 ------ 

149 ValueError 

150 If root location does not exist and ``create`` is `False` in the 

151 configuration. 

152 """ 

153 

154 defaultConfigFile: ClassVar[Optional[str]] = None 

155 """Path to configuration defaults. Accessed within the ``config`` resource 

156 or relative to a search path. Can be None if no defaults specified. 

157 """ 

158 

159 root: ButlerURI 

160 """Root directory URI of this `Datastore`.""" 

161 

162 locationFactory: LocationFactory 

163 """Factory for creating locations relative to the datastore root.""" 

164 

165 formatterFactory: FormatterFactory 

166 """Factory for creating instances of formatters.""" 

167 

168 templates: FileTemplates 

169 """File templates that can be used by this `Datastore`.""" 

170 

171 composites: CompositesMap 

172 """Determines whether a dataset should be disassembled on put.""" 

173 

174 @classmethod 

175 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

176 """Set any filesystem-dependent config options for this Datastore to 

177 be appropriate for a new empty repository with the given root. 

178 

179 Parameters 

180 ---------- 

181 root : `str` 

182 URI to the root of the data repository. 

183 config : `Config` 

184 A `Config` to update. Only the subset understood by 

185 this component will be updated. Will not expand 

186 defaults. 

187 full : `Config` 

188 A complete config with all defaults expanded that can be 

189 converted to a `DatastoreConfig`. Read-only and will not be 

190 modified by this method. 

191 Repository-specific options that should not be obtained 

192 from defaults when Butler instances are constructed 

193 should be copied from ``full`` to ``config``. 

194 overwrite : `bool`, optional 

195 If `False`, do not modify a value in ``config`` if the value 

196 already exists. Default is always to overwrite with the provided 

197 ``root``. 

198 

199 Notes 

200 ----- 

201 If a keyword is explicitly defined in the supplied ``config`` it 

202 will not be overridden by this method if ``overwrite`` is `False`. 

203 This allows explicit values set in external configs to be retained. 

204 """ 

205 Config.updateParameters(DatastoreConfig, config, full, 

206 toUpdate={"root": root}, 

207 toCopy=("cls", ("records", "table")), overwrite=overwrite) 

208 

209 @classmethod 

210 def makeTableSpec(cls) -> ddl.TableSpec: 

211 return ddl.TableSpec( 

212 fields=[ 

213 ddl.FieldSpec(name="dataset_id", dtype=BigInteger, primaryKey=True), 

214 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

215 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

216 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

217 # Use empty string to indicate no component 

218 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

219 # TODO: should checksum be Base64Bytes instead? 

220 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

221 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

222 ], 

223 unique=frozenset(), 

224 ) 

225 

226 def __init__(self, config: Union[DatastoreConfig, str], 

227 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

228 super().__init__(config, bridgeManager) 

229 if "root" not in self.config: 229 ↛ 230line 229 didn't jump to line 230, because the condition on line 229 was never true

230 raise ValueError("No root directory specified in configuration") 

231 

232 # Name ourselves either using an explicit name or a name 

233 # derived from the (unexpanded) root 

234 if "name" in self.config: 

235 self.name = self.config["name"] 

236 else: 

237 # We use the unexpanded root in the name to indicate that this 

238 # datastore can be moved without having to update registry. 

239 self.name = "{}@{}".format(type(self).__name__, 

240 self.config["root"]) 

241 

242 # Support repository relocation in config 

243 # Existence of self.root is checked in subclass 

244 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot), 

245 forceDirectory=True, forceAbsolute=True) 

246 

247 self.locationFactory = LocationFactory(self.root) 

248 self.formatterFactory = FormatterFactory() 

249 

250 # Now associate formatters with storage classes 

251 self.formatterFactory.registerFormatters(self.config["formatters"], 

252 universe=bridgeManager.universe) 

253 

254 # Read the file naming templates 

255 self.templates = FileTemplates(self.config["templates"], 

256 universe=bridgeManager.universe) 

257 

258 # See if composites should be disassembled 

259 self.composites = CompositesMap(self.config["composites"], 

260 universe=bridgeManager.universe) 

261 

262 tableName = self.config["records", "table"] 

263 try: 

264 # Storage of paths and formatters, keyed by dataset_id 

265 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec()) 

266 # Interface to Registry. 

267 self._bridge = bridgeManager.register(self.name) 

268 except ReadOnlyDatabaseError: 

269 # If the database is read only and we just tried and failed to 

270 # create a table, it means someone is trying to create a read-only 

271 # butler client for an empty repo. That should be okay, as long 

272 # as they then try to get any datasets before some other client 

273 # creates the table. Chances are they'rejust validating 

274 # configuration. 

275 pass 

276 

277 # Determine whether checksums should be used 

278 self.useChecksum = self.config.get("checksum", True) 

279 

280 def __str__(self) -> str: 

281 return str(self.root) 

282 

283 @property 

284 def bridge(self) -> DatastoreRegistryBridge: 

285 return self._bridge 

286 

287 def _artifact_exists(self, location: Location) -> bool: 

288 """Check that an artifact exists in this datastore at the specified 

289 location. 

290 

291 Parameters 

292 ---------- 

293 location : `Location` 

294 Expected location of the artifact associated with this datastore. 

295 

296 Returns 

297 ------- 

298 exists : `bool` 

299 True if the location can be found, false otherwise. 

300 """ 

301 log.debug("Checking if resource exists: %s", location.uri) 

302 return location.uri.exists() 

303 

304 def _delete_artifact(self, location: Location) -> None: 

305 """Delete the artifact from the datastore. 

306 

307 Parameters 

308 ---------- 

309 location : `Location` 

310 Location of the artifact associated with this datastore. 

311 """ 

312 log.debug("Deleting file: %s", location.uri) 

313 location.uri.remove() 

314 log.debug("Successfully deleted file: %s", location.uri) 

315 

316 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

317 # Docstring inherited from GenericBaseDatastore 

318 records = [] 

319 for ref, info in zip(refs, infos): 

320 # Component should come from ref and fall back on info 

321 component = ref.datasetType.component() 

322 if component is None and info.component is not None: 322 ↛ 323line 322 didn't jump to line 323, because the condition on line 322 was never true

323 component = info.component 

324 if component is None: 

325 # Use empty string since we want this to be part of the 

326 # primary key. 

327 component = NULLSTR 

328 records.append( 

329 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path, 

330 storage_class=info.storageClass.name, component=component, 

331 checksum=info.checksum, file_size=info.file_size) 

332 ) 

333 self._table.insert(*records) 

334 

335 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

336 # Docstring inherited from GenericBaseDatastore 

337 

338 # Look for the dataset_id -- there might be multiple matches 

339 # if we have disassembled the dataset. 

340 records = list(self._table.fetch(dataset_id=ref.id)) 

341 

342 results = [] 

343 for record in records: 

344 # Convert name of StorageClass to instance 

345 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"]) 

346 component = record["component"] if (record["component"] 

347 and record["component"] != NULLSTR) else None 

348 

349 info = StoredFileInfo(formatter=record["formatter"], 

350 path=record["path"], 

351 storageClass=storageClass, 

352 component=component, 

353 checksum=record["checksum"], 

354 file_size=record["file_size"]) 

355 results.append(info) 

356 

357 return results 

358 

359 def _registered_refs_per_artifact(self, pathInStore: str) -> Set[int]: 

360 """Return all dataset refs associated with the supplied path. 

361 

362 Parameters 

363 ---------- 

364 pathInStore : `str` 

365 Path of interest in the data store. 

366 

367 Returns 

368 ------- 

369 ids : `set` of `int` 

370 All `DatasetRef` IDs associated with this path. 

371 """ 

372 records = list(self._table.fetch(path=pathInStore)) 

373 ids = {r["dataset_id"] for r in records} 

374 return ids 

375 

376 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

377 # Docstring inherited from GenericBaseDatastore 

378 self._table.delete(dataset_id=ref.id) 

379 

380 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

381 r"""Find all the `Location`\ s of the requested dataset in the 

382 `Datastore` and the associated stored file information. 

383 

384 Parameters 

385 ---------- 

386 ref : `DatasetRef` 

387 Reference to the required `Dataset`. 

388 

389 Returns 

390 ------- 

391 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

392 Location of the dataset within the datastore and 

393 stored information about each file and its formatter. 

394 """ 

395 # Get the file information (this will fail if no file) 

396 records = self.getStoredItemsInfo(ref) 

397 

398 # Use the path to determine the location 

399 return [(self.locationFactory.fromPath(r.path), r) for r in records] 

400 

401 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

402 """Check that there is only one dataset associated with the 

403 specified artifact. 

404 

405 Parameters 

406 ---------- 

407 ref : `DatasetRef` or `FakeDatasetRef` 

408 Dataset to be removed. 

409 location : `Location` 

410 The location of the artifact to be removed. 

411 

412 Returns 

413 ------- 

414 can_remove : `Bool` 

415 True if the artifact can be safely removed. 

416 """ 

417 

418 # Get all entries associated with this path 

419 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

420 if not allRefs: 420 ↛ 421line 420 didn't jump to line 421, because the condition on line 420 was never true

421 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

422 

423 # Remove these refs from all the refs and if there is nothing left 

424 # then we can delete 

425 remainingRefs = allRefs - {ref.id} 

426 

427 if remainingRefs: 

428 return False 

429 return True 

430 

431 def _prepare_for_get(self, ref: DatasetRef, 

432 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]: 

433 """Check parameters for ``get`` and obtain formatter and 

434 location. 

435 

436 Parameters 

437 ---------- 

438 ref : `DatasetRef` 

439 Reference to the required Dataset. 

440 parameters : `dict` 

441 `StorageClass`-specific parameters that specify, for example, 

442 a slice of the dataset to be loaded. 

443 

444 Returns 

445 ------- 

446 getInfo : `list` [`DatastoreFileGetInformation`] 

447 Parameters needed to retrieve each file. 

448 """ 

449 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

450 

451 # Get file metadata and internal metadata 

452 fileLocations = self._get_dataset_locations_info(ref) 

453 if not fileLocations: 

454 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

455 

456 # The storage class we want to use eventually 

457 refStorageClass = ref.datasetType.storageClass 

458 

459 if len(fileLocations) > 1: 

460 disassembled = True 

461 else: 

462 disassembled = False 

463 

464 # Is this a component request? 

465 refComponent = ref.datasetType.component() 

466 

467 fileGetInfo = [] 

468 for location, storedFileInfo in fileLocations: 

469 

470 # The storage class used to write the file 

471 writeStorageClass = storedFileInfo.storageClass 

472 

473 # If this has been disassembled we need read to match the write 

474 if disassembled: 

475 readStorageClass = writeStorageClass 

476 else: 

477 readStorageClass = refStorageClass 

478 

479 formatter = getInstanceOf(storedFileInfo.formatter, 

480 FileDescriptor(location, readStorageClass=readStorageClass, 

481 storageClass=writeStorageClass, parameters=parameters), 

482 ref.dataId) 

483 

484 formatterParams, notFormatterParams = formatter.segregateParameters() 

485 

486 # Of the remaining parameters, extract the ones supported by 

487 # this StorageClass (for components not all will be handled) 

488 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

489 

490 # The ref itself could be a component if the dataset was 

491 # disassembled by butler, or we disassembled in datastore and 

492 # components came from the datastore records 

493 component = storedFileInfo.component if storedFileInfo.component else refComponent 

494 

495 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo, 

496 assemblerParams, formatterParams, 

497 component, readStorageClass)) 

498 

499 return fileGetInfo 

500 

501 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

502 """Check the arguments for ``put`` and obtain formatter and 

503 location. 

504 

505 Parameters 

506 ---------- 

507 inMemoryDataset : `object` 

508 The dataset to store. 

509 ref : `DatasetRef` 

510 Reference to the associated Dataset. 

511 

512 Returns 

513 ------- 

514 location : `Location` 

515 The location to write the dataset. 

516 formatter : `Formatter` 

517 The `Formatter` to use to write the dataset. 

518 

519 Raises 

520 ------ 

521 TypeError 

522 Supplied object and storage class are inconsistent. 

523 DatasetTypeNotSupportedError 

524 The associated `DatasetType` is not handled by this datastore. 

525 """ 

526 self._validate_put_parameters(inMemoryDataset, ref) 

527 

528 # Work out output file name 

529 try: 

530 template = self.templates.getTemplate(ref) 

531 except KeyError as e: 

532 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

533 

534 # Validate the template to protect against filenames from different 

535 # dataIds returning the same and causing overwrite confusion. 

536 template.validateTemplate(ref) 

537 

538 location = self.locationFactory.fromPath(template.format(ref)) 

539 

540 # Get the formatter based on the storage class 

541 storageClass = ref.datasetType.storageClass 

542 try: 

543 formatter = self.formatterFactory.getFormatter(ref, 

544 FileDescriptor(location, 

545 storageClass=storageClass), 

546 ref.dataId) 

547 except KeyError as e: 

548 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore " 

549 f"{self.name}") from e 

550 

551 # Now that we know the formatter, update the location 

552 location = formatter.makeUpdatedLocation(location) 

553 

554 return location, formatter 

555 

556 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

557 # Docstring inherited from base class 

558 if transfer != "auto": 

559 return transfer 

560 

561 # See if the paths are within the datastore or not 

562 inside = [self._pathInStore(d.path) is not None for d in datasets] 

563 

564 if all(inside): 

565 transfer = None 

566 elif not any(inside): 566 ↛ 570line 566 didn't jump to line 570, because the condition on line 566 was never false

567 # Allow ButlerURI to use its own knowledge 

568 transfer = "auto" 

569 else: 

570 raise ValueError("Some datasets are inside the datastore and some are outside." 

571 " Please use an explicit transfer mode and not 'auto'.") 

572 

573 return transfer 

574 

575 def _pathInStore(self, path: str) -> Optional[str]: 

576 """Return path relative to datastore root 

577 

578 Parameters 

579 ---------- 

580 path : `str` 

581 Path to dataset. Can be absolute. If relative assumed to 

582 be relative to the datastore. Returns path in datastore 

583 or raises an exception if the path it outside. 

584 

585 Returns 

586 ------- 

587 inStore : `str` 

588 Path relative to datastore root. Returns `None` if the file is 

589 outside the root. 

590 """ 

591 # Relative path will always be relative to datastore 

592 pathUri = ButlerURI(path, forceAbsolute=False) 

593 return pathUri.relative_to(self.root) 

594 

595 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

596 """Standardize the path of a to-be-ingested file. 

597 

598 Parameters 

599 ---------- 

600 path : `str` 

601 Path of a file to be ingested. 

602 transfer : `str`, optional 

603 How (and whether) the dataset should be added to the datastore. 

604 See `ingest` for details of transfer modes. 

605 This implementation is provided only so 

606 `NotImplementedError` can be raised if the mode is not supported; 

607 actual transfers are deferred to `_extractIngestInfo`. 

608 

609 Returns 

610 ------- 

611 path : `str` 

612 New path in what the datastore considers standard form. 

613 

614 Notes 

615 ----- 

616 Subclasses of `FileLikeDatastore` can implement this method instead 

617 of `_prepIngest`. It should not modify the data repository or given 

618 file in any way. 

619 

620 Raises 

621 ------ 

622 NotImplementedError 

623 Raised if the datastore does not support the given transfer mode 

624 (including the case where ingest is not supported at all). 

625 FileNotFoundError 

626 Raised if one of the given files does not exist. 

627 """ 

628 if transfer not in (None,) + self.root.transferModes: 628 ↛ 629line 628 didn't jump to line 629, because the condition on line 628 was never true

629 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

630 

631 # A relative URI indicates relative to datastore root 

632 srcUri = ButlerURI(path, forceAbsolute=False) 

633 if not srcUri.isabs(): 

634 srcUri = self.root.join(path) 

635 

636 if not srcUri.exists(): 

637 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest " 

638 f"are assumed to be relative to {self.root} unless they are absolute.") 

639 

640 if transfer is None: 

641 relpath = srcUri.relative_to(self.root) 

642 if not relpath: 

643 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not " 

644 f"within datastore ({self.root})") 

645 

646 # Return the relative path within the datastore for internal 

647 # transfer 

648 path = relpath 

649 

650 return path 

651 

652 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *, 

653 formatter: Union[Formatter, Type[Formatter]], 

654 transfer: Optional[str] = None) -> StoredFileInfo: 

655 """Relocate (if necessary) and extract `StoredFileInfo` from a 

656 to-be-ingested file. 

657 

658 Parameters 

659 ---------- 

660 path : `str` or `ButlerURI` 

661 URI or path of a file to be ingested. 

662 ref : `DatasetRef` 

663 Reference for the dataset being ingested. Guaranteed to have 

664 ``dataset_id not None`. 

665 formatter : `type` or `Formatter` 

666 `Formatter` subclass to use for this dataset or an instance. 

667 transfer : `str`, optional 

668 How (and whether) the dataset should be added to the datastore. 

669 See `ingest` for details of transfer modes. 

670 

671 Returns 

672 ------- 

673 info : `StoredFileInfo` 

674 Internal datastore record for this file. This will be inserted by 

675 the caller; the `_extractIngestInfo` is only resposible for 

676 creating and populating the struct. 

677 

678 Raises 

679 ------ 

680 FileNotFoundError 

681 Raised if one of the given files does not exist. 

682 FileExistsError 

683 Raised if transfer is not `None` but the (internal) location the 

684 file would be moved to is already occupied. 

685 """ 

686 if self._transaction is None: 686 ↛ 687line 686 didn't jump to line 687, because the condition on line 686 was never true

687 raise RuntimeError("Ingest called without transaction enabled") 

688 

689 # Create URI of the source path, do not need to force a relative 

690 # path to absolute. 

691 srcUri = ButlerURI(path, forceAbsolute=False) 

692 

693 # Track whether we have read the size of the source yet 

694 have_sized = False 

695 

696 if transfer is None: 

697 # A relative path is assumed to be relative to the datastore 

698 # in this context 

699 if not srcUri.isabs(): 

700 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

701 else: 

702 # Work out the path in the datastore from an absolute URI 

703 # This is required to be within the datastore. 

704 pathInStore = srcUri.relative_to(self.root) 

705 if pathInStore is None: 705 ↛ 706line 705 didn't jump to line 706, because the condition on line 705 was never true

706 raise RuntimeError(f"Unexpectedly learned that {srcUri} is " 

707 f"not within datastore {self.root}") 

708 tgtLocation = self.locationFactory.fromPath(pathInStore) 

709 else: 

710 # Work out the name we want this ingested file to have 

711 # inside the datastore 

712 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

713 if not tgtLocation.uri.dirname().exists(): 

714 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

715 tgtLocation.uri.dirname().mkdir() 

716 

717 # if we are transferring from a local file to a remote location 

718 # it may be more efficient to get the size and checksum of the 

719 # local file rather than the transferred one 

720 if not srcUri.scheme or srcUri.scheme == "file": 720 ↛ 725line 720 didn't jump to line 725, because the condition on line 720 was never false

721 size = srcUri.size() 

722 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

723 

724 # transfer the resource to the destination 

725 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction) 

726 

727 # the file should exist in the datastore now 

728 if not have_sized: 728 ↛ 732line 728 didn't jump to line 732, because the condition on line 728 was never false

729 size = tgtLocation.uri.size() 

730 checksum = self.computeChecksum(tgtLocation.uri) if self.useChecksum else None 

731 

732 return StoredFileInfo(formatter=formatter, path=tgtLocation.pathInStore, 

733 storageClass=ref.datasetType.storageClass, 

734 component=ref.datasetType.component(), 

735 file_size=size, checksum=checksum) 

736 

737 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

738 # Docstring inherited from Datastore._prepIngest. 

739 filtered = [] 

740 for dataset in datasets: 

741 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

742 if not acceptable: 

743 continue 

744 else: 

745 dataset.refs = acceptable 

746 if dataset.formatter is None: 

747 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

748 else: 

749 assert isinstance(dataset.formatter, (type, str)) 

750 dataset.formatter = getClassOf(dataset.formatter) 

751 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

752 filtered.append(dataset) 

753 return _IngestPrepData(filtered) 

754 

755 @transactional 

756 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None: 

757 # Docstring inherited from Datastore._finishIngest. 

758 refsAndInfos = [] 

759 for dataset in prepData.datasets: 

760 # Do ingest as if the first dataset ref is associated with the file 

761 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter, 

762 transfer=transfer) 

763 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

764 self._register_datasets(refsAndInfos) 

765 

766 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef, 

767 formatter: Union[Formatter, Type[Formatter]]) -> Location: 

768 """Given a source URI and a DatasetRef, determine the name the 

769 dataset will have inside datastore. 

770 

771 Parameters 

772 ---------- 

773 srcUri : `ButlerURI` 

774 URI to the source dataset file. 

775 ref : `DatasetRef` 

776 Ref associated with the newly-ingested dataset artifact. This 

777 is used to determine the name within the datastore. 

778 formatter : `Formatter` or Formatter class. 

779 Formatter to use for validation. Can be a class or an instance. 

780 

781 Returns 

782 ------- 

783 location : `Location` 

784 Target location for the newly-ingested dataset. 

785 """ 

786 # Ingesting a file from outside the datastore. 

787 # This involves a new name. 

788 template = self.templates.getTemplate(ref) 

789 location = self.locationFactory.fromPath(template.format(ref)) 

790 

791 # Get the extension 

792 ext = srcUri.getExtension() 

793 

794 # Update the destination to include that extension 

795 location.updateExtension(ext) 

796 

797 # Ask the formatter to validate this extension 

798 formatter.validateExtension(location) 

799 

800 return location 

801 

802 @abstractmethod 

803 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

804 """Write out in memory dataset to datastore. 

805 

806 Parameters 

807 ---------- 

808 inMemoryDataset : `object` 

809 Dataset to write to datastore. 

810 ref : `DatasetRef` 

811 Registry information associated with this dataset. 

812 

813 Returns 

814 ------- 

815 info : `StoredFileInfo` 

816 Information describin the artifact written to the datastore. 

817 """ 

818 raise NotImplementedError() 

819 

820 @abstractmethod 

821 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

822 ref: DatasetRef, isComponent: bool = False) -> Any: 

823 """Read the artifact from datastore into in memory object. 

824 

825 Parameters 

826 ---------- 

827 getInfo : `DatastoreFileGetInformation` 

828 Information about the artifact within the datastore. 

829 ref : `DatasetRef` 

830 The registry information associated with this artifact. 

831 isComponent : `bool` 

832 Flag to indicate if a component is being read from this artifact. 

833 

834 Returns 

835 ------- 

836 inMemoryDataset : `object` 

837 The artifact as a python object. 

838 """ 

839 raise NotImplementedError() 

840 

841 def exists(self, ref: DatasetRef) -> bool: 

842 """Check if the dataset exists in the datastore. 

843 

844 Parameters 

845 ---------- 

846 ref : `DatasetRef` 

847 Reference to the required dataset. 

848 

849 Returns 

850 ------- 

851 exists : `bool` 

852 `True` if the entity exists in the `Datastore`. 

853 """ 

854 fileLocations = self._get_dataset_locations_info(ref) 

855 if not fileLocations: 

856 return False 

857 for location, _ in fileLocations: 

858 if not self._artifact_exists(location): 

859 return False 

860 

861 return True 

862 

863 def getURIs(self, ref: DatasetRef, 

864 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

865 """Return URIs associated with dataset. 

866 

867 Parameters 

868 ---------- 

869 ref : `DatasetRef` 

870 Reference to the required dataset. 

871 predict : `bool`, optional 

872 If the datastore does not know about the dataset, should it 

873 return a predicted URI or not? 

874 

875 Returns 

876 ------- 

877 primary : `ButlerURI` 

878 The URI to the primary artifact associated with this dataset. 

879 If the dataset was disassembled within the datastore this 

880 may be `None`. 

881 components : `dict` 

882 URIs to any components associated with the dataset artifact. 

883 Can be empty if there are no components. 

884 """ 

885 

886 primary: Optional[ButlerURI] = None 

887 components: Dict[str, ButlerURI] = {} 

888 

889 # if this has never been written then we have to guess 

890 if not self.exists(ref): 

891 if not predict: 

892 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

893 

894 def predictLocation(thisRef: DatasetRef) -> Location: 

895 template = self.templates.getTemplate(thisRef) 

896 location = self.locationFactory.fromPath(template.format(thisRef)) 

897 storageClass = ref.datasetType.storageClass 

898 formatter = self.formatterFactory.getFormatter(thisRef, 

899 FileDescriptor(location, 

900 storageClass=storageClass)) 

901 # Try to use the extension attribute but ignore problems if the 

902 # formatter does not define one. 

903 try: 

904 location = formatter.makeUpdatedLocation(location) 

905 except Exception: 

906 # Use the default extension 

907 pass 

908 return location 

909 

910 doDisassembly = self.composites.shouldBeDisassembled(ref) 

911 

912 if doDisassembly: 

913 

914 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

915 compRef = ref.makeComponentRef(component) 

916 compLocation = predictLocation(compRef) 

917 

918 # Add a URI fragment to indicate this is a guess 

919 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted") 

920 

921 else: 

922 

923 location = predictLocation(ref) 

924 

925 # Add a URI fragment to indicate this is a guess 

926 primary = ButlerURI(location.uri.geturl() + "#predicted") 

927 

928 return primary, components 

929 

930 # If this is a ref that we have written we can get the path. 

931 # Get file metadata and internal metadata 

932 fileLocations = self._get_dataset_locations_info(ref) 

933 

934 if not fileLocations: 934 ↛ 935line 934 didn't jump to line 935, because the condition on line 934 was never true

935 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

936 

937 if len(fileLocations) == 1: 

938 # No disassembly so this is the primary URI 

939 primary = ButlerURI(fileLocations[0][0].uri) 

940 

941 else: 

942 for location, storedFileInfo in fileLocations: 

943 if storedFileInfo.component is None: 943 ↛ 944line 943 didn't jump to line 944, because the condition on line 943 was never true

944 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

945 components[storedFileInfo.component] = ButlerURI(location.uri) 

946 

947 return primary, components 

948 

949 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI: 

950 """URI to the Dataset. 

951 

952 Parameters 

953 ---------- 

954 ref : `DatasetRef` 

955 Reference to the required Dataset. 

956 predict : `bool` 

957 If `True`, allow URIs to be returned of datasets that have not 

958 been written. 

959 

960 Returns 

961 ------- 

962 uri : `str` 

963 URI pointing to the dataset within the datastore. If the 

964 dataset does not exist in the datastore, and if ``predict`` is 

965 `True`, the URI will be a prediction and will include a URI 

966 fragment "#predicted". 

967 If the datastore does not have entities that relate well 

968 to the concept of a URI the returned URI will be 

969 descriptive. The returned URI is not guaranteed to be obtainable. 

970 

971 Raises 

972 ------ 

973 FileNotFoundError 

974 Raised if a URI has been requested for a dataset that does not 

975 exist and guessing is not allowed. 

976 RuntimeError 

977 Raised if a request is made for a single URI but multiple URIs 

978 are associated with this dataset. 

979 

980 Notes 

981 ----- 

982 When a predicted URI is requested an attempt will be made to form 

983 a reasonable URI based on file templates and the expected formatter. 

984 """ 

985 primary, components = self.getURIs(ref, predict) 

986 if primary is None or components: 986 ↛ 987line 986 didn't jump to line 987, because the condition on line 986 was never true

987 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. " 

988 "Use Dataastore.getURIs() instead.") 

989 return primary 

990 

991 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

992 """Load an InMemoryDataset from the store. 

993 

994 Parameters 

995 ---------- 

996 ref : `DatasetRef` 

997 Reference to the required Dataset. 

998 parameters : `dict` 

999 `StorageClass`-specific parameters that specify, for example, 

1000 a slice of the dataset to be loaded. 

1001 

1002 Returns 

1003 ------- 

1004 inMemoryDataset : `object` 

1005 Requested dataset or slice thereof as an InMemoryDataset. 

1006 

1007 Raises 

1008 ------ 

1009 FileNotFoundError 

1010 Requested dataset can not be retrieved. 

1011 TypeError 

1012 Return value from formatter has unexpected type. 

1013 ValueError 

1014 Formatter failed to process the dataset. 

1015 """ 

1016 allGetInfo = self._prepare_for_get(ref, parameters) 

1017 refComponent = ref.datasetType.component() 

1018 

1019 # Supplied storage class for the component being read 

1020 refStorageClass = ref.datasetType.storageClass 

1021 

1022 # Create mapping from component name to related info 

1023 allComponents = {i.component: i for i in allGetInfo} 

1024 

1025 # By definition the dataset is disassembled if we have more 

1026 # than one record for it. 

1027 isDisassembled = len(allGetInfo) > 1 

1028 

1029 # Look for the special case where we are disassembled but the 

1030 # component is a read-only component that was not written during 

1031 # disassembly. For this scenario we need to check that the 

1032 # component requested is listed as a read-only component for the 

1033 # composite storage class 

1034 isDisassembledReadOnlyComponent = False 

1035 if isDisassembled and refComponent: 

1036 # The composite storage class should be accessible through 

1037 # the component dataset type 

1038 compositeStorageClass = ref.datasetType.parentStorageClass 

1039 

1040 # In the unlikely scenario where the composite storage 

1041 # class is not known, we can only assume that this is a 

1042 # normal component. If that assumption is wrong then the 

1043 # branch below that reads a persisted component will fail 

1044 # so there is no need to complain here. 

1045 if compositeStorageClass is not None: 1045 ↛ 1048line 1045 didn't jump to line 1048, because the condition on line 1045 was never false

1046 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.readComponents 

1047 

1048 if isDisassembled and not refComponent: 

1049 # This was a disassembled dataset spread over multiple files 

1050 # and we need to put them all back together again. 

1051 # Read into memory and then assemble 

1052 

1053 # Check that the supplied parameters are suitable for the type read 

1054 refStorageClass.validateParameters(parameters) 

1055 

1056 # We want to keep track of all the parameters that were not used 

1057 # by formatters. We assume that if any of the component formatters 

1058 # use a parameter that we do not need to apply it again in the 

1059 # assembler. 

1060 usedParams = set() 

1061 

1062 components: Dict[str, Any] = {} 

1063 for getInfo in allGetInfo: 

1064 # assemblerParams are parameters not understood by the 

1065 # associated formatter. 

1066 usedParams.update(set(getInfo.formatterParams)) 

1067 

1068 component = getInfo.component 

1069 

1070 if component is None: 1070 ↛ 1071line 1070 didn't jump to line 1071, because the condition on line 1070 was never true

1071 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

1072 

1073 # We do not want the formatter to think it's reading 

1074 # a component though because it is really reading a 

1075 # standalone dataset -- always tell reader it is not a 

1076 # component. 

1077 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False) 

1078 

1079 inMemoryDataset = ref.datasetType.storageClass.assembler().assemble(components) 

1080 

1081 # Any unused parameters will have to be passed to the assembler 

1082 if parameters: 

1083 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

1084 else: 

1085 unusedParams = {} 

1086 

1087 # Process parameters 

1088 return ref.datasetType.storageClass.assembler().handleParameters(inMemoryDataset, 

1089 parameters=unusedParams) 

1090 

1091 elif isDisassembledReadOnlyComponent: 

1092 

1093 compositeStorageClass = ref.datasetType.parentStorageClass 

1094 if compositeStorageClass is None: 1094 ↛ 1095line 1094 didn't jump to line 1095, because the condition on line 1094 was never true

1095 raise RuntimeError(f"Unable to retrieve read-only component '{refComponent}' since" 

1096 "no composite storage class is available.") 

1097 

1098 if refComponent is None: 1098 ↛ 1100line 1098 didn't jump to line 1100, because the condition on line 1098 was never true

1099 # Mainly for mypy 

1100 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

1101 

1102 # Assume that every read-only component can be calculated by 

1103 # forwarding the request to a single read/write component. 

1104 # Rather than guessing which rw component is the right one by 

1105 # scanning each for a read-only component of the same name, 

1106 # we ask the composite assembler directly which one is best to 

1107 # use. 

1108 compositeAssembler = compositeStorageClass.assembler() 

1109 forwardedComponent = compositeAssembler.selectResponsibleComponent(refComponent, 

1110 set(allComponents)) 

1111 

1112 # Select the relevant component 

1113 rwInfo = allComponents[forwardedComponent] 

1114 

1115 # For now assume that read parameters are validated against 

1116 # the real component and not the requested component 

1117 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

1118 forwardedStorageClass.validateParameters(parameters) 

1119 

1120 # Unfortunately the FileDescriptor inside the formatter will have 

1121 # the wrong write storage class so we need to create a new one 

1122 # given the immutability constraint. 

1123 writeStorageClass = rwInfo.info.storageClass 

1124 

1125 # We may need to put some thought into parameters for read 

1126 # components but for now forward them on as is 

1127 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location, 

1128 readStorageClass=refStorageClass, 

1129 storageClass=writeStorageClass, 

1130 parameters=parameters), 

1131 ref.dataId) 

1132 

1133 # The assembler can not receive any parameter requests for a 

1134 # read-only component at this time since the assembler will 

1135 # see the storage class of the read-only component and those 

1136 # parameters will have to be handled by the formatter on the 

1137 # forwarded storage class. 

1138 assemblerParams: Dict[str, Any] = {} 

1139 

1140 # Need to created a new info that specifies the read-only 

1141 # component and associated storage class 

1142 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter, 

1143 rwInfo.info, assemblerParams, {}, 

1144 refComponent, refStorageClass) 

1145 

1146 return self._read_artifact_into_memory(readInfo, ref, isComponent=True) 

1147 

1148 else: 

1149 # Single file request or component from that composite file 

1150 for lookup in (refComponent, None): 1150 ↛ 1155line 1150 didn't jump to line 1155, because the loop on line 1150 didn't complete

1151 if lookup in allComponents: 1151 ↛ 1150line 1151 didn't jump to line 1150, because the condition on line 1151 was never false

1152 getInfo = allComponents[lookup] 

1153 break 

1154 else: 

1155 raise FileNotFoundError(f"Component {refComponent} not found " 

1156 f"for ref {ref} in datastore {self.name}") 

1157 

1158 # Do not need the component itself if already disassembled 

1159 if isDisassembled: 

1160 isComponent = False 

1161 else: 

1162 isComponent = getInfo.component is not None 

1163 

1164 # For a disassembled component we can validate parametersagainst 

1165 # the component storage class directly 

1166 if isDisassembled: 

1167 refStorageClass.validateParameters(parameters) 

1168 else: 

1169 # For an assembled composite this could be a read-only 

1170 # component derived from a real component. The validity 

1171 # of the parameters is not clear. For now validate against 

1172 # the composite storage class 

1173 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

1174 

1175 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent) 

1176 

1177 @transactional 

1178 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

1179 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

1180 

1181 Parameters 

1182 ---------- 

1183 inMemoryDataset : `object` 

1184 The dataset to store. 

1185 ref : `DatasetRef` 

1186 Reference to the associated Dataset. 

1187 

1188 Raises 

1189 ------ 

1190 TypeError 

1191 Supplied object and storage class are inconsistent. 

1192 DatasetTypeNotSupportedError 

1193 The associated `DatasetType` is not handled by this datastore. 

1194 

1195 Notes 

1196 ----- 

1197 If the datastore is configured to reject certain dataset types it 

1198 is possible that the put will fail and raise a 

1199 `DatasetTypeNotSupportedError`. The main use case for this is to 

1200 allow `ChainedDatastore` to put to multiple datastores without 

1201 requiring that every datastore accepts the dataset. 

1202 """ 

1203 

1204 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1205 # doDisassembly = True 

1206 

1207 artifacts = [] 

1208 if doDisassembly: 

1209 components = ref.datasetType.storageClass.assembler().disassemble(inMemoryDataset) 

1210 for component, componentInfo in components.items(): 

1211 # Don't recurse because we want to take advantage of 

1212 # bulk insert -- need a new DatasetRef that refers to the 

1213 # same dataset_id but has the component DatasetType 

1214 # DatasetType does not refer to the types of components 

1215 # So we construct one ourselves. 

1216 compRef = ref.makeComponentRef(component) 

1217 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

1218 artifacts.append((compRef, storedInfo)) 

1219 else: 

1220 # Write the entire thing out 

1221 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

1222 artifacts.append((ref, storedInfo)) 

1223 

1224 self._register_datasets(artifacts) 

1225 

1226 @transactional 

1227 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None: 

1228 """Indicate to the datastore that a dataset can be removed. 

1229 

1230 Parameters 

1231 ---------- 

1232 ref : `DatasetRef` 

1233 Reference to the required Dataset. 

1234 ignore_errors : `bool` 

1235 If `True` return without error even if something went wrong. 

1236 Problems could occur if another process is simultaneously trying 

1237 to delete. 

1238 

1239 Raises 

1240 ------ 

1241 FileNotFoundError 

1242 Attempt to remove a dataset that does not exist. 

1243 """ 

1244 # Get file metadata and internal metadata 

1245 log.debug("Trashing %s in datastore %s", ref, self.name) 

1246 

1247 fileLocations = self._get_dataset_locations_info(ref) 

1248 

1249 if not fileLocations: 

1250 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

1251 if ignore_errors: 

1252 log.warning(err_msg) 

1253 return 

1254 else: 

1255 raise FileNotFoundError(err_msg) 

1256 

1257 for location, storedFileInfo in fileLocations: 

1258 if not self._artifact_exists(location): 1258 ↛ 1259line 1258 didn't jump to line 1259, because the condition on line 1258 was never true

1259 err_msg = f"Dataset is known to datastore {self.name} but " \ 

1260 f"associated artifact ({location.uri}) is missing" 

1261 if ignore_errors: 

1262 log.warning(err_msg) 

1263 return 

1264 else: 

1265 raise FileNotFoundError(err_msg) 

1266 

1267 # Mark dataset as trashed 

1268 try: 

1269 self._move_to_trash_in_registry(ref) 

1270 except Exception as e: 

1271 if ignore_errors: 

1272 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} " 

1273 f"but encountered an error: {e}") 

1274 pass 

1275 else: 

1276 raise 

1277 

1278 @transactional 

1279 def emptyTrash(self, ignore_errors: bool = True) -> None: 

1280 """Remove all datasets from the trash. 

1281 

1282 Parameters 

1283 ---------- 

1284 ignore_errors : `bool` 

1285 If `True` return without error even if something went wrong. 

1286 Problems could occur if another process is simultaneously trying 

1287 to delete. 

1288 """ 

1289 log.debug("Emptying trash in datastore %s", self.name) 

1290 # Context manager will empty trash iff we finish it without raising. 

1291 with self.bridge.emptyTrash() as trashed: 

1292 for ref in trashed: 

1293 fileLocations = self._get_dataset_locations_info(ref) 

1294 

1295 if not fileLocations: 1295 ↛ 1296line 1295 didn't jump to line 1296, because the condition on line 1295 was never true

1296 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}" 

1297 if ignore_errors: 

1298 log.warning(err_msg) 

1299 continue 

1300 else: 

1301 raise FileNotFoundError(err_msg) 

1302 

1303 for location, _ in fileLocations: 

1304 

1305 if not self._artifact_exists(location): 1305 ↛ 1306line 1305 didn't jump to line 1306, because the condition on line 1305 was never true

1306 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}" 

1307 if ignore_errors: 

1308 log.warning(err_msg) 

1309 continue 

1310 else: 

1311 raise FileNotFoundError(err_msg) 

1312 

1313 # Can only delete the artifact if there are no references 

1314 # to the file from untrashed dataset refs. 

1315 if self._can_remove_dataset_artifact(ref, location): 

1316 # Point of no return for this artifact 

1317 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

1318 try: 

1319 self._delete_artifact(location) 

1320 except Exception as e: 

1321 if ignore_errors: 

1322 log.critical("Encountered error removing artifact %s from datastore %s: %s", 

1323 location.uri, self.name, e) 

1324 else: 

1325 raise 

1326 

1327 # Now must remove the entry from the internal registry even if 

1328 # the artifact removal failed and was ignored, 

1329 # otherwise the removal check above will never be true 

1330 try: 

1331 # There may be multiple rows associated with this ref 

1332 # depending on disassembly 

1333 self.removeStoredItemInfo(ref) 

1334 except Exception as e: 

1335 if ignore_errors: 

1336 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s", 

1337 ref.id, location.uri, self.name, e) 

1338 continue 

1339 else: 

1340 raise FileNotFoundError(err_msg) 

1341 

1342 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], 

1343 logFailures: bool = False) -> None: 

1344 """Validate some of the configuration for this datastore. 

1345 

1346 Parameters 

1347 ---------- 

1348 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

1349 Entities to test against this configuration. Can be differing 

1350 types. 

1351 logFailures : `bool`, optional 

1352 If `True`, output a log message for every validation error 

1353 detected. 

1354 

1355 Raises 

1356 ------ 

1357 DatastoreValidationError 

1358 Raised if there is a validation problem with a configuration. 

1359 All the problems are reported in a single exception. 

1360 

1361 Notes 

1362 ----- 

1363 This method checks that all the supplied entities have valid file 

1364 templates and also have formatters defined. 

1365 """ 

1366 

1367 templateFailed = None 

1368 try: 

1369 self.templates.validateTemplates(entities, logFailures=logFailures) 

1370 except FileTemplateValidationError as e: 

1371 templateFailed = str(e) 

1372 

1373 formatterFailed = [] 

1374 for entity in entities: 

1375 try: 

1376 self.formatterFactory.getFormatterClass(entity) 

1377 except KeyError as e: 

1378 formatterFailed.append(str(e)) 

1379 if logFailures: 1379 ↛ 1374line 1379 didn't jump to line 1374, because the condition on line 1379 was never false

1380 log.fatal("Formatter failure: %s", e) 

1381 

1382 if templateFailed or formatterFailed: 

1383 messages = [] 

1384 if templateFailed: 1384 ↛ 1385line 1384 didn't jump to line 1385, because the condition on line 1384 was never true

1385 messages.append(templateFailed) 

1386 if formatterFailed: 1386 ↛ 1388line 1386 didn't jump to line 1388, because the condition on line 1386 was never false

1387 messages.append(",".join(formatterFailed)) 

1388 msg = ";\n".join(messages) 

1389 raise DatastoreValidationError(msg) 

1390 

1391 def getLookupKeys(self) -> Set[LookupKey]: 

1392 # Docstring is inherited from base class 

1393 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \ 

1394 self.constraints.getLookupKeys() 

1395 

1396 def validateKey(self, lookupKey: LookupKey, 

1397 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

1398 # Docstring is inherited from base class 

1399 # The key can be valid in either formatters or templates so we can 

1400 # only check the template if it exists 

1401 if lookupKey in self.templates: 

1402 try: 

1403 self.templates[lookupKey].validateTemplate(entity) 

1404 except FileTemplateValidationError as e: 

1405 raise DatastoreValidationError(e) from e 

1406 

1407 def export(self, refs: Iterable[DatasetRef], *, 

1408 directory: Optional[Union[ButlerURI, str]] = None, 

1409 transfer: Optional[str] = "auto") -> Iterable[FileDataset]: 

1410 # Docstring inherited from Datastore.export. 

1411 if transfer is not None and directory is None: 1411 ↛ 1412line 1411 didn't jump to line 1412, because the condition on line 1411 was never true

1412 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no " 

1413 "export directory given") 

1414 

1415 # Force the directory to be a URI object 

1416 directoryUri: Optional[ButlerURI] = None 

1417 if directory is not None: 1417 ↛ 1420line 1417 didn't jump to line 1420, because the condition on line 1417 was never false

1418 directoryUri = ButlerURI(directory, forceDirectory=True) 

1419 

1420 if transfer is not None and directoryUri is not None: 1420 ↛ 1425line 1420 didn't jump to line 1425, because the condition on line 1420 was never false

1421 # mypy needs the second test 

1422 if not directoryUri.exists(): 1422 ↛ 1423line 1422 didn't jump to line 1423, because the condition on line 1422 was never true

1423 raise FileNotFoundError(f"Export location {directory} does not exist") 

1424 

1425 for ref in refs: 

1426 fileLocations = self._get_dataset_locations_info(ref) 

1427 if not fileLocations: 1427 ↛ 1428line 1427 didn't jump to line 1428, because the condition on line 1427 was never true

1428 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

1429 # For now we can not export disassembled datasets 

1430 if len(fileLocations) > 1: 

1431 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

1432 location, storedFileInfo = fileLocations[0] 

1433 if transfer is None: 1433 ↛ 1436line 1433 didn't jump to line 1436, because the condition on line 1433 was never true

1434 # TODO: do we also need to return the readStorageClass somehow? 

1435 # We will use the path in store directly 

1436 pass 

1437 else: 

1438 # mypy needs help 

1439 assert directoryUri is not None, "directoryUri must be defined to get here" 

1440 storeUri = ButlerURI(location.uri) 

1441 exportUri = directoryUri.join(location.pathInStore) 

1442 exportUri.transfer_from(storeUri, transfer=transfer) 

1443 

1444 yield FileDataset(refs=[ref], path=location.pathInStore, formatter=storedFileInfo.formatter) 

1445 

1446 @staticmethod 

1447 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]: 

1448 """Compute the checksum of the supplied file. 

1449 

1450 Parameters 

1451 ---------- 

1452 uri : `ButlerURI` 

1453 Name of resource to calculate checksum from. 

1454 algorithm : `str`, optional 

1455 Name of algorithm to use. Must be one of the algorithms supported 

1456 by :py:class`hashlib`. 

1457 block_size : `int` 

1458 Number of bytes to read from file at one time. 

1459 

1460 Returns 

1461 ------- 

1462 hexdigest : `str` 

1463 Hex digest of the file. 

1464 

1465 Notes 

1466 ----- 

1467 Currently returns None if the URI is for a remote resource. 

1468 """ 

1469 if algorithm not in hashlib.algorithms_guaranteed: 1469 ↛ 1470line 1469 didn't jump to line 1470, because the condition on line 1469 was never true

1470 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

1471 

1472 if uri.scheme and uri.scheme != "file": 

1473 return None 

1474 

1475 hasher = hashlib.new(algorithm) 

1476 

1477 filename, is_temp = uri.as_local() 

1478 

1479 with open(filename, "rb") as f: 

1480 for chunk in iter(lambda: f.read(block_size), b""): 

1481 hasher.update(chunk) 

1482 

1483 if is_temp: 1483 ↛ 1484line 1483 didn't jump to line 1484, because the condition on line 1483 was never true

1484 os.remove(filename) 

1485 

1486 return hasher.hexdigest()