Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileLikeDatastore", ) 

26 

27import hashlib 

28import logging 

29import os 

30from abc import abstractmethod 

31 

32from sqlalchemy import BigInteger, String 

33 

34from dataclasses import dataclass 

35from typing import ( 

36 TYPE_CHECKING, 

37 Any, 

38 ClassVar, 

39 Dict, 

40 Iterable, 

41 List, 

42 Mapping, 

43 Optional, 

44 Set, 

45 Tuple, 

46 Type, 

47 Union, 

48) 

49 

50from lsst.daf.butler import ( 

51 ButlerURI, 

52 CompositesMap, 

53 Config, 

54 FileDataset, 

55 DatasetRef, 

56 DatasetType, 

57 DatasetTypeNotSupportedError, 

58 Datastore, 

59 DatastoreConfig, 

60 DatastoreValidationError, 

61 FileDescriptor, 

62 FileTemplates, 

63 FileTemplateValidationError, 

64 Formatter, 

65 FormatterFactory, 

66 Location, 

67 LocationFactory, 

68 StorageClass, 

69 StoredFileInfo, 

70) 

71 

72from lsst.daf.butler import ddl 

73from lsst.daf.butler.registry.interfaces import ( 

74 ReadOnlyDatabaseError, 

75 DatastoreRegistryBridge, 

76) 

77 

78from lsst.daf.butler.core.repoRelocation import replaceRoot 

79from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional 

80from .genericDatastore import GenericBaseDatastore 

81 

82if TYPE_CHECKING: 82 ↛ 83line 82 didn't jump to line 83, because the condition on line 82 was never true

83 from lsst.daf.butler import LookupKey 

84 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

85 

86log = logging.getLogger(__name__) 

87 

88# String to use when a Python None is encountered 

89NULLSTR = "__NULL_STRING__" 

90 

91 

92class _IngestPrepData(Datastore.IngestPrepData): 

93 """Helper class for FileLikeDatastore ingest implementation. 

94 

95 Parameters 

96 ---------- 

97 datasets : `list` of `FileDataset` 

98 Files to be ingested by this datastore. 

99 """ 

100 def __init__(self, datasets: List[FileDataset]): 

101 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

102 self.datasets = datasets 

103 

104 

105@dataclass(frozen=True) 

106class DatastoreFileGetInformation: 

107 """Collection of useful parameters needed to retrieve a file from 

108 a Datastore. 

109 """ 

110 

111 location: Location 

112 """The location from which to read the dataset.""" 

113 

114 formatter: Formatter 

115 """The `Formatter` to use to deserialize the dataset.""" 

116 

117 info: StoredFileInfo 

118 """Stored information about this file and its formatter.""" 

119 

120 assemblerParams: Dict[str, Any] 

121 """Parameters to use for post-processing the retrieved dataset.""" 

122 

123 formatterParams: Dict[str, Any] 

124 """Parameters that were understood by the associated formatter.""" 

125 

126 component: Optional[str] 

127 """The component to be retrieved (can be `None`).""" 

128 

129 readStorageClass: StorageClass 

130 """The `StorageClass` of the dataset being read.""" 

131 

132 

133class FileLikeDatastore(GenericBaseDatastore): 

134 """Generic Datastore for file-based implementations. 

135 

136 Should always be sub-classed since key abstract methods are missing. 

137 

138 Parameters 

139 ---------- 

140 config : `DatastoreConfig` or `str` 

141 Configuration as either a `Config` object or URI to file. 

142 bridgeManager : `DatastoreRegistryBridgeManager` 

143 Object that manages the interface between `Registry` and datastores. 

144 butlerRoot : `str`, optional 

145 New datastore root to use to override the configuration value. 

146 

147 Raises 

148 ------ 

149 ValueError 

150 If root location does not exist and ``create`` is `False` in the 

151 configuration. 

152 """ 

153 

154 defaultConfigFile: ClassVar[Optional[str]] = None 

155 """Path to configuration defaults. Accessed within the ``config`` resource 

156 or relative to a search path. Can be None if no defaults specified. 

157 """ 

158 

159 root: ButlerURI 

160 """Root directory URI of this `Datastore`.""" 

161 

162 locationFactory: LocationFactory 

163 """Factory for creating locations relative to the datastore root.""" 

164 

165 formatterFactory: FormatterFactory 

166 """Factory for creating instances of formatters.""" 

167 

168 templates: FileTemplates 

169 """File templates that can be used by this `Datastore`.""" 

170 

171 composites: CompositesMap 

172 """Determines whether a dataset should be disassembled on put.""" 

173 

174 @classmethod 

175 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

176 """Set any filesystem-dependent config options for this Datastore to 

177 be appropriate for a new empty repository with the given root. 

178 

179 Parameters 

180 ---------- 

181 root : `str` 

182 URI to the root of the data repository. 

183 config : `Config` 

184 A `Config` to update. Only the subset understood by 

185 this component will be updated. Will not expand 

186 defaults. 

187 full : `Config` 

188 A complete config with all defaults expanded that can be 

189 converted to a `DatastoreConfig`. Read-only and will not be 

190 modified by this method. 

191 Repository-specific options that should not be obtained 

192 from defaults when Butler instances are constructed 

193 should be copied from ``full`` to ``config``. 

194 overwrite : `bool`, optional 

195 If `False`, do not modify a value in ``config`` if the value 

196 already exists. Default is always to overwrite with the provided 

197 ``root``. 

198 

199 Notes 

200 ----- 

201 If a keyword is explicitly defined in the supplied ``config`` it 

202 will not be overridden by this method if ``overwrite`` is `False`. 

203 This allows explicit values set in external configs to be retained. 

204 """ 

205 Config.updateParameters(DatastoreConfig, config, full, 

206 toUpdate={"root": root}, 

207 toCopy=("cls", ("records", "table")), overwrite=overwrite) 

208 

209 @classmethod 

210 def makeTableSpec(cls) -> ddl.TableSpec: 

211 return ddl.TableSpec( 

212 fields=[ 

213 ddl.FieldSpec(name="dataset_id", dtype=BigInteger, primaryKey=True), 

214 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

215 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

216 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

217 # Use empty string to indicate no component 

218 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

219 # TODO: should checksum be Base64Bytes instead? 

220 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

221 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

222 ], 

223 unique=frozenset(), 

224 ) 

225 

226 def __init__(self, config: Union[DatastoreConfig, str], 

227 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

228 super().__init__(config, bridgeManager) 

229 if "root" not in self.config: 229 ↛ 230line 229 didn't jump to line 230, because the condition on line 229 was never true

230 raise ValueError("No root directory specified in configuration") 

231 

232 # Name ourselves either using an explicit name or a name 

233 # derived from the (unexpanded) root 

234 if "name" in self.config: 

235 self.name = self.config["name"] 

236 else: 

237 # We use the unexpanded root in the name to indicate that this 

238 # datastore can be moved without having to update registry. 

239 self.name = "{}@{}".format(type(self).__name__, 

240 self.config["root"]) 

241 

242 # Support repository relocation in config 

243 # Existence of self.root is checked in subclass 

244 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot), 

245 forceDirectory=True, forceAbsolute=True) 

246 

247 self.locationFactory = LocationFactory(self.root) 

248 self.formatterFactory = FormatterFactory() 

249 

250 # Now associate formatters with storage classes 

251 self.formatterFactory.registerFormatters(self.config["formatters"], 

252 universe=bridgeManager.universe) 

253 

254 # Read the file naming templates 

255 self.templates = FileTemplates(self.config["templates"], 

256 universe=bridgeManager.universe) 

257 

258 # See if composites should be disassembled 

259 self.composites = CompositesMap(self.config["composites"], 

260 universe=bridgeManager.universe) 

261 

262 tableName = self.config["records", "table"] 

263 try: 

264 # Storage of paths and formatters, keyed by dataset_id 

265 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec()) 

266 # Interface to Registry. 

267 self._bridge = bridgeManager.register(self.name) 

268 except ReadOnlyDatabaseError: 

269 # If the database is read only and we just tried and failed to 

270 # create a table, it means someone is trying to create a read-only 

271 # butler client for an empty repo. That should be okay, as long 

272 # as they then try to get any datasets before some other client 

273 # creates the table. Chances are they'rejust validating 

274 # configuration. 

275 pass 

276 

277 # Determine whether checksums should be used - default to False 

278 self.useChecksum = self.config.get("checksum", False) 

279 

280 def __str__(self) -> str: 

281 return str(self.root) 

282 

283 @property 

284 def bridge(self) -> DatastoreRegistryBridge: 

285 return self._bridge 

286 

287 def _artifact_exists(self, location: Location) -> bool: 

288 """Check that an artifact exists in this datastore at the specified 

289 location. 

290 

291 Parameters 

292 ---------- 

293 location : `Location` 

294 Expected location of the artifact associated with this datastore. 

295 

296 Returns 

297 ------- 

298 exists : `bool` 

299 True if the location can be found, false otherwise. 

300 """ 

301 log.debug("Checking if resource exists: %s", location.uri) 

302 return location.uri.exists() 

303 

304 def _delete_artifact(self, location: Location) -> None: 

305 """Delete the artifact from the datastore. 

306 

307 Parameters 

308 ---------- 

309 location : `Location` 

310 Location of the artifact associated with this datastore. 

311 """ 

312 log.debug("Deleting file: %s", location.uri) 

313 location.uri.remove() 

314 log.debug("Successfully deleted file: %s", location.uri) 

315 

316 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

317 # Docstring inherited from GenericBaseDatastore 

318 records = [] 

319 for ref, info in zip(refs, infos): 

320 # Component should come from ref and fall back on info 

321 component = ref.datasetType.component() 

322 if component is None and info.component is not None: 322 ↛ 323line 322 didn't jump to line 323, because the condition on line 322 was never true

323 component = info.component 

324 if component is None: 

325 # Use empty string since we want this to be part of the 

326 # primary key. 

327 component = NULLSTR 

328 records.append( 

329 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path, 

330 storage_class=info.storageClass.name, component=component, 

331 checksum=info.checksum, file_size=info.file_size) 

332 ) 

333 self._table.insert(*records) 

334 

335 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

336 # Docstring inherited from GenericBaseDatastore 

337 

338 # Look for the dataset_id -- there might be multiple matches 

339 # if we have disassembled the dataset. 

340 records = list(self._table.fetch(dataset_id=ref.id)) 

341 

342 results = [] 

343 for record in records: 

344 # Convert name of StorageClass to instance 

345 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"]) 

346 component = record["component"] if (record["component"] 

347 and record["component"] != NULLSTR) else None 

348 

349 info = StoredFileInfo(formatter=record["formatter"], 

350 path=record["path"], 

351 storageClass=storageClass, 

352 component=component, 

353 checksum=record["checksum"], 

354 file_size=record["file_size"]) 

355 results.append(info) 

356 

357 return results 

358 

359 def _registered_refs_per_artifact(self, pathInStore: str) -> Set[int]: 

360 """Return all dataset refs associated with the supplied path. 

361 

362 Parameters 

363 ---------- 

364 pathInStore : `str` 

365 Path of interest in the data store. 

366 

367 Returns 

368 ------- 

369 ids : `set` of `int` 

370 All `DatasetRef` IDs associated with this path. 

371 """ 

372 records = list(self._table.fetch(path=pathInStore)) 

373 ids = {r["dataset_id"] for r in records} 

374 return ids 

375 

376 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

377 # Docstring inherited from GenericBaseDatastore 

378 self._table.delete(dataset_id=ref.id) 

379 

380 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

381 r"""Find all the `Location`\ s of the requested dataset in the 

382 `Datastore` and the associated stored file information. 

383 

384 Parameters 

385 ---------- 

386 ref : `DatasetRef` 

387 Reference to the required `Dataset`. 

388 

389 Returns 

390 ------- 

391 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

392 Location of the dataset within the datastore and 

393 stored information about each file and its formatter. 

394 """ 

395 # Get the file information (this will fail if no file) 

396 records = self.getStoredItemsInfo(ref) 

397 

398 # Use the path to determine the location 

399 return [(self.locationFactory.fromPath(r.path), r) for r in records] 

400 

401 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

402 """Check that there is only one dataset associated with the 

403 specified artifact. 

404 

405 Parameters 

406 ---------- 

407 ref : `DatasetRef` or `FakeDatasetRef` 

408 Dataset to be removed. 

409 location : `Location` 

410 The location of the artifact to be removed. 

411 

412 Returns 

413 ------- 

414 can_remove : `Bool` 

415 True if the artifact can be safely removed. 

416 """ 

417 

418 # Get all entries associated with this path 

419 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

420 if not allRefs: 420 ↛ 421line 420 didn't jump to line 421, because the condition on line 420 was never true

421 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

422 

423 # Remove these refs from all the refs and if there is nothing left 

424 # then we can delete 

425 remainingRefs = allRefs - {ref.id} 

426 

427 if remainingRefs: 

428 return False 

429 return True 

430 

431 def _prepare_for_get(self, ref: DatasetRef, 

432 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]: 

433 """Check parameters for ``get`` and obtain formatter and 

434 location. 

435 

436 Parameters 

437 ---------- 

438 ref : `DatasetRef` 

439 Reference to the required Dataset. 

440 parameters : `dict` 

441 `StorageClass`-specific parameters that specify, for example, 

442 a slice of the dataset to be loaded. 

443 

444 Returns 

445 ------- 

446 getInfo : `list` [`DatastoreFileGetInformation`] 

447 Parameters needed to retrieve each file. 

448 """ 

449 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

450 

451 # Get file metadata and internal metadata 

452 fileLocations = self._get_dataset_locations_info(ref) 

453 if not fileLocations: 

454 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

455 

456 # The storage class we want to use eventually 

457 refStorageClass = ref.datasetType.storageClass 

458 

459 if len(fileLocations) > 1: 

460 disassembled = True 

461 else: 

462 disassembled = False 

463 

464 # Is this a component request? 

465 refComponent = ref.datasetType.component() 

466 

467 fileGetInfo = [] 

468 for location, storedFileInfo in fileLocations: 

469 

470 # The storage class used to write the file 

471 writeStorageClass = storedFileInfo.storageClass 

472 

473 # If this has been disassembled we need read to match the write 

474 if disassembled: 

475 readStorageClass = writeStorageClass 

476 else: 

477 readStorageClass = refStorageClass 

478 

479 formatter = getInstanceOf(storedFileInfo.formatter, 

480 FileDescriptor(location, readStorageClass=readStorageClass, 

481 storageClass=writeStorageClass, parameters=parameters), 

482 ref.dataId) 

483 

484 formatterParams, notFormatterParams = formatter.segregateParameters() 

485 

486 # Of the remaining parameters, extract the ones supported by 

487 # this StorageClass (for components not all will be handled) 

488 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

489 

490 # The ref itself could be a component if the dataset was 

491 # disassembled by butler, or we disassembled in datastore and 

492 # components came from the datastore records 

493 component = storedFileInfo.component if storedFileInfo.component else refComponent 

494 

495 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo, 

496 assemblerParams, formatterParams, 

497 component, readStorageClass)) 

498 

499 return fileGetInfo 

500 

501 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

502 """Check the arguments for ``put`` and obtain formatter and 

503 location. 

504 

505 Parameters 

506 ---------- 

507 inMemoryDataset : `object` 

508 The dataset to store. 

509 ref : `DatasetRef` 

510 Reference to the associated Dataset. 

511 

512 Returns 

513 ------- 

514 location : `Location` 

515 The location to write the dataset. 

516 formatter : `Formatter` 

517 The `Formatter` to use to write the dataset. 

518 

519 Raises 

520 ------ 

521 TypeError 

522 Supplied object and storage class are inconsistent. 

523 DatasetTypeNotSupportedError 

524 The associated `DatasetType` is not handled by this datastore. 

525 """ 

526 self._validate_put_parameters(inMemoryDataset, ref) 

527 

528 # Work out output file name 

529 try: 

530 template = self.templates.getTemplate(ref) 

531 except KeyError as e: 

532 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

533 

534 # Validate the template to protect against filenames from different 

535 # dataIds returning the same and causing overwrite confusion. 

536 template.validateTemplate(ref) 

537 

538 location = self.locationFactory.fromPath(template.format(ref)) 

539 

540 # Get the formatter based on the storage class 

541 storageClass = ref.datasetType.storageClass 

542 try: 

543 formatter = self.formatterFactory.getFormatter(ref, 

544 FileDescriptor(location, 

545 storageClass=storageClass), 

546 ref.dataId) 

547 except KeyError as e: 

548 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore " 

549 f"{self.name}") from e 

550 

551 # Now that we know the formatter, update the location 

552 location = formatter.makeUpdatedLocation(location) 

553 

554 return location, formatter 

555 

556 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

557 # Docstring inherited from base class 

558 if transfer != "auto": 

559 return transfer 

560 

561 # See if the paths are within the datastore or not 

562 inside = [self._pathInStore(d.path) is not None for d in datasets] 

563 

564 if all(inside): 

565 transfer = None 

566 elif not any(inside): 566 ↛ 570line 566 didn't jump to line 570, because the condition on line 566 was never false

567 # Allow ButlerURI to use its own knowledge 

568 transfer = "auto" 

569 else: 

570 raise ValueError("Some datasets are inside the datastore and some are outside." 

571 " Please use an explicit transfer mode and not 'auto'.") 

572 

573 return transfer 

574 

575 def _pathInStore(self, path: str) -> Optional[str]: 

576 """Return path relative to datastore root 

577 

578 Parameters 

579 ---------- 

580 path : `str` 

581 Path to dataset. Can be absolute. If relative assumed to 

582 be relative to the datastore. Returns path in datastore 

583 or raises an exception if the path it outside. 

584 

585 Returns 

586 ------- 

587 inStore : `str` 

588 Path relative to datastore root. Returns `None` if the file is 

589 outside the root. 

590 """ 

591 # Relative path will always be relative to datastore 

592 pathUri = ButlerURI(path, forceAbsolute=False) 

593 return pathUri.relative_to(self.root) 

594 

595 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

596 """Standardize the path of a to-be-ingested file. 

597 

598 Parameters 

599 ---------- 

600 path : `str` 

601 Path of a file to be ingested. 

602 transfer : `str`, optional 

603 How (and whether) the dataset should be added to the datastore. 

604 See `ingest` for details of transfer modes. 

605 This implementation is provided only so 

606 `NotImplementedError` can be raised if the mode is not supported; 

607 actual transfers are deferred to `_extractIngestInfo`. 

608 

609 Returns 

610 ------- 

611 path : `str` 

612 New path in what the datastore considers standard form. 

613 

614 Notes 

615 ----- 

616 Subclasses of `FileLikeDatastore` can implement this method instead 

617 of `_prepIngest`. It should not modify the data repository or given 

618 file in any way. 

619 

620 Raises 

621 ------ 

622 NotImplementedError 

623 Raised if the datastore does not support the given transfer mode 

624 (including the case where ingest is not supported at all). 

625 FileNotFoundError 

626 Raised if one of the given files does not exist. 

627 """ 

628 if transfer not in (None,) + self.root.transferModes: 628 ↛ 629line 628 didn't jump to line 629, because the condition on line 628 was never true

629 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

630 

631 # A relative URI indicates relative to datastore root 

632 srcUri = ButlerURI(path, forceAbsolute=False) 

633 if not srcUri.isabs(): 

634 srcUri = self.root.join(path) 

635 

636 if not srcUri.exists(): 

637 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest " 

638 f"are assumed to be relative to {self.root} unless they are absolute.") 

639 

640 if transfer is None: 

641 relpath = srcUri.relative_to(self.root) 

642 if not relpath: 

643 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not " 

644 f"within datastore ({self.root})") 

645 

646 # Return the relative path within the datastore for internal 

647 # transfer 

648 path = relpath 

649 

650 return path 

651 

652 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *, 

653 formatter: Union[Formatter, Type[Formatter]], 

654 transfer: Optional[str] = None) -> StoredFileInfo: 

655 """Relocate (if necessary) and extract `StoredFileInfo` from a 

656 to-be-ingested file. 

657 

658 Parameters 

659 ---------- 

660 path : `str` or `ButlerURI` 

661 URI or path of a file to be ingested. 

662 ref : `DatasetRef` 

663 Reference for the dataset being ingested. Guaranteed to have 

664 ``dataset_id not None`. 

665 formatter : `type` or `Formatter` 

666 `Formatter` subclass to use for this dataset or an instance. 

667 transfer : `str`, optional 

668 How (and whether) the dataset should be added to the datastore. 

669 See `ingest` for details of transfer modes. 

670 

671 Returns 

672 ------- 

673 info : `StoredFileInfo` 

674 Internal datastore record for this file. This will be inserted by 

675 the caller; the `_extractIngestInfo` is only resposible for 

676 creating and populating the struct. 

677 

678 Raises 

679 ------ 

680 FileNotFoundError 

681 Raised if one of the given files does not exist. 

682 FileExistsError 

683 Raised if transfer is not `None` but the (internal) location the 

684 file would be moved to is already occupied. 

685 """ 

686 if self._transaction is None: 686 ↛ 687line 686 didn't jump to line 687, because the condition on line 686 was never true

687 raise RuntimeError("Ingest called without transaction enabled") 

688 

689 # Create URI of the source path, do not need to force a relative 

690 # path to absolute. 

691 srcUri = ButlerURI(path, forceAbsolute=False) 

692 

693 # Track whether we have read the size of the source yet 

694 have_sized = False 

695 

696 if transfer is None: 

697 # A relative path is assumed to be relative to the datastore 

698 # in this context 

699 if not srcUri.isabs(): 

700 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

701 else: 

702 # Work out the path in the datastore from an absolute URI 

703 # This is required to be within the datastore. 

704 pathInStore = srcUri.relative_to(self.root) 

705 if pathInStore is None: 705 ↛ 706line 705 didn't jump to line 706, because the condition on line 705 was never true

706 raise RuntimeError(f"Unexpectedly learned that {srcUri} is " 

707 f"not within datastore {self.root}") 

708 tgtLocation = self.locationFactory.fromPath(pathInStore) 

709 else: 

710 # Work out the name we want this ingested file to have 

711 # inside the datastore 

712 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

713 if not tgtLocation.uri.dirname().exists(): 

714 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

715 tgtLocation.uri.dirname().mkdir() 

716 

717 # if we are transferring from a local file to a remote location 

718 # it may be more efficient to get the size and checksum of the 

719 # local file rather than the transferred one 

720 if not srcUri.scheme or srcUri.scheme == "file": 720 ↛ 726line 720 didn't jump to line 726, because the condition on line 720 was never false

721 size = srcUri.size() 

722 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

723 have_sized = True 

724 

725 # transfer the resource to the destination 

726 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction) 

727 

728 # the file should exist in the datastore now 

729 if not have_sized: 

730 size = tgtLocation.uri.size() 

731 checksum = self.computeChecksum(tgtLocation.uri) if self.useChecksum else None 

732 

733 return StoredFileInfo(formatter=formatter, path=tgtLocation.pathInStore, 

734 storageClass=ref.datasetType.storageClass, 

735 component=ref.datasetType.component(), 

736 file_size=size, checksum=checksum) 

737 

738 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

739 # Docstring inherited from Datastore._prepIngest. 

740 filtered = [] 

741 for dataset in datasets: 

742 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

743 if not acceptable: 

744 continue 

745 else: 

746 dataset.refs = acceptable 

747 if dataset.formatter is None: 

748 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

749 else: 

750 assert isinstance(dataset.formatter, (type, str)) 

751 dataset.formatter = getClassOf(dataset.formatter) 

752 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

753 filtered.append(dataset) 

754 return _IngestPrepData(filtered) 

755 

756 @transactional 

757 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None: 

758 # Docstring inherited from Datastore._finishIngest. 

759 refsAndInfos = [] 

760 for dataset in prepData.datasets: 

761 # Do ingest as if the first dataset ref is associated with the file 

762 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter, 

763 transfer=transfer) 

764 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

765 self._register_datasets(refsAndInfos) 

766 

767 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef, 

768 formatter: Union[Formatter, Type[Formatter]]) -> Location: 

769 """Given a source URI and a DatasetRef, determine the name the 

770 dataset will have inside datastore. 

771 

772 Parameters 

773 ---------- 

774 srcUri : `ButlerURI` 

775 URI to the source dataset file. 

776 ref : `DatasetRef` 

777 Ref associated with the newly-ingested dataset artifact. This 

778 is used to determine the name within the datastore. 

779 formatter : `Formatter` or Formatter class. 

780 Formatter to use for validation. Can be a class or an instance. 

781 

782 Returns 

783 ------- 

784 location : `Location` 

785 Target location for the newly-ingested dataset. 

786 """ 

787 # Ingesting a file from outside the datastore. 

788 # This involves a new name. 

789 template = self.templates.getTemplate(ref) 

790 location = self.locationFactory.fromPath(template.format(ref)) 

791 

792 # Get the extension 

793 ext = srcUri.getExtension() 

794 

795 # Update the destination to include that extension 

796 location.updateExtension(ext) 

797 

798 # Ask the formatter to validate this extension 

799 formatter.validateExtension(location) 

800 

801 return location 

802 

803 @abstractmethod 

804 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

805 """Write out in memory dataset to datastore. 

806 

807 Parameters 

808 ---------- 

809 inMemoryDataset : `object` 

810 Dataset to write to datastore. 

811 ref : `DatasetRef` 

812 Registry information associated with this dataset. 

813 

814 Returns 

815 ------- 

816 info : `StoredFileInfo` 

817 Information describin the artifact written to the datastore. 

818 """ 

819 raise NotImplementedError() 

820 

821 @abstractmethod 

822 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

823 ref: DatasetRef, isComponent: bool = False) -> Any: 

824 """Read the artifact from datastore into in memory object. 

825 

826 Parameters 

827 ---------- 

828 getInfo : `DatastoreFileGetInformation` 

829 Information about the artifact within the datastore. 

830 ref : `DatasetRef` 

831 The registry information associated with this artifact. 

832 isComponent : `bool` 

833 Flag to indicate if a component is being read from this artifact. 

834 

835 Returns 

836 ------- 

837 inMemoryDataset : `object` 

838 The artifact as a python object. 

839 """ 

840 raise NotImplementedError() 

841 

842 def exists(self, ref: DatasetRef) -> bool: 

843 """Check if the dataset exists in the datastore. 

844 

845 Parameters 

846 ---------- 

847 ref : `DatasetRef` 

848 Reference to the required dataset. 

849 

850 Returns 

851 ------- 

852 exists : `bool` 

853 `True` if the entity exists in the `Datastore`. 

854 """ 

855 fileLocations = self._get_dataset_locations_info(ref) 

856 if not fileLocations: 

857 return False 

858 for location, _ in fileLocations: 

859 if not self._artifact_exists(location): 

860 return False 

861 

862 return True 

863 

864 def getURIs(self, ref: DatasetRef, 

865 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

866 """Return URIs associated with dataset. 

867 

868 Parameters 

869 ---------- 

870 ref : `DatasetRef` 

871 Reference to the required dataset. 

872 predict : `bool`, optional 

873 If the datastore does not know about the dataset, should it 

874 return a predicted URI or not? 

875 

876 Returns 

877 ------- 

878 primary : `ButlerURI` 

879 The URI to the primary artifact associated with this dataset. 

880 If the dataset was disassembled within the datastore this 

881 may be `None`. 

882 components : `dict` 

883 URIs to any components associated with the dataset artifact. 

884 Can be empty if there are no components. 

885 """ 

886 

887 primary: Optional[ButlerURI] = None 

888 components: Dict[str, ButlerURI] = {} 

889 

890 # if this has never been written then we have to guess 

891 if not self.exists(ref): 

892 if not predict: 

893 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

894 

895 def predictLocation(thisRef: DatasetRef) -> Location: 

896 template = self.templates.getTemplate(thisRef) 

897 location = self.locationFactory.fromPath(template.format(thisRef)) 

898 storageClass = ref.datasetType.storageClass 

899 formatter = self.formatterFactory.getFormatter(thisRef, 

900 FileDescriptor(location, 

901 storageClass=storageClass)) 

902 # Try to use the extension attribute but ignore problems if the 

903 # formatter does not define one. 

904 try: 

905 location = formatter.makeUpdatedLocation(location) 

906 except Exception: 

907 # Use the default extension 

908 pass 

909 return location 

910 

911 doDisassembly = self.composites.shouldBeDisassembled(ref) 

912 

913 if doDisassembly: 

914 

915 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

916 compRef = ref.makeComponentRef(component) 

917 compLocation = predictLocation(compRef) 

918 

919 # Add a URI fragment to indicate this is a guess 

920 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted") 

921 

922 else: 

923 

924 location = predictLocation(ref) 

925 

926 # Add a URI fragment to indicate this is a guess 

927 primary = ButlerURI(location.uri.geturl() + "#predicted") 

928 

929 return primary, components 

930 

931 # If this is a ref that we have written we can get the path. 

932 # Get file metadata and internal metadata 

933 fileLocations = self._get_dataset_locations_info(ref) 

934 

935 if not fileLocations: 935 ↛ 936line 935 didn't jump to line 936, because the condition on line 935 was never true

936 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

937 

938 if len(fileLocations) == 1: 

939 # No disassembly so this is the primary URI 

940 primary = ButlerURI(fileLocations[0][0].uri) 

941 

942 else: 

943 for location, storedFileInfo in fileLocations: 

944 if storedFileInfo.component is None: 944 ↛ 945line 944 didn't jump to line 945, because the condition on line 944 was never true

945 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

946 components[storedFileInfo.component] = ButlerURI(location.uri) 

947 

948 return primary, components 

949 

950 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI: 

951 """URI to the Dataset. 

952 

953 Parameters 

954 ---------- 

955 ref : `DatasetRef` 

956 Reference to the required Dataset. 

957 predict : `bool` 

958 If `True`, allow URIs to be returned of datasets that have not 

959 been written. 

960 

961 Returns 

962 ------- 

963 uri : `str` 

964 URI pointing to the dataset within the datastore. If the 

965 dataset does not exist in the datastore, and if ``predict`` is 

966 `True`, the URI will be a prediction and will include a URI 

967 fragment "#predicted". 

968 If the datastore does not have entities that relate well 

969 to the concept of a URI the returned URI will be 

970 descriptive. The returned URI is not guaranteed to be obtainable. 

971 

972 Raises 

973 ------ 

974 FileNotFoundError 

975 Raised if a URI has been requested for a dataset that does not 

976 exist and guessing is not allowed. 

977 RuntimeError 

978 Raised if a request is made for a single URI but multiple URIs 

979 are associated with this dataset. 

980 

981 Notes 

982 ----- 

983 When a predicted URI is requested an attempt will be made to form 

984 a reasonable URI based on file templates and the expected formatter. 

985 """ 

986 primary, components = self.getURIs(ref, predict) 

987 if primary is None or components: 987 ↛ 988line 987 didn't jump to line 988, because the condition on line 987 was never true

988 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. " 

989 "Use Dataastore.getURIs() instead.") 

990 return primary 

991 

992 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

993 """Load an InMemoryDataset from the store. 

994 

995 Parameters 

996 ---------- 

997 ref : `DatasetRef` 

998 Reference to the required Dataset. 

999 parameters : `dict` 

1000 `StorageClass`-specific parameters that specify, for example, 

1001 a slice of the dataset to be loaded. 

1002 

1003 Returns 

1004 ------- 

1005 inMemoryDataset : `object` 

1006 Requested dataset or slice thereof as an InMemoryDataset. 

1007 

1008 Raises 

1009 ------ 

1010 FileNotFoundError 

1011 Requested dataset can not be retrieved. 

1012 TypeError 

1013 Return value from formatter has unexpected type. 

1014 ValueError 

1015 Formatter failed to process the dataset. 

1016 """ 

1017 allGetInfo = self._prepare_for_get(ref, parameters) 

1018 refComponent = ref.datasetType.component() 

1019 

1020 # Supplied storage class for the component being read 

1021 refStorageClass = ref.datasetType.storageClass 

1022 

1023 # Create mapping from component name to related info 

1024 allComponents = {i.component: i for i in allGetInfo} 

1025 

1026 # By definition the dataset is disassembled if we have more 

1027 # than one record for it. 

1028 isDisassembled = len(allGetInfo) > 1 

1029 

1030 # Look for the special case where we are disassembled but the 

1031 # component is a derived component that was not written during 

1032 # disassembly. For this scenario we need to check that the 

1033 # component requested is listed as a derived component for the 

1034 # composite storage class 

1035 isDisassembledReadOnlyComponent = False 

1036 if isDisassembled and refComponent: 

1037 # The composite storage class should be accessible through 

1038 # the component dataset type 

1039 compositeStorageClass = ref.datasetType.parentStorageClass 

1040 

1041 # In the unlikely scenario where the composite storage 

1042 # class is not known, we can only assume that this is a 

1043 # normal component. If that assumption is wrong then the 

1044 # branch below that reads a persisted component will fail 

1045 # so there is no need to complain here. 

1046 if compositeStorageClass is not None: 1046 ↛ 1049line 1046 didn't jump to line 1049, because the condition on line 1046 was never false

1047 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

1048 

1049 if isDisassembled and not refComponent: 

1050 # This was a disassembled dataset spread over multiple files 

1051 # and we need to put them all back together again. 

1052 # Read into memory and then assemble 

1053 

1054 # Check that the supplied parameters are suitable for the type read 

1055 refStorageClass.validateParameters(parameters) 

1056 

1057 # We want to keep track of all the parameters that were not used 

1058 # by formatters. We assume that if any of the component formatters 

1059 # use a parameter that we do not need to apply it again in the 

1060 # assembler. 

1061 usedParams = set() 

1062 

1063 components: Dict[str, Any] = {} 

1064 for getInfo in allGetInfo: 

1065 # assemblerParams are parameters not understood by the 

1066 # associated formatter. 

1067 usedParams.update(set(getInfo.formatterParams)) 

1068 

1069 component = getInfo.component 

1070 

1071 if component is None: 1071 ↛ 1072line 1071 didn't jump to line 1072, because the condition on line 1071 was never true

1072 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

1073 

1074 # We do not want the formatter to think it's reading 

1075 # a component though because it is really reading a 

1076 # standalone dataset -- always tell reader it is not a 

1077 # component. 

1078 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False) 

1079 

1080 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

1081 

1082 # Any unused parameters will have to be passed to the assembler 

1083 if parameters: 

1084 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

1085 else: 

1086 unusedParams = {} 

1087 

1088 # Process parameters 

1089 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset, 

1090 parameters=unusedParams) 

1091 

1092 elif isDisassembledReadOnlyComponent: 

1093 

1094 compositeStorageClass = ref.datasetType.parentStorageClass 

1095 if compositeStorageClass is None: 1095 ↛ 1096line 1095 didn't jump to line 1096, because the condition on line 1095 was never true

1096 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since" 

1097 "no composite storage class is available.") 

1098 

1099 if refComponent is None: 1099 ↛ 1101line 1099 didn't jump to line 1101, because the condition on line 1099 was never true

1100 # Mainly for mypy 

1101 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

1102 

1103 # Assume that every derived component can be calculated by 

1104 # forwarding the request to a single read/write component. 

1105 # Rather than guessing which rw component is the right one by 

1106 # scanning each for a derived component of the same name, 

1107 # we ask the storage class delegate directly which one is best to 

1108 # use. 

1109 compositeDelegate = compositeStorageClass.delegate() 

1110 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent, 

1111 set(allComponents)) 

1112 

1113 # Select the relevant component 

1114 rwInfo = allComponents[forwardedComponent] 

1115 

1116 # For now assume that read parameters are validated against 

1117 # the real component and not the requested component 

1118 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

1119 forwardedStorageClass.validateParameters(parameters) 

1120 

1121 # Unfortunately the FileDescriptor inside the formatter will have 

1122 # the wrong write storage class so we need to create a new one 

1123 # given the immutability constraint. 

1124 writeStorageClass = rwInfo.info.storageClass 

1125 

1126 # We may need to put some thought into parameters for read 

1127 # components but for now forward them on as is 

1128 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location, 

1129 readStorageClass=refStorageClass, 

1130 storageClass=writeStorageClass, 

1131 parameters=parameters), 

1132 ref.dataId) 

1133 

1134 # The assembler can not receive any parameter requests for a 

1135 # derived component at this time since the assembler will 

1136 # see the storage class of the derived component and those 

1137 # parameters will have to be handled by the formatter on the 

1138 # forwarded storage class. 

1139 assemblerParams: Dict[str, Any] = {} 

1140 

1141 # Need to created a new info that specifies the derived 

1142 # component and associated storage class 

1143 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter, 

1144 rwInfo.info, assemblerParams, {}, 

1145 refComponent, refStorageClass) 

1146 

1147 return self._read_artifact_into_memory(readInfo, ref, isComponent=True) 

1148 

1149 else: 

1150 # Single file request or component from that composite file 

1151 for lookup in (refComponent, None): 1151 ↛ 1156line 1151 didn't jump to line 1156, because the loop on line 1151 didn't complete

1152 if lookup in allComponents: 1152 ↛ 1151line 1152 didn't jump to line 1151, because the condition on line 1152 was never false

1153 getInfo = allComponents[lookup] 

1154 break 

1155 else: 

1156 raise FileNotFoundError(f"Component {refComponent} not found " 

1157 f"for ref {ref} in datastore {self.name}") 

1158 

1159 # Do not need the component itself if already disassembled 

1160 if isDisassembled: 

1161 isComponent = False 

1162 else: 

1163 isComponent = getInfo.component is not None 

1164 

1165 # For a disassembled component we can validate parametersagainst 

1166 # the component storage class directly 

1167 if isDisassembled: 

1168 refStorageClass.validateParameters(parameters) 

1169 else: 

1170 # For an assembled composite this could be a derived 

1171 # component derived from a real component. The validity 

1172 # of the parameters is not clear. For now validate against 

1173 # the composite storage class 

1174 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

1175 

1176 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent) 

1177 

1178 @transactional 

1179 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

1180 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

1181 

1182 Parameters 

1183 ---------- 

1184 inMemoryDataset : `object` 

1185 The dataset to store. 

1186 ref : `DatasetRef` 

1187 Reference to the associated Dataset. 

1188 

1189 Raises 

1190 ------ 

1191 TypeError 

1192 Supplied object and storage class are inconsistent. 

1193 DatasetTypeNotSupportedError 

1194 The associated `DatasetType` is not handled by this datastore. 

1195 

1196 Notes 

1197 ----- 

1198 If the datastore is configured to reject certain dataset types it 

1199 is possible that the put will fail and raise a 

1200 `DatasetTypeNotSupportedError`. The main use case for this is to 

1201 allow `ChainedDatastore` to put to multiple datastores without 

1202 requiring that every datastore accepts the dataset. 

1203 """ 

1204 

1205 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1206 # doDisassembly = True 

1207 

1208 artifacts = [] 

1209 if doDisassembly: 

1210 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

1211 for component, componentInfo in components.items(): 

1212 # Don't recurse because we want to take advantage of 

1213 # bulk insert -- need a new DatasetRef that refers to the 

1214 # same dataset_id but has the component DatasetType 

1215 # DatasetType does not refer to the types of components 

1216 # So we construct one ourselves. 

1217 compRef = ref.makeComponentRef(component) 

1218 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

1219 artifacts.append((compRef, storedInfo)) 

1220 else: 

1221 # Write the entire thing out 

1222 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

1223 artifacts.append((ref, storedInfo)) 

1224 

1225 self._register_datasets(artifacts) 

1226 

1227 @transactional 

1228 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None: 

1229 """Indicate to the datastore that a dataset can be removed. 

1230 

1231 Parameters 

1232 ---------- 

1233 ref : `DatasetRef` 

1234 Reference to the required Dataset. 

1235 ignore_errors : `bool` 

1236 If `True` return without error even if something went wrong. 

1237 Problems could occur if another process is simultaneously trying 

1238 to delete. 

1239 

1240 Raises 

1241 ------ 

1242 FileNotFoundError 

1243 Attempt to remove a dataset that does not exist. 

1244 """ 

1245 # Get file metadata and internal metadata 

1246 log.debug("Trashing %s in datastore %s", ref, self.name) 

1247 

1248 fileLocations = self._get_dataset_locations_info(ref) 

1249 

1250 if not fileLocations: 

1251 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

1252 if ignore_errors: 

1253 log.warning(err_msg) 

1254 return 

1255 else: 

1256 raise FileNotFoundError(err_msg) 

1257 

1258 for location, storedFileInfo in fileLocations: 

1259 if not self._artifact_exists(location): 1259 ↛ 1260line 1259 didn't jump to line 1260, because the condition on line 1259 was never true

1260 err_msg = f"Dataset is known to datastore {self.name} but " \ 

1261 f"associated artifact ({location.uri}) is missing" 

1262 if ignore_errors: 

1263 log.warning(err_msg) 

1264 return 

1265 else: 

1266 raise FileNotFoundError(err_msg) 

1267 

1268 # Mark dataset as trashed 

1269 try: 

1270 self._move_to_trash_in_registry(ref) 

1271 except Exception as e: 

1272 if ignore_errors: 

1273 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} " 

1274 f"but encountered an error: {e}") 

1275 pass 

1276 else: 

1277 raise 

1278 

1279 @transactional 

1280 def emptyTrash(self, ignore_errors: bool = True) -> None: 

1281 """Remove all datasets from the trash. 

1282 

1283 Parameters 

1284 ---------- 

1285 ignore_errors : `bool` 

1286 If `True` return without error even if something went wrong. 

1287 Problems could occur if another process is simultaneously trying 

1288 to delete. 

1289 """ 

1290 log.debug("Emptying trash in datastore %s", self.name) 

1291 # Context manager will empty trash iff we finish it without raising. 

1292 with self.bridge.emptyTrash() as trashed: 

1293 for ref in trashed: 

1294 fileLocations = self._get_dataset_locations_info(ref) 

1295 

1296 if not fileLocations: 1296 ↛ 1297line 1296 didn't jump to line 1297, because the condition on line 1296 was never true

1297 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}" 

1298 if ignore_errors: 

1299 log.warning(err_msg) 

1300 continue 

1301 else: 

1302 raise FileNotFoundError(err_msg) 

1303 

1304 for location, _ in fileLocations: 

1305 

1306 if not self._artifact_exists(location): 1306 ↛ 1307line 1306 didn't jump to line 1307, because the condition on line 1306 was never true

1307 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}" 

1308 if ignore_errors: 

1309 log.warning(err_msg) 

1310 continue 

1311 else: 

1312 raise FileNotFoundError(err_msg) 

1313 

1314 # Can only delete the artifact if there are no references 

1315 # to the file from untrashed dataset refs. 

1316 if self._can_remove_dataset_artifact(ref, location): 

1317 # Point of no return for this artifact 

1318 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

1319 try: 

1320 self._delete_artifact(location) 

1321 except Exception as e: 

1322 if ignore_errors: 

1323 log.critical("Encountered error removing artifact %s from datastore %s: %s", 

1324 location.uri, self.name, e) 

1325 else: 

1326 raise 

1327 

1328 # Now must remove the entry from the internal registry even if 

1329 # the artifact removal failed and was ignored, 

1330 # otherwise the removal check above will never be true 

1331 try: 

1332 # There may be multiple rows associated with this ref 

1333 # depending on disassembly 

1334 self.removeStoredItemInfo(ref) 

1335 except Exception as e: 

1336 if ignore_errors: 

1337 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s", 

1338 ref.id, location.uri, self.name, e) 

1339 continue 

1340 else: 

1341 raise FileNotFoundError(err_msg) 

1342 

1343 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], 

1344 logFailures: bool = False) -> None: 

1345 """Validate some of the configuration for this datastore. 

1346 

1347 Parameters 

1348 ---------- 

1349 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

1350 Entities to test against this configuration. Can be differing 

1351 types. 

1352 logFailures : `bool`, optional 

1353 If `True`, output a log message for every validation error 

1354 detected. 

1355 

1356 Raises 

1357 ------ 

1358 DatastoreValidationError 

1359 Raised if there is a validation problem with a configuration. 

1360 All the problems are reported in a single exception. 

1361 

1362 Notes 

1363 ----- 

1364 This method checks that all the supplied entities have valid file 

1365 templates and also have formatters defined. 

1366 """ 

1367 

1368 templateFailed = None 

1369 try: 

1370 self.templates.validateTemplates(entities, logFailures=logFailures) 

1371 except FileTemplateValidationError as e: 

1372 templateFailed = str(e) 

1373 

1374 formatterFailed = [] 

1375 for entity in entities: 

1376 try: 

1377 self.formatterFactory.getFormatterClass(entity) 

1378 except KeyError as e: 

1379 formatterFailed.append(str(e)) 

1380 if logFailures: 1380 ↛ 1375line 1380 didn't jump to line 1375, because the condition on line 1380 was never false

1381 log.fatal("Formatter failure: %s", e) 

1382 

1383 if templateFailed or formatterFailed: 

1384 messages = [] 

1385 if templateFailed: 1385 ↛ 1386line 1385 didn't jump to line 1386, because the condition on line 1385 was never true

1386 messages.append(templateFailed) 

1387 if formatterFailed: 1387 ↛ 1389line 1387 didn't jump to line 1389, because the condition on line 1387 was never false

1388 messages.append(",".join(formatterFailed)) 

1389 msg = ";\n".join(messages) 

1390 raise DatastoreValidationError(msg) 

1391 

1392 def getLookupKeys(self) -> Set[LookupKey]: 

1393 # Docstring is inherited from base class 

1394 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \ 

1395 self.constraints.getLookupKeys() 

1396 

1397 def validateKey(self, lookupKey: LookupKey, 

1398 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

1399 # Docstring is inherited from base class 

1400 # The key can be valid in either formatters or templates so we can 

1401 # only check the template if it exists 

1402 if lookupKey in self.templates: 

1403 try: 

1404 self.templates[lookupKey].validateTemplate(entity) 

1405 except FileTemplateValidationError as e: 

1406 raise DatastoreValidationError(e) from e 

1407 

1408 def export(self, refs: Iterable[DatasetRef], *, 

1409 directory: Optional[Union[ButlerURI, str]] = None, 

1410 transfer: Optional[str] = "auto") -> Iterable[FileDataset]: 

1411 # Docstring inherited from Datastore.export. 

1412 if transfer is not None and directory is None: 1412 ↛ 1413line 1412 didn't jump to line 1413, because the condition on line 1412 was never true

1413 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no " 

1414 "export directory given") 

1415 

1416 # Force the directory to be a URI object 

1417 directoryUri: Optional[ButlerURI] = None 

1418 if directory is not None: 1418 ↛ 1421line 1418 didn't jump to line 1421, because the condition on line 1418 was never false

1419 directoryUri = ButlerURI(directory, forceDirectory=True) 

1420 

1421 if transfer is not None and directoryUri is not None: 1421 ↛ 1426line 1421 didn't jump to line 1426, because the condition on line 1421 was never false

1422 # mypy needs the second test 

1423 if not directoryUri.exists(): 1423 ↛ 1424line 1423 didn't jump to line 1424, because the condition on line 1423 was never true

1424 raise FileNotFoundError(f"Export location {directory} does not exist") 

1425 

1426 for ref in refs: 

1427 fileLocations = self._get_dataset_locations_info(ref) 

1428 if not fileLocations: 1428 ↛ 1429line 1428 didn't jump to line 1429, because the condition on line 1428 was never true

1429 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

1430 # For now we can not export disassembled datasets 

1431 if len(fileLocations) > 1: 

1432 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

1433 location, storedFileInfo = fileLocations[0] 

1434 if transfer is None: 1434 ↛ 1437line 1434 didn't jump to line 1437, because the condition on line 1434 was never true

1435 # TODO: do we also need to return the readStorageClass somehow? 

1436 # We will use the path in store directly 

1437 pass 

1438 else: 

1439 # mypy needs help 

1440 assert directoryUri is not None, "directoryUri must be defined to get here" 

1441 storeUri = ButlerURI(location.uri) 

1442 exportUri = directoryUri.join(location.pathInStore) 

1443 exportUri.transfer_from(storeUri, transfer=transfer) 

1444 

1445 yield FileDataset(refs=[ref], path=location.pathInStore, formatter=storedFileInfo.formatter) 

1446 

1447 @staticmethod 

1448 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]: 

1449 """Compute the checksum of the supplied file. 

1450 

1451 Parameters 

1452 ---------- 

1453 uri : `ButlerURI` 

1454 Name of resource to calculate checksum from. 

1455 algorithm : `str`, optional 

1456 Name of algorithm to use. Must be one of the algorithms supported 

1457 by :py:class`hashlib`. 

1458 block_size : `int` 

1459 Number of bytes to read from file at one time. 

1460 

1461 Returns 

1462 ------- 

1463 hexdigest : `str` 

1464 Hex digest of the file. 

1465 

1466 Notes 

1467 ----- 

1468 Currently returns None if the URI is for a remote resource. 

1469 """ 

1470 if algorithm not in hashlib.algorithms_guaranteed: 1470 ↛ 1471line 1470 didn't jump to line 1471, because the condition on line 1470 was never true

1471 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

1472 

1473 if uri.scheme and uri.scheme != "file": 1473 ↛ 1474line 1473 didn't jump to line 1474, because the condition on line 1473 was never true

1474 return None 

1475 

1476 hasher = hashlib.new(algorithm) 

1477 

1478 filename, is_temp = uri.as_local() 

1479 

1480 with open(filename, "rb") as f: 

1481 for chunk in iter(lambda: f.read(block_size), b""): 

1482 hasher.update(chunk) 

1483 

1484 if is_temp: 1484 ↛ 1485line 1484 didn't jump to line 1485, because the condition on line 1484 was never true

1485 os.remove(filename) 

1486 

1487 return hasher.hexdigest()