Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileDatastore", ) 

26 

27import hashlib 

28import logging 

29import os 

30import tempfile 

31 

32from sqlalchemy import BigInteger, String 

33 

34from dataclasses import dataclass 

35from typing import ( 

36 TYPE_CHECKING, 

37 Any, 

38 ClassVar, 

39 Dict, 

40 Iterable, 

41 List, 

42 Mapping, 

43 Optional, 

44 Set, 

45 Tuple, 

46 Type, 

47 Union, 

48) 

49 

50from lsst.daf.butler import ( 

51 ButlerURI, 

52 CompositesMap, 

53 Config, 

54 FileDataset, 

55 DatasetRef, 

56 DatasetType, 

57 DatasetTypeNotSupportedError, 

58 Datastore, 

59 DatastoreConfig, 

60 DatastoreValidationError, 

61 FileDescriptor, 

62 FileTemplates, 

63 FileTemplateValidationError, 

64 Formatter, 

65 FormatterFactory, 

66 Location, 

67 LocationFactory, 

68 StorageClass, 

69 StoredFileInfo, 

70) 

71 

72from lsst.daf.butler import ddl 

73from lsst.daf.butler.registry.interfaces import ( 

74 ReadOnlyDatabaseError, 

75 DatastoreRegistryBridge, 

76) 

77 

78from lsst.daf.butler.core.repoRelocation import replaceRoot 

79from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional 

80from .genericDatastore import GenericBaseDatastore 

81 

82if TYPE_CHECKING: 82 ↛ 83line 82 didn't jump to line 83, because the condition on line 82 was never true

83 from lsst.daf.butler import LookupKey 

84 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

85 

86log = logging.getLogger(__name__) 

87 

88# String to use when a Python None is encountered 

89NULLSTR = "__NULL_STRING__" 

90 

91 

92class _IngestPrepData(Datastore.IngestPrepData): 

93 """Helper class for FileDatastore ingest implementation. 

94 

95 Parameters 

96 ---------- 

97 datasets : `list` of `FileDataset` 

98 Files to be ingested by this datastore. 

99 """ 

100 def __init__(self, datasets: List[FileDataset]): 

101 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

102 self.datasets = datasets 

103 

104 

105@dataclass(frozen=True) 

106class DatastoreFileGetInformation: 

107 """Collection of useful parameters needed to retrieve a file from 

108 a Datastore. 

109 """ 

110 

111 location: Location 

112 """The location from which to read the dataset.""" 

113 

114 formatter: Formatter 

115 """The `Formatter` to use to deserialize the dataset.""" 

116 

117 info: StoredFileInfo 

118 """Stored information about this file and its formatter.""" 

119 

120 assemblerParams: Dict[str, Any] 

121 """Parameters to use for post-processing the retrieved dataset.""" 

122 

123 formatterParams: Dict[str, Any] 

124 """Parameters that were understood by the associated formatter.""" 

125 

126 component: Optional[str] 

127 """The component to be retrieved (can be `None`).""" 

128 

129 readStorageClass: StorageClass 

130 """The `StorageClass` of the dataset being read.""" 

131 

132 

133class FileDatastore(GenericBaseDatastore): 

134 """Generic Datastore for file-based implementations. 

135 

136 Should always be sub-classed since key abstract methods are missing. 

137 

138 Parameters 

139 ---------- 

140 config : `DatastoreConfig` or `str` 

141 Configuration as either a `Config` object or URI to file. 

142 bridgeManager : `DatastoreRegistryBridgeManager` 

143 Object that manages the interface between `Registry` and datastores. 

144 butlerRoot : `str`, optional 

145 New datastore root to use to override the configuration value. 

146 

147 Raises 

148 ------ 

149 ValueError 

150 If root location does not exist and ``create`` is `False` in the 

151 configuration. 

152 """ 

153 

154 defaultConfigFile: ClassVar[Optional[str]] = None 

155 """Path to configuration defaults. Accessed within the ``config`` resource 

156 or relative to a search path. Can be None if no defaults specified. 

157 """ 

158 

159 root: ButlerURI 

160 """Root directory URI of this `Datastore`.""" 

161 

162 locationFactory: LocationFactory 

163 """Factory for creating locations relative to the datastore root.""" 

164 

165 formatterFactory: FormatterFactory 

166 """Factory for creating instances of formatters.""" 

167 

168 templates: FileTemplates 

169 """File templates that can be used by this `Datastore`.""" 

170 

171 composites: CompositesMap 

172 """Determines whether a dataset should be disassembled on put.""" 

173 

174 defaultConfigFile = "datastores/fileDatastore.yaml" 

175 """Path to configuration defaults. Accessed within the ``config`` resource 

176 or relative to a search path. Can be None if no defaults specified. 

177 """ 

178 

179 @classmethod 

180 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

181 """Set any filesystem-dependent config options for this Datastore to 

182 be appropriate for a new empty repository with the given root. 

183 

184 Parameters 

185 ---------- 

186 root : `str` 

187 URI to the root of the data repository. 

188 config : `Config` 

189 A `Config` to update. Only the subset understood by 

190 this component will be updated. Will not expand 

191 defaults. 

192 full : `Config` 

193 A complete config with all defaults expanded that can be 

194 converted to a `DatastoreConfig`. Read-only and will not be 

195 modified by this method. 

196 Repository-specific options that should not be obtained 

197 from defaults when Butler instances are constructed 

198 should be copied from ``full`` to ``config``. 

199 overwrite : `bool`, optional 

200 If `False`, do not modify a value in ``config`` if the value 

201 already exists. Default is always to overwrite with the provided 

202 ``root``. 

203 

204 Notes 

205 ----- 

206 If a keyword is explicitly defined in the supplied ``config`` it 

207 will not be overridden by this method if ``overwrite`` is `False`. 

208 This allows explicit values set in external configs to be retained. 

209 """ 

210 Config.updateParameters(DatastoreConfig, config, full, 

211 toUpdate={"root": root}, 

212 toCopy=("cls", ("records", "table")), overwrite=overwrite) 

213 

214 @classmethod 

215 def makeTableSpec(cls) -> ddl.TableSpec: 

216 return ddl.TableSpec( 

217 fields=[ 

218 ddl.FieldSpec(name="dataset_id", dtype=BigInteger, primaryKey=True), 

219 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

220 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

221 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

222 # Use empty string to indicate no component 

223 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

224 # TODO: should checksum be Base64Bytes instead? 

225 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

226 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

227 ], 

228 unique=frozenset(), 

229 ) 

230 

231 def __init__(self, config: Union[DatastoreConfig, str], 

232 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

233 super().__init__(config, bridgeManager) 

234 if "root" not in self.config: 234 ↛ 235line 234 didn't jump to line 235, because the condition on line 234 was never true

235 raise ValueError("No root directory specified in configuration") 

236 

237 # Name ourselves either using an explicit name or a name 

238 # derived from the (unexpanded) root 

239 if "name" in self.config: 

240 self.name = self.config["name"] 

241 else: 

242 # We use the unexpanded root in the name to indicate that this 

243 # datastore can be moved without having to update registry. 

244 self.name = "{}@{}".format(type(self).__name__, 

245 self.config["root"]) 

246 

247 # Support repository relocation in config 

248 # Existence of self.root is checked in subclass 

249 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot), 

250 forceDirectory=True, forceAbsolute=True) 

251 

252 self.locationFactory = LocationFactory(self.root) 

253 self.formatterFactory = FormatterFactory() 

254 

255 # Now associate formatters with storage classes 

256 self.formatterFactory.registerFormatters(self.config["formatters"], 

257 universe=bridgeManager.universe) 

258 

259 # Read the file naming templates 

260 self.templates = FileTemplates(self.config["templates"], 

261 universe=bridgeManager.universe) 

262 

263 # See if composites should be disassembled 

264 self.composites = CompositesMap(self.config["composites"], 

265 universe=bridgeManager.universe) 

266 

267 tableName = self.config["records", "table"] 

268 try: 

269 # Storage of paths and formatters, keyed by dataset_id 

270 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec()) 

271 # Interface to Registry. 

272 self._bridge = bridgeManager.register(self.name) 

273 except ReadOnlyDatabaseError: 

274 # If the database is read only and we just tried and failed to 

275 # create a table, it means someone is trying to create a read-only 

276 # butler client for an empty repo. That should be okay, as long 

277 # as they then try to get any datasets before some other client 

278 # creates the table. Chances are they'rejust validating 

279 # configuration. 

280 pass 

281 

282 # Determine whether checksums should be used - default to False 

283 self.useChecksum = self.config.get("checksum", False) 

284 

285 # Check existence and create directory structure if necessary 

286 if not self.root.exists(): 

287 if "create" not in self.config or not self.config["create"]: 287 ↛ 288line 287 didn't jump to line 288, because the condition on line 287 was never true

288 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

289 try: 

290 self.root.mkdir() 

291 except Exception as e: 

292 raise ValueError(f"Can not create datastore root '{self.root}', check permissions." 

293 f" Got error: {e}") from e 

294 

295 def __str__(self) -> str: 

296 return str(self.root) 

297 

298 @property 

299 def bridge(self) -> DatastoreRegistryBridge: 

300 return self._bridge 

301 

302 def _artifact_exists(self, location: Location) -> bool: 

303 """Check that an artifact exists in this datastore at the specified 

304 location. 

305 

306 Parameters 

307 ---------- 

308 location : `Location` 

309 Expected location of the artifact associated with this datastore. 

310 

311 Returns 

312 ------- 

313 exists : `bool` 

314 True if the location can be found, false otherwise. 

315 """ 

316 log.debug("Checking if resource exists: %s", location.uri) 

317 return location.uri.exists() 

318 

319 def _delete_artifact(self, location: Location) -> None: 

320 """Delete the artifact from the datastore. 

321 

322 Parameters 

323 ---------- 

324 location : `Location` 

325 Location of the artifact associated with this datastore. 

326 """ 

327 log.debug("Deleting file: %s", location.uri) 

328 location.uri.remove() 

329 log.debug("Successfully deleted file: %s", location.uri) 

330 

331 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

332 # Docstring inherited from GenericBaseDatastore 

333 records = [] 

334 for ref, info in zip(refs, infos): 

335 # Component should come from ref and fall back on info 

336 component = ref.datasetType.component() 

337 if component is None and info.component is not None: 337 ↛ 338line 337 didn't jump to line 338, because the condition on line 337 was never true

338 component = info.component 

339 if component is None: 

340 # Use empty string since we want this to be part of the 

341 # primary key. 

342 component = NULLSTR 

343 records.append( 

344 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path, 

345 storage_class=info.storageClass.name, component=component, 

346 checksum=info.checksum, file_size=info.file_size) 

347 ) 

348 self._table.insert(*records) 

349 

350 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

351 # Docstring inherited from GenericBaseDatastore 

352 

353 # Look for the dataset_id -- there might be multiple matches 

354 # if we have disassembled the dataset. 

355 records = list(self._table.fetch(dataset_id=ref.id)) 

356 

357 results = [] 

358 for record in records: 

359 # Convert name of StorageClass to instance 

360 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"]) 

361 component = record["component"] if (record["component"] 

362 and record["component"] != NULLSTR) else None 

363 

364 info = StoredFileInfo(formatter=record["formatter"], 

365 path=record["path"], 

366 storageClass=storageClass, 

367 component=component, 

368 checksum=record["checksum"], 

369 file_size=record["file_size"]) 

370 results.append(info) 

371 

372 return results 

373 

374 def _registered_refs_per_artifact(self, pathInStore: str) -> Set[int]: 

375 """Return all dataset refs associated with the supplied path. 

376 

377 Parameters 

378 ---------- 

379 pathInStore : `str` 

380 Path of interest in the data store. 

381 

382 Returns 

383 ------- 

384 ids : `set` of `int` 

385 All `DatasetRef` IDs associated with this path. 

386 """ 

387 records = list(self._table.fetch(path=pathInStore)) 

388 ids = {r["dataset_id"] for r in records} 

389 return ids 

390 

391 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

392 # Docstring inherited from GenericBaseDatastore 

393 self._table.delete(dataset_id=ref.id) 

394 

395 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

396 r"""Find all the `Location`\ s of the requested dataset in the 

397 `Datastore` and the associated stored file information. 

398 

399 Parameters 

400 ---------- 

401 ref : `DatasetRef` 

402 Reference to the required `Dataset`. 

403 

404 Returns 

405 ------- 

406 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

407 Location of the dataset within the datastore and 

408 stored information about each file and its formatter. 

409 """ 

410 # Get the file information (this will fail if no file) 

411 records = self.getStoredItemsInfo(ref) 

412 

413 # Use the path to determine the location 

414 return [(self.locationFactory.fromPath(r.path), r) for r in records] 

415 

416 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

417 """Check that there is only one dataset associated with the 

418 specified artifact. 

419 

420 Parameters 

421 ---------- 

422 ref : `DatasetRef` or `FakeDatasetRef` 

423 Dataset to be removed. 

424 location : `Location` 

425 The location of the artifact to be removed. 

426 

427 Returns 

428 ------- 

429 can_remove : `Bool` 

430 True if the artifact can be safely removed. 

431 """ 

432 

433 # Get all entries associated with this path 

434 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

435 if not allRefs: 435 ↛ 436line 435 didn't jump to line 436, because the condition on line 435 was never true

436 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

437 

438 # Remove these refs from all the refs and if there is nothing left 

439 # then we can delete 

440 remainingRefs = allRefs - {ref.id} 

441 

442 if remainingRefs: 

443 return False 

444 return True 

445 

446 def _prepare_for_get(self, ref: DatasetRef, 

447 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]: 

448 """Check parameters for ``get`` and obtain formatter and 

449 location. 

450 

451 Parameters 

452 ---------- 

453 ref : `DatasetRef` 

454 Reference to the required Dataset. 

455 parameters : `dict` 

456 `StorageClass`-specific parameters that specify, for example, 

457 a slice of the dataset to be loaded. 

458 

459 Returns 

460 ------- 

461 getInfo : `list` [`DatastoreFileGetInformation`] 

462 Parameters needed to retrieve each file. 

463 """ 

464 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

465 

466 # Get file metadata and internal metadata 

467 fileLocations = self._get_dataset_locations_info(ref) 

468 if not fileLocations: 

469 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

470 

471 # The storage class we want to use eventually 

472 refStorageClass = ref.datasetType.storageClass 

473 

474 if len(fileLocations) > 1: 

475 disassembled = True 

476 else: 

477 disassembled = False 

478 

479 # Is this a component request? 

480 refComponent = ref.datasetType.component() 

481 

482 fileGetInfo = [] 

483 for location, storedFileInfo in fileLocations: 

484 

485 # The storage class used to write the file 

486 writeStorageClass = storedFileInfo.storageClass 

487 

488 # If this has been disassembled we need read to match the write 

489 if disassembled: 

490 readStorageClass = writeStorageClass 

491 else: 

492 readStorageClass = refStorageClass 

493 

494 formatter = getInstanceOf(storedFileInfo.formatter, 

495 FileDescriptor(location, readStorageClass=readStorageClass, 

496 storageClass=writeStorageClass, parameters=parameters), 

497 ref.dataId) 

498 

499 formatterParams, notFormatterParams = formatter.segregateParameters() 

500 

501 # Of the remaining parameters, extract the ones supported by 

502 # this StorageClass (for components not all will be handled) 

503 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

504 

505 # The ref itself could be a component if the dataset was 

506 # disassembled by butler, or we disassembled in datastore and 

507 # components came from the datastore records 

508 component = storedFileInfo.component if storedFileInfo.component else refComponent 

509 

510 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo, 

511 assemblerParams, formatterParams, 

512 component, readStorageClass)) 

513 

514 return fileGetInfo 

515 

516 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

517 """Check the arguments for ``put`` and obtain formatter and 

518 location. 

519 

520 Parameters 

521 ---------- 

522 inMemoryDataset : `object` 

523 The dataset to store. 

524 ref : `DatasetRef` 

525 Reference to the associated Dataset. 

526 

527 Returns 

528 ------- 

529 location : `Location` 

530 The location to write the dataset. 

531 formatter : `Formatter` 

532 The `Formatter` to use to write the dataset. 

533 

534 Raises 

535 ------ 

536 TypeError 

537 Supplied object and storage class are inconsistent. 

538 DatasetTypeNotSupportedError 

539 The associated `DatasetType` is not handled by this datastore. 

540 """ 

541 self._validate_put_parameters(inMemoryDataset, ref) 

542 

543 # Work out output file name 

544 try: 

545 template = self.templates.getTemplate(ref) 

546 except KeyError as e: 

547 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

548 

549 # Validate the template to protect against filenames from different 

550 # dataIds returning the same and causing overwrite confusion. 

551 template.validateTemplate(ref) 

552 

553 location = self.locationFactory.fromPath(template.format(ref)) 

554 

555 # Get the formatter based on the storage class 

556 storageClass = ref.datasetType.storageClass 

557 try: 

558 formatter = self.formatterFactory.getFormatter(ref, 

559 FileDescriptor(location, 

560 storageClass=storageClass), 

561 ref.dataId) 

562 except KeyError as e: 

563 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore " 

564 f"{self.name}") from e 

565 

566 # Now that we know the formatter, update the location 

567 location = formatter.makeUpdatedLocation(location) 

568 

569 return location, formatter 

570 

571 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

572 # Docstring inherited from base class 

573 if transfer != "auto": 

574 return transfer 

575 

576 # See if the paths are within the datastore or not 

577 inside = [self._pathInStore(d.path) is not None for d in datasets] 

578 

579 if all(inside): 

580 transfer = None 

581 elif not any(inside): 581 ↛ 585line 581 didn't jump to line 585, because the condition on line 581 was never false

582 # Allow ButlerURI to use its own knowledge 

583 transfer = "auto" 

584 else: 

585 raise ValueError("Some datasets are inside the datastore and some are outside." 

586 " Please use an explicit transfer mode and not 'auto'.") 

587 

588 return transfer 

589 

590 def _pathInStore(self, path: str) -> Optional[str]: 

591 """Return path relative to datastore root 

592 

593 Parameters 

594 ---------- 

595 path : `str` 

596 Path to dataset. Can be absolute. If relative assumed to 

597 be relative to the datastore. Returns path in datastore 

598 or raises an exception if the path it outside. 

599 

600 Returns 

601 ------- 

602 inStore : `str` 

603 Path relative to datastore root. Returns `None` if the file is 

604 outside the root. 

605 """ 

606 # Relative path will always be relative to datastore 

607 pathUri = ButlerURI(path, forceAbsolute=False) 

608 return pathUri.relative_to(self.root) 

609 

610 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

611 """Standardize the path of a to-be-ingested file. 

612 

613 Parameters 

614 ---------- 

615 path : `str` 

616 Path of a file to be ingested. 

617 transfer : `str`, optional 

618 How (and whether) the dataset should be added to the datastore. 

619 See `ingest` for details of transfer modes. 

620 This implementation is provided only so 

621 `NotImplementedError` can be raised if the mode is not supported; 

622 actual transfers are deferred to `_extractIngestInfo`. 

623 

624 Returns 

625 ------- 

626 path : `str` 

627 New path in what the datastore considers standard form. 

628 

629 Notes 

630 ----- 

631 Subclasses of `FileDatastore` can implement this method instead 

632 of `_prepIngest`. It should not modify the data repository or given 

633 file in any way. 

634 

635 Raises 

636 ------ 

637 NotImplementedError 

638 Raised if the datastore does not support the given transfer mode 

639 (including the case where ingest is not supported at all). 

640 FileNotFoundError 

641 Raised if one of the given files does not exist. 

642 """ 

643 if transfer not in (None,) + self.root.transferModes: 643 ↛ 644line 643 didn't jump to line 644, because the condition on line 643 was never true

644 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

645 

646 # A relative URI indicates relative to datastore root 

647 srcUri = ButlerURI(path, forceAbsolute=False) 

648 if not srcUri.isabs(): 

649 srcUri = self.root.join(path) 

650 

651 if not srcUri.exists(): 

652 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest " 

653 f"are assumed to be relative to {self.root} unless they are absolute.") 

654 

655 if transfer is None: 

656 relpath = srcUri.relative_to(self.root) 

657 if not relpath: 

658 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not " 

659 f"within datastore ({self.root})") 

660 

661 # Return the relative path within the datastore for internal 

662 # transfer 

663 path = relpath 

664 

665 return path 

666 

667 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *, 

668 formatter: Union[Formatter, Type[Formatter]], 

669 transfer: Optional[str] = None) -> StoredFileInfo: 

670 """Relocate (if necessary) and extract `StoredFileInfo` from a 

671 to-be-ingested file. 

672 

673 Parameters 

674 ---------- 

675 path : `str` or `ButlerURI` 

676 URI or path of a file to be ingested. 

677 ref : `DatasetRef` 

678 Reference for the dataset being ingested. Guaranteed to have 

679 ``dataset_id not None`. 

680 formatter : `type` or `Formatter` 

681 `Formatter` subclass to use for this dataset or an instance. 

682 transfer : `str`, optional 

683 How (and whether) the dataset should be added to the datastore. 

684 See `ingest` for details of transfer modes. 

685 

686 Returns 

687 ------- 

688 info : `StoredFileInfo` 

689 Internal datastore record for this file. This will be inserted by 

690 the caller; the `_extractIngestInfo` is only resposible for 

691 creating and populating the struct. 

692 

693 Raises 

694 ------ 

695 FileNotFoundError 

696 Raised if one of the given files does not exist. 

697 FileExistsError 

698 Raised if transfer is not `None` but the (internal) location the 

699 file would be moved to is already occupied. 

700 """ 

701 if self._transaction is None: 701 ↛ 702line 701 didn't jump to line 702, because the condition on line 701 was never true

702 raise RuntimeError("Ingest called without transaction enabled") 

703 

704 # Create URI of the source path, do not need to force a relative 

705 # path to absolute. 

706 srcUri = ButlerURI(path, forceAbsolute=False) 

707 

708 # Track whether we have read the size of the source yet 

709 have_sized = False 

710 

711 if transfer is None: 

712 # A relative path is assumed to be relative to the datastore 

713 # in this context 

714 if not srcUri.isabs(): 

715 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

716 else: 

717 # Work out the path in the datastore from an absolute URI 

718 # This is required to be within the datastore. 

719 pathInStore = srcUri.relative_to(self.root) 

720 if pathInStore is None: 720 ↛ 721line 720 didn't jump to line 721, because the condition on line 720 was never true

721 raise RuntimeError(f"Unexpectedly learned that {srcUri} is " 

722 f"not within datastore {self.root}") 

723 tgtLocation = self.locationFactory.fromPath(pathInStore) 

724 else: 

725 # Work out the name we want this ingested file to have 

726 # inside the datastore 

727 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

728 if not tgtLocation.uri.dirname().exists(): 

729 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

730 tgtLocation.uri.dirname().mkdir() 

731 

732 # if we are transferring from a local file to a remote location 

733 # it may be more efficient to get the size and checksum of the 

734 # local file rather than the transferred one 

735 if not srcUri.scheme or srcUri.scheme == "file": 735 ↛ 741line 735 didn't jump to line 741, because the condition on line 735 was never false

736 size = srcUri.size() 

737 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

738 have_sized = True 

739 

740 # transfer the resource to the destination 

741 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction) 

742 

743 # the file should exist in the datastore now 

744 if not have_sized: 

745 size = tgtLocation.uri.size() 

746 checksum = self.computeChecksum(tgtLocation.uri) if self.useChecksum else None 

747 

748 return StoredFileInfo(formatter=formatter, path=tgtLocation.pathInStore, 

749 storageClass=ref.datasetType.storageClass, 

750 component=ref.datasetType.component(), 

751 file_size=size, checksum=checksum) 

752 

753 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

754 # Docstring inherited from Datastore._prepIngest. 

755 filtered = [] 

756 for dataset in datasets: 

757 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

758 if not acceptable: 

759 continue 

760 else: 

761 dataset.refs = acceptable 

762 if dataset.formatter is None: 

763 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

764 else: 

765 assert isinstance(dataset.formatter, (type, str)) 

766 dataset.formatter = getClassOf(dataset.formatter) 

767 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

768 filtered.append(dataset) 

769 return _IngestPrepData(filtered) 

770 

771 @transactional 

772 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None: 

773 # Docstring inherited from Datastore._finishIngest. 

774 refsAndInfos = [] 

775 for dataset in prepData.datasets: 

776 # Do ingest as if the first dataset ref is associated with the file 

777 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter, 

778 transfer=transfer) 

779 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

780 self._register_datasets(refsAndInfos) 

781 

782 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef, 

783 formatter: Union[Formatter, Type[Formatter]]) -> Location: 

784 """Given a source URI and a DatasetRef, determine the name the 

785 dataset will have inside datastore. 

786 

787 Parameters 

788 ---------- 

789 srcUri : `ButlerURI` 

790 URI to the source dataset file. 

791 ref : `DatasetRef` 

792 Ref associated with the newly-ingested dataset artifact. This 

793 is used to determine the name within the datastore. 

794 formatter : `Formatter` or Formatter class. 

795 Formatter to use for validation. Can be a class or an instance. 

796 

797 Returns 

798 ------- 

799 location : `Location` 

800 Target location for the newly-ingested dataset. 

801 """ 

802 # Ingesting a file from outside the datastore. 

803 # This involves a new name. 

804 template = self.templates.getTemplate(ref) 

805 location = self.locationFactory.fromPath(template.format(ref)) 

806 

807 # Get the extension 

808 ext = srcUri.getExtension() 

809 

810 # Update the destination to include that extension 

811 location.updateExtension(ext) 

812 

813 # Ask the formatter to validate this extension 

814 formatter.validateExtension(location) 

815 

816 return location 

817 

818 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

819 """Write out in memory dataset to datastore. 

820 

821 Parameters 

822 ---------- 

823 inMemoryDataset : `object` 

824 Dataset to write to datastore. 

825 ref : `DatasetRef` 

826 Registry information associated with this dataset. 

827 

828 Returns 

829 ------- 

830 info : `StoredFileInfo` 

831 Information describin the artifact written to the datastore. 

832 """ 

833 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

834 uri = location.uri 

835 

836 if not uri.dirname().exists(): 

837 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

838 uri.dirname().mkdir() 

839 

840 if self._transaction is None: 840 ↛ 841line 840 didn't jump to line 841, because the condition on line 840 was never true

841 raise RuntimeError("Attempting to write artifact without transaction enabled") 

842 

843 def _removeFileExists(uri: ButlerURI) -> None: 

844 """Remove a file and do not complain if it is not there. 

845 

846 This is important since a formatter might fail before the file 

847 is written and we should not confuse people by writing spurious 

848 error messages to the log. 

849 """ 

850 try: 

851 uri.remove() 

852 except FileNotFoundError: 

853 pass 

854 

855 # Register a callback to try to delete the uploaded data if 

856 # something fails below 

857 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

858 

859 # For a local file, simply use the formatter directly 

860 if uri.isLocal: 

861 path = formatter.write(inMemoryDataset) 

862 assert self.root.join(path) == uri 

863 log.debug("Successfully wrote python object to local file at %s", uri) 

864 else: 

865 # This is a remote URI, so first try bytes and write directly else 

866 # fallback to a temporary file 

867 try: 

868 serializedDataset = formatter.toBytes(inMemoryDataset) 

869 log.debug("Writing bytes directly to %s", uri) 

870 uri.write(serializedDataset, overwrite=True) 

871 log.debug("Successfully wrote bytes directly to %s", uri) 

872 except NotImplementedError: 

873 with tempfile.NamedTemporaryFile(suffix=uri.getExtension()) as tmpFile: 

874 # Need to configure the formatter to write to a different 

875 # location and that needs us to overwrite internals 

876 tmpLocation = Location(*os.path.split(tmpFile.name)) 

877 log.debug("Writing dataset to temporary location at %s", tmpLocation.uri) 

878 with formatter._updateLocation(tmpLocation): 

879 formatter.write(inMemoryDataset) 

880 uri.transfer_from(tmpLocation.uri, transfer="copy", overwrite=True) 

881 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

882 

883 # URI is needed to resolve what ingest case are we dealing with 

884 return self._extractIngestInfo(uri, ref, formatter=formatter) 

885 

886 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

887 ref: DatasetRef, isComponent: bool = False) -> Any: 

888 """Read the artifact from datastore into in memory object. 

889 

890 Parameters 

891 ---------- 

892 getInfo : `DatastoreFileGetInformation` 

893 Information about the artifact within the datastore. 

894 ref : `DatasetRef` 

895 The registry information associated with this artifact. 

896 isComponent : `bool` 

897 Flag to indicate if a component is being read from this artifact. 

898 

899 Returns 

900 ------- 

901 inMemoryDataset : `object` 

902 The artifact as a python object. 

903 """ 

904 location = getInfo.location 

905 uri = location.uri 

906 log.debug("Accessing data from %s", uri) 

907 

908 # Cannot recalculate checksum but can compare size as a quick check 

909 recorded_size = getInfo.info.file_size 

910 resource_size = uri.size() 

911 if resource_size != recorded_size: 911 ↛ 912line 911 didn't jump to line 912, because the condition on line 911 was never true

912 raise RuntimeError("Integrity failure in Datastore. " 

913 f"Size of file {uri} ({resource_size}) " 

914 f"does not match size recorded in registry of {recorded_size}") 

915 

916 # For the general case we have choices for how to proceed. 

917 # 1. Always use a local file (downloading the remote resource to a 

918 # temporary file if needed). 

919 # 2. Use a threshold size and read into memory and use bytes. 

920 # Use both for now with an arbitrary hand off size. 

921 # This allows small datasets to be downloaded from remote object 

922 # stores without requiring a temporary file. 

923 

924 formatter = getInfo.formatter 

925 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

926 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

927 serializedDataset = uri.read() 

928 log.debug("Deserializing %s from %d bytes from location %s with formatter %s", 

929 f"component {getInfo.component}" if isComponent else "", 

930 len(serializedDataset), uri, formatter.name()) 

931 try: 

932 result = formatter.fromBytes(serializedDataset, 

933 component=getInfo.component if isComponent else None) 

934 except Exception as e: 

935 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

936 f" ({ref.datasetType.name} from {uri}): {e}") from e 

937 else: 

938 # Read from file 

939 with uri.as_local() as local_uri: 

940 # Have to update the Location associated with the formatter 

941 # because formatter.read does not allow an override. 

942 # This could be improved. 

943 msg = "" 

944 newLocation = None 

945 if uri != local_uri: 

946 newLocation = Location(*local_uri.split()) 

947 msg = "(via download to local file)" 

948 

949 log.debug("Reading %s from location %s %s with formatter %s", 

950 f"component {getInfo.component}" if isComponent else "", 

951 uri, msg, formatter.name()) 

952 try: 

953 with formatter._updateLocation(newLocation): 

954 result = formatter.read(component=getInfo.component if isComponent else None) 

955 except Exception as e: 

956 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

957 f" ({ref.datasetType.name} from {uri}): {e}") from e 

958 

959 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

960 isComponent=isComponent) 

961 

962 def exists(self, ref: DatasetRef) -> bool: 

963 """Check if the dataset exists in the datastore. 

964 

965 Parameters 

966 ---------- 

967 ref : `DatasetRef` 

968 Reference to the required dataset. 

969 

970 Returns 

971 ------- 

972 exists : `bool` 

973 `True` if the entity exists in the `Datastore`. 

974 """ 

975 fileLocations = self._get_dataset_locations_info(ref) 

976 if not fileLocations: 

977 return False 

978 for location, _ in fileLocations: 

979 if not self._artifact_exists(location): 

980 return False 

981 

982 return True 

983 

984 def getURIs(self, ref: DatasetRef, 

985 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

986 """Return URIs associated with dataset. 

987 

988 Parameters 

989 ---------- 

990 ref : `DatasetRef` 

991 Reference to the required dataset. 

992 predict : `bool`, optional 

993 If the datastore does not know about the dataset, should it 

994 return a predicted URI or not? 

995 

996 Returns 

997 ------- 

998 primary : `ButlerURI` 

999 The URI to the primary artifact associated with this dataset. 

1000 If the dataset was disassembled within the datastore this 

1001 may be `None`. 

1002 components : `dict` 

1003 URIs to any components associated with the dataset artifact. 

1004 Can be empty if there are no components. 

1005 """ 

1006 

1007 primary: Optional[ButlerURI] = None 

1008 components: Dict[str, ButlerURI] = {} 

1009 

1010 # if this has never been written then we have to guess 

1011 if not self.exists(ref): 

1012 if not predict: 

1013 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

1014 

1015 def predictLocation(thisRef: DatasetRef) -> Location: 

1016 template = self.templates.getTemplate(thisRef) 

1017 location = self.locationFactory.fromPath(template.format(thisRef)) 

1018 storageClass = ref.datasetType.storageClass 

1019 formatter = self.formatterFactory.getFormatter(thisRef, 

1020 FileDescriptor(location, 

1021 storageClass=storageClass)) 

1022 # Try to use the extension attribute but ignore problems if the 

1023 # formatter does not define one. 

1024 try: 

1025 location = formatter.makeUpdatedLocation(location) 

1026 except Exception: 

1027 # Use the default extension 

1028 pass 

1029 return location 

1030 

1031 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1032 

1033 if doDisassembly: 

1034 

1035 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

1036 compRef = ref.makeComponentRef(component) 

1037 compLocation = predictLocation(compRef) 

1038 

1039 # Add a URI fragment to indicate this is a guess 

1040 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted") 

1041 

1042 else: 

1043 

1044 location = predictLocation(ref) 

1045 

1046 # Add a URI fragment to indicate this is a guess 

1047 primary = ButlerURI(location.uri.geturl() + "#predicted") 

1048 

1049 return primary, components 

1050 

1051 # If this is a ref that we have written we can get the path. 

1052 # Get file metadata and internal metadata 

1053 fileLocations = self._get_dataset_locations_info(ref) 

1054 

1055 if not fileLocations: 1055 ↛ 1056line 1055 didn't jump to line 1056, because the condition on line 1055 was never true

1056 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1057 

1058 if len(fileLocations) == 1: 

1059 # No disassembly so this is the primary URI 

1060 primary = ButlerURI(fileLocations[0][0].uri) 

1061 

1062 else: 

1063 for location, storedFileInfo in fileLocations: 

1064 if storedFileInfo.component is None: 1064 ↛ 1065line 1064 didn't jump to line 1065, because the condition on line 1064 was never true

1065 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1066 components[storedFileInfo.component] = ButlerURI(location.uri) 

1067 

1068 return primary, components 

1069 

1070 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI: 

1071 """URI to the Dataset. 

1072 

1073 Parameters 

1074 ---------- 

1075 ref : `DatasetRef` 

1076 Reference to the required Dataset. 

1077 predict : `bool` 

1078 If `True`, allow URIs to be returned of datasets that have not 

1079 been written. 

1080 

1081 Returns 

1082 ------- 

1083 uri : `str` 

1084 URI pointing to the dataset within the datastore. If the 

1085 dataset does not exist in the datastore, and if ``predict`` is 

1086 `True`, the URI will be a prediction and will include a URI 

1087 fragment "#predicted". 

1088 If the datastore does not have entities that relate well 

1089 to the concept of a URI the returned URI will be 

1090 descriptive. The returned URI is not guaranteed to be obtainable. 

1091 

1092 Raises 

1093 ------ 

1094 FileNotFoundError 

1095 Raised if a URI has been requested for a dataset that does not 

1096 exist and guessing is not allowed. 

1097 RuntimeError 

1098 Raised if a request is made for a single URI but multiple URIs 

1099 are associated with this dataset. 

1100 

1101 Notes 

1102 ----- 

1103 When a predicted URI is requested an attempt will be made to form 

1104 a reasonable URI based on file templates and the expected formatter. 

1105 """ 

1106 primary, components = self.getURIs(ref, predict) 

1107 if primary is None or components: 1107 ↛ 1108line 1107 didn't jump to line 1108, because the condition on line 1107 was never true

1108 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. " 

1109 "Use Dataastore.getURIs() instead.") 

1110 return primary 

1111 

1112 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

1113 """Load an InMemoryDataset from the store. 

1114 

1115 Parameters 

1116 ---------- 

1117 ref : `DatasetRef` 

1118 Reference to the required Dataset. 

1119 parameters : `dict` 

1120 `StorageClass`-specific parameters that specify, for example, 

1121 a slice of the dataset to be loaded. 

1122 

1123 Returns 

1124 ------- 

1125 inMemoryDataset : `object` 

1126 Requested dataset or slice thereof as an InMemoryDataset. 

1127 

1128 Raises 

1129 ------ 

1130 FileNotFoundError 

1131 Requested dataset can not be retrieved. 

1132 TypeError 

1133 Return value from formatter has unexpected type. 

1134 ValueError 

1135 Formatter failed to process the dataset. 

1136 """ 

1137 allGetInfo = self._prepare_for_get(ref, parameters) 

1138 refComponent = ref.datasetType.component() 

1139 

1140 # Supplied storage class for the component being read 

1141 refStorageClass = ref.datasetType.storageClass 

1142 

1143 # Create mapping from component name to related info 

1144 allComponents = {i.component: i for i in allGetInfo} 

1145 

1146 # By definition the dataset is disassembled if we have more 

1147 # than one record for it. 

1148 isDisassembled = len(allGetInfo) > 1 

1149 

1150 # Look for the special case where we are disassembled but the 

1151 # component is a derived component that was not written during 

1152 # disassembly. For this scenario we need to check that the 

1153 # component requested is listed as a derived component for the 

1154 # composite storage class 

1155 isDisassembledReadOnlyComponent = False 

1156 if isDisassembled and refComponent: 

1157 # The composite storage class should be accessible through 

1158 # the component dataset type 

1159 compositeStorageClass = ref.datasetType.parentStorageClass 

1160 

1161 # In the unlikely scenario where the composite storage 

1162 # class is not known, we can only assume that this is a 

1163 # normal component. If that assumption is wrong then the 

1164 # branch below that reads a persisted component will fail 

1165 # so there is no need to complain here. 

1166 if compositeStorageClass is not None: 1166 ↛ 1169line 1166 didn't jump to line 1169, because the condition on line 1166 was never false

1167 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

1168 

1169 if isDisassembled and not refComponent: 

1170 # This was a disassembled dataset spread over multiple files 

1171 # and we need to put them all back together again. 

1172 # Read into memory and then assemble 

1173 

1174 # Check that the supplied parameters are suitable for the type read 

1175 refStorageClass.validateParameters(parameters) 

1176 

1177 # We want to keep track of all the parameters that were not used 

1178 # by formatters. We assume that if any of the component formatters 

1179 # use a parameter that we do not need to apply it again in the 

1180 # assembler. 

1181 usedParams = set() 

1182 

1183 components: Dict[str, Any] = {} 

1184 for getInfo in allGetInfo: 

1185 # assemblerParams are parameters not understood by the 

1186 # associated formatter. 

1187 usedParams.update(set(getInfo.formatterParams)) 

1188 

1189 component = getInfo.component 

1190 

1191 if component is None: 1191 ↛ 1192line 1191 didn't jump to line 1192, because the condition on line 1191 was never true

1192 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

1193 

1194 # We do not want the formatter to think it's reading 

1195 # a component though because it is really reading a 

1196 # standalone dataset -- always tell reader it is not a 

1197 # component. 

1198 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False) 

1199 

1200 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

1201 

1202 # Any unused parameters will have to be passed to the assembler 

1203 if parameters: 

1204 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

1205 else: 

1206 unusedParams = {} 

1207 

1208 # Process parameters 

1209 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset, 

1210 parameters=unusedParams) 

1211 

1212 elif isDisassembledReadOnlyComponent: 

1213 

1214 compositeStorageClass = ref.datasetType.parentStorageClass 

1215 if compositeStorageClass is None: 1215 ↛ 1216line 1215 didn't jump to line 1216, because the condition on line 1215 was never true

1216 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since" 

1217 "no composite storage class is available.") 

1218 

1219 if refComponent is None: 1219 ↛ 1221line 1219 didn't jump to line 1221, because the condition on line 1219 was never true

1220 # Mainly for mypy 

1221 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

1222 

1223 # Assume that every derived component can be calculated by 

1224 # forwarding the request to a single read/write component. 

1225 # Rather than guessing which rw component is the right one by 

1226 # scanning each for a derived component of the same name, 

1227 # we ask the storage class delegate directly which one is best to 

1228 # use. 

1229 compositeDelegate = compositeStorageClass.delegate() 

1230 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent, 

1231 set(allComponents)) 

1232 

1233 # Select the relevant component 

1234 rwInfo = allComponents[forwardedComponent] 

1235 

1236 # For now assume that read parameters are validated against 

1237 # the real component and not the requested component 

1238 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

1239 forwardedStorageClass.validateParameters(parameters) 

1240 

1241 # Unfortunately the FileDescriptor inside the formatter will have 

1242 # the wrong write storage class so we need to create a new one 

1243 # given the immutability constraint. 

1244 writeStorageClass = rwInfo.info.storageClass 

1245 

1246 # We may need to put some thought into parameters for read 

1247 # components but for now forward them on as is 

1248 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location, 

1249 readStorageClass=refStorageClass, 

1250 storageClass=writeStorageClass, 

1251 parameters=parameters), 

1252 ref.dataId) 

1253 

1254 # The assembler can not receive any parameter requests for a 

1255 # derived component at this time since the assembler will 

1256 # see the storage class of the derived component and those 

1257 # parameters will have to be handled by the formatter on the 

1258 # forwarded storage class. 

1259 assemblerParams: Dict[str, Any] = {} 

1260 

1261 # Need to created a new info that specifies the derived 

1262 # component and associated storage class 

1263 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter, 

1264 rwInfo.info, assemblerParams, {}, 

1265 refComponent, refStorageClass) 

1266 

1267 return self._read_artifact_into_memory(readInfo, ref, isComponent=True) 

1268 

1269 else: 

1270 # Single file request or component from that composite file 

1271 for lookup in (refComponent, None): 1271 ↛ 1276line 1271 didn't jump to line 1276, because the loop on line 1271 didn't complete

1272 if lookup in allComponents: 1272 ↛ 1271line 1272 didn't jump to line 1271, because the condition on line 1272 was never false

1273 getInfo = allComponents[lookup] 

1274 break 

1275 else: 

1276 raise FileNotFoundError(f"Component {refComponent} not found " 

1277 f"for ref {ref} in datastore {self.name}") 

1278 

1279 # Do not need the component itself if already disassembled 

1280 if isDisassembled: 

1281 isComponent = False 

1282 else: 

1283 isComponent = getInfo.component is not None 

1284 

1285 # For a disassembled component we can validate parametersagainst 

1286 # the component storage class directly 

1287 if isDisassembled: 

1288 refStorageClass.validateParameters(parameters) 

1289 else: 

1290 # For an assembled composite this could be a derived 

1291 # component derived from a real component. The validity 

1292 # of the parameters is not clear. For now validate against 

1293 # the composite storage class 

1294 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

1295 

1296 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent) 

1297 

1298 @transactional 

1299 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

1300 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

1301 

1302 Parameters 

1303 ---------- 

1304 inMemoryDataset : `object` 

1305 The dataset to store. 

1306 ref : `DatasetRef` 

1307 Reference to the associated Dataset. 

1308 

1309 Raises 

1310 ------ 

1311 TypeError 

1312 Supplied object and storage class are inconsistent. 

1313 DatasetTypeNotSupportedError 

1314 The associated `DatasetType` is not handled by this datastore. 

1315 

1316 Notes 

1317 ----- 

1318 If the datastore is configured to reject certain dataset types it 

1319 is possible that the put will fail and raise a 

1320 `DatasetTypeNotSupportedError`. The main use case for this is to 

1321 allow `ChainedDatastore` to put to multiple datastores without 

1322 requiring that every datastore accepts the dataset. 

1323 """ 

1324 

1325 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1326 # doDisassembly = True 

1327 

1328 artifacts = [] 

1329 if doDisassembly: 

1330 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

1331 for component, componentInfo in components.items(): 

1332 # Don't recurse because we want to take advantage of 

1333 # bulk insert -- need a new DatasetRef that refers to the 

1334 # same dataset_id but has the component DatasetType 

1335 # DatasetType does not refer to the types of components 

1336 # So we construct one ourselves. 

1337 compRef = ref.makeComponentRef(component) 

1338 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

1339 artifacts.append((compRef, storedInfo)) 

1340 else: 

1341 # Write the entire thing out 

1342 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

1343 artifacts.append((ref, storedInfo)) 

1344 

1345 self._register_datasets(artifacts) 

1346 

1347 @transactional 

1348 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None: 

1349 """Indicate to the datastore that a dataset can be removed. 

1350 

1351 Parameters 

1352 ---------- 

1353 ref : `DatasetRef` 

1354 Reference to the required Dataset. 

1355 ignore_errors : `bool` 

1356 If `True` return without error even if something went wrong. 

1357 Problems could occur if another process is simultaneously trying 

1358 to delete. 

1359 

1360 Raises 

1361 ------ 

1362 FileNotFoundError 

1363 Attempt to remove a dataset that does not exist. 

1364 """ 

1365 # Get file metadata and internal metadata 

1366 log.debug("Trashing %s in datastore %s", ref, self.name) 

1367 

1368 fileLocations = self._get_dataset_locations_info(ref) 

1369 

1370 if not fileLocations: 

1371 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

1372 if ignore_errors: 

1373 log.warning(err_msg) 

1374 return 

1375 else: 

1376 raise FileNotFoundError(err_msg) 

1377 

1378 for location, storedFileInfo in fileLocations: 

1379 if not self._artifact_exists(location): 1379 ↛ 1380line 1379 didn't jump to line 1380, because the condition on line 1379 was never true

1380 err_msg = f"Dataset is known to datastore {self.name} but " \ 

1381 f"associated artifact ({location.uri}) is missing" 

1382 if ignore_errors: 

1383 log.warning(err_msg) 

1384 return 

1385 else: 

1386 raise FileNotFoundError(err_msg) 

1387 

1388 # Mark dataset as trashed 

1389 try: 

1390 self._move_to_trash_in_registry(ref) 

1391 except Exception as e: 

1392 if ignore_errors: 

1393 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} " 

1394 f"but encountered an error: {e}") 

1395 pass 

1396 else: 

1397 raise 

1398 

1399 @transactional 

1400 def emptyTrash(self, ignore_errors: bool = True) -> None: 

1401 """Remove all datasets from the trash. 

1402 

1403 Parameters 

1404 ---------- 

1405 ignore_errors : `bool` 

1406 If `True` return without error even if something went wrong. 

1407 Problems could occur if another process is simultaneously trying 

1408 to delete. 

1409 """ 

1410 log.debug("Emptying trash in datastore %s", self.name) 

1411 # Context manager will empty trash iff we finish it without raising. 

1412 with self.bridge.emptyTrash() as trashed: 

1413 for ref in trashed: 

1414 fileLocations = self._get_dataset_locations_info(ref) 

1415 

1416 if not fileLocations: 1416 ↛ 1417line 1416 didn't jump to line 1417, because the condition on line 1416 was never true

1417 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}" 

1418 if ignore_errors: 

1419 log.warning(err_msg) 

1420 continue 

1421 else: 

1422 raise FileNotFoundError(err_msg) 

1423 

1424 for location, _ in fileLocations: 

1425 

1426 if not self._artifact_exists(location): 1426 ↛ 1427line 1426 didn't jump to line 1427, because the condition on line 1426 was never true

1427 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}" 

1428 if ignore_errors: 

1429 log.warning(err_msg) 

1430 continue 

1431 else: 

1432 raise FileNotFoundError(err_msg) 

1433 

1434 # Can only delete the artifact if there are no references 

1435 # to the file from untrashed dataset refs. 

1436 if self._can_remove_dataset_artifact(ref, location): 

1437 # Point of no return for this artifact 

1438 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

1439 try: 

1440 self._delete_artifact(location) 

1441 except Exception as e: 

1442 if ignore_errors: 

1443 log.critical("Encountered error removing artifact %s from datastore %s: %s", 

1444 location.uri, self.name, e) 

1445 else: 

1446 raise 

1447 

1448 # Now must remove the entry from the internal registry even if 

1449 # the artifact removal failed and was ignored, 

1450 # otherwise the removal check above will never be true 

1451 try: 

1452 # There may be multiple rows associated with this ref 

1453 # depending on disassembly 

1454 self.removeStoredItemInfo(ref) 

1455 except Exception as e: 

1456 if ignore_errors: 

1457 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s", 

1458 ref.id, location.uri, self.name, e) 

1459 continue 

1460 else: 

1461 raise FileNotFoundError(err_msg) 

1462 

1463 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], 

1464 logFailures: bool = False) -> None: 

1465 """Validate some of the configuration for this datastore. 

1466 

1467 Parameters 

1468 ---------- 

1469 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

1470 Entities to test against this configuration. Can be differing 

1471 types. 

1472 logFailures : `bool`, optional 

1473 If `True`, output a log message for every validation error 

1474 detected. 

1475 

1476 Raises 

1477 ------ 

1478 DatastoreValidationError 

1479 Raised if there is a validation problem with a configuration. 

1480 All the problems are reported in a single exception. 

1481 

1482 Notes 

1483 ----- 

1484 This method checks that all the supplied entities have valid file 

1485 templates and also have formatters defined. 

1486 """ 

1487 

1488 templateFailed = None 

1489 try: 

1490 self.templates.validateTemplates(entities, logFailures=logFailures) 

1491 except FileTemplateValidationError as e: 

1492 templateFailed = str(e) 

1493 

1494 formatterFailed = [] 

1495 for entity in entities: 

1496 try: 

1497 self.formatterFactory.getFormatterClass(entity) 

1498 except KeyError as e: 

1499 formatterFailed.append(str(e)) 

1500 if logFailures: 1500 ↛ 1495line 1500 didn't jump to line 1495, because the condition on line 1500 was never false

1501 log.fatal("Formatter failure: %s", e) 

1502 

1503 if templateFailed or formatterFailed: 

1504 messages = [] 

1505 if templateFailed: 1505 ↛ 1506line 1505 didn't jump to line 1506, because the condition on line 1505 was never true

1506 messages.append(templateFailed) 

1507 if formatterFailed: 1507 ↛ 1509line 1507 didn't jump to line 1509, because the condition on line 1507 was never false

1508 messages.append(",".join(formatterFailed)) 

1509 msg = ";\n".join(messages) 

1510 raise DatastoreValidationError(msg) 

1511 

1512 def getLookupKeys(self) -> Set[LookupKey]: 

1513 # Docstring is inherited from base class 

1514 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \ 

1515 self.constraints.getLookupKeys() 

1516 

1517 def validateKey(self, lookupKey: LookupKey, 

1518 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

1519 # Docstring is inherited from base class 

1520 # The key can be valid in either formatters or templates so we can 

1521 # only check the template if it exists 

1522 if lookupKey in self.templates: 

1523 try: 

1524 self.templates[lookupKey].validateTemplate(entity) 

1525 except FileTemplateValidationError as e: 

1526 raise DatastoreValidationError(e) from e 

1527 

1528 def export(self, refs: Iterable[DatasetRef], *, 

1529 directory: Optional[Union[ButlerURI, str]] = None, 

1530 transfer: Optional[str] = "auto") -> Iterable[FileDataset]: 

1531 # Docstring inherited from Datastore.export. 

1532 if transfer is not None and directory is None: 1532 ↛ 1533line 1532 didn't jump to line 1533, because the condition on line 1532 was never true

1533 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no " 

1534 "export directory given") 

1535 

1536 # Force the directory to be a URI object 

1537 directoryUri: Optional[ButlerURI] = None 

1538 if directory is not None: 1538 ↛ 1541line 1538 didn't jump to line 1541, because the condition on line 1538 was never false

1539 directoryUri = ButlerURI(directory, forceDirectory=True) 

1540 

1541 if transfer is not None and directoryUri is not None: 1541 ↛ 1546line 1541 didn't jump to line 1546, because the condition on line 1541 was never false

1542 # mypy needs the second test 

1543 if not directoryUri.exists(): 1543 ↛ 1544line 1543 didn't jump to line 1544, because the condition on line 1543 was never true

1544 raise FileNotFoundError(f"Export location {directory} does not exist") 

1545 

1546 for ref in refs: 

1547 fileLocations = self._get_dataset_locations_info(ref) 

1548 if not fileLocations: 1548 ↛ 1549line 1548 didn't jump to line 1549, because the condition on line 1548 was never true

1549 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

1550 # For now we can not export disassembled datasets 

1551 if len(fileLocations) > 1: 

1552 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

1553 location, storedFileInfo = fileLocations[0] 

1554 if transfer is None: 1554 ↛ 1557line 1554 didn't jump to line 1557, because the condition on line 1554 was never true

1555 # TODO: do we also need to return the readStorageClass somehow? 

1556 # We will use the path in store directly 

1557 pass 

1558 else: 

1559 # mypy needs help 

1560 assert directoryUri is not None, "directoryUri must be defined to get here" 

1561 storeUri = ButlerURI(location.uri) 

1562 exportUri = directoryUri.join(location.pathInStore) 

1563 exportUri.transfer_from(storeUri, transfer=transfer) 

1564 

1565 yield FileDataset(refs=[ref], path=location.pathInStore, formatter=storedFileInfo.formatter) 

1566 

1567 @staticmethod 

1568 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]: 

1569 """Compute the checksum of the supplied file. 

1570 

1571 Parameters 

1572 ---------- 

1573 uri : `ButlerURI` 

1574 Name of resource to calculate checksum from. 

1575 algorithm : `str`, optional 

1576 Name of algorithm to use. Must be one of the algorithms supported 

1577 by :py:class`hashlib`. 

1578 block_size : `int` 

1579 Number of bytes to read from file at one time. 

1580 

1581 Returns 

1582 ------- 

1583 hexdigest : `str` 

1584 Hex digest of the file. 

1585 

1586 Notes 

1587 ----- 

1588 Currently returns None if the URI is for a remote resource. 

1589 """ 

1590 if algorithm not in hashlib.algorithms_guaranteed: 1590 ↛ 1591line 1590 didn't jump to line 1591, because the condition on line 1590 was never true

1591 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

1592 

1593 if not uri.isLocal: 1593 ↛ 1594line 1593 didn't jump to line 1594, because the condition on line 1593 was never true

1594 return None 

1595 

1596 hasher = hashlib.new(algorithm) 

1597 

1598 with uri.as_local() as local_uri: 

1599 with open(local_uri.ospath, "rb") as f: 

1600 for chunk in iter(lambda: f.read(block_size), b""): 

1601 hasher.update(chunk) 

1602 

1603 return hasher.hexdigest()