Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileDatastore", ) 

26 

27import hashlib 

28import logging 

29import os 

30import tempfile 

31 

32from sqlalchemy import BigInteger, String 

33 

34from dataclasses import dataclass 

35from typing import ( 

36 TYPE_CHECKING, 

37 Any, 

38 ClassVar, 

39 Dict, 

40 Iterable, 

41 List, 

42 Mapping, 

43 Optional, 

44 Set, 

45 Tuple, 

46 Type, 

47 Union, 

48) 

49 

50from lsst.daf.butler import ( 

51 ButlerURI, 

52 CompositesMap, 

53 Config, 

54 FileDataset, 

55 DatasetRef, 

56 DatasetType, 

57 DatasetTypeNotSupportedError, 

58 Datastore, 

59 DatastoreConfig, 

60 DatastoreValidationError, 

61 FileDescriptor, 

62 FileTemplates, 

63 FileTemplateValidationError, 

64 Formatter, 

65 FormatterFactory, 

66 Location, 

67 LocationFactory, 

68 StorageClass, 

69 StoredFileInfo, 

70) 

71 

72from lsst.daf.butler import ddl 

73from lsst.daf.butler.registry.interfaces import ( 

74 ReadOnlyDatabaseError, 

75 DatastoreRegistryBridge, 

76) 

77 

78from lsst.daf.butler.core.repoRelocation import replaceRoot 

79from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional 

80from .genericDatastore import GenericBaseDatastore 

81 

82if TYPE_CHECKING: 82 ↛ 83line 82 didn't jump to line 83, because the condition on line 82 was never true

83 from lsst.daf.butler import LookupKey 

84 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

85 

86log = logging.getLogger(__name__) 

87 

88# String to use when a Python None is encountered 

89NULLSTR = "__NULL_STRING__" 

90 

91 

92class _IngestPrepData(Datastore.IngestPrepData): 

93 """Helper class for FileDatastore ingest implementation. 

94 

95 Parameters 

96 ---------- 

97 datasets : `list` of `FileDataset` 

98 Files to be ingested by this datastore. 

99 """ 

100 def __init__(self, datasets: List[FileDataset]): 

101 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

102 self.datasets = datasets 

103 

104 

105@dataclass(frozen=True) 

106class DatastoreFileGetInformation: 

107 """Collection of useful parameters needed to retrieve a file from 

108 a Datastore. 

109 """ 

110 

111 location: Location 

112 """The location from which to read the dataset.""" 

113 

114 formatter: Formatter 

115 """The `Formatter` to use to deserialize the dataset.""" 

116 

117 info: StoredFileInfo 

118 """Stored information about this file and its formatter.""" 

119 

120 assemblerParams: Dict[str, Any] 

121 """Parameters to use for post-processing the retrieved dataset.""" 

122 

123 formatterParams: Dict[str, Any] 

124 """Parameters that were understood by the associated formatter.""" 

125 

126 component: Optional[str] 

127 """The component to be retrieved (can be `None`).""" 

128 

129 readStorageClass: StorageClass 

130 """The `StorageClass` of the dataset being read.""" 

131 

132 

133class FileDatastore(GenericBaseDatastore): 

134 """Generic Datastore for file-based implementations. 

135 

136 Should always be sub-classed since key abstract methods are missing. 

137 

138 Parameters 

139 ---------- 

140 config : `DatastoreConfig` or `str` 

141 Configuration as either a `Config` object or URI to file. 

142 bridgeManager : `DatastoreRegistryBridgeManager` 

143 Object that manages the interface between `Registry` and datastores. 

144 butlerRoot : `str`, optional 

145 New datastore root to use to override the configuration value. 

146 

147 Raises 

148 ------ 

149 ValueError 

150 If root location does not exist and ``create`` is `False` in the 

151 configuration. 

152 """ 

153 

154 defaultConfigFile: ClassVar[Optional[str]] = None 

155 """Path to configuration defaults. Accessed within the ``config`` resource 

156 or relative to a search path. Can be None if no defaults specified. 

157 """ 

158 

159 root: ButlerURI 

160 """Root directory URI of this `Datastore`.""" 

161 

162 locationFactory: LocationFactory 

163 """Factory for creating locations relative to the datastore root.""" 

164 

165 formatterFactory: FormatterFactory 

166 """Factory for creating instances of formatters.""" 

167 

168 templates: FileTemplates 

169 """File templates that can be used by this `Datastore`.""" 

170 

171 composites: CompositesMap 

172 """Determines whether a dataset should be disassembled on put.""" 

173 

174 defaultConfigFile = "datastores/fileDatastore.yaml" 

175 """Path to configuration defaults. Accessed within the ``config`` resource 

176 or relative to a search path. Can be None if no defaults specified. 

177 """ 

178 

179 @classmethod 

180 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

181 """Set any filesystem-dependent config options for this Datastore to 

182 be appropriate for a new empty repository with the given root. 

183 

184 Parameters 

185 ---------- 

186 root : `str` 

187 URI to the root of the data repository. 

188 config : `Config` 

189 A `Config` to update. Only the subset understood by 

190 this component will be updated. Will not expand 

191 defaults. 

192 full : `Config` 

193 A complete config with all defaults expanded that can be 

194 converted to a `DatastoreConfig`. Read-only and will not be 

195 modified by this method. 

196 Repository-specific options that should not be obtained 

197 from defaults when Butler instances are constructed 

198 should be copied from ``full`` to ``config``. 

199 overwrite : `bool`, optional 

200 If `False`, do not modify a value in ``config`` if the value 

201 already exists. Default is always to overwrite with the provided 

202 ``root``. 

203 

204 Notes 

205 ----- 

206 If a keyword is explicitly defined in the supplied ``config`` it 

207 will not be overridden by this method if ``overwrite`` is `False`. 

208 This allows explicit values set in external configs to be retained. 

209 """ 

210 Config.updateParameters(DatastoreConfig, config, full, 

211 toUpdate={"root": root}, 

212 toCopy=("cls", ("records", "table")), overwrite=overwrite) 

213 

214 @classmethod 

215 def makeTableSpec(cls) -> ddl.TableSpec: 

216 return ddl.TableSpec( 

217 fields=[ 

218 ddl.FieldSpec(name="dataset_id", dtype=BigInteger, primaryKey=True), 

219 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

220 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

221 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

222 # Use empty string to indicate no component 

223 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

224 # TODO: should checksum be Base64Bytes instead? 

225 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

226 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

227 ], 

228 unique=frozenset(), 

229 ) 

230 

231 def __init__(self, config: Union[DatastoreConfig, str], 

232 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

233 super().__init__(config, bridgeManager) 

234 if "root" not in self.config: 234 ↛ 235line 234 didn't jump to line 235, because the condition on line 234 was never true

235 raise ValueError("No root directory specified in configuration") 

236 

237 # Name ourselves either using an explicit name or a name 

238 # derived from the (unexpanded) root 

239 if "name" in self.config: 

240 self.name = self.config["name"] 

241 else: 

242 # We use the unexpanded root in the name to indicate that this 

243 # datastore can be moved without having to update registry. 

244 self.name = "{}@{}".format(type(self).__name__, 

245 self.config["root"]) 

246 

247 # Support repository relocation in config 

248 # Existence of self.root is checked in subclass 

249 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot), 

250 forceDirectory=True, forceAbsolute=True) 

251 

252 self.locationFactory = LocationFactory(self.root) 

253 self.formatterFactory = FormatterFactory() 

254 

255 # Now associate formatters with storage classes 

256 self.formatterFactory.registerFormatters(self.config["formatters"], 

257 universe=bridgeManager.universe) 

258 

259 # Read the file naming templates 

260 self.templates = FileTemplates(self.config["templates"], 

261 universe=bridgeManager.universe) 

262 

263 # See if composites should be disassembled 

264 self.composites = CompositesMap(self.config["composites"], 

265 universe=bridgeManager.universe) 

266 

267 tableName = self.config["records", "table"] 

268 try: 

269 # Storage of paths and formatters, keyed by dataset_id 

270 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec()) 

271 # Interface to Registry. 

272 self._bridge = bridgeManager.register(self.name) 

273 except ReadOnlyDatabaseError: 

274 # If the database is read only and we just tried and failed to 

275 # create a table, it means someone is trying to create a read-only 

276 # butler client for an empty repo. That should be okay, as long 

277 # as they then try to get any datasets before some other client 

278 # creates the table. Chances are they'rejust validating 

279 # configuration. 

280 pass 

281 

282 # Determine whether checksums should be used - default to False 

283 self.useChecksum = self.config.get("checksum", False) 

284 

285 # Determine whether we can fall back to configuration if a 

286 # requested dataset is not known to registry 

287 self.trustGetRequest = self.config.get("trust_get_request", False) 

288 

289 # Check existence and create directory structure if necessary 

290 if not self.root.exists(): 

291 if "create" not in self.config or not self.config["create"]: 291 ↛ 292line 291 didn't jump to line 292, because the condition on line 291 was never true

292 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

293 try: 

294 self.root.mkdir() 

295 except Exception as e: 

296 raise ValueError(f"Can not create datastore root '{self.root}', check permissions." 

297 f" Got error: {e}") from e 

298 

299 def __str__(self) -> str: 

300 return str(self.root) 

301 

302 @property 

303 def bridge(self) -> DatastoreRegistryBridge: 

304 return self._bridge 

305 

306 def _artifact_exists(self, location: Location) -> bool: 

307 """Check that an artifact exists in this datastore at the specified 

308 location. 

309 

310 Parameters 

311 ---------- 

312 location : `Location` 

313 Expected location of the artifact associated with this datastore. 

314 

315 Returns 

316 ------- 

317 exists : `bool` 

318 True if the location can be found, false otherwise. 

319 """ 

320 log.debug("Checking if resource exists: %s", location.uri) 

321 return location.uri.exists() 

322 

323 def _delete_artifact(self, location: Location) -> None: 

324 """Delete the artifact from the datastore. 

325 

326 Parameters 

327 ---------- 

328 location : `Location` 

329 Location of the artifact associated with this datastore. 

330 """ 

331 if location.pathInStore.isabs(): 331 ↛ 332line 331 didn't jump to line 332, because the condition on line 331 was never true

332 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

333 log.debug("Deleting file: %s", location.uri) 

334 location.uri.remove() 

335 log.debug("Successfully deleted file: %s", location.uri) 

336 

337 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

338 # Docstring inherited from GenericBaseDatastore 

339 records = [] 

340 for ref, info in zip(refs, infos): 

341 # Component should come from ref and fall back on info 

342 component = ref.datasetType.component() 

343 if component is None and info.component is not None: 343 ↛ 344line 343 didn't jump to line 344, because the condition on line 343 was never true

344 component = info.component 

345 if component is None: 

346 # Use empty string since we want this to be part of the 

347 # primary key. 

348 component = NULLSTR 

349 records.append( 

350 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path, 

351 storage_class=info.storageClass.name, component=component, 

352 checksum=info.checksum, file_size=info.file_size) 

353 ) 

354 self._table.insert(*records) 

355 

356 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

357 # Docstring inherited from GenericBaseDatastore 

358 

359 # Look for the dataset_id -- there might be multiple matches 

360 # if we have disassembled the dataset. 

361 records = list(self._table.fetch(dataset_id=ref.id)) 

362 

363 results = [] 

364 for record in records: 

365 # Convert name of StorageClass to instance 

366 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"]) 

367 component = record["component"] if (record["component"] 

368 and record["component"] != NULLSTR) else None 

369 

370 info = StoredFileInfo(formatter=record["formatter"], 

371 path=record["path"], 

372 storageClass=storageClass, 

373 component=component, 

374 checksum=record["checksum"], 

375 file_size=record["file_size"]) 

376 results.append(info) 

377 

378 return results 

379 

380 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[int]: 

381 """Return all dataset refs associated with the supplied path. 

382 

383 Parameters 

384 ---------- 

385 pathInStore : `ButlerURI` 

386 Path of interest in the data store. 

387 

388 Returns 

389 ------- 

390 ids : `set` of `int` 

391 All `DatasetRef` IDs associated with this path. 

392 """ 

393 records = list(self._table.fetch(path=str(pathInStore))) 

394 ids = {r["dataset_id"] for r in records} 

395 return ids 

396 

397 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

398 # Docstring inherited from GenericBaseDatastore 

399 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

400 

401 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

402 r"""Find all the `Location`\ s of the requested dataset in the 

403 `Datastore` and the associated stored file information. 

404 

405 Parameters 

406 ---------- 

407 ref : `DatasetRef` 

408 Reference to the required `Dataset`. 

409 

410 Returns 

411 ------- 

412 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

413 Location of the dataset within the datastore and 

414 stored information about each file and its formatter. 

415 """ 

416 # Get the file information (this will fail if no file) 

417 records = self.getStoredItemsInfo(ref) 

418 

419 # Use the path to determine the location -- we need to take 

420 # into account absolute URIs in the datastore record 

421 locations: List[Tuple[Location, StoredFileInfo]] = [] 

422 for r in records: 

423 uriInStore = ButlerURI(r.path, forceAbsolute=False) 

424 if uriInStore.isabs(): 424 ↛ 425line 424 didn't jump to line 425, because the condition on line 424 was never true

425 location = Location(None, uriInStore) 

426 else: 

427 location = self.locationFactory.fromPath(r.path) 

428 locations.append((location, r)) 

429 return locations 

430 

431 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

432 """Check that there is only one dataset associated with the 

433 specified artifact. 

434 

435 Parameters 

436 ---------- 

437 ref : `DatasetRef` or `FakeDatasetRef` 

438 Dataset to be removed. 

439 location : `Location` 

440 The location of the artifact to be removed. 

441 

442 Returns 

443 ------- 

444 can_remove : `Bool` 

445 True if the artifact can be safely removed. 

446 """ 

447 # Can't ever delete absolute URIs. 

448 if location.pathInStore.isabs(): 448 ↛ 449line 448 didn't jump to line 449, because the condition on line 448 was never true

449 return False 

450 

451 # Get all entries associated with this path 

452 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

453 if not allRefs: 453 ↛ 454line 453 didn't jump to line 454, because the condition on line 453 was never true

454 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

455 

456 # Remove these refs from all the refs and if there is nothing left 

457 # then we can delete 

458 remainingRefs = allRefs - {ref.id} 

459 

460 if remainingRefs: 

461 return False 

462 return True 

463 

464 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, 

465 StoredFileInfo]]: 

466 """Predict the location and related file information of the requested 

467 dataset in this datastore. 

468 

469 Parameters 

470 ---------- 

471 ref : `DatasetRef` 

472 Reference to the required `Dataset`. 

473 

474 Returns 

475 ------- 

476 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

477 Expected Location of the dataset within the datastore and 

478 placeholder information about each file and its formatter. 

479 

480 Notes 

481 ----- 

482 Uses the current configuration to determine how we would expect the 

483 datastore files to have been written if we couldn't ask registry. 

484 This is safe so long as there has been no change to datastore 

485 configuration between writing the dataset and wanting to read it. 

486 Will not work for files that have been ingested without using the 

487 standard file template or default formatter. 

488 """ 

489 

490 # If we have a component ref we always need to ask the questions 

491 # of the composite. If the composite is disassembled this routine 

492 # should return all components. If the composite was not 

493 # disassembled the composite is what is stored regardless of 

494 # component request. Note that if the caller has disassembled 

495 # a composite there is no way for this guess to know that 

496 # without trying both the composite and component ref and seeing 

497 # if there is something at the component Location even without 

498 # disassembly being enabled. 

499 if ref.datasetType.isComponent(): 

500 ref = ref.makeCompositeRef() 

501 

502 # See if the ref is a composite that should be disassembled 

503 doDisassembly = self.composites.shouldBeDisassembled(ref) 

504 

505 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = [] 

506 

507 if doDisassembly: 

508 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

509 compRef = ref.makeComponentRef(component) 

510 location, formatter = self._determine_put_formatter_location(compRef) 

511 all_info.append((location, formatter, componentStorage, component)) 

512 

513 else: 

514 # Always use the composite ref if no disassembly 

515 location, formatter = self._determine_put_formatter_location(ref) 

516 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

517 

518 # Convert the list of tuples to have StoredFileInfo as second element 

519 return [(location, StoredFileInfo(formatter=formatter, 

520 path=location.pathInStore.path, 

521 storageClass=storageClass, 

522 component=component, 

523 checksum=None, 

524 file_size=-1)) 

525 for location, formatter, storageClass, component in all_info] 

526 

527 def _prepare_for_get(self, ref: DatasetRef, 

528 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]: 

529 """Check parameters for ``get`` and obtain formatter and 

530 location. 

531 

532 Parameters 

533 ---------- 

534 ref : `DatasetRef` 

535 Reference to the required Dataset. 

536 parameters : `dict` 

537 `StorageClass`-specific parameters that specify, for example, 

538 a slice of the dataset to be loaded. 

539 

540 Returns 

541 ------- 

542 getInfo : `list` [`DatastoreFileGetInformation`] 

543 Parameters needed to retrieve each file. 

544 """ 

545 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

546 

547 # Get file metadata and internal metadata 

548 fileLocations = self._get_dataset_locations_info(ref) 

549 if not fileLocations: 

550 if not self.trustGetRequest: 

551 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

552 # Assume the dataset is where we think it should be 

553 fileLocations = self._get_expected_dataset_locations_info(ref) 

554 

555 # The storage class we want to use eventually 

556 refStorageClass = ref.datasetType.storageClass 

557 

558 if len(fileLocations) > 1: 

559 disassembled = True 

560 else: 

561 disassembled = False 

562 

563 # Is this a component request? 

564 refComponent = ref.datasetType.component() 

565 

566 fileGetInfo = [] 

567 for location, storedFileInfo in fileLocations: 

568 

569 # The storage class used to write the file 

570 writeStorageClass = storedFileInfo.storageClass 

571 

572 # If this has been disassembled we need read to match the write 

573 if disassembled: 

574 readStorageClass = writeStorageClass 

575 else: 

576 readStorageClass = refStorageClass 

577 

578 formatter = getInstanceOf(storedFileInfo.formatter, 

579 FileDescriptor(location, readStorageClass=readStorageClass, 

580 storageClass=writeStorageClass, parameters=parameters), 

581 ref.dataId) 

582 

583 formatterParams, notFormatterParams = formatter.segregateParameters() 

584 

585 # Of the remaining parameters, extract the ones supported by 

586 # this StorageClass (for components not all will be handled) 

587 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

588 

589 # The ref itself could be a component if the dataset was 

590 # disassembled by butler, or we disassembled in datastore and 

591 # components came from the datastore records 

592 component = storedFileInfo.component if storedFileInfo.component else refComponent 

593 

594 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo, 

595 assemblerParams, formatterParams, 

596 component, readStorageClass)) 

597 

598 return fileGetInfo 

599 

600 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

601 """Check the arguments for ``put`` and obtain formatter and 

602 location. 

603 

604 Parameters 

605 ---------- 

606 inMemoryDataset : `object` 

607 The dataset to store. 

608 ref : `DatasetRef` 

609 Reference to the associated Dataset. 

610 

611 Returns 

612 ------- 

613 location : `Location` 

614 The location to write the dataset. 

615 formatter : `Formatter` 

616 The `Formatter` to use to write the dataset. 

617 

618 Raises 

619 ------ 

620 TypeError 

621 Supplied object and storage class are inconsistent. 

622 DatasetTypeNotSupportedError 

623 The associated `DatasetType` is not handled by this datastore. 

624 """ 

625 self._validate_put_parameters(inMemoryDataset, ref) 

626 return self._determine_put_formatter_location(ref) 

627 

628 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]: 

629 """Calculate the formatter and output location to use for put. 

630 

631 Parameters 

632 ---------- 

633 ref : `DatasetRef` 

634 Reference to the associated Dataset. 

635 

636 Returns 

637 ------- 

638 location : `Location` 

639 The location to write the dataset. 

640 formatter : `Formatter` 

641 The `Formatter` to use to write the dataset. 

642 """ 

643 # Work out output file name 

644 try: 

645 template = self.templates.getTemplate(ref) 

646 except KeyError as e: 

647 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

648 

649 # Validate the template to protect against filenames from different 

650 # dataIds returning the same and causing overwrite confusion. 

651 template.validateTemplate(ref) 

652 

653 location = self.locationFactory.fromPath(template.format(ref)) 

654 

655 # Get the formatter based on the storage class 

656 storageClass = ref.datasetType.storageClass 

657 try: 

658 formatter = self.formatterFactory.getFormatter(ref, 

659 FileDescriptor(location, 

660 storageClass=storageClass), 

661 ref.dataId) 

662 except KeyError as e: 

663 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore " 

664 f"{self.name}") from e 

665 

666 # Now that we know the formatter, update the location 

667 location = formatter.makeUpdatedLocation(location) 

668 

669 return location, formatter 

670 

671 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

672 # Docstring inherited from base class 

673 if transfer != "auto": 

674 return transfer 

675 

676 # See if the paths are within the datastore or not 

677 inside = [self._pathInStore(d.path) is not None for d in datasets] 

678 

679 if all(inside): 

680 transfer = None 

681 elif not any(inside): 681 ↛ 685line 681 didn't jump to line 685, because the condition on line 681 was never false

682 # Allow ButlerURI to use its own knowledge 

683 transfer = "auto" 

684 else: 

685 raise ValueError("Some datasets are inside the datastore and some are outside." 

686 " Please use an explicit transfer mode and not 'auto'.") 

687 

688 return transfer 

689 

690 def _pathInStore(self, path: str) -> Optional[str]: 

691 """Return path relative to datastore root 

692 

693 Parameters 

694 ---------- 

695 path : `str` 

696 Path to dataset. Can be absolute. If relative assumed to 

697 be relative to the datastore. Returns path in datastore 

698 or raises an exception if the path it outside. 

699 

700 Returns 

701 ------- 

702 inStore : `str` 

703 Path relative to datastore root. Returns `None` if the file is 

704 outside the root. 

705 """ 

706 # Relative path will always be relative to datastore 

707 pathUri = ButlerURI(path, forceAbsolute=False) 

708 return pathUri.relative_to(self.root) 

709 

710 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

711 """Standardize the path of a to-be-ingested file. 

712 

713 Parameters 

714 ---------- 

715 path : `str` 

716 Path of a file to be ingested. 

717 transfer : `str`, optional 

718 How (and whether) the dataset should be added to the datastore. 

719 See `ingest` for details of transfer modes. 

720 This implementation is provided only so 

721 `NotImplementedError` can be raised if the mode is not supported; 

722 actual transfers are deferred to `_extractIngestInfo`. 

723 

724 Returns 

725 ------- 

726 path : `str` 

727 New path in what the datastore considers standard form. 

728 

729 Notes 

730 ----- 

731 Subclasses of `FileDatastore` can implement this method instead 

732 of `_prepIngest`. It should not modify the data repository or given 

733 file in any way. 

734 

735 Raises 

736 ------ 

737 NotImplementedError 

738 Raised if the datastore does not support the given transfer mode 

739 (including the case where ingest is not supported at all). 

740 FileNotFoundError 

741 Raised if one of the given files does not exist. 

742 """ 

743 if transfer not in (None, "direct") + self.root.transferModes: 743 ↛ 744line 743 didn't jump to line 744, because the condition on line 743 was never true

744 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

745 

746 # A relative URI indicates relative to datastore root 

747 srcUri = ButlerURI(path, forceAbsolute=False) 

748 if not srcUri.isabs(): 

749 srcUri = self.root.join(path) 

750 

751 if not srcUri.exists(): 

752 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest " 

753 f"are assumed to be relative to {self.root} unless they are absolute.") 

754 

755 if transfer is None: 

756 relpath = srcUri.relative_to(self.root) 

757 if not relpath: 

758 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not " 

759 f"within datastore ({self.root})") 

760 

761 # Return the relative path within the datastore for internal 

762 # transfer 

763 path = relpath 

764 

765 return path 

766 

767 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *, 

768 formatter: Union[Formatter, Type[Formatter]], 

769 transfer: Optional[str] = None) -> StoredFileInfo: 

770 """Relocate (if necessary) and extract `StoredFileInfo` from a 

771 to-be-ingested file. 

772 

773 Parameters 

774 ---------- 

775 path : `str` or `ButlerURI` 

776 URI or path of a file to be ingested. 

777 ref : `DatasetRef` 

778 Reference for the dataset being ingested. Guaranteed to have 

779 ``dataset_id not None`. 

780 formatter : `type` or `Formatter` 

781 `Formatter` subclass to use for this dataset or an instance. 

782 transfer : `str`, optional 

783 How (and whether) the dataset should be added to the datastore. 

784 See `ingest` for details of transfer modes. 

785 

786 Returns 

787 ------- 

788 info : `StoredFileInfo` 

789 Internal datastore record for this file. This will be inserted by 

790 the caller; the `_extractIngestInfo` is only resposible for 

791 creating and populating the struct. 

792 

793 Raises 

794 ------ 

795 FileNotFoundError 

796 Raised if one of the given files does not exist. 

797 FileExistsError 

798 Raised if transfer is not `None` but the (internal) location the 

799 file would be moved to is already occupied. 

800 """ 

801 if self._transaction is None: 801 ↛ 802line 801 didn't jump to line 802, because the condition on line 801 was never true

802 raise RuntimeError("Ingest called without transaction enabled") 

803 

804 # Create URI of the source path, do not need to force a relative 

805 # path to absolute. 

806 srcUri = ButlerURI(path, forceAbsolute=False) 

807 

808 # Track whether we have read the size of the source yet 

809 have_sized = False 

810 

811 tgtLocation: Optional[Location] 

812 if transfer is None: 

813 # A relative path is assumed to be relative to the datastore 

814 # in this context 

815 if not srcUri.isabs(): 

816 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

817 else: 

818 # Work out the path in the datastore from an absolute URI 

819 # This is required to be within the datastore. 

820 pathInStore = srcUri.relative_to(self.root) 

821 if pathInStore is None: 821 ↛ 822line 821 didn't jump to line 822, because the condition on line 821 was never true

822 raise RuntimeError(f"Unexpectedly learned that {srcUri} is " 

823 f"not within datastore {self.root}") 

824 tgtLocation = self.locationFactory.fromPath(pathInStore) 

825 elif transfer == "direct": 825 ↛ 830line 825 didn't jump to line 830, because the condition on line 825 was never true

826 # Want to store the full URI to the resource directly in 

827 # datastore. This is useful for referring to permanent archive 

828 # storage for raw data. 

829 # Trust that people know what they are doing. 

830 tgtLocation = None 

831 else: 

832 # Work out the name we want this ingested file to have 

833 # inside the datastore 

834 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

835 if not tgtLocation.uri.dirname().exists(): 

836 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

837 tgtLocation.uri.dirname().mkdir() 

838 

839 # if we are transferring from a local file to a remote location 

840 # it may be more efficient to get the size and checksum of the 

841 # local file rather than the transferred one 

842 if not srcUri.scheme or srcUri.scheme == "file": 842 ↛ 848line 842 didn't jump to line 848, because the condition on line 842 was never false

843 size = srcUri.size() 

844 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

845 have_sized = True 

846 

847 # transfer the resource to the destination 

848 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction) 

849 

850 if tgtLocation is None: 850 ↛ 852line 850 didn't jump to line 852, because the condition on line 850 was never true

851 # This means we are using direct mode 

852 targetUri = srcUri 

853 targetPath = str(srcUri) 

854 else: 

855 targetUri = tgtLocation.uri 

856 targetPath = tgtLocation.pathInStore.path 

857 

858 # the file should exist in the datastore now 

859 if not have_sized: 

860 size = targetUri.size() 

861 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

862 

863 return StoredFileInfo(formatter=formatter, path=targetPath, 

864 storageClass=ref.datasetType.storageClass, 

865 component=ref.datasetType.component(), 

866 file_size=size, checksum=checksum) 

867 

868 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

869 # Docstring inherited from Datastore._prepIngest. 

870 filtered = [] 

871 for dataset in datasets: 

872 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

873 if not acceptable: 

874 continue 

875 else: 

876 dataset.refs = acceptable 

877 if dataset.formatter is None: 

878 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

879 else: 

880 assert isinstance(dataset.formatter, (type, str)) 

881 dataset.formatter = getClassOf(dataset.formatter) 

882 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

883 filtered.append(dataset) 

884 return _IngestPrepData(filtered) 

885 

886 @transactional 

887 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None: 

888 # Docstring inherited from Datastore._finishIngest. 

889 refsAndInfos = [] 

890 for dataset in prepData.datasets: 

891 # Do ingest as if the first dataset ref is associated with the file 

892 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter, 

893 transfer=transfer) 

894 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

895 self._register_datasets(refsAndInfos) 

896 

897 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef, 

898 formatter: Union[Formatter, Type[Formatter]]) -> Location: 

899 """Given a source URI and a DatasetRef, determine the name the 

900 dataset will have inside datastore. 

901 

902 Parameters 

903 ---------- 

904 srcUri : `ButlerURI` 

905 URI to the source dataset file. 

906 ref : `DatasetRef` 

907 Ref associated with the newly-ingested dataset artifact. This 

908 is used to determine the name within the datastore. 

909 formatter : `Formatter` or Formatter class. 

910 Formatter to use for validation. Can be a class or an instance. 

911 

912 Returns 

913 ------- 

914 location : `Location` 

915 Target location for the newly-ingested dataset. 

916 """ 

917 # Ingesting a file from outside the datastore. 

918 # This involves a new name. 

919 template = self.templates.getTemplate(ref) 

920 location = self.locationFactory.fromPath(template.format(ref)) 

921 

922 # Get the extension 

923 ext = srcUri.getExtension() 

924 

925 # Update the destination to include that extension 

926 location.updateExtension(ext) 

927 

928 # Ask the formatter to validate this extension 

929 formatter.validateExtension(location) 

930 

931 return location 

932 

933 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

934 """Write out in memory dataset to datastore. 

935 

936 Parameters 

937 ---------- 

938 inMemoryDataset : `object` 

939 Dataset to write to datastore. 

940 ref : `DatasetRef` 

941 Registry information associated with this dataset. 

942 

943 Returns 

944 ------- 

945 info : `StoredFileInfo` 

946 Information describin the artifact written to the datastore. 

947 """ 

948 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

949 uri = location.uri 

950 

951 if not uri.dirname().exists(): 

952 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

953 uri.dirname().mkdir() 

954 

955 if self._transaction is None: 955 ↛ 956line 955 didn't jump to line 956, because the condition on line 955 was never true

956 raise RuntimeError("Attempting to write artifact without transaction enabled") 

957 

958 def _removeFileExists(uri: ButlerURI) -> None: 

959 """Remove a file and do not complain if it is not there. 

960 

961 This is important since a formatter might fail before the file 

962 is written and we should not confuse people by writing spurious 

963 error messages to the log. 

964 """ 

965 try: 

966 uri.remove() 

967 except FileNotFoundError: 

968 pass 

969 

970 # Register a callback to try to delete the uploaded data if 

971 # something fails below 

972 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

973 

974 # For a local file, simply use the formatter directly 

975 if uri.isLocal: 

976 formatter.write(inMemoryDataset) 

977 log.debug("Successfully wrote python object to local file at %s", uri) 

978 else: 

979 # This is a remote URI, so first try bytes and write directly else 

980 # fallback to a temporary file 

981 try: 

982 serializedDataset = formatter.toBytes(inMemoryDataset) 

983 log.debug("Writing bytes directly to %s", uri) 

984 uri.write(serializedDataset, overwrite=True) 

985 log.debug("Successfully wrote bytes directly to %s", uri) 

986 except NotImplementedError: 

987 with tempfile.NamedTemporaryFile(suffix=uri.getExtension()) as tmpFile: 

988 # Need to configure the formatter to write to a different 

989 # location and that needs us to overwrite internals 

990 tmpLocation = Location(*os.path.split(tmpFile.name)) 

991 log.debug("Writing dataset to temporary location at %s", tmpLocation.uri) 

992 with formatter._updateLocation(tmpLocation): 

993 formatter.write(inMemoryDataset) 

994 uri.transfer_from(tmpLocation.uri, transfer="copy", overwrite=True) 

995 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

996 

997 # URI is needed to resolve what ingest case are we dealing with 

998 return self._extractIngestInfo(uri, ref, formatter=formatter) 

999 

1000 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

1001 ref: DatasetRef, isComponent: bool = False) -> Any: 

1002 """Read the artifact from datastore into in memory object. 

1003 

1004 Parameters 

1005 ---------- 

1006 getInfo : `DatastoreFileGetInformation` 

1007 Information about the artifact within the datastore. 

1008 ref : `DatasetRef` 

1009 The registry information associated with this artifact. 

1010 isComponent : `bool` 

1011 Flag to indicate if a component is being read from this artifact. 

1012 

1013 Returns 

1014 ------- 

1015 inMemoryDataset : `object` 

1016 The artifact as a python object. 

1017 """ 

1018 location = getInfo.location 

1019 uri = location.uri 

1020 log.debug("Accessing data from %s", uri) 

1021 

1022 # Cannot recalculate checksum but can compare size as a quick check 

1023 # Do not do this if the size is negative since that indicates 

1024 # we do not know. 

1025 recorded_size = getInfo.info.file_size 

1026 resource_size = uri.size() 

1027 if recorded_size >= 0 and resource_size != recorded_size: 1027 ↛ 1028line 1027 didn't jump to line 1028, because the condition on line 1027 was never true

1028 raise RuntimeError("Integrity failure in Datastore. " 

1029 f"Size of file {uri} ({resource_size}) " 

1030 f"does not match size recorded in registry of {recorded_size}") 

1031 

1032 # For the general case we have choices for how to proceed. 

1033 # 1. Always use a local file (downloading the remote resource to a 

1034 # temporary file if needed). 

1035 # 2. Use a threshold size and read into memory and use bytes. 

1036 # Use both for now with an arbitrary hand off size. 

1037 # This allows small datasets to be downloaded from remote object 

1038 # stores without requiring a temporary file. 

1039 

1040 formatter = getInfo.formatter 

1041 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1042 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1043 serializedDataset = uri.read() 

1044 log.debug("Deserializing %s from %d bytes from location %s with formatter %s", 

1045 f"component {getInfo.component}" if isComponent else "", 

1046 len(serializedDataset), uri, formatter.name()) 

1047 try: 

1048 result = formatter.fromBytes(serializedDataset, 

1049 component=getInfo.component if isComponent else None) 

1050 except Exception as e: 

1051 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1052 f" ({ref.datasetType.name} from {uri}): {e}") from e 

1053 else: 

1054 # Read from file 

1055 with uri.as_local() as local_uri: 

1056 # Have to update the Location associated with the formatter 

1057 # because formatter.read does not allow an override. 

1058 # This could be improved. 

1059 msg = "" 

1060 newLocation = None 

1061 if uri != local_uri: 

1062 newLocation = Location(*local_uri.split()) 

1063 msg = "(via download to local file)" 

1064 

1065 log.debug("Reading %s from location %s %s with formatter %s", 

1066 f"component {getInfo.component}" if isComponent else "", 

1067 uri, msg, formatter.name()) 

1068 try: 

1069 with formatter._updateLocation(newLocation): 

1070 result = formatter.read(component=getInfo.component if isComponent else None) 

1071 except Exception as e: 

1072 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1073 f" ({ref.datasetType.name} from {uri}): {e}") from e 

1074 

1075 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

1076 isComponent=isComponent) 

1077 

1078 def exists(self, ref: DatasetRef) -> bool: 

1079 """Check if the dataset exists in the datastore. 

1080 

1081 Parameters 

1082 ---------- 

1083 ref : `DatasetRef` 

1084 Reference to the required dataset. 

1085 

1086 Returns 

1087 ------- 

1088 exists : `bool` 

1089 `True` if the entity exists in the `Datastore`. 

1090 """ 

1091 fileLocations = self._get_dataset_locations_info(ref) 

1092 

1093 # if we are being asked to trust that registry might not be correct 

1094 # we ask for the expected locations and check them explicitly 

1095 if not fileLocations: 

1096 if not self.trustGetRequest: 

1097 return False 

1098 fileLocations = self._get_expected_dataset_locations_info(ref) 

1099 for location, _ in fileLocations: 

1100 if not self._artifact_exists(location): 

1101 return False 

1102 

1103 return True 

1104 

1105 def getURIs(self, ref: DatasetRef, 

1106 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

1107 """Return URIs associated with dataset. 

1108 

1109 Parameters 

1110 ---------- 

1111 ref : `DatasetRef` 

1112 Reference to the required dataset. 

1113 predict : `bool`, optional 

1114 If the datastore does not know about the dataset, should it 

1115 return a predicted URI or not? 

1116 

1117 Returns 

1118 ------- 

1119 primary : `ButlerURI` 

1120 The URI to the primary artifact associated with this dataset. 

1121 If the dataset was disassembled within the datastore this 

1122 may be `None`. 

1123 components : `dict` 

1124 URIs to any components associated with the dataset artifact. 

1125 Can be empty if there are no components. 

1126 """ 

1127 

1128 primary: Optional[ButlerURI] = None 

1129 components: Dict[str, ButlerURI] = {} 

1130 

1131 # if this has never been written then we have to guess 

1132 if not self.exists(ref): 

1133 if not predict: 

1134 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

1135 

1136 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1137 

1138 if doDisassembly: 

1139 

1140 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

1141 compRef = ref.makeComponentRef(component) 

1142 compLocation, _ = self._determine_put_formatter_location(compRef) 

1143 

1144 # Add a URI fragment to indicate this is a guess 

1145 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted") 

1146 

1147 else: 

1148 

1149 location, _ = self._determine_put_formatter_location(ref) 

1150 

1151 # Add a URI fragment to indicate this is a guess 

1152 primary = ButlerURI(location.uri.geturl() + "#predicted") 

1153 

1154 return primary, components 

1155 

1156 # If this is a ref that we have written we can get the path. 

1157 # Get file metadata and internal metadata 

1158 fileLocations = self._get_dataset_locations_info(ref) 

1159 

1160 guessing = False 

1161 if not fileLocations: 

1162 if not self.trustGetRequest: 1162 ↛ 1163line 1162 didn't jump to line 1163, because the condition on line 1162 was never true

1163 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1164 fileLocations = self._get_expected_dataset_locations_info(ref) 

1165 guessing = True 

1166 

1167 if len(fileLocations) == 1: 

1168 # No disassembly so this is the primary URI 

1169 uri = fileLocations[0][0].uri 

1170 if guessing and not uri.exists(): 1170 ↛ 1171line 1170 didn't jump to line 1171, because the condition on line 1170 was never true

1171 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1172 primary = uri 

1173 

1174 else: 

1175 for location, storedFileInfo in fileLocations: 

1176 if storedFileInfo.component is None: 1176 ↛ 1177line 1176 didn't jump to line 1177, because the condition on line 1176 was never true

1177 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1178 uri = location.uri 

1179 if guessing and not uri.exists(): 1179 ↛ 1180line 1179 didn't jump to line 1180, because the condition on line 1179 was never true

1180 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1181 components[storedFileInfo.component] = uri 

1182 

1183 return primary, components 

1184 

1185 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI: 

1186 """URI to the Dataset. 

1187 

1188 Parameters 

1189 ---------- 

1190 ref : `DatasetRef` 

1191 Reference to the required Dataset. 

1192 predict : `bool` 

1193 If `True`, allow URIs to be returned of datasets that have not 

1194 been written. 

1195 

1196 Returns 

1197 ------- 

1198 uri : `str` 

1199 URI pointing to the dataset within the datastore. If the 

1200 dataset does not exist in the datastore, and if ``predict`` is 

1201 `True`, the URI will be a prediction and will include a URI 

1202 fragment "#predicted". 

1203 If the datastore does not have entities that relate well 

1204 to the concept of a URI the returned URI will be 

1205 descriptive. The returned URI is not guaranteed to be obtainable. 

1206 

1207 Raises 

1208 ------ 

1209 FileNotFoundError 

1210 Raised if a URI has been requested for a dataset that does not 

1211 exist and guessing is not allowed. 

1212 RuntimeError 

1213 Raised if a request is made for a single URI but multiple URIs 

1214 are associated with this dataset. 

1215 

1216 Notes 

1217 ----- 

1218 When a predicted URI is requested an attempt will be made to form 

1219 a reasonable URI based on file templates and the expected formatter. 

1220 """ 

1221 primary, components = self.getURIs(ref, predict) 

1222 if primary is None or components: 1222 ↛ 1223line 1222 didn't jump to line 1223, because the condition on line 1222 was never true

1223 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. " 

1224 "Use Dataastore.getURIs() instead.") 

1225 return primary 

1226 

1227 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

1228 """Load an InMemoryDataset from the store. 

1229 

1230 Parameters 

1231 ---------- 

1232 ref : `DatasetRef` 

1233 Reference to the required Dataset. 

1234 parameters : `dict` 

1235 `StorageClass`-specific parameters that specify, for example, 

1236 a slice of the dataset to be loaded. 

1237 

1238 Returns 

1239 ------- 

1240 inMemoryDataset : `object` 

1241 Requested dataset or slice thereof as an InMemoryDataset. 

1242 

1243 Raises 

1244 ------ 

1245 FileNotFoundError 

1246 Requested dataset can not be retrieved. 

1247 TypeError 

1248 Return value from formatter has unexpected type. 

1249 ValueError 

1250 Formatter failed to process the dataset. 

1251 """ 

1252 allGetInfo = self._prepare_for_get(ref, parameters) 

1253 refComponent = ref.datasetType.component() 

1254 

1255 # Supplied storage class for the component being read 

1256 refStorageClass = ref.datasetType.storageClass 

1257 

1258 # Create mapping from component name to related info 

1259 allComponents = {i.component: i for i in allGetInfo} 

1260 

1261 # By definition the dataset is disassembled if we have more 

1262 # than one record for it. 

1263 isDisassembled = len(allGetInfo) > 1 

1264 

1265 # Look for the special case where we are disassembled but the 

1266 # component is a derived component that was not written during 

1267 # disassembly. For this scenario we need to check that the 

1268 # component requested is listed as a derived component for the 

1269 # composite storage class 

1270 isDisassembledReadOnlyComponent = False 

1271 if isDisassembled and refComponent: 

1272 # The composite storage class should be accessible through 

1273 # the component dataset type 

1274 compositeStorageClass = ref.datasetType.parentStorageClass 

1275 

1276 # In the unlikely scenario where the composite storage 

1277 # class is not known, we can only assume that this is a 

1278 # normal component. If that assumption is wrong then the 

1279 # branch below that reads a persisted component will fail 

1280 # so there is no need to complain here. 

1281 if compositeStorageClass is not None: 1281 ↛ 1284line 1281 didn't jump to line 1284, because the condition on line 1281 was never false

1282 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

1283 

1284 if isDisassembled and not refComponent: 

1285 # This was a disassembled dataset spread over multiple files 

1286 # and we need to put them all back together again. 

1287 # Read into memory and then assemble 

1288 

1289 # Check that the supplied parameters are suitable for the type read 

1290 refStorageClass.validateParameters(parameters) 

1291 

1292 # We want to keep track of all the parameters that were not used 

1293 # by formatters. We assume that if any of the component formatters 

1294 # use a parameter that we do not need to apply it again in the 

1295 # assembler. 

1296 usedParams = set() 

1297 

1298 components: Dict[str, Any] = {} 

1299 for getInfo in allGetInfo: 

1300 # assemblerParams are parameters not understood by the 

1301 # associated formatter. 

1302 usedParams.update(set(getInfo.formatterParams)) 

1303 

1304 component = getInfo.component 

1305 

1306 if component is None: 1306 ↛ 1307line 1306 didn't jump to line 1307, because the condition on line 1306 was never true

1307 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

1308 

1309 # We do not want the formatter to think it's reading 

1310 # a component though because it is really reading a 

1311 # standalone dataset -- always tell reader it is not a 

1312 # component. 

1313 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False) 

1314 

1315 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

1316 

1317 # Any unused parameters will have to be passed to the assembler 

1318 if parameters: 

1319 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

1320 else: 

1321 unusedParams = {} 

1322 

1323 # Process parameters 

1324 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset, 

1325 parameters=unusedParams) 

1326 

1327 elif isDisassembledReadOnlyComponent: 

1328 

1329 compositeStorageClass = ref.datasetType.parentStorageClass 

1330 if compositeStorageClass is None: 1330 ↛ 1331line 1330 didn't jump to line 1331, because the condition on line 1330 was never true

1331 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since" 

1332 "no composite storage class is available.") 

1333 

1334 if refComponent is None: 1334 ↛ 1336line 1334 didn't jump to line 1336, because the condition on line 1334 was never true

1335 # Mainly for mypy 

1336 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

1337 

1338 # Assume that every derived component can be calculated by 

1339 # forwarding the request to a single read/write component. 

1340 # Rather than guessing which rw component is the right one by 

1341 # scanning each for a derived component of the same name, 

1342 # we ask the storage class delegate directly which one is best to 

1343 # use. 

1344 compositeDelegate = compositeStorageClass.delegate() 

1345 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent, 

1346 set(allComponents)) 

1347 

1348 # Select the relevant component 

1349 rwInfo = allComponents[forwardedComponent] 

1350 

1351 # For now assume that read parameters are validated against 

1352 # the real component and not the requested component 

1353 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

1354 forwardedStorageClass.validateParameters(parameters) 

1355 

1356 # Unfortunately the FileDescriptor inside the formatter will have 

1357 # the wrong write storage class so we need to create a new one 

1358 # given the immutability constraint. 

1359 writeStorageClass = rwInfo.info.storageClass 

1360 

1361 # We may need to put some thought into parameters for read 

1362 # components but for now forward them on as is 

1363 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location, 

1364 readStorageClass=refStorageClass, 

1365 storageClass=writeStorageClass, 

1366 parameters=parameters), 

1367 ref.dataId) 

1368 

1369 # The assembler can not receive any parameter requests for a 

1370 # derived component at this time since the assembler will 

1371 # see the storage class of the derived component and those 

1372 # parameters will have to be handled by the formatter on the 

1373 # forwarded storage class. 

1374 assemblerParams: Dict[str, Any] = {} 

1375 

1376 # Need to created a new info that specifies the derived 

1377 # component and associated storage class 

1378 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter, 

1379 rwInfo.info, assemblerParams, {}, 

1380 refComponent, refStorageClass) 

1381 

1382 return self._read_artifact_into_memory(readInfo, ref, isComponent=True) 

1383 

1384 else: 

1385 # Single file request or component from that composite file 

1386 for lookup in (refComponent, None): 1386 ↛ 1391line 1386 didn't jump to line 1391, because the loop on line 1386 didn't complete

1387 if lookup in allComponents: 1387 ↛ 1386line 1387 didn't jump to line 1386, because the condition on line 1387 was never false

1388 getInfo = allComponents[lookup] 

1389 break 

1390 else: 

1391 raise FileNotFoundError(f"Component {refComponent} not found " 

1392 f"for ref {ref} in datastore {self.name}") 

1393 

1394 # Do not need the component itself if already disassembled 

1395 if isDisassembled: 

1396 isComponent = False 

1397 else: 

1398 isComponent = getInfo.component is not None 

1399 

1400 # For a disassembled component we can validate parametersagainst 

1401 # the component storage class directly 

1402 if isDisassembled: 

1403 refStorageClass.validateParameters(parameters) 

1404 else: 

1405 # For an assembled composite this could be a derived 

1406 # component derived from a real component. The validity 

1407 # of the parameters is not clear. For now validate against 

1408 # the composite storage class 

1409 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

1410 

1411 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent) 

1412 

1413 @transactional 

1414 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

1415 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

1416 

1417 Parameters 

1418 ---------- 

1419 inMemoryDataset : `object` 

1420 The dataset to store. 

1421 ref : `DatasetRef` 

1422 Reference to the associated Dataset. 

1423 

1424 Raises 

1425 ------ 

1426 TypeError 

1427 Supplied object and storage class are inconsistent. 

1428 DatasetTypeNotSupportedError 

1429 The associated `DatasetType` is not handled by this datastore. 

1430 

1431 Notes 

1432 ----- 

1433 If the datastore is configured to reject certain dataset types it 

1434 is possible that the put will fail and raise a 

1435 `DatasetTypeNotSupportedError`. The main use case for this is to 

1436 allow `ChainedDatastore` to put to multiple datastores without 

1437 requiring that every datastore accepts the dataset. 

1438 """ 

1439 

1440 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1441 # doDisassembly = True 

1442 

1443 artifacts = [] 

1444 if doDisassembly: 

1445 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

1446 for component, componentInfo in components.items(): 

1447 # Don't recurse because we want to take advantage of 

1448 # bulk insert -- need a new DatasetRef that refers to the 

1449 # same dataset_id but has the component DatasetType 

1450 # DatasetType does not refer to the types of components 

1451 # So we construct one ourselves. 

1452 compRef = ref.makeComponentRef(component) 

1453 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

1454 artifacts.append((compRef, storedInfo)) 

1455 else: 

1456 # Write the entire thing out 

1457 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

1458 artifacts.append((ref, storedInfo)) 

1459 

1460 self._register_datasets(artifacts) 

1461 

1462 @transactional 

1463 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None: 

1464 """Indicate to the datastore that a dataset can be removed. 

1465 

1466 Parameters 

1467 ---------- 

1468 ref : `DatasetRef` 

1469 Reference to the required Dataset. 

1470 ignore_errors : `bool` 

1471 If `True` return without error even if something went wrong. 

1472 Problems could occur if another process is simultaneously trying 

1473 to delete. 

1474 

1475 Raises 

1476 ------ 

1477 FileNotFoundError 

1478 Attempt to remove a dataset that does not exist. 

1479 """ 

1480 # Get file metadata and internal metadata 

1481 log.debug("Trashing %s in datastore %s", ref, self.name) 

1482 

1483 fileLocations = self._get_dataset_locations_info(ref) 

1484 

1485 if not fileLocations: 

1486 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

1487 if ignore_errors: 

1488 log.warning(err_msg) 

1489 return 

1490 else: 

1491 raise FileNotFoundError(err_msg) 

1492 

1493 for location, storedFileInfo in fileLocations: 

1494 if not self._artifact_exists(location): 1494 ↛ 1495line 1494 didn't jump to line 1495, because the condition on line 1494 was never true

1495 err_msg = f"Dataset is known to datastore {self.name} but " \ 

1496 f"associated artifact ({location.uri}) is missing" 

1497 if ignore_errors: 

1498 log.warning(err_msg) 

1499 return 

1500 else: 

1501 raise FileNotFoundError(err_msg) 

1502 

1503 # Mark dataset as trashed 

1504 try: 

1505 self._move_to_trash_in_registry(ref) 

1506 except Exception as e: 

1507 if ignore_errors: 

1508 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} " 

1509 f"but encountered an error: {e}") 

1510 pass 

1511 else: 

1512 raise 

1513 

1514 @transactional 

1515 def emptyTrash(self, ignore_errors: bool = True) -> None: 

1516 """Remove all datasets from the trash. 

1517 

1518 Parameters 

1519 ---------- 

1520 ignore_errors : `bool` 

1521 If `True` return without error even if something went wrong. 

1522 Problems could occur if another process is simultaneously trying 

1523 to delete. 

1524 """ 

1525 log.debug("Emptying trash in datastore %s", self.name) 

1526 # Context manager will empty trash iff we finish it without raising. 

1527 with self.bridge.emptyTrash() as trashed: 

1528 for ref in trashed: 

1529 fileLocations = self._get_dataset_locations_info(ref) 

1530 

1531 if not fileLocations: 1531 ↛ 1532line 1531 didn't jump to line 1532, because the condition on line 1531 was never true

1532 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}" 

1533 if ignore_errors: 

1534 log.warning(err_msg) 

1535 continue 

1536 else: 

1537 raise FileNotFoundError(err_msg) 

1538 

1539 for location, _ in fileLocations: 

1540 

1541 if not self._artifact_exists(location): 1541 ↛ 1542line 1541 didn't jump to line 1542, because the condition on line 1541 was never true

1542 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}" 

1543 if ignore_errors: 

1544 log.warning(err_msg) 

1545 continue 

1546 else: 

1547 raise FileNotFoundError(err_msg) 

1548 

1549 # Can only delete the artifact if there are no references 

1550 # to the file from untrashed dataset refs. 

1551 if self._can_remove_dataset_artifact(ref, location): 

1552 # Point of no return for this artifact 

1553 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

1554 try: 

1555 self._delete_artifact(location) 

1556 except Exception as e: 

1557 if ignore_errors: 

1558 log.critical("Encountered error removing artifact %s from datastore %s: %s", 

1559 location.uri, self.name, e) 

1560 else: 

1561 raise 

1562 

1563 # Now must remove the entry from the internal registry even if 

1564 # the artifact removal failed and was ignored, 

1565 # otherwise the removal check above will never be true 

1566 try: 

1567 # There may be multiple rows associated with this ref 

1568 # depending on disassembly 

1569 self.removeStoredItemInfo(ref) 

1570 except Exception as e: 

1571 if ignore_errors: 

1572 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s", 

1573 ref.id, location.uri, self.name, e) 

1574 continue 

1575 else: 

1576 raise FileNotFoundError( 

1577 f"Error removing dataset {ref.id} ({location.uri}) from internal registry " 

1578 f"of {self.name}" 

1579 ) from e 

1580 

1581 @transactional 

1582 def forget(self, refs: Iterable[DatasetRef]) -> None: 

1583 # Docstring inherited. 

1584 refs = list(refs) 

1585 self.bridge.forget(refs) 

1586 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs]) 

1587 

1588 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], 

1589 logFailures: bool = False) -> None: 

1590 """Validate some of the configuration for this datastore. 

1591 

1592 Parameters 

1593 ---------- 

1594 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

1595 Entities to test against this configuration. Can be differing 

1596 types. 

1597 logFailures : `bool`, optional 

1598 If `True`, output a log message for every validation error 

1599 detected. 

1600 

1601 Raises 

1602 ------ 

1603 DatastoreValidationError 

1604 Raised if there is a validation problem with a configuration. 

1605 All the problems are reported in a single exception. 

1606 

1607 Notes 

1608 ----- 

1609 This method checks that all the supplied entities have valid file 

1610 templates and also have formatters defined. 

1611 """ 

1612 

1613 templateFailed = None 

1614 try: 

1615 self.templates.validateTemplates(entities, logFailures=logFailures) 

1616 except FileTemplateValidationError as e: 

1617 templateFailed = str(e) 

1618 

1619 formatterFailed = [] 

1620 for entity in entities: 

1621 try: 

1622 self.formatterFactory.getFormatterClass(entity) 

1623 except KeyError as e: 

1624 formatterFailed.append(str(e)) 

1625 if logFailures: 1625 ↛ 1620line 1625 didn't jump to line 1620, because the condition on line 1625 was never false

1626 log.critical("Formatter failure: %s", e) 

1627 

1628 if templateFailed or formatterFailed: 

1629 messages = [] 

1630 if templateFailed: 1630 ↛ 1631line 1630 didn't jump to line 1631, because the condition on line 1630 was never true

1631 messages.append(templateFailed) 

1632 if formatterFailed: 1632 ↛ 1634line 1632 didn't jump to line 1634, because the condition on line 1632 was never false

1633 messages.append(",".join(formatterFailed)) 

1634 msg = ";\n".join(messages) 

1635 raise DatastoreValidationError(msg) 

1636 

1637 def getLookupKeys(self) -> Set[LookupKey]: 

1638 # Docstring is inherited from base class 

1639 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \ 

1640 self.constraints.getLookupKeys() 

1641 

1642 def validateKey(self, lookupKey: LookupKey, 

1643 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

1644 # Docstring is inherited from base class 

1645 # The key can be valid in either formatters or templates so we can 

1646 # only check the template if it exists 

1647 if lookupKey in self.templates: 

1648 try: 

1649 self.templates[lookupKey].validateTemplate(entity) 

1650 except FileTemplateValidationError as e: 

1651 raise DatastoreValidationError(e) from e 

1652 

1653 def export(self, refs: Iterable[DatasetRef], *, 

1654 directory: Optional[Union[ButlerURI, str]] = None, 

1655 transfer: Optional[str] = "auto") -> Iterable[FileDataset]: 

1656 # Docstring inherited from Datastore.export. 

1657 if transfer is not None and directory is None: 1657 ↛ 1658line 1657 didn't jump to line 1658, because the condition on line 1657 was never true

1658 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no " 

1659 "export directory given") 

1660 

1661 # Force the directory to be a URI object 

1662 directoryUri: Optional[ButlerURI] = None 

1663 if directory is not None: 1663 ↛ 1666line 1663 didn't jump to line 1666, because the condition on line 1663 was never false

1664 directoryUri = ButlerURI(directory, forceDirectory=True) 

1665 

1666 if transfer is not None and directoryUri is not None: 1666 ↛ 1671line 1666 didn't jump to line 1671, because the condition on line 1666 was never false

1667 # mypy needs the second test 

1668 if not directoryUri.exists(): 1668 ↛ 1669line 1668 didn't jump to line 1669, because the condition on line 1668 was never true

1669 raise FileNotFoundError(f"Export location {directory} does not exist") 

1670 

1671 for ref in refs: 

1672 fileLocations = self._get_dataset_locations_info(ref) 

1673 if not fileLocations: 1673 ↛ 1674line 1673 didn't jump to line 1674, because the condition on line 1673 was never true

1674 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

1675 # For now we can not export disassembled datasets 

1676 if len(fileLocations) > 1: 

1677 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

1678 location, storedFileInfo = fileLocations[0] 

1679 

1680 pathInStore = location.pathInStore.path 

1681 if transfer is None: 1681 ↛ 1684line 1681 didn't jump to line 1684, because the condition on line 1681 was never true

1682 # TODO: do we also need to return the readStorageClass somehow? 

1683 # We will use the path in store directly 

1684 pass 

1685 elif transfer == "direct": 1685 ↛ 1687line 1685 didn't jump to line 1687, because the condition on line 1685 was never true

1686 # Use full URIs to the remote store in the export 

1687 pathInStore = str(location.uri) 

1688 else: 

1689 # mypy needs help 

1690 assert directoryUri is not None, "directoryUri must be defined to get here" 

1691 storeUri = ButlerURI(location.uri) 

1692 

1693 # if the datastore has an absolute URI to a resource, we 

1694 # have two options: 

1695 # 1. Keep the absolute URI in the exported YAML 

1696 # 2. Allocate a new name in the local datastore and transfer 

1697 # it. 

1698 # For now go with option 2 

1699 if location.pathInStore.isabs(): 1699 ↛ 1700line 1699 didn't jump to line 1700, because the condition on line 1699 was never true

1700 template = self.templates.getTemplate(ref) 

1701 pathInStore = template.format(ref) 

1702 

1703 exportUri = directoryUri.join(pathInStore) 

1704 exportUri.transfer_from(storeUri, transfer=transfer) 

1705 

1706 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

1707 

1708 @staticmethod 

1709 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]: 

1710 """Compute the checksum of the supplied file. 

1711 

1712 Parameters 

1713 ---------- 

1714 uri : `ButlerURI` 

1715 Name of resource to calculate checksum from. 

1716 algorithm : `str`, optional 

1717 Name of algorithm to use. Must be one of the algorithms supported 

1718 by :py:class`hashlib`. 

1719 block_size : `int` 

1720 Number of bytes to read from file at one time. 

1721 

1722 Returns 

1723 ------- 

1724 hexdigest : `str` 

1725 Hex digest of the file. 

1726 

1727 Notes 

1728 ----- 

1729 Currently returns None if the URI is for a remote resource. 

1730 """ 

1731 if algorithm not in hashlib.algorithms_guaranteed: 1731 ↛ 1732line 1731 didn't jump to line 1732, because the condition on line 1731 was never true

1732 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

1733 

1734 if not uri.isLocal: 1734 ↛ 1735line 1734 didn't jump to line 1735, because the condition on line 1734 was never true

1735 return None 

1736 

1737 hasher = hashlib.new(algorithm) 

1738 

1739 with uri.as_local() as local_uri: 

1740 with open(local_uri.ospath, "rb") as f: 

1741 for chunk in iter(lambda: f.read(block_size), b""): 

1742 hasher.update(chunk) 

1743 

1744 return hasher.hexdigest()