Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileDatastore", ) 

26 

27import hashlib 

28import logging 

29import os 

30import tempfile 

31 

32from sqlalchemy import BigInteger, String 

33 

34from collections import defaultdict 

35from dataclasses import dataclass 

36from typing import ( 

37 TYPE_CHECKING, 

38 Any, 

39 ClassVar, 

40 Dict, 

41 Iterable, 

42 List, 

43 Mapping, 

44 Optional, 

45 Set, 

46 Tuple, 

47 Type, 

48 Union, 

49) 

50 

51from lsst.daf.butler import ( 

52 ButlerURI, 

53 CompositesMap, 

54 Config, 

55 FileDataset, 

56 DatasetId, 

57 DatasetRef, 

58 DatasetType, 

59 DatasetTypeNotSupportedError, 

60 Datastore, 

61 DatastoreCacheManager, 

62 DatastoreDisabledCacheManager, 

63 DatastoreConfig, 

64 DatastoreValidationError, 

65 FileDescriptor, 

66 FileTemplates, 

67 FileTemplateValidationError, 

68 Formatter, 

69 FormatterFactory, 

70 Location, 

71 LocationFactory, 

72 Progress, 

73 StorageClass, 

74 StoredFileInfo, 

75) 

76 

77from lsst.daf.butler import ddl 

78from lsst.daf.butler.registry.interfaces import ( 

79 ReadOnlyDatabaseError, 

80 DatastoreRegistryBridge, 

81) 

82 

83from lsst.daf.butler.core.repoRelocation import replaceRoot 

84from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional 

85from .genericDatastore import GenericBaseDatastore 

86 

87if TYPE_CHECKING: 87 ↛ 88line 87 didn't jump to line 88, because the condition on line 87 was never true

88 from lsst.daf.butler import LookupKey, AbstractDatastoreCacheManager 

89 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

90 

91log = logging.getLogger(__name__) 

92 

93 

94class _IngestPrepData(Datastore.IngestPrepData): 

95 """Helper class for FileDatastore ingest implementation. 

96 

97 Parameters 

98 ---------- 

99 datasets : `list` of `FileDataset` 

100 Files to be ingested by this datastore. 

101 """ 

102 def __init__(self, datasets: List[FileDataset]): 

103 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

104 self.datasets = datasets 

105 

106 

107@dataclass(frozen=True) 

108class DatastoreFileGetInformation: 

109 """Collection of useful parameters needed to retrieve a file from 

110 a Datastore. 

111 """ 

112 

113 location: Location 

114 """The location from which to read the dataset.""" 

115 

116 formatter: Formatter 

117 """The `Formatter` to use to deserialize the dataset.""" 

118 

119 info: StoredFileInfo 

120 """Stored information about this file and its formatter.""" 

121 

122 assemblerParams: Dict[str, Any] 

123 """Parameters to use for post-processing the retrieved dataset.""" 

124 

125 formatterParams: Dict[str, Any] 

126 """Parameters that were understood by the associated formatter.""" 

127 

128 component: Optional[str] 

129 """The component to be retrieved (can be `None`).""" 

130 

131 readStorageClass: StorageClass 

132 """The `StorageClass` of the dataset being read.""" 

133 

134 

135class FileDatastore(GenericBaseDatastore): 

136 """Generic Datastore for file-based implementations. 

137 

138 Should always be sub-classed since key abstract methods are missing. 

139 

140 Parameters 

141 ---------- 

142 config : `DatastoreConfig` or `str` 

143 Configuration as either a `Config` object or URI to file. 

144 bridgeManager : `DatastoreRegistryBridgeManager` 

145 Object that manages the interface between `Registry` and datastores. 

146 butlerRoot : `str`, optional 

147 New datastore root to use to override the configuration value. 

148 

149 Raises 

150 ------ 

151 ValueError 

152 If root location does not exist and ``create`` is `False` in the 

153 configuration. 

154 """ 

155 

156 defaultConfigFile: ClassVar[Optional[str]] = None 

157 """Path to configuration defaults. Accessed within the ``config`` resource 

158 or relative to a search path. Can be None if no defaults specified. 

159 """ 

160 

161 root: ButlerURI 

162 """Root directory URI of this `Datastore`.""" 

163 

164 locationFactory: LocationFactory 

165 """Factory for creating locations relative to the datastore root.""" 

166 

167 formatterFactory: FormatterFactory 

168 """Factory for creating instances of formatters.""" 

169 

170 templates: FileTemplates 

171 """File templates that can be used by this `Datastore`.""" 

172 

173 composites: CompositesMap 

174 """Determines whether a dataset should be disassembled on put.""" 

175 

176 defaultConfigFile = "datastores/fileDatastore.yaml" 

177 """Path to configuration defaults. Accessed within the ``config`` resource 

178 or relative to a search path. Can be None if no defaults specified. 

179 """ 

180 

181 @classmethod 

182 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

183 """Set any filesystem-dependent config options for this Datastore to 

184 be appropriate for a new empty repository with the given root. 

185 

186 Parameters 

187 ---------- 

188 root : `str` 

189 URI to the root of the data repository. 

190 config : `Config` 

191 A `Config` to update. Only the subset understood by 

192 this component will be updated. Will not expand 

193 defaults. 

194 full : `Config` 

195 A complete config with all defaults expanded that can be 

196 converted to a `DatastoreConfig`. Read-only and will not be 

197 modified by this method. 

198 Repository-specific options that should not be obtained 

199 from defaults when Butler instances are constructed 

200 should be copied from ``full`` to ``config``. 

201 overwrite : `bool`, optional 

202 If `False`, do not modify a value in ``config`` if the value 

203 already exists. Default is always to overwrite with the provided 

204 ``root``. 

205 

206 Notes 

207 ----- 

208 If a keyword is explicitly defined in the supplied ``config`` it 

209 will not be overridden by this method if ``overwrite`` is `False`. 

210 This allows explicit values set in external configs to be retained. 

211 """ 

212 Config.updateParameters(DatastoreConfig, config, full, 

213 toUpdate={"root": root}, 

214 toCopy=("cls", ("records", "table")), overwrite=overwrite) 

215 

216 @classmethod 

217 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

218 return ddl.TableSpec( 

219 fields=[ 

220 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

221 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

222 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

223 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

224 # Use empty string to indicate no component 

225 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

226 # TODO: should checksum be Base64Bytes instead? 

227 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

228 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

229 ], 

230 unique=frozenset(), 

231 indexes=[tuple(["path"])], 

232 ) 

233 

234 def __init__(self, config: Union[DatastoreConfig, str], 

235 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

236 super().__init__(config, bridgeManager) 

237 if "root" not in self.config: 237 ↛ 238line 237 didn't jump to line 238, because the condition on line 237 was never true

238 raise ValueError("No root directory specified in configuration") 

239 

240 # Name ourselves either using an explicit name or a name 

241 # derived from the (unexpanded) root 

242 if "name" in self.config: 

243 self.name = self.config["name"] 

244 else: 

245 # We use the unexpanded root in the name to indicate that this 

246 # datastore can be moved without having to update registry. 

247 self.name = "{}@{}".format(type(self).__name__, 

248 self.config["root"]) 

249 

250 # Support repository relocation in config 

251 # Existence of self.root is checked in subclass 

252 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot), 

253 forceDirectory=True, forceAbsolute=True) 

254 

255 self.locationFactory = LocationFactory(self.root) 

256 self.formatterFactory = FormatterFactory() 

257 

258 # Now associate formatters with storage classes 

259 self.formatterFactory.registerFormatters(self.config["formatters"], 

260 universe=bridgeManager.universe) 

261 

262 # Read the file naming templates 

263 self.templates = FileTemplates(self.config["templates"], 

264 universe=bridgeManager.universe) 

265 

266 # See if composites should be disassembled 

267 self.composites = CompositesMap(self.config["composites"], 

268 universe=bridgeManager.universe) 

269 

270 tableName = self.config["records", "table"] 

271 try: 

272 # Storage of paths and formatters, keyed by dataset_id 

273 self._table = bridgeManager.opaque.register( 

274 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType)) 

275 # Interface to Registry. 

276 self._bridge = bridgeManager.register(self.name) 

277 except ReadOnlyDatabaseError: 

278 # If the database is read only and we just tried and failed to 

279 # create a table, it means someone is trying to create a read-only 

280 # butler client for an empty repo. That should be okay, as long 

281 # as they then try to get any datasets before some other client 

282 # creates the table. Chances are they'rejust validating 

283 # configuration. 

284 pass 

285 

286 # Determine whether checksums should be used - default to False 

287 self.useChecksum = self.config.get("checksum", False) 

288 

289 # Determine whether we can fall back to configuration if a 

290 # requested dataset is not known to registry 

291 self.trustGetRequest = self.config.get("trust_get_request", False) 

292 

293 # Create a cache manager 

294 self.cacheManager: AbstractDatastoreCacheManager 

295 if "cached" in self.config: 295 ↛ 299line 295 didn't jump to line 299, because the condition on line 295 was never false

296 self.cacheManager = DatastoreCacheManager(self.config["cached"], 

297 universe=bridgeManager.universe) 

298 else: 

299 self.cacheManager = DatastoreDisabledCacheManager("", 

300 universe=bridgeManager.universe) 

301 

302 # Check existence and create directory structure if necessary 

303 if not self.root.exists(): 

304 if "create" not in self.config or not self.config["create"]: 304 ↛ 305line 304 didn't jump to line 305, because the condition on line 304 was never true

305 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

306 try: 

307 self.root.mkdir() 

308 except Exception as e: 

309 raise ValueError(f"Can not create datastore root '{self.root}', check permissions." 

310 f" Got error: {e}") from e 

311 

312 def __str__(self) -> str: 

313 return str(self.root) 

314 

315 @property 

316 def bridge(self) -> DatastoreRegistryBridge: 

317 return self._bridge 

318 

319 def _artifact_exists(self, location: Location) -> bool: 

320 """Check that an artifact exists in this datastore at the specified 

321 location. 

322 

323 Parameters 

324 ---------- 

325 location : `Location` 

326 Expected location of the artifact associated with this datastore. 

327 

328 Returns 

329 ------- 

330 exists : `bool` 

331 True if the location can be found, false otherwise. 

332 """ 

333 log.debug("Checking if resource exists: %s", location.uri) 

334 return location.uri.exists() 

335 

336 def _delete_artifact(self, location: Location) -> None: 

337 """Delete the artifact from the datastore. 

338 

339 Parameters 

340 ---------- 

341 location : `Location` 

342 Location of the artifact associated with this datastore. 

343 """ 

344 if location.pathInStore.isabs(): 344 ↛ 345line 344 didn't jump to line 345, because the condition on line 344 was never true

345 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

346 

347 try: 

348 location.uri.remove() 

349 except FileNotFoundError: 

350 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

351 raise 

352 except Exception as e: 

353 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

354 raise 

355 log.debug("Successfully deleted file: %s", location.uri) 

356 

357 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

358 # Docstring inherited from GenericBaseDatastore 

359 records = [info.to_record(ref) for ref, info in zip(refs, infos)] 

360 self._table.insert(*records) 

361 

362 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

363 # Docstring inherited from GenericBaseDatastore 

364 

365 # Look for the dataset_id -- there might be multiple matches 

366 # if we have disassembled the dataset. 

367 records = list(self._table.fetch(dataset_id=ref.id)) 

368 return [StoredFileInfo.from_record(record) for record in records] 

369 

370 def _refs_associated_with_artifacts(self, paths: List[Union[str, ButlerURI]]) -> Dict[str, 

371 Set[DatasetId]]: 

372 """Return paths and associated dataset refs. 

373 

374 Parameters 

375 ---------- 

376 paths : `list` of `str` or `ButlerURI` 

377 All the paths to include in search. 

378 

379 Returns 

380 ------- 

381 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

382 Mapping of each path to a set of associated database IDs. 

383 """ 

384 records = list(self._table.fetch(path=[str(path) for path in paths])) 

385 result = defaultdict(set) 

386 for row in records: 

387 result[row["path"]].add(row["dataset_id"]) 

388 return result 

389 

390 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[DatasetId]: 

391 """Return all dataset refs associated with the supplied path. 

392 

393 Parameters 

394 ---------- 

395 pathInStore : `ButlerURI` 

396 Path of interest in the data store. 

397 

398 Returns 

399 ------- 

400 ids : `set` of `int` 

401 All `DatasetRef` IDs associated with this path. 

402 """ 

403 records = list(self._table.fetch(path=str(pathInStore))) 

404 ids = {r["dataset_id"] for r in records} 

405 return ids 

406 

407 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

408 # Docstring inherited from GenericBaseDatastore 

409 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

410 

411 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

412 r"""Find all the `Location`\ s of the requested dataset in the 

413 `Datastore` and the associated stored file information. 

414 

415 Parameters 

416 ---------- 

417 ref : `DatasetRef` 

418 Reference to the required `Dataset`. 

419 

420 Returns 

421 ------- 

422 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

423 Location of the dataset within the datastore and 

424 stored information about each file and its formatter. 

425 """ 

426 # Get the file information (this will fail if no file) 

427 records = self.getStoredItemsInfo(ref) 

428 

429 # Use the path to determine the location -- we need to take 

430 # into account absolute URIs in the datastore record 

431 return [(r.file_location(self.locationFactory), r) for r in records] 

432 

433 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

434 """Check that there is only one dataset associated with the 

435 specified artifact. 

436 

437 Parameters 

438 ---------- 

439 ref : `DatasetRef` or `FakeDatasetRef` 

440 Dataset to be removed. 

441 location : `Location` 

442 The location of the artifact to be removed. 

443 

444 Returns 

445 ------- 

446 can_remove : `Bool` 

447 True if the artifact can be safely removed. 

448 """ 

449 # Can't ever delete absolute URIs. 

450 if location.pathInStore.isabs(): 

451 return False 

452 

453 # Get all entries associated with this path 

454 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

455 if not allRefs: 

456 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

457 

458 # Remove these refs from all the refs and if there is nothing left 

459 # then we can delete 

460 remainingRefs = allRefs - {ref.id} 

461 

462 if remainingRefs: 

463 return False 

464 return True 

465 

466 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, 

467 StoredFileInfo]]: 

468 """Predict the location and related file information of the requested 

469 dataset in this datastore. 

470 

471 Parameters 

472 ---------- 

473 ref : `DatasetRef` 

474 Reference to the required `Dataset`. 

475 

476 Returns 

477 ------- 

478 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

479 Expected Location of the dataset within the datastore and 

480 placeholder information about each file and its formatter. 

481 

482 Notes 

483 ----- 

484 Uses the current configuration to determine how we would expect the 

485 datastore files to have been written if we couldn't ask registry. 

486 This is safe so long as there has been no change to datastore 

487 configuration between writing the dataset and wanting to read it. 

488 Will not work for files that have been ingested without using the 

489 standard file template or default formatter. 

490 """ 

491 

492 # If we have a component ref we always need to ask the questions 

493 # of the composite. If the composite is disassembled this routine 

494 # should return all components. If the composite was not 

495 # disassembled the composite is what is stored regardless of 

496 # component request. Note that if the caller has disassembled 

497 # a composite there is no way for this guess to know that 

498 # without trying both the composite and component ref and seeing 

499 # if there is something at the component Location even without 

500 # disassembly being enabled. 

501 if ref.datasetType.isComponent(): 

502 ref = ref.makeCompositeRef() 

503 

504 # See if the ref is a composite that should be disassembled 

505 doDisassembly = self.composites.shouldBeDisassembled(ref) 

506 

507 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = [] 

508 

509 if doDisassembly: 

510 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

511 compRef = ref.makeComponentRef(component) 

512 location, formatter = self._determine_put_formatter_location(compRef) 

513 all_info.append((location, formatter, componentStorage, component)) 

514 

515 else: 

516 # Always use the composite ref if no disassembly 

517 location, formatter = self._determine_put_formatter_location(ref) 

518 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

519 

520 # Convert the list of tuples to have StoredFileInfo as second element 

521 return [(location, StoredFileInfo(formatter=formatter, 

522 path=location.pathInStore.path, 

523 storageClass=storageClass, 

524 component=component, 

525 checksum=None, 

526 file_size=-1)) 

527 for location, formatter, storageClass, component in all_info] 

528 

529 def _prepare_for_get(self, ref: DatasetRef, 

530 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]: 

531 """Check parameters for ``get`` and obtain formatter and 

532 location. 

533 

534 Parameters 

535 ---------- 

536 ref : `DatasetRef` 

537 Reference to the required Dataset. 

538 parameters : `dict` 

539 `StorageClass`-specific parameters that specify, for example, 

540 a slice of the dataset to be loaded. 

541 

542 Returns 

543 ------- 

544 getInfo : `list` [`DatastoreFileGetInformation`] 

545 Parameters needed to retrieve each file. 

546 """ 

547 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

548 

549 # Get file metadata and internal metadata 

550 fileLocations = self._get_dataset_locations_info(ref) 

551 if not fileLocations: 

552 if not self.trustGetRequest: 

553 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

554 # Assume the dataset is where we think it should be 

555 fileLocations = self._get_expected_dataset_locations_info(ref) 

556 

557 # The storage class we want to use eventually 

558 refStorageClass = ref.datasetType.storageClass 

559 

560 if len(fileLocations) > 1: 

561 disassembled = True 

562 else: 

563 disassembled = False 

564 

565 # Is this a component request? 

566 refComponent = ref.datasetType.component() 

567 

568 fileGetInfo = [] 

569 for location, storedFileInfo in fileLocations: 

570 

571 # The storage class used to write the file 

572 writeStorageClass = storedFileInfo.storageClass 

573 

574 # If this has been disassembled we need read to match the write 

575 if disassembled: 

576 readStorageClass = writeStorageClass 

577 else: 

578 readStorageClass = refStorageClass 

579 

580 formatter = getInstanceOf(storedFileInfo.formatter, 

581 FileDescriptor(location, readStorageClass=readStorageClass, 

582 storageClass=writeStorageClass, parameters=parameters), 

583 ref.dataId) 

584 

585 formatterParams, notFormatterParams = formatter.segregateParameters() 

586 

587 # Of the remaining parameters, extract the ones supported by 

588 # this StorageClass (for components not all will be handled) 

589 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

590 

591 # The ref itself could be a component if the dataset was 

592 # disassembled by butler, or we disassembled in datastore and 

593 # components came from the datastore records 

594 component = storedFileInfo.component if storedFileInfo.component else refComponent 

595 

596 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo, 

597 assemblerParams, formatterParams, 

598 component, readStorageClass)) 

599 

600 return fileGetInfo 

601 

602 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

603 """Check the arguments for ``put`` and obtain formatter and 

604 location. 

605 

606 Parameters 

607 ---------- 

608 inMemoryDataset : `object` 

609 The dataset to store. 

610 ref : `DatasetRef` 

611 Reference to the associated Dataset. 

612 

613 Returns 

614 ------- 

615 location : `Location` 

616 The location to write the dataset. 

617 formatter : `Formatter` 

618 The `Formatter` to use to write the dataset. 

619 

620 Raises 

621 ------ 

622 TypeError 

623 Supplied object and storage class are inconsistent. 

624 DatasetTypeNotSupportedError 

625 The associated `DatasetType` is not handled by this datastore. 

626 """ 

627 self._validate_put_parameters(inMemoryDataset, ref) 

628 return self._determine_put_formatter_location(ref) 

629 

630 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]: 

631 """Calculate the formatter and output location to use for put. 

632 

633 Parameters 

634 ---------- 

635 ref : `DatasetRef` 

636 Reference to the associated Dataset. 

637 

638 Returns 

639 ------- 

640 location : `Location` 

641 The location to write the dataset. 

642 formatter : `Formatter` 

643 The `Formatter` to use to write the dataset. 

644 """ 

645 # Work out output file name 

646 try: 

647 template = self.templates.getTemplate(ref) 

648 except KeyError as e: 

649 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

650 

651 # Validate the template to protect against filenames from different 

652 # dataIds returning the same and causing overwrite confusion. 

653 template.validateTemplate(ref) 

654 

655 location = self.locationFactory.fromPath(template.format(ref)) 

656 

657 # Get the formatter based on the storage class 

658 storageClass = ref.datasetType.storageClass 

659 try: 

660 formatter = self.formatterFactory.getFormatter(ref, 

661 FileDescriptor(location, 

662 storageClass=storageClass), 

663 ref.dataId) 

664 except KeyError as e: 

665 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore " 

666 f"{self.name}") from e 

667 

668 # Now that we know the formatter, update the location 

669 location = formatter.makeUpdatedLocation(location) 

670 

671 return location, formatter 

672 

673 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

674 # Docstring inherited from base class 

675 if transfer != "auto": 

676 return transfer 

677 

678 # See if the paths are within the datastore or not 

679 inside = [self._pathInStore(d.path) is not None for d in datasets] 

680 

681 if all(inside): 

682 transfer = None 

683 elif not any(inside): 683 ↛ 687line 683 didn't jump to line 687, because the condition on line 683 was never false

684 # Allow ButlerURI to use its own knowledge 

685 transfer = "auto" 

686 else: 

687 raise ValueError("Some datasets are inside the datastore and some are outside." 

688 " Please use an explicit transfer mode and not 'auto'.") 

689 

690 return transfer 

691 

692 def _pathInStore(self, path: Union[str, ButlerURI]) -> Optional[str]: 

693 """Return path relative to datastore root 

694 

695 Parameters 

696 ---------- 

697 path : `str` or `ButlerURI` 

698 Path to dataset. Can be absolute URI. If relative assumed to 

699 be relative to the datastore. Returns path in datastore 

700 or raises an exception if the path it outside. 

701 

702 Returns 

703 ------- 

704 inStore : `str` 

705 Path relative to datastore root. Returns `None` if the file is 

706 outside the root. 

707 """ 

708 # Relative path will always be relative to datastore 

709 pathUri = ButlerURI(path, forceAbsolute=False) 

710 return pathUri.relative_to(self.root) 

711 

712 def _standardizeIngestPath(self, path: Union[str, ButlerURI], *, 

713 transfer: Optional[str] = None) -> Union[str, ButlerURI]: 

714 """Standardize the path of a to-be-ingested file. 

715 

716 Parameters 

717 ---------- 

718 path : `str` or `ButlerURI` 

719 Path of a file to be ingested. 

720 transfer : `str`, optional 

721 How (and whether) the dataset should be added to the datastore. 

722 See `ingest` for details of transfer modes. 

723 This implementation is provided only so 

724 `NotImplementedError` can be raised if the mode is not supported; 

725 actual transfers are deferred to `_extractIngestInfo`. 

726 

727 Returns 

728 ------- 

729 path : `str` or `ButlerURI` 

730 New path in what the datastore considers standard form. If an 

731 absolute URI was given that will be returned unchanged. 

732 

733 Notes 

734 ----- 

735 Subclasses of `FileDatastore` can implement this method instead 

736 of `_prepIngest`. It should not modify the data repository or given 

737 file in any way. 

738 

739 Raises 

740 ------ 

741 NotImplementedError 

742 Raised if the datastore does not support the given transfer mode 

743 (including the case where ingest is not supported at all). 

744 FileNotFoundError 

745 Raised if one of the given files does not exist. 

746 """ 

747 if transfer not in (None, "direct") + self.root.transferModes: 747 ↛ 748line 747 didn't jump to line 748, because the condition on line 747 was never true

748 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

749 

750 # A relative URI indicates relative to datastore root 

751 srcUri = ButlerURI(path, forceAbsolute=False) 

752 if not srcUri.isabs(): 

753 srcUri = self.root.join(path) 

754 

755 if not srcUri.exists(): 

756 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest " 

757 f"are assumed to be relative to {self.root} unless they are absolute.") 

758 

759 if transfer is None: 

760 relpath = srcUri.relative_to(self.root) 

761 if not relpath: 

762 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not " 

763 f"within datastore ({self.root})") 

764 

765 # Return the relative path within the datastore for internal 

766 # transfer 

767 path = relpath 

768 

769 return path 

770 

771 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *, 

772 formatter: Union[Formatter, Type[Formatter]], 

773 transfer: Optional[str] = None) -> StoredFileInfo: 

774 """Relocate (if necessary) and extract `StoredFileInfo` from a 

775 to-be-ingested file. 

776 

777 Parameters 

778 ---------- 

779 path : `str` or `ButlerURI` 

780 URI or path of a file to be ingested. 

781 ref : `DatasetRef` 

782 Reference for the dataset being ingested. Guaranteed to have 

783 ``dataset_id not None`. 

784 formatter : `type` or `Formatter` 

785 `Formatter` subclass to use for this dataset or an instance. 

786 transfer : `str`, optional 

787 How (and whether) the dataset should be added to the datastore. 

788 See `ingest` for details of transfer modes. 

789 

790 Returns 

791 ------- 

792 info : `StoredFileInfo` 

793 Internal datastore record for this file. This will be inserted by 

794 the caller; the `_extractIngestInfo` is only resposible for 

795 creating and populating the struct. 

796 

797 Raises 

798 ------ 

799 FileNotFoundError 

800 Raised if one of the given files does not exist. 

801 FileExistsError 

802 Raised if transfer is not `None` but the (internal) location the 

803 file would be moved to is already occupied. 

804 """ 

805 if self._transaction is None: 805 ↛ 806line 805 didn't jump to line 806, because the condition on line 805 was never true

806 raise RuntimeError("Ingest called without transaction enabled") 

807 

808 # Create URI of the source path, do not need to force a relative 

809 # path to absolute. 

810 srcUri = ButlerURI(path, forceAbsolute=False) 

811 

812 # Track whether we have read the size of the source yet 

813 have_sized = False 

814 

815 tgtLocation: Optional[Location] 

816 if transfer is None: 

817 # A relative path is assumed to be relative to the datastore 

818 # in this context 

819 if not srcUri.isabs(): 

820 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

821 else: 

822 # Work out the path in the datastore from an absolute URI 

823 # This is required to be within the datastore. 

824 pathInStore = srcUri.relative_to(self.root) 

825 if pathInStore is None: 825 ↛ 826line 825 didn't jump to line 826, because the condition on line 825 was never true

826 raise RuntimeError(f"Unexpectedly learned that {srcUri} is " 

827 f"not within datastore {self.root}") 

828 tgtLocation = self.locationFactory.fromPath(pathInStore) 

829 elif transfer == "direct": 829 ↛ 834line 829 didn't jump to line 834, because the condition on line 829 was never true

830 # Want to store the full URI to the resource directly in 

831 # datastore. This is useful for referring to permanent archive 

832 # storage for raw data. 

833 # Trust that people know what they are doing. 

834 tgtLocation = None 

835 else: 

836 # Work out the name we want this ingested file to have 

837 # inside the datastore 

838 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

839 if not tgtLocation.uri.dirname().exists(): 

840 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

841 tgtLocation.uri.dirname().mkdir() 

842 

843 # if we are transferring from a local file to a remote location 

844 # it may be more efficient to get the size and checksum of the 

845 # local file rather than the transferred one 

846 if not srcUri.scheme or srcUri.scheme == "file": 846 ↛ 852line 846 didn't jump to line 852, because the condition on line 846 was never false

847 size = srcUri.size() 

848 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

849 have_sized = True 

850 

851 # transfer the resource to the destination 

852 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction) 

853 

854 if tgtLocation is None: 854 ↛ 856line 854 didn't jump to line 856, because the condition on line 854 was never true

855 # This means we are using direct mode 

856 targetUri = srcUri 

857 targetPath = str(srcUri) 

858 else: 

859 targetUri = tgtLocation.uri 

860 targetPath = tgtLocation.pathInStore.path 

861 

862 # the file should exist in the datastore now 

863 if not have_sized: 

864 size = targetUri.size() 

865 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

866 

867 return StoredFileInfo(formatter=formatter, path=targetPath, 

868 storageClass=ref.datasetType.storageClass, 

869 component=ref.datasetType.component(), 

870 file_size=size, checksum=checksum) 

871 

872 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

873 # Docstring inherited from Datastore._prepIngest. 

874 filtered = [] 

875 for dataset in datasets: 

876 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

877 if not acceptable: 

878 continue 

879 else: 

880 dataset.refs = acceptable 

881 if dataset.formatter is None: 

882 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

883 else: 

884 assert isinstance(dataset.formatter, (type, str)) 

885 dataset.formatter = getClassOf(dataset.formatter) 

886 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

887 filtered.append(dataset) 

888 return _IngestPrepData(filtered) 

889 

890 @transactional 

891 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None: 

892 # Docstring inherited from Datastore._finishIngest. 

893 refsAndInfos = [] 

894 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

895 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

896 # Do ingest as if the first dataset ref is associated with the file 

897 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter, 

898 transfer=transfer) 

899 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

900 self._register_datasets(refsAndInfos) 

901 

902 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef, 

903 formatter: Union[Formatter, Type[Formatter]]) -> Location: 

904 """Given a source URI and a DatasetRef, determine the name the 

905 dataset will have inside datastore. 

906 

907 Parameters 

908 ---------- 

909 srcUri : `ButlerURI` 

910 URI to the source dataset file. 

911 ref : `DatasetRef` 

912 Ref associated with the newly-ingested dataset artifact. This 

913 is used to determine the name within the datastore. 

914 formatter : `Formatter` or Formatter class. 

915 Formatter to use for validation. Can be a class or an instance. 

916 

917 Returns 

918 ------- 

919 location : `Location` 

920 Target location for the newly-ingested dataset. 

921 """ 

922 # Ingesting a file from outside the datastore. 

923 # This involves a new name. 

924 template = self.templates.getTemplate(ref) 

925 location = self.locationFactory.fromPath(template.format(ref)) 

926 

927 # Get the extension 

928 ext = srcUri.getExtension() 

929 

930 # Update the destination to include that extension 

931 location.updateExtension(ext) 

932 

933 # Ask the formatter to validate this extension 

934 formatter.validateExtension(location) 

935 

936 return location 

937 

938 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

939 """Write out in memory dataset to datastore. 

940 

941 Parameters 

942 ---------- 

943 inMemoryDataset : `object` 

944 Dataset to write to datastore. 

945 ref : `DatasetRef` 

946 Registry information associated with this dataset. 

947 

948 Returns 

949 ------- 

950 info : `StoredFileInfo` 

951 Information describin the artifact written to the datastore. 

952 """ 

953 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

954 uri = location.uri 

955 

956 if not uri.dirname().exists(): 

957 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

958 uri.dirname().mkdir() 

959 

960 if self._transaction is None: 960 ↛ 961line 960 didn't jump to line 961, because the condition on line 960 was never true

961 raise RuntimeError("Attempting to write artifact without transaction enabled") 

962 

963 def _removeFileExists(uri: ButlerURI) -> None: 

964 """Remove a file and do not complain if it is not there. 

965 

966 This is important since a formatter might fail before the file 

967 is written and we should not confuse people by writing spurious 

968 error messages to the log. 

969 """ 

970 try: 

971 uri.remove() 

972 except FileNotFoundError: 

973 pass 

974 

975 # Register a callback to try to delete the uploaded data if 

976 # something fails below 

977 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

978 

979 # For a local file, simply use the formatter directly 

980 if uri.isLocal: 

981 try: 

982 formatter.write(inMemoryDataset) 

983 except Exception as e: 

984 raise RuntimeError(f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} " 

985 f"to location {uri}") from e 

986 log.debug("Successfully wrote python object to local file at %s", uri) 

987 else: 

988 # This is a remote URI, so first try bytes and write directly else 

989 # fallback to a temporary file 

990 try: 

991 serializedDataset = formatter.toBytes(inMemoryDataset) 

992 except NotImplementedError: 992 ↛ 1011line 992 didn't jump to line 1011

993 with tempfile.NamedTemporaryFile(suffix=uri.getExtension()) as tmpFile: 

994 # Need to configure the formatter to write to a different 

995 # location and that needs us to overwrite internals 

996 tmpLocation = Location(*os.path.split(tmpFile.name)) 

997 log.debug("Writing dataset to temporary location at %s", tmpLocation.uri) 

998 with formatter._updateLocation(tmpLocation): 

999 try: 

1000 formatter.write(inMemoryDataset) 

1001 except Exception as e: 

1002 raise RuntimeError(f"Failed to serialize dataset {ref} of type" 

1003 f" {type(inMemoryDataset)} to " 

1004 f"temporary location {tmpLocation.uri}") from e 

1005 uri.transfer_from(tmpLocation.uri, transfer="copy", overwrite=True) 

1006 

1007 # Cache if required 

1008 self.cacheManager.move_to_cache(tmpLocation.uri, ref) 

1009 

1010 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1011 except Exception as e: 

1012 raise RuntimeError(f"Failed to serialize dataset {ref} to bytes.") from e 

1013 else: 

1014 log.debug("Writing bytes directly to %s", uri) 

1015 uri.write(serializedDataset, overwrite=True) 

1016 log.debug("Successfully wrote bytes directly to %s", uri) 

1017 

1018 # URI is needed to resolve what ingest case are we dealing with 

1019 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1020 

1021 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

1022 ref: DatasetRef, isComponent: bool = False) -> Any: 

1023 """Read the artifact from datastore into in memory object. 

1024 

1025 Parameters 

1026 ---------- 

1027 getInfo : `DatastoreFileGetInformation` 

1028 Information about the artifact within the datastore. 

1029 ref : `DatasetRef` 

1030 The registry information associated with this artifact. 

1031 isComponent : `bool` 

1032 Flag to indicate if a component is being read from this artifact. 

1033 

1034 Returns 

1035 ------- 

1036 inMemoryDataset : `object` 

1037 The artifact as a python object. 

1038 """ 

1039 location = getInfo.location 

1040 uri = location.uri 

1041 log.debug("Accessing data from %s", uri) 

1042 

1043 # Cannot recalculate checksum but can compare size as a quick check 

1044 # Do not do this if the size is negative since that indicates 

1045 # we do not know. 

1046 recorded_size = getInfo.info.file_size 

1047 resource_size = uri.size() 

1048 if recorded_size >= 0 and resource_size != recorded_size: 1048 ↛ 1049line 1048 didn't jump to line 1049, because the condition on line 1048 was never true

1049 raise RuntimeError("Integrity failure in Datastore. " 

1050 f"Size of file {uri} ({resource_size}) " 

1051 f"does not match size recorded in registry of {recorded_size}") 

1052 

1053 # For the general case we have choices for how to proceed. 

1054 # 1. Always use a local file (downloading the remote resource to a 

1055 # temporary file if needed). 

1056 # 2. Use a threshold size and read into memory and use bytes. 

1057 # Use both for now with an arbitrary hand off size. 

1058 # This allows small datasets to be downloaded from remote object 

1059 # stores without requiring a temporary file. 

1060 

1061 formatter = getInfo.formatter 

1062 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1063 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1064 serializedDataset = uri.read() 

1065 log.debug("Deserializing %s from %d bytes from location %s with formatter %s", 

1066 f"component {getInfo.component}" if isComponent else "", 

1067 len(serializedDataset), uri, formatter.name()) 

1068 try: 

1069 result = formatter.fromBytes(serializedDataset, 

1070 component=getInfo.component if isComponent else None) 

1071 except Exception as e: 

1072 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1073 f" ({ref.datasetType.name} from {uri}): {e}") from e 

1074 else: 

1075 # Read from file. 

1076 

1077 # Have to update the Location associated with the formatter 

1078 # because formatter.read does not allow an override. 

1079 # This could be improved. 

1080 location_updated = False 

1081 msg = "" 

1082 

1083 # First check in cache for local version. 

1084 # The cache will only be relevant for remote resources. 

1085 if not uri.isLocal: 

1086 cached_file = self.cacheManager.find_in_cache(ref, uri.getExtension()) 

1087 if cached_file is not None: 1087 ↛ 1088line 1087 didn't jump to line 1088, because the condition on line 1087 was never true

1088 msg = f"(via cache read of remote file {uri})" 

1089 uri = cached_file 

1090 location_updated = True 

1091 

1092 with uri.as_local() as local_uri: 

1093 

1094 # URI was remote and file was downloaded 

1095 if uri != local_uri: 

1096 cache_msg = "" 

1097 location_updated = True 

1098 

1099 # Cache the downloaded file if needed. 

1100 cached_uri = self.cacheManager.move_to_cache(local_uri, ref) 

1101 if cached_uri is not None: 1101 ↛ 1102line 1101 didn't jump to line 1102, because the condition on line 1101 was never true

1102 local_uri = cached_uri 

1103 cache_msg = " and cached" 

1104 

1105 msg = f"(via download to local file{cache_msg})" 

1106 

1107 # Calculate the (possibly) new location for the formatter 

1108 # to use. 

1109 newLocation = Location(*local_uri.split()) if location_updated else None 

1110 

1111 log.debug("Reading%s from location %s %s with formatter %s", 

1112 f" component {getInfo.component}" if isComponent else "", 

1113 uri, msg, formatter.name()) 

1114 try: 

1115 with formatter._updateLocation(newLocation): 

1116 result = formatter.read(component=getInfo.component if isComponent else None) 

1117 except Exception as e: 

1118 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1119 f" ({ref.datasetType.name} from {uri}): {e}") from e 

1120 

1121 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

1122 isComponent=isComponent) 

1123 

1124 def exists(self, ref: DatasetRef) -> bool: 

1125 """Check if the dataset exists in the datastore. 

1126 

1127 Parameters 

1128 ---------- 

1129 ref : `DatasetRef` 

1130 Reference to the required dataset. 

1131 

1132 Returns 

1133 ------- 

1134 exists : `bool` 

1135 `True` if the entity exists in the `Datastore`. 

1136 """ 

1137 fileLocations = self._get_dataset_locations_info(ref) 

1138 

1139 # if we are being asked to trust that registry might not be correct 

1140 # we ask for the expected locations and check them explicitly 

1141 if not fileLocations: 

1142 if not self.trustGetRequest: 

1143 return False 

1144 fileLocations = self._get_expected_dataset_locations_info(ref) 

1145 for location, _ in fileLocations: 

1146 if not self._artifact_exists(location): 

1147 return False 

1148 

1149 return True 

1150 

1151 def getURIs(self, ref: DatasetRef, 

1152 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

1153 """Return URIs associated with dataset. 

1154 

1155 Parameters 

1156 ---------- 

1157 ref : `DatasetRef` 

1158 Reference to the required dataset. 

1159 predict : `bool`, optional 

1160 If the datastore does not know about the dataset, should it 

1161 return a predicted URI or not? 

1162 

1163 Returns 

1164 ------- 

1165 primary : `ButlerURI` 

1166 The URI to the primary artifact associated with this dataset. 

1167 If the dataset was disassembled within the datastore this 

1168 may be `None`. 

1169 components : `dict` 

1170 URIs to any components associated with the dataset artifact. 

1171 Can be empty if there are no components. 

1172 """ 

1173 

1174 primary: Optional[ButlerURI] = None 

1175 components: Dict[str, ButlerURI] = {} 

1176 

1177 # if this has never been written then we have to guess 

1178 if not self.exists(ref): 

1179 if not predict: 

1180 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

1181 

1182 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1183 

1184 if doDisassembly: 

1185 

1186 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

1187 compRef = ref.makeComponentRef(component) 

1188 compLocation, _ = self._determine_put_formatter_location(compRef) 

1189 

1190 # Add a URI fragment to indicate this is a guess 

1191 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted") 

1192 

1193 else: 

1194 

1195 location, _ = self._determine_put_formatter_location(ref) 

1196 

1197 # Add a URI fragment to indicate this is a guess 

1198 primary = ButlerURI(location.uri.geturl() + "#predicted") 

1199 

1200 return primary, components 

1201 

1202 # If this is a ref that we have written we can get the path. 

1203 # Get file metadata and internal metadata 

1204 fileLocations = self._get_dataset_locations_info(ref) 

1205 

1206 guessing = False 

1207 if not fileLocations: 

1208 if not self.trustGetRequest: 1208 ↛ 1209line 1208 didn't jump to line 1209, because the condition on line 1208 was never true

1209 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1210 fileLocations = self._get_expected_dataset_locations_info(ref) 

1211 guessing = True 

1212 

1213 if len(fileLocations) == 1: 

1214 # No disassembly so this is the primary URI 

1215 uri = fileLocations[0][0].uri 

1216 if guessing and not uri.exists(): 1216 ↛ 1217line 1216 didn't jump to line 1217, because the condition on line 1216 was never true

1217 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1218 primary = uri 

1219 

1220 else: 

1221 for location, storedFileInfo in fileLocations: 

1222 if storedFileInfo.component is None: 1222 ↛ 1223line 1222 didn't jump to line 1223, because the condition on line 1222 was never true

1223 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1224 uri = location.uri 

1225 if guessing and not uri.exists(): 1225 ↛ 1226line 1225 didn't jump to line 1226, because the condition on line 1225 was never true

1226 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1227 components[storedFileInfo.component] = uri 

1228 

1229 return primary, components 

1230 

1231 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI: 

1232 """URI to the Dataset. 

1233 

1234 Parameters 

1235 ---------- 

1236 ref : `DatasetRef` 

1237 Reference to the required Dataset. 

1238 predict : `bool` 

1239 If `True`, allow URIs to be returned of datasets that have not 

1240 been written. 

1241 

1242 Returns 

1243 ------- 

1244 uri : `str` 

1245 URI pointing to the dataset within the datastore. If the 

1246 dataset does not exist in the datastore, and if ``predict`` is 

1247 `True`, the URI will be a prediction and will include a URI 

1248 fragment "#predicted". 

1249 If the datastore does not have entities that relate well 

1250 to the concept of a URI the returned URI will be 

1251 descriptive. The returned URI is not guaranteed to be obtainable. 

1252 

1253 Raises 

1254 ------ 

1255 FileNotFoundError 

1256 Raised if a URI has been requested for a dataset that does not 

1257 exist and guessing is not allowed. 

1258 RuntimeError 

1259 Raised if a request is made for a single URI but multiple URIs 

1260 are associated with this dataset. 

1261 

1262 Notes 

1263 ----- 

1264 When a predicted URI is requested an attempt will be made to form 

1265 a reasonable URI based on file templates and the expected formatter. 

1266 """ 

1267 primary, components = self.getURIs(ref, predict) 

1268 if primary is None or components: 1268 ↛ 1269line 1268 didn't jump to line 1269, because the condition on line 1268 was never true

1269 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. " 

1270 "Use Dataastore.getURIs() instead.") 

1271 return primary 

1272 

1273 def retrieveArtifacts(self, refs: Iterable[DatasetRef], 

1274 destination: ButlerURI, transfer: str = "auto", 

1275 preserve_path: bool = True, 

1276 overwrite: bool = False) -> List[ButlerURI]: 

1277 """Retrieve the file artifacts associated with the supplied refs. 

1278 

1279 Parameters 

1280 ---------- 

1281 refs : iterable of `DatasetRef` 

1282 The datasets for which file artifacts are to be retrieved. 

1283 A single ref can result in multiple files. The refs must 

1284 be resolved. 

1285 destination : `ButlerURI` 

1286 Location to write the file artifacts. 

1287 transfer : `str`, optional 

1288 Method to use to transfer the artifacts. Must be one of the options 

1289 supported by `ButlerURI.transfer_from()`. "move" is not allowed. 

1290 preserve_path : `bool`, optional 

1291 If `True` the full path of the file artifact within the datastore 

1292 is preserved. If `False` the final file component of the path 

1293 is used. 

1294 overwrite : `bool`, optional 

1295 If `True` allow transfers to overwrite existing files at the 

1296 destination. 

1297 

1298 Returns 

1299 ------- 

1300 targets : `list` of `ButlerURI` 

1301 URIs of file artifacts in destination location. Order is not 

1302 preserved. 

1303 """ 

1304 if not destination.isdir(): 1304 ↛ 1305line 1304 didn't jump to line 1305, because the condition on line 1304 was never true

1305 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

1306 

1307 if transfer == "move": 

1308 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

1309 

1310 # Source -> Destination 

1311 # This also helps filter out duplicate DatasetRef in the request 

1312 # that will map to the same underlying file transfer. 

1313 to_transfer: Dict[ButlerURI, ButlerURI] = {} 

1314 

1315 for ref in refs: 

1316 locations = self._get_dataset_locations_info(ref) 

1317 for location, _ in locations: 

1318 source_uri = location.uri 

1319 target_path: Union[str, ButlerURI] 

1320 if preserve_path: 

1321 target_path = location.pathInStore 

1322 if target_path.isabs(): 1322 ↛ 1325line 1322 didn't jump to line 1325, because the condition on line 1322 was never true

1323 # This is an absolute path to an external file. 

1324 # Use the full path. 

1325 target_path = target_path.relativeToPathRoot 

1326 else: 

1327 target_path = source_uri.basename() 

1328 target_uri = destination.join(target_path) 

1329 to_transfer[source_uri] = target_uri 

1330 

1331 # In theory can now parallelize the transfer 

1332 log.debug("Number of artifacts to transfer to %s: %d", 

1333 str(destination), len(to_transfer)) 

1334 for source_uri, target_uri in to_transfer.items(): 

1335 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

1336 

1337 return list(to_transfer.values()) 

1338 

1339 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

1340 """Load an InMemoryDataset from the store. 

1341 

1342 Parameters 

1343 ---------- 

1344 ref : `DatasetRef` 

1345 Reference to the required Dataset. 

1346 parameters : `dict` 

1347 `StorageClass`-specific parameters that specify, for example, 

1348 a slice of the dataset to be loaded. 

1349 

1350 Returns 

1351 ------- 

1352 inMemoryDataset : `object` 

1353 Requested dataset or slice thereof as an InMemoryDataset. 

1354 

1355 Raises 

1356 ------ 

1357 FileNotFoundError 

1358 Requested dataset can not be retrieved. 

1359 TypeError 

1360 Return value from formatter has unexpected type. 

1361 ValueError 

1362 Formatter failed to process the dataset. 

1363 """ 

1364 allGetInfo = self._prepare_for_get(ref, parameters) 

1365 refComponent = ref.datasetType.component() 

1366 

1367 # Supplied storage class for the component being read 

1368 refStorageClass = ref.datasetType.storageClass 

1369 

1370 # Create mapping from component name to related info 

1371 allComponents = {i.component: i for i in allGetInfo} 

1372 

1373 # By definition the dataset is disassembled if we have more 

1374 # than one record for it. 

1375 isDisassembled = len(allGetInfo) > 1 

1376 

1377 # Look for the special case where we are disassembled but the 

1378 # component is a derived component that was not written during 

1379 # disassembly. For this scenario we need to check that the 

1380 # component requested is listed as a derived component for the 

1381 # composite storage class 

1382 isDisassembledReadOnlyComponent = False 

1383 if isDisassembled and refComponent: 

1384 # The composite storage class should be accessible through 

1385 # the component dataset type 

1386 compositeStorageClass = ref.datasetType.parentStorageClass 

1387 

1388 # In the unlikely scenario where the composite storage 

1389 # class is not known, we can only assume that this is a 

1390 # normal component. If that assumption is wrong then the 

1391 # branch below that reads a persisted component will fail 

1392 # so there is no need to complain here. 

1393 if compositeStorageClass is not None: 1393 ↛ 1396line 1393 didn't jump to line 1396, because the condition on line 1393 was never false

1394 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

1395 

1396 if isDisassembled and not refComponent: 

1397 # This was a disassembled dataset spread over multiple files 

1398 # and we need to put them all back together again. 

1399 # Read into memory and then assemble 

1400 

1401 # Check that the supplied parameters are suitable for the type read 

1402 refStorageClass.validateParameters(parameters) 

1403 

1404 # We want to keep track of all the parameters that were not used 

1405 # by formatters. We assume that if any of the component formatters 

1406 # use a parameter that we do not need to apply it again in the 

1407 # assembler. 

1408 usedParams = set() 

1409 

1410 components: Dict[str, Any] = {} 

1411 for getInfo in allGetInfo: 

1412 # assemblerParams are parameters not understood by the 

1413 # associated formatter. 

1414 usedParams.update(set(getInfo.formatterParams)) 

1415 

1416 component = getInfo.component 

1417 

1418 if component is None: 1418 ↛ 1419line 1418 didn't jump to line 1419, because the condition on line 1418 was never true

1419 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

1420 

1421 # We do not want the formatter to think it's reading 

1422 # a component though because it is really reading a 

1423 # standalone dataset -- always tell reader it is not a 

1424 # component. 

1425 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False) 

1426 

1427 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

1428 

1429 # Any unused parameters will have to be passed to the assembler 

1430 if parameters: 

1431 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

1432 else: 

1433 unusedParams = {} 

1434 

1435 # Process parameters 

1436 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset, 

1437 parameters=unusedParams) 

1438 

1439 elif isDisassembledReadOnlyComponent: 

1440 

1441 compositeStorageClass = ref.datasetType.parentStorageClass 

1442 if compositeStorageClass is None: 1442 ↛ 1443line 1442 didn't jump to line 1443, because the condition on line 1442 was never true

1443 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since" 

1444 "no composite storage class is available.") 

1445 

1446 if refComponent is None: 1446 ↛ 1448line 1446 didn't jump to line 1448, because the condition on line 1446 was never true

1447 # Mainly for mypy 

1448 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

1449 

1450 # Assume that every derived component can be calculated by 

1451 # forwarding the request to a single read/write component. 

1452 # Rather than guessing which rw component is the right one by 

1453 # scanning each for a derived component of the same name, 

1454 # we ask the storage class delegate directly which one is best to 

1455 # use. 

1456 compositeDelegate = compositeStorageClass.delegate() 

1457 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent, 

1458 set(allComponents)) 

1459 

1460 # Select the relevant component 

1461 rwInfo = allComponents[forwardedComponent] 

1462 

1463 # For now assume that read parameters are validated against 

1464 # the real component and not the requested component 

1465 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

1466 forwardedStorageClass.validateParameters(parameters) 

1467 

1468 # Unfortunately the FileDescriptor inside the formatter will have 

1469 # the wrong write storage class so we need to create a new one 

1470 # given the immutability constraint. 

1471 writeStorageClass = rwInfo.info.storageClass 

1472 

1473 # We may need to put some thought into parameters for read 

1474 # components but for now forward them on as is 

1475 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location, 

1476 readStorageClass=refStorageClass, 

1477 storageClass=writeStorageClass, 

1478 parameters=parameters), 

1479 ref.dataId) 

1480 

1481 # The assembler can not receive any parameter requests for a 

1482 # derived component at this time since the assembler will 

1483 # see the storage class of the derived component and those 

1484 # parameters will have to be handled by the formatter on the 

1485 # forwarded storage class. 

1486 assemblerParams: Dict[str, Any] = {} 

1487 

1488 # Need to created a new info that specifies the derived 

1489 # component and associated storage class 

1490 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter, 

1491 rwInfo.info, assemblerParams, {}, 

1492 refComponent, refStorageClass) 

1493 

1494 return self._read_artifact_into_memory(readInfo, ref, isComponent=True) 

1495 

1496 else: 

1497 # Single file request or component from that composite file 

1498 for lookup in (refComponent, None): 1498 ↛ 1503line 1498 didn't jump to line 1503, because the loop on line 1498 didn't complete

1499 if lookup in allComponents: 1499 ↛ 1498line 1499 didn't jump to line 1498, because the condition on line 1499 was never false

1500 getInfo = allComponents[lookup] 

1501 break 

1502 else: 

1503 raise FileNotFoundError(f"Component {refComponent} not found " 

1504 f"for ref {ref} in datastore {self.name}") 

1505 

1506 # Do not need the component itself if already disassembled 

1507 if isDisassembled: 

1508 isComponent = False 

1509 else: 

1510 isComponent = getInfo.component is not None 

1511 

1512 # For a disassembled component we can validate parametersagainst 

1513 # the component storage class directly 

1514 if isDisassembled: 

1515 refStorageClass.validateParameters(parameters) 

1516 else: 

1517 # For an assembled composite this could be a derived 

1518 # component derived from a real component. The validity 

1519 # of the parameters is not clear. For now validate against 

1520 # the composite storage class 

1521 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

1522 

1523 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent) 

1524 

1525 @transactional 

1526 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

1527 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

1528 

1529 Parameters 

1530 ---------- 

1531 inMemoryDataset : `object` 

1532 The dataset to store. 

1533 ref : `DatasetRef` 

1534 Reference to the associated Dataset. 

1535 

1536 Raises 

1537 ------ 

1538 TypeError 

1539 Supplied object and storage class are inconsistent. 

1540 DatasetTypeNotSupportedError 

1541 The associated `DatasetType` is not handled by this datastore. 

1542 

1543 Notes 

1544 ----- 

1545 If the datastore is configured to reject certain dataset types it 

1546 is possible that the put will fail and raise a 

1547 `DatasetTypeNotSupportedError`. The main use case for this is to 

1548 allow `ChainedDatastore` to put to multiple datastores without 

1549 requiring that every datastore accepts the dataset. 

1550 """ 

1551 

1552 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1553 # doDisassembly = True 

1554 

1555 artifacts = [] 

1556 if doDisassembly: 

1557 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

1558 for component, componentInfo in components.items(): 

1559 # Don't recurse because we want to take advantage of 

1560 # bulk insert -- need a new DatasetRef that refers to the 

1561 # same dataset_id but has the component DatasetType 

1562 # DatasetType does not refer to the types of components 

1563 # So we construct one ourselves. 

1564 compRef = ref.makeComponentRef(component) 

1565 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

1566 artifacts.append((compRef, storedInfo)) 

1567 else: 

1568 # Write the entire thing out 

1569 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

1570 artifacts.append((ref, storedInfo)) 

1571 

1572 self._register_datasets(artifacts) 

1573 

1574 @transactional 

1575 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None: 

1576 # Get file metadata and internal metadata 

1577 if not isinstance(ref, DatasetRef): 

1578 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

1579 # Assumed to be an iterable of refs so bulk mode enabled. 

1580 try: 

1581 self.bridge.moveToTrash(ref) 

1582 except Exception as e: 

1583 if ignore_errors: 

1584 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

1585 else: 

1586 raise 

1587 return 

1588 

1589 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

1590 

1591 fileLocations = self._get_dataset_locations_info(ref) 

1592 

1593 if not fileLocations: 

1594 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

1595 if ignore_errors: 1595 ↛ 1596line 1595 didn't jump to line 1596, because the condition on line 1595 was never true

1596 log.warning(err_msg) 

1597 return 

1598 else: 

1599 raise FileNotFoundError(err_msg) 

1600 

1601 for location, storedFileInfo in fileLocations: 

1602 if not self._artifact_exists(location): 1602 ↛ 1603line 1602 didn't jump to line 1603, because the condition on line 1602 was never true

1603 err_msg = f"Dataset is known to datastore {self.name} but " \ 

1604 f"associated artifact ({location.uri}) is missing" 

1605 if ignore_errors: 

1606 log.warning(err_msg) 

1607 return 

1608 else: 

1609 raise FileNotFoundError(err_msg) 

1610 

1611 # Mark dataset as trashed 

1612 try: 

1613 self.bridge.moveToTrash([ref]) 

1614 except Exception as e: 

1615 if ignore_errors: 

1616 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} " 

1617 f"but encountered an error: {e}") 

1618 pass 

1619 else: 

1620 raise 

1621 

1622 @transactional 

1623 def emptyTrash(self, ignore_errors: bool = True) -> None: 

1624 """Remove all datasets from the trash. 

1625 

1626 Parameters 

1627 ---------- 

1628 ignore_errors : `bool` 

1629 If `True` return without error even if something went wrong. 

1630 Problems could occur if another process is simultaneously trying 

1631 to delete. 

1632 """ 

1633 log.debug("Emptying trash in datastore %s", self.name) 

1634 

1635 # Context manager will empty trash iff we finish it without raising. 

1636 # It will also automatically delete the relevant rows from the 

1637 # trash table and the records table. 

1638 with self.bridge.emptyTrash(self._table, record_class=StoredFileInfo, 

1639 record_column="path") as trash_data: 

1640 # Removing the artifacts themselves requires that the files are 

1641 # not also associated with refs that are not to be trashed. 

1642 # Therefore need to do a query with the file paths themselves 

1643 # and return all the refs associated with them. Can only delete 

1644 # a file if the refs to be trashed are the only refs associated 

1645 # with the file. 

1646 # This requires multiple copies of the trashed items 

1647 trashed, artifacts_to_keep = trash_data 

1648 

1649 if artifacts_to_keep is None: 

1650 # The bridge is not helping us so have to work it out 

1651 # ourselves. This is not going to be as efficient. 

1652 trashed = list(trashed) 

1653 

1654 # The instance check is for mypy since up to this point it 

1655 # does not know the type of info. 

1656 path_map = self._refs_associated_with_artifacts([info.path for _, info in trashed 

1657 if isinstance(info, StoredFileInfo)]) 

1658 

1659 for ref, info in trashed: 

1660 

1661 # Mypy needs to know this is not the base class 

1662 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

1663 

1664 # Check for mypy 

1665 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

1666 

1667 path_map[info.path].remove(ref.id) 

1668 if not path_map[info.path]: 1668 ↛ 1659line 1668 didn't jump to line 1659, because the condition on line 1668 was never false

1669 del path_map[info.path] 

1670 

1671 artifacts_to_keep = set(path_map) 

1672 

1673 for ref, info in trashed: 

1674 

1675 # Should not happen for this implementation but need 

1676 # to keep mypy happy. 

1677 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

1678 

1679 # Mypy needs to know this is not the base class 

1680 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

1681 

1682 # Check for mypy 

1683 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

1684 

1685 if info.path in artifacts_to_keep: 

1686 # This is a multi-dataset artifact and we are not 

1687 # removing all associated refs. 

1688 continue 

1689 

1690 # Only trashed refs still known to datastore will be returned. 

1691 location = info.file_location(self.locationFactory) 

1692 

1693 # Point of no return for this artifact 

1694 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

1695 try: 

1696 self._delete_artifact(location) 

1697 except FileNotFoundError: 

1698 # If the file itself has been deleted there is nothing 

1699 # we can do about it. It is possible that trash has 

1700 # been run in parallel in another process or someone 

1701 # decided to delete the file. It is unlikely to come 

1702 # back and so we should still continue with the removal 

1703 # of the entry from the trash table. It is also possible 

1704 # we removed it in a previous iteration if it was 

1705 # a multi-dataset artifact. The delete artifact method 

1706 # will log a debug message in this scenario. 

1707 # Distinguishing file missing before trash started and 

1708 # file already removed previously as part of this trash 

1709 # is not worth the distinction with regards to potential 

1710 # memory cost. 

1711 pass 

1712 except Exception as e: 

1713 if ignore_errors: 

1714 # Use a debug message here even though it's not 

1715 # a good situation. In some cases this can be 

1716 # caused by a race between user A and user B 

1717 # and neither of them has permissions for the 

1718 # other's files. Butler does not know about users 

1719 # and trash has no idea what collections these 

1720 # files were in (without guessing from a path). 

1721 log.debug("Encountered error removing artifact %s from datastore %s: %s", 

1722 location.uri, self.name, e) 

1723 else: 

1724 raise 

1725 

1726 @transactional 

1727 def forget(self, refs: Iterable[DatasetRef]) -> None: 

1728 # Docstring inherited. 

1729 refs = list(refs) 

1730 self.bridge.forget(refs) 

1731 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs]) 

1732 

1733 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], 

1734 logFailures: bool = False) -> None: 

1735 """Validate some of the configuration for this datastore. 

1736 

1737 Parameters 

1738 ---------- 

1739 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

1740 Entities to test against this configuration. Can be differing 

1741 types. 

1742 logFailures : `bool`, optional 

1743 If `True`, output a log message for every validation error 

1744 detected. 

1745 

1746 Raises 

1747 ------ 

1748 DatastoreValidationError 

1749 Raised if there is a validation problem with a configuration. 

1750 All the problems are reported in a single exception. 

1751 

1752 Notes 

1753 ----- 

1754 This method checks that all the supplied entities have valid file 

1755 templates and also have formatters defined. 

1756 """ 

1757 

1758 templateFailed = None 

1759 try: 

1760 self.templates.validateTemplates(entities, logFailures=logFailures) 

1761 except FileTemplateValidationError as e: 

1762 templateFailed = str(e) 

1763 

1764 formatterFailed = [] 

1765 for entity in entities: 

1766 try: 

1767 self.formatterFactory.getFormatterClass(entity) 

1768 except KeyError as e: 

1769 formatterFailed.append(str(e)) 

1770 if logFailures: 1770 ↛ 1765line 1770 didn't jump to line 1765, because the condition on line 1770 was never false

1771 log.critical("Formatter failure: %s", e) 

1772 

1773 if templateFailed or formatterFailed: 

1774 messages = [] 

1775 if templateFailed: 1775 ↛ 1776line 1775 didn't jump to line 1776, because the condition on line 1775 was never true

1776 messages.append(templateFailed) 

1777 if formatterFailed: 1777 ↛ 1779line 1777 didn't jump to line 1779, because the condition on line 1777 was never false

1778 messages.append(",".join(formatterFailed)) 

1779 msg = ";\n".join(messages) 

1780 raise DatastoreValidationError(msg) 

1781 

1782 def getLookupKeys(self) -> Set[LookupKey]: 

1783 # Docstring is inherited from base class 

1784 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \ 

1785 self.constraints.getLookupKeys() 

1786 

1787 def validateKey(self, lookupKey: LookupKey, 

1788 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

1789 # Docstring is inherited from base class 

1790 # The key can be valid in either formatters or templates so we can 

1791 # only check the template if it exists 

1792 if lookupKey in self.templates: 

1793 try: 

1794 self.templates[lookupKey].validateTemplate(entity) 

1795 except FileTemplateValidationError as e: 

1796 raise DatastoreValidationError(e) from e 

1797 

1798 def export(self, refs: Iterable[DatasetRef], *, 

1799 directory: Optional[Union[ButlerURI, str]] = None, 

1800 transfer: Optional[str] = "auto") -> Iterable[FileDataset]: 

1801 # Docstring inherited from Datastore.export. 

1802 if transfer is not None and directory is None: 1802 ↛ 1803line 1802 didn't jump to line 1803, because the condition on line 1802 was never true

1803 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no " 

1804 "export directory given") 

1805 

1806 # Force the directory to be a URI object 

1807 directoryUri: Optional[ButlerURI] = None 

1808 if directory is not None: 1808 ↛ 1811line 1808 didn't jump to line 1811, because the condition on line 1808 was never false

1809 directoryUri = ButlerURI(directory, forceDirectory=True) 

1810 

1811 if transfer is not None and directoryUri is not None: 1811 ↛ 1816line 1811 didn't jump to line 1816, because the condition on line 1811 was never false

1812 # mypy needs the second test 

1813 if not directoryUri.exists(): 1813 ↛ 1814line 1813 didn't jump to line 1814, because the condition on line 1813 was never true

1814 raise FileNotFoundError(f"Export location {directory} does not exist") 

1815 

1816 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

1817 for ref in progress.wrap(refs, "Exporting dataset files"): 

1818 fileLocations = self._get_dataset_locations_info(ref) 

1819 if not fileLocations: 1819 ↛ 1820line 1819 didn't jump to line 1820, because the condition on line 1819 was never true

1820 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

1821 # For now we can not export disassembled datasets 

1822 if len(fileLocations) > 1: 1822 ↛ 1823line 1822 didn't jump to line 1823, because the condition on line 1822 was never true

1823 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

1824 location, storedFileInfo = fileLocations[0] 

1825 

1826 pathInStore = location.pathInStore.path 

1827 if transfer is None: 1827 ↛ 1830line 1827 didn't jump to line 1830, because the condition on line 1827 was never true

1828 # TODO: do we also need to return the readStorageClass somehow? 

1829 # We will use the path in store directly 

1830 pass 

1831 elif transfer == "direct": 1831 ↛ 1833line 1831 didn't jump to line 1833, because the condition on line 1831 was never true

1832 # Use full URIs to the remote store in the export 

1833 pathInStore = str(location.uri) 

1834 else: 

1835 # mypy needs help 

1836 assert directoryUri is not None, "directoryUri must be defined to get here" 

1837 storeUri = ButlerURI(location.uri) 

1838 

1839 # if the datastore has an absolute URI to a resource, we 

1840 # have two options: 

1841 # 1. Keep the absolute URI in the exported YAML 

1842 # 2. Allocate a new name in the local datastore and transfer 

1843 # it. 

1844 # For now go with option 2 

1845 if location.pathInStore.isabs(): 1845 ↛ 1846line 1845 didn't jump to line 1846, because the condition on line 1845 was never true

1846 template = self.templates.getTemplate(ref) 

1847 newURI = ButlerURI(template.format(ref), forceAbsolute=False) 

1848 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

1849 

1850 exportUri = directoryUri.join(pathInStore) 

1851 exportUri.transfer_from(storeUri, transfer=transfer) 

1852 

1853 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

1854 

1855 @staticmethod 

1856 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]: 

1857 """Compute the checksum of the supplied file. 

1858 

1859 Parameters 

1860 ---------- 

1861 uri : `ButlerURI` 

1862 Name of resource to calculate checksum from. 

1863 algorithm : `str`, optional 

1864 Name of algorithm to use. Must be one of the algorithms supported 

1865 by :py:class`hashlib`. 

1866 block_size : `int` 

1867 Number of bytes to read from file at one time. 

1868 

1869 Returns 

1870 ------- 

1871 hexdigest : `str` 

1872 Hex digest of the file. 

1873 

1874 Notes 

1875 ----- 

1876 Currently returns None if the URI is for a remote resource. 

1877 """ 

1878 if algorithm not in hashlib.algorithms_guaranteed: 1878 ↛ 1879line 1878 didn't jump to line 1879, because the condition on line 1878 was never true

1879 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

1880 

1881 if not uri.isLocal: 1881 ↛ 1882line 1881 didn't jump to line 1882, because the condition on line 1881 was never true

1882 return None 

1883 

1884 hasher = hashlib.new(algorithm) 

1885 

1886 with uri.as_local() as local_uri: 

1887 with open(local_uri.ospath, "rb") as f: 

1888 for chunk in iter(lambda: f.read(block_size), b""): 

1889 hasher.update(chunk) 

1890 

1891 return hasher.hexdigest() 

1892 

1893 def needs_expanded_data_ids( 

1894 self, 

1895 transfer: Optional[str], 

1896 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

1897 ) -> bool: 

1898 # Docstring inherited. 

1899 # This _could_ also use entity to inspect whether the filename template 

1900 # involves placeholders other than the required dimensions for its 

1901 # dataset type, but that's not necessary for correctness; it just 

1902 # enables more optimizations (perhaps only in theory). 

1903 return transfer not in ("direct", None)