Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileDatastore", ) 

26 

27import hashlib 

28import logging 

29import os 

30import tempfile 

31 

32from sqlalchemy import BigInteger, String 

33 

34from dataclasses import dataclass 

35from typing import ( 

36 TYPE_CHECKING, 

37 Any, 

38 ClassVar, 

39 Dict, 

40 Iterable, 

41 List, 

42 Mapping, 

43 Optional, 

44 Set, 

45 Tuple, 

46 Type, 

47 Union, 

48) 

49 

50from lsst.daf.butler import ( 

51 ButlerURI, 

52 CompositesMap, 

53 Config, 

54 FileDataset, 

55 DatasetId, 

56 DatasetRef, 

57 DatasetType, 

58 DatasetTypeNotSupportedError, 

59 Datastore, 

60 DatastoreCacheManager, 

61 DatastoreDisabledCacheManager, 

62 DatastoreConfig, 

63 DatastoreValidationError, 

64 FileDescriptor, 

65 FileTemplates, 

66 FileTemplateValidationError, 

67 Formatter, 

68 FormatterFactory, 

69 Location, 

70 LocationFactory, 

71 Progress, 

72 StorageClass, 

73 StoredFileInfo, 

74) 

75 

76from lsst.daf.butler import ddl 

77from lsst.daf.butler.registry.interfaces import ( 

78 ReadOnlyDatabaseError, 

79 DatastoreRegistryBridge, 

80) 

81 

82from lsst.daf.butler.core.repoRelocation import replaceRoot 

83from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional 

84from .genericDatastore import GenericBaseDatastore 

85 

86if TYPE_CHECKING: 86 ↛ 87line 86 didn't jump to line 87, because the condition on line 86 was never true

87 from lsst.daf.butler import LookupKey, AbstractDatastoreCacheManager 

88 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

89 

90log = logging.getLogger(__name__) 

91 

92# String to use when a Python None is encountered 

93NULLSTR = "__NULL_STRING__" 

94 

95 

96class _IngestPrepData(Datastore.IngestPrepData): 

97 """Helper class for FileDatastore ingest implementation. 

98 

99 Parameters 

100 ---------- 

101 datasets : `list` of `FileDataset` 

102 Files to be ingested by this datastore. 

103 """ 

104 def __init__(self, datasets: List[FileDataset]): 

105 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

106 self.datasets = datasets 

107 

108 

109@dataclass(frozen=True) 

110class DatastoreFileGetInformation: 

111 """Collection of useful parameters needed to retrieve a file from 

112 a Datastore. 

113 """ 

114 

115 location: Location 

116 """The location from which to read the dataset.""" 

117 

118 formatter: Formatter 

119 """The `Formatter` to use to deserialize the dataset.""" 

120 

121 info: StoredFileInfo 

122 """Stored information about this file and its formatter.""" 

123 

124 assemblerParams: Dict[str, Any] 

125 """Parameters to use for post-processing the retrieved dataset.""" 

126 

127 formatterParams: Dict[str, Any] 

128 """Parameters that were understood by the associated formatter.""" 

129 

130 component: Optional[str] 

131 """The component to be retrieved (can be `None`).""" 

132 

133 readStorageClass: StorageClass 

134 """The `StorageClass` of the dataset being read.""" 

135 

136 

137class FileDatastore(GenericBaseDatastore): 

138 """Generic Datastore for file-based implementations. 

139 

140 Should always be sub-classed since key abstract methods are missing. 

141 

142 Parameters 

143 ---------- 

144 config : `DatastoreConfig` or `str` 

145 Configuration as either a `Config` object or URI to file. 

146 bridgeManager : `DatastoreRegistryBridgeManager` 

147 Object that manages the interface between `Registry` and datastores. 

148 butlerRoot : `str`, optional 

149 New datastore root to use to override the configuration value. 

150 

151 Raises 

152 ------ 

153 ValueError 

154 If root location does not exist and ``create`` is `False` in the 

155 configuration. 

156 """ 

157 

158 defaultConfigFile: ClassVar[Optional[str]] = None 

159 """Path to configuration defaults. Accessed within the ``config`` resource 

160 or relative to a search path. Can be None if no defaults specified. 

161 """ 

162 

163 root: ButlerURI 

164 """Root directory URI of this `Datastore`.""" 

165 

166 locationFactory: LocationFactory 

167 """Factory for creating locations relative to the datastore root.""" 

168 

169 formatterFactory: FormatterFactory 

170 """Factory for creating instances of formatters.""" 

171 

172 templates: FileTemplates 

173 """File templates that can be used by this `Datastore`.""" 

174 

175 composites: CompositesMap 

176 """Determines whether a dataset should be disassembled on put.""" 

177 

178 defaultConfigFile = "datastores/fileDatastore.yaml" 

179 """Path to configuration defaults. Accessed within the ``config`` resource 

180 or relative to a search path. Can be None if no defaults specified. 

181 """ 

182 

183 @classmethod 

184 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

185 """Set any filesystem-dependent config options for this Datastore to 

186 be appropriate for a new empty repository with the given root. 

187 

188 Parameters 

189 ---------- 

190 root : `str` 

191 URI to the root of the data repository. 

192 config : `Config` 

193 A `Config` to update. Only the subset understood by 

194 this component will be updated. Will not expand 

195 defaults. 

196 full : `Config` 

197 A complete config with all defaults expanded that can be 

198 converted to a `DatastoreConfig`. Read-only and will not be 

199 modified by this method. 

200 Repository-specific options that should not be obtained 

201 from defaults when Butler instances are constructed 

202 should be copied from ``full`` to ``config``. 

203 overwrite : `bool`, optional 

204 If `False`, do not modify a value in ``config`` if the value 

205 already exists. Default is always to overwrite with the provided 

206 ``root``. 

207 

208 Notes 

209 ----- 

210 If a keyword is explicitly defined in the supplied ``config`` it 

211 will not be overridden by this method if ``overwrite`` is `False`. 

212 This allows explicit values set in external configs to be retained. 

213 """ 

214 Config.updateParameters(DatastoreConfig, config, full, 

215 toUpdate={"root": root}, 

216 toCopy=("cls", ("records", "table")), overwrite=overwrite) 

217 

218 @classmethod 

219 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

220 return ddl.TableSpec( 

221 fields=[ 

222 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

223 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

224 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

225 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

226 # Use empty string to indicate no component 

227 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

228 # TODO: should checksum be Base64Bytes instead? 

229 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

230 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

231 ], 

232 unique=frozenset(), 

233 ) 

234 

235 def __init__(self, config: Union[DatastoreConfig, str], 

236 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

237 super().__init__(config, bridgeManager) 

238 if "root" not in self.config: 238 ↛ 239line 238 didn't jump to line 239, because the condition on line 238 was never true

239 raise ValueError("No root directory specified in configuration") 

240 

241 # Name ourselves either using an explicit name or a name 

242 # derived from the (unexpanded) root 

243 if "name" in self.config: 

244 self.name = self.config["name"] 

245 else: 

246 # We use the unexpanded root in the name to indicate that this 

247 # datastore can be moved without having to update registry. 

248 self.name = "{}@{}".format(type(self).__name__, 

249 self.config["root"]) 

250 

251 # Support repository relocation in config 

252 # Existence of self.root is checked in subclass 

253 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot), 

254 forceDirectory=True, forceAbsolute=True) 

255 

256 self.locationFactory = LocationFactory(self.root) 

257 self.formatterFactory = FormatterFactory() 

258 

259 # Now associate formatters with storage classes 

260 self.formatterFactory.registerFormatters(self.config["formatters"], 

261 universe=bridgeManager.universe) 

262 

263 # Read the file naming templates 

264 self.templates = FileTemplates(self.config["templates"], 

265 universe=bridgeManager.universe) 

266 

267 # See if composites should be disassembled 

268 self.composites = CompositesMap(self.config["composites"], 

269 universe=bridgeManager.universe) 

270 

271 tableName = self.config["records", "table"] 

272 try: 

273 # Storage of paths and formatters, keyed by dataset_id 

274 self._table = bridgeManager.opaque.register( 

275 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType)) 

276 # Interface to Registry. 

277 self._bridge = bridgeManager.register(self.name) 

278 except ReadOnlyDatabaseError: 

279 # If the database is read only and we just tried and failed to 

280 # create a table, it means someone is trying to create a read-only 

281 # butler client for an empty repo. That should be okay, as long 

282 # as they then try to get any datasets before some other client 

283 # creates the table. Chances are they'rejust validating 

284 # configuration. 

285 pass 

286 

287 # Determine whether checksums should be used - default to False 

288 self.useChecksum = self.config.get("checksum", False) 

289 

290 # Determine whether we can fall back to configuration if a 

291 # requested dataset is not known to registry 

292 self.trustGetRequest = self.config.get("trust_get_request", False) 

293 

294 # Create a cache manager 

295 self.cacheManager: AbstractDatastoreCacheManager 

296 if "cached" in self.config: 296 ↛ 300line 296 didn't jump to line 300, because the condition on line 296 was never false

297 self.cacheManager = DatastoreCacheManager(self.config["cached"], 

298 universe=bridgeManager.universe) 

299 else: 

300 self.cacheManager = DatastoreDisabledCacheManager("", 

301 universe=bridgeManager.universe) 

302 

303 # Check existence and create directory structure if necessary 

304 if not self.root.exists(): 

305 if "create" not in self.config or not self.config["create"]: 305 ↛ 306line 305 didn't jump to line 306, because the condition on line 305 was never true

306 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

307 try: 

308 self.root.mkdir() 

309 except Exception as e: 

310 raise ValueError(f"Can not create datastore root '{self.root}', check permissions." 

311 f" Got error: {e}") from e 

312 

313 def __str__(self) -> str: 

314 return str(self.root) 

315 

316 @property 

317 def bridge(self) -> DatastoreRegistryBridge: 

318 return self._bridge 

319 

320 def _artifact_exists(self, location: Location) -> bool: 

321 """Check that an artifact exists in this datastore at the specified 

322 location. 

323 

324 Parameters 

325 ---------- 

326 location : `Location` 

327 Expected location of the artifact associated with this datastore. 

328 

329 Returns 

330 ------- 

331 exists : `bool` 

332 True if the location can be found, false otherwise. 

333 """ 

334 log.debug("Checking if resource exists: %s", location.uri) 

335 return location.uri.exists() 

336 

337 def _delete_artifact(self, location: Location) -> None: 

338 """Delete the artifact from the datastore. 

339 

340 Parameters 

341 ---------- 

342 location : `Location` 

343 Location of the artifact associated with this datastore. 

344 """ 

345 if location.pathInStore.isabs(): 345 ↛ 346line 345 didn't jump to line 346, because the condition on line 345 was never true

346 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

347 log.debug("Deleting file: %s", location.uri) 

348 location.uri.remove() 

349 log.debug("Successfully deleted file: %s", location.uri) 

350 

351 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

352 # Docstring inherited from GenericBaseDatastore 

353 records = [] 

354 for ref, info in zip(refs, infos): 

355 # Component should come from ref and fall back on info 

356 component = ref.datasetType.component() 

357 if component is None and info.component is not None: 357 ↛ 358line 357 didn't jump to line 358, because the condition on line 357 was never true

358 component = info.component 

359 if component is None: 

360 # Use empty string since we want this to be part of the 

361 # primary key. 

362 component = NULLSTR 

363 records.append( 

364 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path, 

365 storage_class=info.storageClass.name, component=component, 

366 checksum=info.checksum, file_size=info.file_size) 

367 ) 

368 self._table.insert(*records) 

369 

370 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

371 # Docstring inherited from GenericBaseDatastore 

372 

373 # Look for the dataset_id -- there might be multiple matches 

374 # if we have disassembled the dataset. 

375 records = list(self._table.fetch(dataset_id=ref.id)) 

376 

377 results = [] 

378 for record in records: 

379 # Convert name of StorageClass to instance 

380 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"]) 

381 component = record["component"] if (record["component"] 

382 and record["component"] != NULLSTR) else None 

383 

384 info = StoredFileInfo(formatter=record["formatter"], 

385 path=record["path"], 

386 storageClass=storageClass, 

387 component=component, 

388 checksum=record["checksum"], 

389 file_size=record["file_size"]) 

390 results.append(info) 

391 

392 return results 

393 

394 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[DatasetId]: 

395 """Return all dataset refs associated with the supplied path. 

396 

397 Parameters 

398 ---------- 

399 pathInStore : `ButlerURI` 

400 Path of interest in the data store. 

401 

402 Returns 

403 ------- 

404 ids : `set` of `int` 

405 All `DatasetRef` IDs associated with this path. 

406 """ 

407 records = list(self._table.fetch(path=str(pathInStore))) 

408 ids = {r["dataset_id"] for r in records} 

409 return ids 

410 

411 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

412 # Docstring inherited from GenericBaseDatastore 

413 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

414 

415 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

416 r"""Find all the `Location`\ s of the requested dataset in the 

417 `Datastore` and the associated stored file information. 

418 

419 Parameters 

420 ---------- 

421 ref : `DatasetRef` 

422 Reference to the required `Dataset`. 

423 

424 Returns 

425 ------- 

426 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

427 Location of the dataset within the datastore and 

428 stored information about each file and its formatter. 

429 """ 

430 # Get the file information (this will fail if no file) 

431 records = self.getStoredItemsInfo(ref) 

432 

433 # Use the path to determine the location -- we need to take 

434 # into account absolute URIs in the datastore record 

435 locations: List[Tuple[Location, StoredFileInfo]] = [] 

436 for r in records: 

437 uriInStore = ButlerURI(r.path, forceAbsolute=False) 

438 if uriInStore.isabs(): 438 ↛ 439line 438 didn't jump to line 439, because the condition on line 438 was never true

439 location = Location(None, uriInStore) 

440 else: 

441 location = self.locationFactory.fromPath(r.path) 

442 locations.append((location, r)) 

443 return locations 

444 

445 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

446 """Check that there is only one dataset associated with the 

447 specified artifact. 

448 

449 Parameters 

450 ---------- 

451 ref : `DatasetRef` or `FakeDatasetRef` 

452 Dataset to be removed. 

453 location : `Location` 

454 The location of the artifact to be removed. 

455 

456 Returns 

457 ------- 

458 can_remove : `Bool` 

459 True if the artifact can be safely removed. 

460 """ 

461 # Can't ever delete absolute URIs. 

462 if location.pathInStore.isabs(): 462 ↛ 463line 462 didn't jump to line 463, because the condition on line 462 was never true

463 return False 

464 

465 # Get all entries associated with this path 

466 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

467 if not allRefs: 467 ↛ 468line 467 didn't jump to line 468, because the condition on line 467 was never true

468 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

469 

470 # Remove these refs from all the refs and if there is nothing left 

471 # then we can delete 

472 remainingRefs = allRefs - {ref.id} 

473 

474 if remainingRefs: 

475 return False 

476 return True 

477 

478 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, 

479 StoredFileInfo]]: 

480 """Predict the location and related file information of the requested 

481 dataset in this datastore. 

482 

483 Parameters 

484 ---------- 

485 ref : `DatasetRef` 

486 Reference to the required `Dataset`. 

487 

488 Returns 

489 ------- 

490 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

491 Expected Location of the dataset within the datastore and 

492 placeholder information about each file and its formatter. 

493 

494 Notes 

495 ----- 

496 Uses the current configuration to determine how we would expect the 

497 datastore files to have been written if we couldn't ask registry. 

498 This is safe so long as there has been no change to datastore 

499 configuration between writing the dataset and wanting to read it. 

500 Will not work for files that have been ingested without using the 

501 standard file template or default formatter. 

502 """ 

503 

504 # If we have a component ref we always need to ask the questions 

505 # of the composite. If the composite is disassembled this routine 

506 # should return all components. If the composite was not 

507 # disassembled the composite is what is stored regardless of 

508 # component request. Note that if the caller has disassembled 

509 # a composite there is no way for this guess to know that 

510 # without trying both the composite and component ref and seeing 

511 # if there is something at the component Location even without 

512 # disassembly being enabled. 

513 if ref.datasetType.isComponent(): 

514 ref = ref.makeCompositeRef() 

515 

516 # See if the ref is a composite that should be disassembled 

517 doDisassembly = self.composites.shouldBeDisassembled(ref) 

518 

519 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = [] 

520 

521 if doDisassembly: 

522 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

523 compRef = ref.makeComponentRef(component) 

524 location, formatter = self._determine_put_formatter_location(compRef) 

525 all_info.append((location, formatter, componentStorage, component)) 

526 

527 else: 

528 # Always use the composite ref if no disassembly 

529 location, formatter = self._determine_put_formatter_location(ref) 

530 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

531 

532 # Convert the list of tuples to have StoredFileInfo as second element 

533 return [(location, StoredFileInfo(formatter=formatter, 

534 path=location.pathInStore.path, 

535 storageClass=storageClass, 

536 component=component, 

537 checksum=None, 

538 file_size=-1)) 

539 for location, formatter, storageClass, component in all_info] 

540 

541 def _prepare_for_get(self, ref: DatasetRef, 

542 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]: 

543 """Check parameters for ``get`` and obtain formatter and 

544 location. 

545 

546 Parameters 

547 ---------- 

548 ref : `DatasetRef` 

549 Reference to the required Dataset. 

550 parameters : `dict` 

551 `StorageClass`-specific parameters that specify, for example, 

552 a slice of the dataset to be loaded. 

553 

554 Returns 

555 ------- 

556 getInfo : `list` [`DatastoreFileGetInformation`] 

557 Parameters needed to retrieve each file. 

558 """ 

559 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

560 

561 # Get file metadata and internal metadata 

562 fileLocations = self._get_dataset_locations_info(ref) 

563 if not fileLocations: 

564 if not self.trustGetRequest: 

565 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

566 # Assume the dataset is where we think it should be 

567 fileLocations = self._get_expected_dataset_locations_info(ref) 

568 

569 # The storage class we want to use eventually 

570 refStorageClass = ref.datasetType.storageClass 

571 

572 if len(fileLocations) > 1: 

573 disassembled = True 

574 else: 

575 disassembled = False 

576 

577 # Is this a component request? 

578 refComponent = ref.datasetType.component() 

579 

580 fileGetInfo = [] 

581 for location, storedFileInfo in fileLocations: 

582 

583 # The storage class used to write the file 

584 writeStorageClass = storedFileInfo.storageClass 

585 

586 # If this has been disassembled we need read to match the write 

587 if disassembled: 

588 readStorageClass = writeStorageClass 

589 else: 

590 readStorageClass = refStorageClass 

591 

592 formatter = getInstanceOf(storedFileInfo.formatter, 

593 FileDescriptor(location, readStorageClass=readStorageClass, 

594 storageClass=writeStorageClass, parameters=parameters), 

595 ref.dataId) 

596 

597 formatterParams, notFormatterParams = formatter.segregateParameters() 

598 

599 # Of the remaining parameters, extract the ones supported by 

600 # this StorageClass (for components not all will be handled) 

601 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

602 

603 # The ref itself could be a component if the dataset was 

604 # disassembled by butler, or we disassembled in datastore and 

605 # components came from the datastore records 

606 component = storedFileInfo.component if storedFileInfo.component else refComponent 

607 

608 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo, 

609 assemblerParams, formatterParams, 

610 component, readStorageClass)) 

611 

612 return fileGetInfo 

613 

614 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

615 """Check the arguments for ``put`` and obtain formatter and 

616 location. 

617 

618 Parameters 

619 ---------- 

620 inMemoryDataset : `object` 

621 The dataset to store. 

622 ref : `DatasetRef` 

623 Reference to the associated Dataset. 

624 

625 Returns 

626 ------- 

627 location : `Location` 

628 The location to write the dataset. 

629 formatter : `Formatter` 

630 The `Formatter` to use to write the dataset. 

631 

632 Raises 

633 ------ 

634 TypeError 

635 Supplied object and storage class are inconsistent. 

636 DatasetTypeNotSupportedError 

637 The associated `DatasetType` is not handled by this datastore. 

638 """ 

639 self._validate_put_parameters(inMemoryDataset, ref) 

640 return self._determine_put_formatter_location(ref) 

641 

642 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]: 

643 """Calculate the formatter and output location to use for put. 

644 

645 Parameters 

646 ---------- 

647 ref : `DatasetRef` 

648 Reference to the associated Dataset. 

649 

650 Returns 

651 ------- 

652 location : `Location` 

653 The location to write the dataset. 

654 formatter : `Formatter` 

655 The `Formatter` to use to write the dataset. 

656 """ 

657 # Work out output file name 

658 try: 

659 template = self.templates.getTemplate(ref) 

660 except KeyError as e: 

661 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

662 

663 # Validate the template to protect against filenames from different 

664 # dataIds returning the same and causing overwrite confusion. 

665 template.validateTemplate(ref) 

666 

667 location = self.locationFactory.fromPath(template.format(ref)) 

668 

669 # Get the formatter based on the storage class 

670 storageClass = ref.datasetType.storageClass 

671 try: 

672 formatter = self.formatterFactory.getFormatter(ref, 

673 FileDescriptor(location, 

674 storageClass=storageClass), 

675 ref.dataId) 

676 except KeyError as e: 

677 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore " 

678 f"{self.name}") from e 

679 

680 # Now that we know the formatter, update the location 

681 location = formatter.makeUpdatedLocation(location) 

682 

683 return location, formatter 

684 

685 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

686 # Docstring inherited from base class 

687 if transfer != "auto": 

688 return transfer 

689 

690 # See if the paths are within the datastore or not 

691 inside = [self._pathInStore(d.path) is not None for d in datasets] 

692 

693 if all(inside): 

694 transfer = None 

695 elif not any(inside): 695 ↛ 699line 695 didn't jump to line 699, because the condition on line 695 was never false

696 # Allow ButlerURI to use its own knowledge 

697 transfer = "auto" 

698 else: 

699 raise ValueError("Some datasets are inside the datastore and some are outside." 

700 " Please use an explicit transfer mode and not 'auto'.") 

701 

702 return transfer 

703 

704 def _pathInStore(self, path: Union[str, ButlerURI]) -> Optional[str]: 

705 """Return path relative to datastore root 

706 

707 Parameters 

708 ---------- 

709 path : `str` or `ButlerURI` 

710 Path to dataset. Can be absolute URI. If relative assumed to 

711 be relative to the datastore. Returns path in datastore 

712 or raises an exception if the path it outside. 

713 

714 Returns 

715 ------- 

716 inStore : `str` 

717 Path relative to datastore root. Returns `None` if the file is 

718 outside the root. 

719 """ 

720 # Relative path will always be relative to datastore 

721 pathUri = ButlerURI(path, forceAbsolute=False) 

722 return pathUri.relative_to(self.root) 

723 

724 def _standardizeIngestPath(self, path: Union[str, ButlerURI], *, 

725 transfer: Optional[str] = None) -> Union[str, ButlerURI]: 

726 """Standardize the path of a to-be-ingested file. 

727 

728 Parameters 

729 ---------- 

730 path : `str` or `ButlerURI` 

731 Path of a file to be ingested. 

732 transfer : `str`, optional 

733 How (and whether) the dataset should be added to the datastore. 

734 See `ingest` for details of transfer modes. 

735 This implementation is provided only so 

736 `NotImplementedError` can be raised if the mode is not supported; 

737 actual transfers are deferred to `_extractIngestInfo`. 

738 

739 Returns 

740 ------- 

741 path : `str` or `ButlerURI` 

742 New path in what the datastore considers standard form. If an 

743 absolute URI was given that will be returned unchanged. 

744 

745 Notes 

746 ----- 

747 Subclasses of `FileDatastore` can implement this method instead 

748 of `_prepIngest`. It should not modify the data repository or given 

749 file in any way. 

750 

751 Raises 

752 ------ 

753 NotImplementedError 

754 Raised if the datastore does not support the given transfer mode 

755 (including the case where ingest is not supported at all). 

756 FileNotFoundError 

757 Raised if one of the given files does not exist. 

758 """ 

759 if transfer not in (None, "direct") + self.root.transferModes: 759 ↛ 760line 759 didn't jump to line 760, because the condition on line 759 was never true

760 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

761 

762 # A relative URI indicates relative to datastore root 

763 srcUri = ButlerURI(path, forceAbsolute=False) 

764 if not srcUri.isabs(): 

765 srcUri = self.root.join(path) 

766 

767 if not srcUri.exists(): 

768 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest " 

769 f"are assumed to be relative to {self.root} unless they are absolute.") 

770 

771 if transfer is None: 

772 relpath = srcUri.relative_to(self.root) 

773 if not relpath: 

774 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not " 

775 f"within datastore ({self.root})") 

776 

777 # Return the relative path within the datastore for internal 

778 # transfer 

779 path = relpath 

780 

781 return path 

782 

783 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *, 

784 formatter: Union[Formatter, Type[Formatter]], 

785 transfer: Optional[str] = None) -> StoredFileInfo: 

786 """Relocate (if necessary) and extract `StoredFileInfo` from a 

787 to-be-ingested file. 

788 

789 Parameters 

790 ---------- 

791 path : `str` or `ButlerURI` 

792 URI or path of a file to be ingested. 

793 ref : `DatasetRef` 

794 Reference for the dataset being ingested. Guaranteed to have 

795 ``dataset_id not None`. 

796 formatter : `type` or `Formatter` 

797 `Formatter` subclass to use for this dataset or an instance. 

798 transfer : `str`, optional 

799 How (and whether) the dataset should be added to the datastore. 

800 See `ingest` for details of transfer modes. 

801 

802 Returns 

803 ------- 

804 info : `StoredFileInfo` 

805 Internal datastore record for this file. This will be inserted by 

806 the caller; the `_extractIngestInfo` is only resposible for 

807 creating and populating the struct. 

808 

809 Raises 

810 ------ 

811 FileNotFoundError 

812 Raised if one of the given files does not exist. 

813 FileExistsError 

814 Raised if transfer is not `None` but the (internal) location the 

815 file would be moved to is already occupied. 

816 """ 

817 if self._transaction is None: 817 ↛ 818line 817 didn't jump to line 818, because the condition on line 817 was never true

818 raise RuntimeError("Ingest called without transaction enabled") 

819 

820 # Create URI of the source path, do not need to force a relative 

821 # path to absolute. 

822 srcUri = ButlerURI(path, forceAbsolute=False) 

823 

824 # Track whether we have read the size of the source yet 

825 have_sized = False 

826 

827 tgtLocation: Optional[Location] 

828 if transfer is None: 

829 # A relative path is assumed to be relative to the datastore 

830 # in this context 

831 if not srcUri.isabs(): 

832 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

833 else: 

834 # Work out the path in the datastore from an absolute URI 

835 # This is required to be within the datastore. 

836 pathInStore = srcUri.relative_to(self.root) 

837 if pathInStore is None: 837 ↛ 838line 837 didn't jump to line 838, because the condition on line 837 was never true

838 raise RuntimeError(f"Unexpectedly learned that {srcUri} is " 

839 f"not within datastore {self.root}") 

840 tgtLocation = self.locationFactory.fromPath(pathInStore) 

841 elif transfer == "direct": 841 ↛ 846line 841 didn't jump to line 846, because the condition on line 841 was never true

842 # Want to store the full URI to the resource directly in 

843 # datastore. This is useful for referring to permanent archive 

844 # storage for raw data. 

845 # Trust that people know what they are doing. 

846 tgtLocation = None 

847 else: 

848 # Work out the name we want this ingested file to have 

849 # inside the datastore 

850 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

851 if not tgtLocation.uri.dirname().exists(): 

852 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

853 tgtLocation.uri.dirname().mkdir() 

854 

855 # if we are transferring from a local file to a remote location 

856 # it may be more efficient to get the size and checksum of the 

857 # local file rather than the transferred one 

858 if not srcUri.scheme or srcUri.scheme == "file": 858 ↛ 864line 858 didn't jump to line 864, because the condition on line 858 was never false

859 size = srcUri.size() 

860 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

861 have_sized = True 

862 

863 # transfer the resource to the destination 

864 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction) 

865 

866 if tgtLocation is None: 866 ↛ 868line 866 didn't jump to line 868, because the condition on line 866 was never true

867 # This means we are using direct mode 

868 targetUri = srcUri 

869 targetPath = str(srcUri) 

870 else: 

871 targetUri = tgtLocation.uri 

872 targetPath = tgtLocation.pathInStore.path 

873 

874 # the file should exist in the datastore now 

875 if not have_sized: 

876 size = targetUri.size() 

877 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

878 

879 return StoredFileInfo(formatter=formatter, path=targetPath, 

880 storageClass=ref.datasetType.storageClass, 

881 component=ref.datasetType.component(), 

882 file_size=size, checksum=checksum) 

883 

884 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

885 # Docstring inherited from Datastore._prepIngest. 

886 filtered = [] 

887 for dataset in datasets: 

888 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

889 if not acceptable: 

890 continue 

891 else: 

892 dataset.refs = acceptable 

893 if dataset.formatter is None: 

894 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

895 else: 

896 assert isinstance(dataset.formatter, (type, str)) 

897 dataset.formatter = getClassOf(dataset.formatter) 

898 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

899 filtered.append(dataset) 

900 return _IngestPrepData(filtered) 

901 

902 @transactional 

903 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None: 

904 # Docstring inherited from Datastore._finishIngest. 

905 refsAndInfos = [] 

906 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

907 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

908 # Do ingest as if the first dataset ref is associated with the file 

909 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter, 

910 transfer=transfer) 

911 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

912 self._register_datasets(refsAndInfos) 

913 

914 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef, 

915 formatter: Union[Formatter, Type[Formatter]]) -> Location: 

916 """Given a source URI and a DatasetRef, determine the name the 

917 dataset will have inside datastore. 

918 

919 Parameters 

920 ---------- 

921 srcUri : `ButlerURI` 

922 URI to the source dataset file. 

923 ref : `DatasetRef` 

924 Ref associated with the newly-ingested dataset artifact. This 

925 is used to determine the name within the datastore. 

926 formatter : `Formatter` or Formatter class. 

927 Formatter to use for validation. Can be a class or an instance. 

928 

929 Returns 

930 ------- 

931 location : `Location` 

932 Target location for the newly-ingested dataset. 

933 """ 

934 # Ingesting a file from outside the datastore. 

935 # This involves a new name. 

936 template = self.templates.getTemplate(ref) 

937 location = self.locationFactory.fromPath(template.format(ref)) 

938 

939 # Get the extension 

940 ext = srcUri.getExtension() 

941 

942 # Update the destination to include that extension 

943 location.updateExtension(ext) 

944 

945 # Ask the formatter to validate this extension 

946 formatter.validateExtension(location) 

947 

948 return location 

949 

950 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

951 """Write out in memory dataset to datastore. 

952 

953 Parameters 

954 ---------- 

955 inMemoryDataset : `object` 

956 Dataset to write to datastore. 

957 ref : `DatasetRef` 

958 Registry information associated with this dataset. 

959 

960 Returns 

961 ------- 

962 info : `StoredFileInfo` 

963 Information describin the artifact written to the datastore. 

964 """ 

965 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

966 uri = location.uri 

967 

968 if not uri.dirname().exists(): 

969 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

970 uri.dirname().mkdir() 

971 

972 if self._transaction is None: 972 ↛ 973line 972 didn't jump to line 973, because the condition on line 972 was never true

973 raise RuntimeError("Attempting to write artifact without transaction enabled") 

974 

975 def _removeFileExists(uri: ButlerURI) -> None: 

976 """Remove a file and do not complain if it is not there. 

977 

978 This is important since a formatter might fail before the file 

979 is written and we should not confuse people by writing spurious 

980 error messages to the log. 

981 """ 

982 try: 

983 uri.remove() 

984 except FileNotFoundError: 

985 pass 

986 

987 # Register a callback to try to delete the uploaded data if 

988 # something fails below 

989 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

990 

991 # For a local file, simply use the formatter directly 

992 if uri.isLocal: 

993 try: 

994 formatter.write(inMemoryDataset) 

995 except Exception as e: 

996 raise RuntimeError(f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} " 

997 f"to location {uri}") from e 

998 log.debug("Successfully wrote python object to local file at %s", uri) 

999 else: 

1000 # This is a remote URI, so first try bytes and write directly else 

1001 # fallback to a temporary file 

1002 try: 

1003 serializedDataset = formatter.toBytes(inMemoryDataset) 

1004 except NotImplementedError: 1004 ↛ 1023line 1004 didn't jump to line 1023

1005 with tempfile.NamedTemporaryFile(suffix=uri.getExtension()) as tmpFile: 

1006 # Need to configure the formatter to write to a different 

1007 # location and that needs us to overwrite internals 

1008 tmpLocation = Location(*os.path.split(tmpFile.name)) 

1009 log.debug("Writing dataset to temporary location at %s", tmpLocation.uri) 

1010 with formatter._updateLocation(tmpLocation): 

1011 try: 

1012 formatter.write(inMemoryDataset) 

1013 except Exception as e: 

1014 raise RuntimeError(f"Failed to serialize dataset {ref} of type" 

1015 f" {type(inMemoryDataset)} to " 

1016 f"temporary location {tmpLocation.uri}") from e 

1017 uri.transfer_from(tmpLocation.uri, transfer="copy", overwrite=True) 

1018 

1019 # Cache if required 

1020 self.cacheManager.move_to_cache(tmpLocation.uri, ref) 

1021 

1022 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1023 except Exception as e: 

1024 raise RuntimeError(f"Failed to serialize dataset {ref} to bytes.") from e 

1025 else: 

1026 log.debug("Writing bytes directly to %s", uri) 

1027 uri.write(serializedDataset, overwrite=True) 

1028 log.debug("Successfully wrote bytes directly to %s", uri) 

1029 

1030 # URI is needed to resolve what ingest case are we dealing with 

1031 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1032 

1033 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

1034 ref: DatasetRef, isComponent: bool = False) -> Any: 

1035 """Read the artifact from datastore into in memory object. 

1036 

1037 Parameters 

1038 ---------- 

1039 getInfo : `DatastoreFileGetInformation` 

1040 Information about the artifact within the datastore. 

1041 ref : `DatasetRef` 

1042 The registry information associated with this artifact. 

1043 isComponent : `bool` 

1044 Flag to indicate if a component is being read from this artifact. 

1045 

1046 Returns 

1047 ------- 

1048 inMemoryDataset : `object` 

1049 The artifact as a python object. 

1050 """ 

1051 location = getInfo.location 

1052 uri = location.uri 

1053 log.debug("Accessing data from %s", uri) 

1054 

1055 # Cannot recalculate checksum but can compare size as a quick check 

1056 # Do not do this if the size is negative since that indicates 

1057 # we do not know. 

1058 recorded_size = getInfo.info.file_size 

1059 resource_size = uri.size() 

1060 if recorded_size >= 0 and resource_size != recorded_size: 1060 ↛ 1061line 1060 didn't jump to line 1061, because the condition on line 1060 was never true

1061 raise RuntimeError("Integrity failure in Datastore. " 

1062 f"Size of file {uri} ({resource_size}) " 

1063 f"does not match size recorded in registry of {recorded_size}") 

1064 

1065 # For the general case we have choices for how to proceed. 

1066 # 1. Always use a local file (downloading the remote resource to a 

1067 # temporary file if needed). 

1068 # 2. Use a threshold size and read into memory and use bytes. 

1069 # Use both for now with an arbitrary hand off size. 

1070 # This allows small datasets to be downloaded from remote object 

1071 # stores without requiring a temporary file. 

1072 

1073 formatter = getInfo.formatter 

1074 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1075 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1076 serializedDataset = uri.read() 

1077 log.debug("Deserializing %s from %d bytes from location %s with formatter %s", 

1078 f"component {getInfo.component}" if isComponent else "", 

1079 len(serializedDataset), uri, formatter.name()) 

1080 try: 

1081 result = formatter.fromBytes(serializedDataset, 

1082 component=getInfo.component if isComponent else None) 

1083 except Exception as e: 

1084 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1085 f" ({ref.datasetType.name} from {uri}): {e}") from e 

1086 else: 

1087 # Read from file. 

1088 

1089 # Have to update the Location associated with the formatter 

1090 # because formatter.read does not allow an override. 

1091 # This could be improved. 

1092 location_updated = False 

1093 msg = "" 

1094 

1095 # First check in cache for local version. 

1096 # The cache will only be relevant for remote resources. 

1097 if not uri.isLocal: 

1098 cached_file = self.cacheManager.find_in_cache(ref, uri.getExtension()) 

1099 if cached_file is not None: 1099 ↛ 1100line 1099 didn't jump to line 1100, because the condition on line 1099 was never true

1100 msg = f"(via cache read of remote file {uri})" 

1101 uri = cached_file 

1102 location_updated = True 

1103 

1104 with uri.as_local() as local_uri: 

1105 

1106 # URI was remote and file was downloaded 

1107 if uri != local_uri: 

1108 cache_msg = "" 

1109 location_updated = True 

1110 

1111 # Cache the downloaded file if needed. 

1112 cached_uri = self.cacheManager.move_to_cache(local_uri, ref) 

1113 if cached_uri is not None: 1113 ↛ 1114line 1113 didn't jump to line 1114, because the condition on line 1113 was never true

1114 local_uri = cached_uri 

1115 cache_msg = " and cached" 

1116 

1117 msg = f"(via download to local file{cache_msg})" 

1118 

1119 # Calculate the (possibly) new location for the formatter 

1120 # to use. 

1121 newLocation = Location(*local_uri.split()) if location_updated else None 

1122 

1123 log.debug("Reading%s from location %s %s with formatter %s", 

1124 f" component {getInfo.component}" if isComponent else "", 

1125 uri, msg, formatter.name()) 

1126 try: 

1127 with formatter._updateLocation(newLocation): 

1128 result = formatter.read(component=getInfo.component if isComponent else None) 

1129 except Exception as e: 

1130 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1131 f" ({ref.datasetType.name} from {uri}): {e}") from e 

1132 

1133 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

1134 isComponent=isComponent) 

1135 

1136 def exists(self, ref: DatasetRef) -> bool: 

1137 """Check if the dataset exists in the datastore. 

1138 

1139 Parameters 

1140 ---------- 

1141 ref : `DatasetRef` 

1142 Reference to the required dataset. 

1143 

1144 Returns 

1145 ------- 

1146 exists : `bool` 

1147 `True` if the entity exists in the `Datastore`. 

1148 """ 

1149 fileLocations = self._get_dataset_locations_info(ref) 

1150 

1151 # if we are being asked to trust that registry might not be correct 

1152 # we ask for the expected locations and check them explicitly 

1153 if not fileLocations: 

1154 if not self.trustGetRequest: 

1155 return False 

1156 fileLocations = self._get_expected_dataset_locations_info(ref) 

1157 for location, _ in fileLocations: 

1158 if not self._artifact_exists(location): 

1159 return False 

1160 

1161 return True 

1162 

1163 def getURIs(self, ref: DatasetRef, 

1164 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

1165 """Return URIs associated with dataset. 

1166 

1167 Parameters 

1168 ---------- 

1169 ref : `DatasetRef` 

1170 Reference to the required dataset. 

1171 predict : `bool`, optional 

1172 If the datastore does not know about the dataset, should it 

1173 return a predicted URI or not? 

1174 

1175 Returns 

1176 ------- 

1177 primary : `ButlerURI` 

1178 The URI to the primary artifact associated with this dataset. 

1179 If the dataset was disassembled within the datastore this 

1180 may be `None`. 

1181 components : `dict` 

1182 URIs to any components associated with the dataset artifact. 

1183 Can be empty if there are no components. 

1184 """ 

1185 

1186 primary: Optional[ButlerURI] = None 

1187 components: Dict[str, ButlerURI] = {} 

1188 

1189 # if this has never been written then we have to guess 

1190 if not self.exists(ref): 

1191 if not predict: 

1192 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

1193 

1194 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1195 

1196 if doDisassembly: 

1197 

1198 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

1199 compRef = ref.makeComponentRef(component) 

1200 compLocation, _ = self._determine_put_formatter_location(compRef) 

1201 

1202 # Add a URI fragment to indicate this is a guess 

1203 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted") 

1204 

1205 else: 

1206 

1207 location, _ = self._determine_put_formatter_location(ref) 

1208 

1209 # Add a URI fragment to indicate this is a guess 

1210 primary = ButlerURI(location.uri.geturl() + "#predicted") 

1211 

1212 return primary, components 

1213 

1214 # If this is a ref that we have written we can get the path. 

1215 # Get file metadata and internal metadata 

1216 fileLocations = self._get_dataset_locations_info(ref) 

1217 

1218 guessing = False 

1219 if not fileLocations: 

1220 if not self.trustGetRequest: 1220 ↛ 1221line 1220 didn't jump to line 1221, because the condition on line 1220 was never true

1221 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1222 fileLocations = self._get_expected_dataset_locations_info(ref) 

1223 guessing = True 

1224 

1225 if len(fileLocations) == 1: 

1226 # No disassembly so this is the primary URI 

1227 uri = fileLocations[0][0].uri 

1228 if guessing and not uri.exists(): 1228 ↛ 1229line 1228 didn't jump to line 1229, because the condition on line 1228 was never true

1229 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1230 primary = uri 

1231 

1232 else: 

1233 for location, storedFileInfo in fileLocations: 

1234 if storedFileInfo.component is None: 1234 ↛ 1235line 1234 didn't jump to line 1235, because the condition on line 1234 was never true

1235 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1236 uri = location.uri 

1237 if guessing and not uri.exists(): 1237 ↛ 1238line 1237 didn't jump to line 1238, because the condition on line 1237 was never true

1238 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1239 components[storedFileInfo.component] = uri 

1240 

1241 return primary, components 

1242 

1243 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI: 

1244 """URI to the Dataset. 

1245 

1246 Parameters 

1247 ---------- 

1248 ref : `DatasetRef` 

1249 Reference to the required Dataset. 

1250 predict : `bool` 

1251 If `True`, allow URIs to be returned of datasets that have not 

1252 been written. 

1253 

1254 Returns 

1255 ------- 

1256 uri : `str` 

1257 URI pointing to the dataset within the datastore. If the 

1258 dataset does not exist in the datastore, and if ``predict`` is 

1259 `True`, the URI will be a prediction and will include a URI 

1260 fragment "#predicted". 

1261 If the datastore does not have entities that relate well 

1262 to the concept of a URI the returned URI will be 

1263 descriptive. The returned URI is not guaranteed to be obtainable. 

1264 

1265 Raises 

1266 ------ 

1267 FileNotFoundError 

1268 Raised if a URI has been requested for a dataset that does not 

1269 exist and guessing is not allowed. 

1270 RuntimeError 

1271 Raised if a request is made for a single URI but multiple URIs 

1272 are associated with this dataset. 

1273 

1274 Notes 

1275 ----- 

1276 When a predicted URI is requested an attempt will be made to form 

1277 a reasonable URI based on file templates and the expected formatter. 

1278 """ 

1279 primary, components = self.getURIs(ref, predict) 

1280 if primary is None or components: 1280 ↛ 1281line 1280 didn't jump to line 1281, because the condition on line 1280 was never true

1281 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. " 

1282 "Use Dataastore.getURIs() instead.") 

1283 return primary 

1284 

1285 def retrieveArtifacts(self, refs: Iterable[DatasetRef], 

1286 destination: ButlerURI, transfer: str = "auto", 

1287 preserve_path: bool = True, 

1288 overwrite: bool = False) -> List[ButlerURI]: 

1289 """Retrieve the file artifacts associated with the supplied refs. 

1290 

1291 Parameters 

1292 ---------- 

1293 refs : iterable of `DatasetRef` 

1294 The datasets for which file artifacts are to be retrieved. 

1295 A single ref can result in multiple files. The refs must 

1296 be resolved. 

1297 destination : `ButlerURI` 

1298 Location to write the file artifacts. 

1299 transfer : `str`, optional 

1300 Method to use to transfer the artifacts. Must be one of the options 

1301 supported by `ButlerURI.transfer_from()`. "move" is not allowed. 

1302 preserve_path : `bool`, optional 

1303 If `True` the full path of the file artifact within the datastore 

1304 is preserved. If `False` the final file component of the path 

1305 is used. 

1306 overwrite : `bool`, optional 

1307 If `True` allow transfers to overwrite existing files at the 

1308 destination. 

1309 

1310 Returns 

1311 ------- 

1312 targets : `list` of `ButlerURI` 

1313 URIs of file artifacts in destination location. Order is not 

1314 preserved. 

1315 """ 

1316 if not destination.isdir(): 1316 ↛ 1317line 1316 didn't jump to line 1317, because the condition on line 1316 was never true

1317 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

1318 

1319 if transfer == "move": 

1320 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

1321 

1322 # Source -> Destination 

1323 # This also helps filter out duplicate DatasetRef in the request 

1324 # that will map to the same underlying file transfer. 

1325 to_transfer: Dict[ButlerURI, ButlerURI] = {} 

1326 

1327 for ref in refs: 

1328 locations = self._get_dataset_locations_info(ref) 

1329 for location, _ in locations: 

1330 source_uri = location.uri 

1331 target_path: Union[str, ButlerURI] 

1332 if preserve_path: 

1333 target_path = location.pathInStore 

1334 if target_path.isabs(): 1334 ↛ 1337line 1334 didn't jump to line 1337, because the condition on line 1334 was never true

1335 # This is an absolute path to an external file. 

1336 # Use the full path. 

1337 target_path = target_path.relativeToPathRoot 

1338 else: 

1339 target_path = source_uri.basename() 

1340 target_uri = destination.join(target_path) 

1341 to_transfer[source_uri] = target_uri 

1342 

1343 # In theory can now parallelize the transfer 

1344 log.debug("Number of artifacts to transfer to %s: %d", 

1345 str(destination), len(to_transfer)) 

1346 for source_uri, target_uri in to_transfer.items(): 

1347 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

1348 

1349 return list(to_transfer.values()) 

1350 

1351 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

1352 """Load an InMemoryDataset from the store. 

1353 

1354 Parameters 

1355 ---------- 

1356 ref : `DatasetRef` 

1357 Reference to the required Dataset. 

1358 parameters : `dict` 

1359 `StorageClass`-specific parameters that specify, for example, 

1360 a slice of the dataset to be loaded. 

1361 

1362 Returns 

1363 ------- 

1364 inMemoryDataset : `object` 

1365 Requested dataset or slice thereof as an InMemoryDataset. 

1366 

1367 Raises 

1368 ------ 

1369 FileNotFoundError 

1370 Requested dataset can not be retrieved. 

1371 TypeError 

1372 Return value from formatter has unexpected type. 

1373 ValueError 

1374 Formatter failed to process the dataset. 

1375 """ 

1376 allGetInfo = self._prepare_for_get(ref, parameters) 

1377 refComponent = ref.datasetType.component() 

1378 

1379 # Supplied storage class for the component being read 

1380 refStorageClass = ref.datasetType.storageClass 

1381 

1382 # Create mapping from component name to related info 

1383 allComponents = {i.component: i for i in allGetInfo} 

1384 

1385 # By definition the dataset is disassembled if we have more 

1386 # than one record for it. 

1387 isDisassembled = len(allGetInfo) > 1 

1388 

1389 # Look for the special case where we are disassembled but the 

1390 # component is a derived component that was not written during 

1391 # disassembly. For this scenario we need to check that the 

1392 # component requested is listed as a derived component for the 

1393 # composite storage class 

1394 isDisassembledReadOnlyComponent = False 

1395 if isDisassembled and refComponent: 

1396 # The composite storage class should be accessible through 

1397 # the component dataset type 

1398 compositeStorageClass = ref.datasetType.parentStorageClass 

1399 

1400 # In the unlikely scenario where the composite storage 

1401 # class is not known, we can only assume that this is a 

1402 # normal component. If that assumption is wrong then the 

1403 # branch below that reads a persisted component will fail 

1404 # so there is no need to complain here. 

1405 if compositeStorageClass is not None: 1405 ↛ 1408line 1405 didn't jump to line 1408, because the condition on line 1405 was never false

1406 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

1407 

1408 if isDisassembled and not refComponent: 

1409 # This was a disassembled dataset spread over multiple files 

1410 # and we need to put them all back together again. 

1411 # Read into memory and then assemble 

1412 

1413 # Check that the supplied parameters are suitable for the type read 

1414 refStorageClass.validateParameters(parameters) 

1415 

1416 # We want to keep track of all the parameters that were not used 

1417 # by formatters. We assume that if any of the component formatters 

1418 # use a parameter that we do not need to apply it again in the 

1419 # assembler. 

1420 usedParams = set() 

1421 

1422 components: Dict[str, Any] = {} 

1423 for getInfo in allGetInfo: 

1424 # assemblerParams are parameters not understood by the 

1425 # associated formatter. 

1426 usedParams.update(set(getInfo.formatterParams)) 

1427 

1428 component = getInfo.component 

1429 

1430 if component is None: 1430 ↛ 1431line 1430 didn't jump to line 1431, because the condition on line 1430 was never true

1431 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

1432 

1433 # We do not want the formatter to think it's reading 

1434 # a component though because it is really reading a 

1435 # standalone dataset -- always tell reader it is not a 

1436 # component. 

1437 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False) 

1438 

1439 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

1440 

1441 # Any unused parameters will have to be passed to the assembler 

1442 if parameters: 

1443 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

1444 else: 

1445 unusedParams = {} 

1446 

1447 # Process parameters 

1448 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset, 

1449 parameters=unusedParams) 

1450 

1451 elif isDisassembledReadOnlyComponent: 

1452 

1453 compositeStorageClass = ref.datasetType.parentStorageClass 

1454 if compositeStorageClass is None: 1454 ↛ 1455line 1454 didn't jump to line 1455, because the condition on line 1454 was never true

1455 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since" 

1456 "no composite storage class is available.") 

1457 

1458 if refComponent is None: 1458 ↛ 1460line 1458 didn't jump to line 1460, because the condition on line 1458 was never true

1459 # Mainly for mypy 

1460 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

1461 

1462 # Assume that every derived component can be calculated by 

1463 # forwarding the request to a single read/write component. 

1464 # Rather than guessing which rw component is the right one by 

1465 # scanning each for a derived component of the same name, 

1466 # we ask the storage class delegate directly which one is best to 

1467 # use. 

1468 compositeDelegate = compositeStorageClass.delegate() 

1469 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent, 

1470 set(allComponents)) 

1471 

1472 # Select the relevant component 

1473 rwInfo = allComponents[forwardedComponent] 

1474 

1475 # For now assume that read parameters are validated against 

1476 # the real component and not the requested component 

1477 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

1478 forwardedStorageClass.validateParameters(parameters) 

1479 

1480 # Unfortunately the FileDescriptor inside the formatter will have 

1481 # the wrong write storage class so we need to create a new one 

1482 # given the immutability constraint. 

1483 writeStorageClass = rwInfo.info.storageClass 

1484 

1485 # We may need to put some thought into parameters for read 

1486 # components but for now forward them on as is 

1487 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location, 

1488 readStorageClass=refStorageClass, 

1489 storageClass=writeStorageClass, 

1490 parameters=parameters), 

1491 ref.dataId) 

1492 

1493 # The assembler can not receive any parameter requests for a 

1494 # derived component at this time since the assembler will 

1495 # see the storage class of the derived component and those 

1496 # parameters will have to be handled by the formatter on the 

1497 # forwarded storage class. 

1498 assemblerParams: Dict[str, Any] = {} 

1499 

1500 # Need to created a new info that specifies the derived 

1501 # component and associated storage class 

1502 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter, 

1503 rwInfo.info, assemblerParams, {}, 

1504 refComponent, refStorageClass) 

1505 

1506 return self._read_artifact_into_memory(readInfo, ref, isComponent=True) 

1507 

1508 else: 

1509 # Single file request or component from that composite file 

1510 for lookup in (refComponent, None): 1510 ↛ 1515line 1510 didn't jump to line 1515, because the loop on line 1510 didn't complete

1511 if lookup in allComponents: 1511 ↛ 1510line 1511 didn't jump to line 1510, because the condition on line 1511 was never false

1512 getInfo = allComponents[lookup] 

1513 break 

1514 else: 

1515 raise FileNotFoundError(f"Component {refComponent} not found " 

1516 f"for ref {ref} in datastore {self.name}") 

1517 

1518 # Do not need the component itself if already disassembled 

1519 if isDisassembled: 

1520 isComponent = False 

1521 else: 

1522 isComponent = getInfo.component is not None 

1523 

1524 # For a disassembled component we can validate parametersagainst 

1525 # the component storage class directly 

1526 if isDisassembled: 

1527 refStorageClass.validateParameters(parameters) 

1528 else: 

1529 # For an assembled composite this could be a derived 

1530 # component derived from a real component. The validity 

1531 # of the parameters is not clear. For now validate against 

1532 # the composite storage class 

1533 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

1534 

1535 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent) 

1536 

1537 @transactional 

1538 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

1539 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

1540 

1541 Parameters 

1542 ---------- 

1543 inMemoryDataset : `object` 

1544 The dataset to store. 

1545 ref : `DatasetRef` 

1546 Reference to the associated Dataset. 

1547 

1548 Raises 

1549 ------ 

1550 TypeError 

1551 Supplied object and storage class are inconsistent. 

1552 DatasetTypeNotSupportedError 

1553 The associated `DatasetType` is not handled by this datastore. 

1554 

1555 Notes 

1556 ----- 

1557 If the datastore is configured to reject certain dataset types it 

1558 is possible that the put will fail and raise a 

1559 `DatasetTypeNotSupportedError`. The main use case for this is to 

1560 allow `ChainedDatastore` to put to multiple datastores without 

1561 requiring that every datastore accepts the dataset. 

1562 """ 

1563 

1564 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1565 # doDisassembly = True 

1566 

1567 artifacts = [] 

1568 if doDisassembly: 

1569 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

1570 for component, componentInfo in components.items(): 

1571 # Don't recurse because we want to take advantage of 

1572 # bulk insert -- need a new DatasetRef that refers to the 

1573 # same dataset_id but has the component DatasetType 

1574 # DatasetType does not refer to the types of components 

1575 # So we construct one ourselves. 

1576 compRef = ref.makeComponentRef(component) 

1577 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

1578 artifacts.append((compRef, storedInfo)) 

1579 else: 

1580 # Write the entire thing out 

1581 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

1582 artifacts.append((ref, storedInfo)) 

1583 

1584 self._register_datasets(artifacts) 

1585 

1586 @transactional 

1587 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None: 

1588 """Indicate to the datastore that a dataset can be removed. 

1589 

1590 Parameters 

1591 ---------- 

1592 ref : `DatasetRef` 

1593 Reference to the required Dataset. 

1594 ignore_errors : `bool` 

1595 If `True` return without error even if something went wrong. 

1596 Problems could occur if another process is simultaneously trying 

1597 to delete. 

1598 

1599 Raises 

1600 ------ 

1601 FileNotFoundError 

1602 Attempt to remove a dataset that does not exist. 

1603 """ 

1604 # Get file metadata and internal metadata 

1605 log.debug("Trashing %s in datastore %s", ref, self.name) 

1606 

1607 fileLocations = self._get_dataset_locations_info(ref) 

1608 

1609 if not fileLocations: 

1610 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

1611 if ignore_errors: 

1612 log.warning(err_msg) 

1613 return 

1614 else: 

1615 raise FileNotFoundError(err_msg) 

1616 

1617 for location, storedFileInfo in fileLocations: 

1618 if not self._artifact_exists(location): 1618 ↛ 1619line 1618 didn't jump to line 1619, because the condition on line 1618 was never true

1619 err_msg = f"Dataset is known to datastore {self.name} but " \ 

1620 f"associated artifact ({location.uri}) is missing" 

1621 if ignore_errors: 

1622 log.warning(err_msg) 

1623 return 

1624 else: 

1625 raise FileNotFoundError(err_msg) 

1626 

1627 # Mark dataset as trashed 

1628 try: 

1629 self._move_to_trash_in_registry(ref) 

1630 except Exception as e: 

1631 if ignore_errors: 

1632 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} " 

1633 f"but encountered an error: {e}") 

1634 pass 

1635 else: 

1636 raise 

1637 

1638 @transactional 

1639 def emptyTrash(self, ignore_errors: bool = True) -> None: 

1640 """Remove all datasets from the trash. 

1641 

1642 Parameters 

1643 ---------- 

1644 ignore_errors : `bool` 

1645 If `True` return without error even if something went wrong. 

1646 Problems could occur if another process is simultaneously trying 

1647 to delete. 

1648 """ 

1649 log.debug("Emptying trash in datastore %s", self.name) 

1650 # Context manager will empty trash iff we finish it without raising. 

1651 with self.bridge.emptyTrash() as trashed: 

1652 for ref in trashed: 

1653 fileLocations = self._get_dataset_locations_info(ref) 

1654 

1655 if not fileLocations: 1655 ↛ 1656line 1655 didn't jump to line 1656, because the condition on line 1655 was never true

1656 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}" 

1657 if ignore_errors: 

1658 log.warning(err_msg) 

1659 continue 

1660 else: 

1661 raise FileNotFoundError(err_msg) 

1662 

1663 for location, _ in fileLocations: 

1664 

1665 if not self._artifact_exists(location): 1665 ↛ 1666line 1665 didn't jump to line 1666, because the condition on line 1665 was never true

1666 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}" 

1667 if ignore_errors: 

1668 log.warning(err_msg) 

1669 continue 

1670 else: 

1671 raise FileNotFoundError(err_msg) 

1672 

1673 # Can only delete the artifact if there are no references 

1674 # to the file from untrashed dataset refs. 

1675 if self._can_remove_dataset_artifact(ref, location): 

1676 # Point of no return for this artifact 

1677 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

1678 try: 

1679 self._delete_artifact(location) 

1680 except Exception as e: 

1681 if ignore_errors: 

1682 log.critical("Encountered error removing artifact %s from datastore %s: %s", 

1683 location.uri, self.name, e) 

1684 else: 

1685 raise 

1686 

1687 # Now must remove the entry from the internal registry even if 

1688 # the artifact removal failed and was ignored, 

1689 # otherwise the removal check above will never be true 

1690 try: 

1691 # There may be multiple rows associated with this ref 

1692 # depending on disassembly 

1693 self.removeStoredItemInfo(ref) 

1694 except Exception as e: 

1695 if ignore_errors: 

1696 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s", 

1697 ref.id, location.uri, self.name, e) 

1698 continue 

1699 else: 

1700 raise FileNotFoundError( 

1701 f"Error removing dataset {ref.id} ({location.uri}) from internal registry " 

1702 f"of {self.name}" 

1703 ) from e 

1704 

1705 @transactional 

1706 def forget(self, refs: Iterable[DatasetRef]) -> None: 

1707 # Docstring inherited. 

1708 refs = list(refs) 

1709 self.bridge.forget(refs) 

1710 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs]) 

1711 

1712 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], 

1713 logFailures: bool = False) -> None: 

1714 """Validate some of the configuration for this datastore. 

1715 

1716 Parameters 

1717 ---------- 

1718 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

1719 Entities to test against this configuration. Can be differing 

1720 types. 

1721 logFailures : `bool`, optional 

1722 If `True`, output a log message for every validation error 

1723 detected. 

1724 

1725 Raises 

1726 ------ 

1727 DatastoreValidationError 

1728 Raised if there is a validation problem with a configuration. 

1729 All the problems are reported in a single exception. 

1730 

1731 Notes 

1732 ----- 

1733 This method checks that all the supplied entities have valid file 

1734 templates and also have formatters defined. 

1735 """ 

1736 

1737 templateFailed = None 

1738 try: 

1739 self.templates.validateTemplates(entities, logFailures=logFailures) 

1740 except FileTemplateValidationError as e: 

1741 templateFailed = str(e) 

1742 

1743 formatterFailed = [] 

1744 for entity in entities: 

1745 try: 

1746 self.formatterFactory.getFormatterClass(entity) 

1747 except KeyError as e: 

1748 formatterFailed.append(str(e)) 

1749 if logFailures: 1749 ↛ 1744line 1749 didn't jump to line 1744, because the condition on line 1749 was never false

1750 log.critical("Formatter failure: %s", e) 

1751 

1752 if templateFailed or formatterFailed: 

1753 messages = [] 

1754 if templateFailed: 1754 ↛ 1755line 1754 didn't jump to line 1755, because the condition on line 1754 was never true

1755 messages.append(templateFailed) 

1756 if formatterFailed: 1756 ↛ 1758line 1756 didn't jump to line 1758, because the condition on line 1756 was never false

1757 messages.append(",".join(formatterFailed)) 

1758 msg = ";\n".join(messages) 

1759 raise DatastoreValidationError(msg) 

1760 

1761 def getLookupKeys(self) -> Set[LookupKey]: 

1762 # Docstring is inherited from base class 

1763 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \ 

1764 self.constraints.getLookupKeys() 

1765 

1766 def validateKey(self, lookupKey: LookupKey, 

1767 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

1768 # Docstring is inherited from base class 

1769 # The key can be valid in either formatters or templates so we can 

1770 # only check the template if it exists 

1771 if lookupKey in self.templates: 

1772 try: 

1773 self.templates[lookupKey].validateTemplate(entity) 

1774 except FileTemplateValidationError as e: 

1775 raise DatastoreValidationError(e) from e 

1776 

1777 def export(self, refs: Iterable[DatasetRef], *, 

1778 directory: Optional[Union[ButlerURI, str]] = None, 

1779 transfer: Optional[str] = "auto") -> Iterable[FileDataset]: 

1780 # Docstring inherited from Datastore.export. 

1781 if transfer is not None and directory is None: 1781 ↛ 1782line 1781 didn't jump to line 1782, because the condition on line 1781 was never true

1782 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no " 

1783 "export directory given") 

1784 

1785 # Force the directory to be a URI object 

1786 directoryUri: Optional[ButlerURI] = None 

1787 if directory is not None: 1787 ↛ 1790line 1787 didn't jump to line 1790, because the condition on line 1787 was never false

1788 directoryUri = ButlerURI(directory, forceDirectory=True) 

1789 

1790 if transfer is not None and directoryUri is not None: 1790 ↛ 1795line 1790 didn't jump to line 1795, because the condition on line 1790 was never false

1791 # mypy needs the second test 

1792 if not directoryUri.exists(): 1792 ↛ 1793line 1792 didn't jump to line 1793, because the condition on line 1792 was never true

1793 raise FileNotFoundError(f"Export location {directory} does not exist") 

1794 

1795 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

1796 for ref in progress.wrap(refs, "Exporting dataset files"): 

1797 fileLocations = self._get_dataset_locations_info(ref) 

1798 if not fileLocations: 1798 ↛ 1799line 1798 didn't jump to line 1799, because the condition on line 1798 was never true

1799 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

1800 # For now we can not export disassembled datasets 

1801 if len(fileLocations) > 1: 1801 ↛ 1802line 1801 didn't jump to line 1802, because the condition on line 1801 was never true

1802 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

1803 location, storedFileInfo = fileLocations[0] 

1804 

1805 pathInStore = location.pathInStore.path 

1806 if transfer is None: 1806 ↛ 1809line 1806 didn't jump to line 1809, because the condition on line 1806 was never true

1807 # TODO: do we also need to return the readStorageClass somehow? 

1808 # We will use the path in store directly 

1809 pass 

1810 elif transfer == "direct": 1810 ↛ 1812line 1810 didn't jump to line 1812, because the condition on line 1810 was never true

1811 # Use full URIs to the remote store in the export 

1812 pathInStore = str(location.uri) 

1813 else: 

1814 # mypy needs help 

1815 assert directoryUri is not None, "directoryUri must be defined to get here" 

1816 storeUri = ButlerURI(location.uri) 

1817 

1818 # if the datastore has an absolute URI to a resource, we 

1819 # have two options: 

1820 # 1. Keep the absolute URI in the exported YAML 

1821 # 2. Allocate a new name in the local datastore and transfer 

1822 # it. 

1823 # For now go with option 2 

1824 if location.pathInStore.isabs(): 1824 ↛ 1825line 1824 didn't jump to line 1825, because the condition on line 1824 was never true

1825 template = self.templates.getTemplate(ref) 

1826 newURI = ButlerURI(template.format(ref), forceAbsolute=False) 

1827 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

1828 

1829 exportUri = directoryUri.join(pathInStore) 

1830 exportUri.transfer_from(storeUri, transfer=transfer) 

1831 

1832 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

1833 

1834 @staticmethod 

1835 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]: 

1836 """Compute the checksum of the supplied file. 

1837 

1838 Parameters 

1839 ---------- 

1840 uri : `ButlerURI` 

1841 Name of resource to calculate checksum from. 

1842 algorithm : `str`, optional 

1843 Name of algorithm to use. Must be one of the algorithms supported 

1844 by :py:class`hashlib`. 

1845 block_size : `int` 

1846 Number of bytes to read from file at one time. 

1847 

1848 Returns 

1849 ------- 

1850 hexdigest : `str` 

1851 Hex digest of the file. 

1852 

1853 Notes 

1854 ----- 

1855 Currently returns None if the URI is for a remote resource. 

1856 """ 

1857 if algorithm not in hashlib.algorithms_guaranteed: 1857 ↛ 1858line 1857 didn't jump to line 1858, because the condition on line 1857 was never true

1858 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

1859 

1860 if not uri.isLocal: 1860 ↛ 1861line 1860 didn't jump to line 1861, because the condition on line 1860 was never true

1861 return None 

1862 

1863 hasher = hashlib.new(algorithm) 

1864 

1865 with uri.as_local() as local_uri: 

1866 with open(local_uri.ospath, "rb") as f: 

1867 for chunk in iter(lambda: f.read(block_size), b""): 

1868 hasher.update(chunk) 

1869 

1870 return hasher.hexdigest() 

1871 

1872 def needs_expanded_data_ids( 

1873 self, 

1874 transfer: Optional[str], 

1875 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

1876 ) -> bool: 

1877 # Docstring inherited. 

1878 # This _could_ also use entity to inspect whether the filename template 

1879 # involves placeholders other than the required dimensions for its 

1880 # dataset type, but that's not necessary for correctness; it just 

1881 # enables more optimizations (perhaps only in theory). 

1882 return transfer not in ("direct", None)