Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 83%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

833 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileDatastore", ) 

26 

27import hashlib 

28import logging 

29 

30from sqlalchemy import BigInteger, String 

31 

32from collections import defaultdict 

33from dataclasses import dataclass 

34from typing import ( 

35 TYPE_CHECKING, 

36 Any, 

37 ClassVar, 

38 Dict, 

39 Iterable, 

40 List, 

41 Mapping, 

42 Optional, 

43 Set, 

44 Tuple, 

45 Type, 

46 Union, 

47) 

48 

49from lsst.daf.butler import ( 

50 ButlerURI, 

51 CompositesMap, 

52 Config, 

53 FileDataset, 

54 DatasetId, 

55 DatasetRef, 

56 DatasetType, 

57 DatasetTypeNotSupportedError, 

58 Datastore, 

59 DatastoreCacheManager, 

60 DatastoreDisabledCacheManager, 

61 DatastoreConfig, 

62 DatastoreValidationError, 

63 FileDescriptor, 

64 FileTemplates, 

65 FileTemplateValidationError, 

66 Formatter, 

67 FormatterFactory, 

68 Location, 

69 LocationFactory, 

70 Progress, 

71 StorageClass, 

72 StoredFileInfo, 

73 VERBOSE, 

74) 

75 

76from lsst.daf.butler import ddl 

77from lsst.daf.butler.registry.interfaces import ( 

78 ReadOnlyDatabaseError, 

79 DatastoreRegistryBridge, 

80) 

81 

82from lsst.daf.butler.core.repoRelocation import replaceRoot 

83from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional, time_this, chunk_iterable 

84from .genericDatastore import GenericBaseDatastore 

85 

86if TYPE_CHECKING: 86 ↛ 87line 86 didn't jump to line 87, because the condition on line 86 was never true

87 from lsst.daf.butler import LookupKey, AbstractDatastoreCacheManager 

88 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

89 

90log = logging.getLogger(__name__) 

91 

92 

93class _IngestPrepData(Datastore.IngestPrepData): 

94 """Helper class for FileDatastore ingest implementation. 

95 

96 Parameters 

97 ---------- 

98 datasets : `list` of `FileDataset` 

99 Files to be ingested by this datastore. 

100 """ 

101 def __init__(self, datasets: List[FileDataset]): 

102 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

103 self.datasets = datasets 

104 

105 

106@dataclass(frozen=True) 

107class DatastoreFileGetInformation: 

108 """Collection of useful parameters needed to retrieve a file from 

109 a Datastore. 

110 """ 

111 

112 location: Location 

113 """The location from which to read the dataset.""" 

114 

115 formatter: Formatter 

116 """The `Formatter` to use to deserialize the dataset.""" 

117 

118 info: StoredFileInfo 

119 """Stored information about this file and its formatter.""" 

120 

121 assemblerParams: Dict[str, Any] 

122 """Parameters to use for post-processing the retrieved dataset.""" 

123 

124 formatterParams: Dict[str, Any] 

125 """Parameters that were understood by the associated formatter.""" 

126 

127 component: Optional[str] 

128 """The component to be retrieved (can be `None`).""" 

129 

130 readStorageClass: StorageClass 

131 """The `StorageClass` of the dataset being read.""" 

132 

133 

134class FileDatastore(GenericBaseDatastore): 

135 """Generic Datastore for file-based implementations. 

136 

137 Should always be sub-classed since key abstract methods are missing. 

138 

139 Parameters 

140 ---------- 

141 config : `DatastoreConfig` or `str` 

142 Configuration as either a `Config` object or URI to file. 

143 bridgeManager : `DatastoreRegistryBridgeManager` 

144 Object that manages the interface between `Registry` and datastores. 

145 butlerRoot : `str`, optional 

146 New datastore root to use to override the configuration value. 

147 

148 Raises 

149 ------ 

150 ValueError 

151 If root location does not exist and ``create`` is `False` in the 

152 configuration. 

153 """ 

154 

155 defaultConfigFile: ClassVar[Optional[str]] = None 

156 """Path to configuration defaults. Accessed within the ``config`` resource 

157 or relative to a search path. Can be None if no defaults specified. 

158 """ 

159 

160 root: ButlerURI 

161 """Root directory URI of this `Datastore`.""" 

162 

163 locationFactory: LocationFactory 

164 """Factory for creating locations relative to the datastore root.""" 

165 

166 formatterFactory: FormatterFactory 

167 """Factory for creating instances of formatters.""" 

168 

169 templates: FileTemplates 

170 """File templates that can be used by this `Datastore`.""" 

171 

172 composites: CompositesMap 

173 """Determines whether a dataset should be disassembled on put.""" 

174 

175 defaultConfigFile = "datastores/fileDatastore.yaml" 

176 """Path to configuration defaults. Accessed within the ``config`` resource 

177 or relative to a search path. Can be None if no defaults specified. 

178 """ 

179 

180 @classmethod 

181 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

182 """Set any filesystem-dependent config options for this Datastore to 

183 be appropriate for a new empty repository with the given root. 

184 

185 Parameters 

186 ---------- 

187 root : `str` 

188 URI to the root of the data repository. 

189 config : `Config` 

190 A `Config` to update. Only the subset understood by 

191 this component will be updated. Will not expand 

192 defaults. 

193 full : `Config` 

194 A complete config with all defaults expanded that can be 

195 converted to a `DatastoreConfig`. Read-only and will not be 

196 modified by this method. 

197 Repository-specific options that should not be obtained 

198 from defaults when Butler instances are constructed 

199 should be copied from ``full`` to ``config``. 

200 overwrite : `bool`, optional 

201 If `False`, do not modify a value in ``config`` if the value 

202 already exists. Default is always to overwrite with the provided 

203 ``root``. 

204 

205 Notes 

206 ----- 

207 If a keyword is explicitly defined in the supplied ``config`` it 

208 will not be overridden by this method if ``overwrite`` is `False`. 

209 This allows explicit values set in external configs to be retained. 

210 """ 

211 Config.updateParameters(DatastoreConfig, config, full, 

212 toUpdate={"root": root}, 

213 toCopy=("cls", ("records", "table")), overwrite=overwrite) 

214 

215 @classmethod 

216 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

217 return ddl.TableSpec( 

218 fields=[ 

219 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

220 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

221 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

222 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

223 # Use empty string to indicate no component 

224 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

225 # TODO: should checksum be Base64Bytes instead? 

226 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

227 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

228 ], 

229 unique=frozenset(), 

230 indexes=[tuple(["path"])], 

231 ) 

232 

233 def __init__(self, config: Union[DatastoreConfig, str], 

234 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

235 super().__init__(config, bridgeManager) 

236 if "root" not in self.config: 236 ↛ 237line 236 didn't jump to line 237, because the condition on line 236 was never true

237 raise ValueError("No root directory specified in configuration") 

238 

239 # Name ourselves either using an explicit name or a name 

240 # derived from the (unexpanded) root 

241 if "name" in self.config: 

242 self.name = self.config["name"] 

243 else: 

244 # We use the unexpanded root in the name to indicate that this 

245 # datastore can be moved without having to update registry. 

246 self.name = "{}@{}".format(type(self).__name__, 

247 self.config["root"]) 

248 

249 # Support repository relocation in config 

250 # Existence of self.root is checked in subclass 

251 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot), 

252 forceDirectory=True, forceAbsolute=True) 

253 

254 self.locationFactory = LocationFactory(self.root) 

255 self.formatterFactory = FormatterFactory() 

256 

257 # Now associate formatters with storage classes 

258 self.formatterFactory.registerFormatters(self.config["formatters"], 

259 universe=bridgeManager.universe) 

260 

261 # Read the file naming templates 

262 self.templates = FileTemplates(self.config["templates"], 

263 universe=bridgeManager.universe) 

264 

265 # See if composites should be disassembled 

266 self.composites = CompositesMap(self.config["composites"], 

267 universe=bridgeManager.universe) 

268 

269 tableName = self.config["records", "table"] 

270 try: 

271 # Storage of paths and formatters, keyed by dataset_id 

272 self._table = bridgeManager.opaque.register( 

273 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType)) 

274 # Interface to Registry. 

275 self._bridge = bridgeManager.register(self.name) 

276 except ReadOnlyDatabaseError: 

277 # If the database is read only and we just tried and failed to 

278 # create a table, it means someone is trying to create a read-only 

279 # butler client for an empty repo. That should be okay, as long 

280 # as they then try to get any datasets before some other client 

281 # creates the table. Chances are they'rejust validating 

282 # configuration. 

283 pass 

284 

285 # Determine whether checksums should be used - default to False 

286 self.useChecksum = self.config.get("checksum", False) 

287 

288 # Determine whether we can fall back to configuration if a 

289 # requested dataset is not known to registry 

290 self.trustGetRequest = self.config.get("trust_get_request", False) 

291 

292 # Create a cache manager 

293 self.cacheManager: AbstractDatastoreCacheManager 

294 if "cached" in self.config: 294 ↛ 298line 294 didn't jump to line 298, because the condition on line 294 was never false

295 self.cacheManager = DatastoreCacheManager(self.config["cached"], 

296 universe=bridgeManager.universe) 

297 else: 

298 self.cacheManager = DatastoreDisabledCacheManager("", 

299 universe=bridgeManager.universe) 

300 

301 # Check existence and create directory structure if necessary 

302 if not self.root.exists(): 

303 if "create" not in self.config or not self.config["create"]: 303 ↛ 304line 303 didn't jump to line 304, because the condition on line 303 was never true

304 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

305 try: 

306 self.root.mkdir() 

307 except Exception as e: 

308 raise ValueError(f"Can not create datastore root '{self.root}', check permissions." 

309 f" Got error: {e}") from e 

310 

311 def __str__(self) -> str: 

312 return str(self.root) 

313 

314 @property 

315 def bridge(self) -> DatastoreRegistryBridge: 

316 return self._bridge 

317 

318 def _artifact_exists(self, location: Location) -> bool: 

319 """Check that an artifact exists in this datastore at the specified 

320 location. 

321 

322 Parameters 

323 ---------- 

324 location : `Location` 

325 Expected location of the artifact associated with this datastore. 

326 

327 Returns 

328 ------- 

329 exists : `bool` 

330 True if the location can be found, false otherwise. 

331 """ 

332 log.debug("Checking if resource exists: %s", location.uri) 

333 return location.uri.exists() 

334 

335 def _delete_artifact(self, location: Location) -> None: 

336 """Delete the artifact from the datastore. 

337 

338 Parameters 

339 ---------- 

340 location : `Location` 

341 Location of the artifact associated with this datastore. 

342 """ 

343 if location.pathInStore.isabs(): 343 ↛ 344line 343 didn't jump to line 344, because the condition on line 343 was never true

344 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

345 

346 try: 

347 location.uri.remove() 

348 except FileNotFoundError: 

349 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

350 raise 

351 except Exception as e: 

352 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

353 raise 

354 log.debug("Successfully deleted file: %s", location.uri) 

355 

356 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

357 # Docstring inherited from GenericBaseDatastore 

358 records = [info.to_record(ref) for ref, info in zip(refs, infos)] 

359 self._table.insert(*records) 

360 

361 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

362 # Docstring inherited from GenericBaseDatastore 

363 

364 # Look for the dataset_id -- there might be multiple matches 

365 # if we have disassembled the dataset. 

366 records = self._table.fetch(dataset_id=ref.id) 

367 return [StoredFileInfo.from_record(record) for record in records] 

368 

369 def _get_stored_records_associated_with_refs(self, 

370 refs: Iterable[DatasetIdRef] 

371 ) -> Dict[DatasetId, List[StoredFileInfo]]: 

372 """Retrieve all records associated with the provided refs. 

373 

374 Parameters 

375 ---------- 

376 refs : iterable of `DatasetIdRef` 

377 The refs for which records are to be retrieved. 

378 

379 Returns 

380 ------- 

381 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

382 The matching records indexed by the ref ID. The number of entries 

383 in the dict can be smaller than the number of requested refs. 

384 """ 

385 records = self._table.fetch(dataset_id=[ref.id for ref in refs]) 

386 

387 # Uniqueness is dataset_id + component so can have multiple records 

388 # per ref. 

389 records_by_ref = defaultdict(list) 

390 for record in records: 

391 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

392 return records_by_ref 

393 

394 def _refs_associated_with_artifacts(self, paths: List[Union[str, ButlerURI]]) -> Dict[str, 

395 Set[DatasetId]]: 

396 """Return paths and associated dataset refs. 

397 

398 Parameters 

399 ---------- 

400 paths : `list` of `str` or `ButlerURI` 

401 All the paths to include in search. 

402 

403 Returns 

404 ------- 

405 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

406 Mapping of each path to a set of associated database IDs. 

407 """ 

408 records = self._table.fetch(path=[str(path) for path in paths]) 

409 result = defaultdict(set) 

410 for row in records: 

411 result[row["path"]].add(row["dataset_id"]) 

412 return result 

413 

414 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[DatasetId]: 

415 """Return all dataset refs associated with the supplied path. 

416 

417 Parameters 

418 ---------- 

419 pathInStore : `ButlerURI` 

420 Path of interest in the data store. 

421 

422 Returns 

423 ------- 

424 ids : `set` of `int` 

425 All `DatasetRef` IDs associated with this path. 

426 """ 

427 records = list(self._table.fetch(path=str(pathInStore))) 

428 ids = {r["dataset_id"] for r in records} 

429 return ids 

430 

431 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

432 # Docstring inherited from GenericBaseDatastore 

433 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

434 

435 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

436 r"""Find all the `Location`\ s of the requested dataset in the 

437 `Datastore` and the associated stored file information. 

438 

439 Parameters 

440 ---------- 

441 ref : `DatasetRef` 

442 Reference to the required `Dataset`. 

443 

444 Returns 

445 ------- 

446 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

447 Location of the dataset within the datastore and 

448 stored information about each file and its formatter. 

449 """ 

450 # Get the file information (this will fail if no file) 

451 records = self.getStoredItemsInfo(ref) 

452 

453 # Use the path to determine the location -- we need to take 

454 # into account absolute URIs in the datastore record 

455 return [(r.file_location(self.locationFactory), r) for r in records] 

456 

457 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

458 """Check that there is only one dataset associated with the 

459 specified artifact. 

460 

461 Parameters 

462 ---------- 

463 ref : `DatasetRef` or `FakeDatasetRef` 

464 Dataset to be removed. 

465 location : `Location` 

466 The location of the artifact to be removed. 

467 

468 Returns 

469 ------- 

470 can_remove : `Bool` 

471 True if the artifact can be safely removed. 

472 """ 

473 # Can't ever delete absolute URIs. 

474 if location.pathInStore.isabs(): 

475 return False 

476 

477 # Get all entries associated with this path 

478 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

479 if not allRefs: 

480 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

481 

482 # Remove these refs from all the refs and if there is nothing left 

483 # then we can delete 

484 remainingRefs = allRefs - {ref.id} 

485 

486 if remainingRefs: 

487 return False 

488 return True 

489 

490 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, 

491 StoredFileInfo]]: 

492 """Predict the location and related file information of the requested 

493 dataset in this datastore. 

494 

495 Parameters 

496 ---------- 

497 ref : `DatasetRef` 

498 Reference to the required `Dataset`. 

499 

500 Returns 

501 ------- 

502 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

503 Expected Location of the dataset within the datastore and 

504 placeholder information about each file and its formatter. 

505 

506 Notes 

507 ----- 

508 Uses the current configuration to determine how we would expect the 

509 datastore files to have been written if we couldn't ask registry. 

510 This is safe so long as there has been no change to datastore 

511 configuration between writing the dataset and wanting to read it. 

512 Will not work for files that have been ingested without using the 

513 standard file template or default formatter. 

514 """ 

515 

516 # If we have a component ref we always need to ask the questions 

517 # of the composite. If the composite is disassembled this routine 

518 # should return all components. If the composite was not 

519 # disassembled the composite is what is stored regardless of 

520 # component request. Note that if the caller has disassembled 

521 # a composite there is no way for this guess to know that 

522 # without trying both the composite and component ref and seeing 

523 # if there is something at the component Location even without 

524 # disassembly being enabled. 

525 if ref.datasetType.isComponent(): 

526 ref = ref.makeCompositeRef() 

527 

528 # See if the ref is a composite that should be disassembled 

529 doDisassembly = self.composites.shouldBeDisassembled(ref) 

530 

531 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = [] 

532 

533 if doDisassembly: 

534 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

535 compRef = ref.makeComponentRef(component) 

536 location, formatter = self._determine_put_formatter_location(compRef) 

537 all_info.append((location, formatter, componentStorage, component)) 

538 

539 else: 

540 # Always use the composite ref if no disassembly 

541 location, formatter = self._determine_put_formatter_location(ref) 

542 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

543 

544 # Convert the list of tuples to have StoredFileInfo as second element 

545 return [(location, StoredFileInfo(formatter=formatter, 

546 path=location.pathInStore.path, 

547 storageClass=storageClass, 

548 component=component, 

549 checksum=None, 

550 file_size=-1)) 

551 for location, formatter, storageClass, component in all_info] 

552 

553 def _prepare_for_get(self, ref: DatasetRef, 

554 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]: 

555 """Check parameters for ``get`` and obtain formatter and 

556 location. 

557 

558 Parameters 

559 ---------- 

560 ref : `DatasetRef` 

561 Reference to the required Dataset. 

562 parameters : `dict` 

563 `StorageClass`-specific parameters that specify, for example, 

564 a slice of the dataset to be loaded. 

565 

566 Returns 

567 ------- 

568 getInfo : `list` [`DatastoreFileGetInformation`] 

569 Parameters needed to retrieve each file. 

570 """ 

571 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

572 

573 # Get file metadata and internal metadata 

574 fileLocations = self._get_dataset_locations_info(ref) 

575 if not fileLocations: 

576 if not self.trustGetRequest: 

577 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

578 # Assume the dataset is where we think it should be 

579 fileLocations = self._get_expected_dataset_locations_info(ref) 

580 

581 # The storage class we want to use eventually 

582 refStorageClass = ref.datasetType.storageClass 

583 

584 if len(fileLocations) > 1: 

585 disassembled = True 

586 

587 # If trust is involved it is possible that there will be 

588 # components listed here that do not exist in the datastore. 

589 # Explicitly check for file artifact existence and filter out any 

590 # that are missing. 

591 if self.trustGetRequest: 

592 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()] 

593 

594 # For now complain only if we have no components at all. One 

595 # component is probably a problem but we can punt that to the 

596 # assembler. 

597 if not fileLocations: 597 ↛ 598line 597 didn't jump to line 598, because the condition on line 597 was never true

598 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") 

599 

600 else: 

601 disassembled = False 

602 

603 # Is this a component request? 

604 refComponent = ref.datasetType.component() 

605 

606 fileGetInfo = [] 

607 for location, storedFileInfo in fileLocations: 

608 

609 # The storage class used to write the file 

610 writeStorageClass = storedFileInfo.storageClass 

611 

612 # If this has been disassembled we need read to match the write 

613 if disassembled: 

614 readStorageClass = writeStorageClass 

615 else: 

616 readStorageClass = refStorageClass 

617 

618 formatter = getInstanceOf(storedFileInfo.formatter, 

619 FileDescriptor(location, readStorageClass=readStorageClass, 

620 storageClass=writeStorageClass, parameters=parameters), 

621 ref.dataId) 

622 

623 formatterParams, notFormatterParams = formatter.segregateParameters() 

624 

625 # Of the remaining parameters, extract the ones supported by 

626 # this StorageClass (for components not all will be handled) 

627 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

628 

629 # The ref itself could be a component if the dataset was 

630 # disassembled by butler, or we disassembled in datastore and 

631 # components came from the datastore records 

632 component = storedFileInfo.component if storedFileInfo.component else refComponent 

633 

634 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo, 

635 assemblerParams, formatterParams, 

636 component, readStorageClass)) 

637 

638 return fileGetInfo 

639 

640 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

641 """Check the arguments for ``put`` and obtain formatter and 

642 location. 

643 

644 Parameters 

645 ---------- 

646 inMemoryDataset : `object` 

647 The dataset to store. 

648 ref : `DatasetRef` 

649 Reference to the associated Dataset. 

650 

651 Returns 

652 ------- 

653 location : `Location` 

654 The location to write the dataset. 

655 formatter : `Formatter` 

656 The `Formatter` to use to write the dataset. 

657 

658 Raises 

659 ------ 

660 TypeError 

661 Supplied object and storage class are inconsistent. 

662 DatasetTypeNotSupportedError 

663 The associated `DatasetType` is not handled by this datastore. 

664 """ 

665 self._validate_put_parameters(inMemoryDataset, ref) 

666 return self._determine_put_formatter_location(ref) 

667 

668 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]: 

669 """Calculate the formatter and output location to use for put. 

670 

671 Parameters 

672 ---------- 

673 ref : `DatasetRef` 

674 Reference to the associated Dataset. 

675 

676 Returns 

677 ------- 

678 location : `Location` 

679 The location to write the dataset. 

680 formatter : `Formatter` 

681 The `Formatter` to use to write the dataset. 

682 """ 

683 # Work out output file name 

684 try: 

685 template = self.templates.getTemplate(ref) 

686 except KeyError as e: 

687 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

688 

689 # Validate the template to protect against filenames from different 

690 # dataIds returning the same and causing overwrite confusion. 

691 template.validateTemplate(ref) 

692 

693 location = self.locationFactory.fromPath(template.format(ref)) 

694 

695 # Get the formatter based on the storage class 

696 storageClass = ref.datasetType.storageClass 

697 try: 

698 formatter = self.formatterFactory.getFormatter(ref, 

699 FileDescriptor(location, 

700 storageClass=storageClass), 

701 ref.dataId) 

702 except KeyError as e: 

703 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore " 

704 f"{self.name}") from e 

705 

706 # Now that we know the formatter, update the location 

707 location = formatter.makeUpdatedLocation(location) 

708 

709 return location, formatter 

710 

711 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

712 # Docstring inherited from base class 

713 if transfer != "auto": 

714 return transfer 

715 

716 # See if the paths are within the datastore or not 

717 inside = [self._pathInStore(d.path) is not None for d in datasets] 

718 

719 if all(inside): 

720 transfer = None 

721 elif not any(inside): 721 ↛ 730line 721 didn't jump to line 730, because the condition on line 721 was never false

722 # Allow ButlerURI to use its own knowledge 

723 transfer = "auto" 

724 else: 

725 # This can happen when importing from a datastore that 

726 # has had some datasets ingested using "direct" mode. 

727 # Also allow ButlerURI to sort it out but warn about it. 

728 # This can happen if you are importing from a datastore 

729 # that had some direct transfer datasets. 

730 log.warning("Some datasets are inside the datastore and some are outside. Using 'split' " 

731 "transfer mode. This assumes that the files outside the datastore are " 

732 "still accessible to the new butler since they will not be copied into " 

733 "the target datastore.") 

734 transfer = "split" 

735 

736 return transfer 

737 

738 def _pathInStore(self, path: Union[str, ButlerURI]) -> Optional[str]: 

739 """Return path relative to datastore root 

740 

741 Parameters 

742 ---------- 

743 path : `str` or `ButlerURI` 

744 Path to dataset. Can be absolute URI. If relative assumed to 

745 be relative to the datastore. Returns path in datastore 

746 or raises an exception if the path it outside. 

747 

748 Returns 

749 ------- 

750 inStore : `str` 

751 Path relative to datastore root. Returns `None` if the file is 

752 outside the root. 

753 """ 

754 # Relative path will always be relative to datastore 

755 pathUri = ButlerURI(path, forceAbsolute=False) 

756 return pathUri.relative_to(self.root) 

757 

758 def _standardizeIngestPath(self, path: Union[str, ButlerURI], *, 

759 transfer: Optional[str] = None) -> Union[str, ButlerURI]: 

760 """Standardize the path of a to-be-ingested file. 

761 

762 Parameters 

763 ---------- 

764 path : `str` or `ButlerURI` 

765 Path of a file to be ingested. 

766 transfer : `str`, optional 

767 How (and whether) the dataset should be added to the datastore. 

768 See `ingest` for details of transfer modes. 

769 This implementation is provided only so 

770 `NotImplementedError` can be raised if the mode is not supported; 

771 actual transfers are deferred to `_extractIngestInfo`. 

772 

773 Returns 

774 ------- 

775 path : `str` or `ButlerURI` 

776 New path in what the datastore considers standard form. If an 

777 absolute URI was given that will be returned unchanged. 

778 

779 Notes 

780 ----- 

781 Subclasses of `FileDatastore` can implement this method instead 

782 of `_prepIngest`. It should not modify the data repository or given 

783 file in any way. 

784 

785 Raises 

786 ------ 

787 NotImplementedError 

788 Raised if the datastore does not support the given transfer mode 

789 (including the case where ingest is not supported at all). 

790 FileNotFoundError 

791 Raised if one of the given files does not exist. 

792 """ 

793 if transfer not in (None, "direct", "split") + self.root.transferModes: 793 ↛ 794line 793 didn't jump to line 794, because the condition on line 793 was never true

794 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

795 

796 # A relative URI indicates relative to datastore root 

797 srcUri = ButlerURI(path, forceAbsolute=False) 

798 if not srcUri.isabs(): 

799 srcUri = self.root.join(path) 

800 

801 if not srcUri.exists(): 

802 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest " 

803 f"are assumed to be relative to {self.root} unless they are absolute.") 

804 

805 if transfer is None: 

806 relpath = srcUri.relative_to(self.root) 

807 if not relpath: 

808 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not " 

809 f"within datastore ({self.root})") 

810 

811 # Return the relative path within the datastore for internal 

812 # transfer 

813 path = relpath 

814 

815 return path 

816 

817 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *, 

818 formatter: Union[Formatter, Type[Formatter]], 

819 transfer: Optional[str] = None) -> StoredFileInfo: 

820 """Relocate (if necessary) and extract `StoredFileInfo` from a 

821 to-be-ingested file. 

822 

823 Parameters 

824 ---------- 

825 path : `str` or `ButlerURI` 

826 URI or path of a file to be ingested. 

827 ref : `DatasetRef` 

828 Reference for the dataset being ingested. Guaranteed to have 

829 ``dataset_id not None`. 

830 formatter : `type` or `Formatter` 

831 `Formatter` subclass to use for this dataset or an instance. 

832 transfer : `str`, optional 

833 How (and whether) the dataset should be added to the datastore. 

834 See `ingest` for details of transfer modes. 

835 

836 Returns 

837 ------- 

838 info : `StoredFileInfo` 

839 Internal datastore record for this file. This will be inserted by 

840 the caller; the `_extractIngestInfo` is only resposible for 

841 creating and populating the struct. 

842 

843 Raises 

844 ------ 

845 FileNotFoundError 

846 Raised if one of the given files does not exist. 

847 FileExistsError 

848 Raised if transfer is not `None` but the (internal) location the 

849 file would be moved to is already occupied. 

850 """ 

851 if self._transaction is None: 851 ↛ 852line 851 didn't jump to line 852, because the condition on line 851 was never true

852 raise RuntimeError("Ingest called without transaction enabled") 

853 

854 # Create URI of the source path, do not need to force a relative 

855 # path to absolute. 

856 srcUri = ButlerURI(path, forceAbsolute=False) 

857 

858 # Track whether we have read the size of the source yet 

859 have_sized = False 

860 

861 tgtLocation: Optional[Location] 

862 if transfer is None or transfer == "split": 

863 # A relative path is assumed to be relative to the datastore 

864 # in this context 

865 if not srcUri.isabs(): 

866 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

867 else: 

868 # Work out the path in the datastore from an absolute URI 

869 # This is required to be within the datastore. 

870 pathInStore = srcUri.relative_to(self.root) 

871 if pathInStore is None and transfer is None: 871 ↛ 872line 871 didn't jump to line 872, because the condition on line 871 was never true

872 raise RuntimeError(f"Unexpectedly learned that {srcUri} is " 

873 f"not within datastore {self.root}") 

874 if pathInStore: 874 ↛ 876line 874 didn't jump to line 876, because the condition on line 874 was never false

875 tgtLocation = self.locationFactory.fromPath(pathInStore) 

876 elif transfer == "split": 

877 # Outside the datastore but treat that as a direct ingest 

878 # instead. 

879 tgtLocation = None 

880 else: 

881 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for" 

882 f" URI {srcUri}") 

883 elif transfer == "direct": 883 ↛ 888line 883 didn't jump to line 888, because the condition on line 883 was never true

884 # Want to store the full URI to the resource directly in 

885 # datastore. This is useful for referring to permanent archive 

886 # storage for raw data. 

887 # Trust that people know what they are doing. 

888 tgtLocation = None 

889 else: 

890 # Work out the name we want this ingested file to have 

891 # inside the datastore 

892 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

893 if not tgtLocation.uri.dirname().exists(): 

894 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

895 tgtLocation.uri.dirname().mkdir() 

896 

897 # if we are transferring from a local file to a remote location 

898 # it may be more efficient to get the size and checksum of the 

899 # local file rather than the transferred one 

900 if not srcUri.scheme or srcUri.scheme == "file": 900 ↛ 910line 900 didn't jump to line 910, because the condition on line 900 was never false

901 size = srcUri.size() 

902 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

903 have_sized = True 

904 

905 # Transfer the resource to the destination. 

906 # Allow overwrite of an existing file. This matches the behavior 

907 # of datastore.put() in that it trusts that registry would not 

908 # be asking to overwrite unless registry thought that the 

909 # overwrite was allowed. 

910 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction, 

911 overwrite=True) 

912 

913 if tgtLocation is None: 913 ↛ 915line 913 didn't jump to line 915, because the condition on line 913 was never true

914 # This means we are using direct mode 

915 targetUri = srcUri 

916 targetPath = str(srcUri) 

917 else: 

918 targetUri = tgtLocation.uri 

919 targetPath = tgtLocation.pathInStore.path 

920 

921 # the file should exist in the datastore now 

922 if not have_sized: 

923 size = targetUri.size() 

924 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

925 

926 return StoredFileInfo(formatter=formatter, path=targetPath, 

927 storageClass=ref.datasetType.storageClass, 

928 component=ref.datasetType.component(), 

929 file_size=size, checksum=checksum) 

930 

931 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

932 # Docstring inherited from Datastore._prepIngest. 

933 filtered = [] 

934 for dataset in datasets: 

935 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

936 if not acceptable: 

937 continue 

938 else: 

939 dataset.refs = acceptable 

940 if dataset.formatter is None: 

941 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

942 else: 

943 assert isinstance(dataset.formatter, (type, str)) 

944 dataset.formatter = getClassOf(dataset.formatter) 

945 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

946 filtered.append(dataset) 

947 return _IngestPrepData(filtered) 

948 

949 @transactional 

950 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None: 

951 # Docstring inherited from Datastore._finishIngest. 

952 refsAndInfos = [] 

953 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

954 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

955 # Do ingest as if the first dataset ref is associated with the file 

956 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter, 

957 transfer=transfer) 

958 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

959 self._register_datasets(refsAndInfos) 

960 

961 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef, 

962 formatter: Union[Formatter, Type[Formatter]]) -> Location: 

963 """Given a source URI and a DatasetRef, determine the name the 

964 dataset will have inside datastore. 

965 

966 Parameters 

967 ---------- 

968 srcUri : `ButlerURI` 

969 URI to the source dataset file. 

970 ref : `DatasetRef` 

971 Ref associated with the newly-ingested dataset artifact. This 

972 is used to determine the name within the datastore. 

973 formatter : `Formatter` or Formatter class. 

974 Formatter to use for validation. Can be a class or an instance. 

975 

976 Returns 

977 ------- 

978 location : `Location` 

979 Target location for the newly-ingested dataset. 

980 """ 

981 # Ingesting a file from outside the datastore. 

982 # This involves a new name. 

983 template = self.templates.getTemplate(ref) 

984 location = self.locationFactory.fromPath(template.format(ref)) 

985 

986 # Get the extension 

987 ext = srcUri.getExtension() 

988 

989 # Update the destination to include that extension 

990 location.updateExtension(ext) 

991 

992 # Ask the formatter to validate this extension 

993 formatter.validateExtension(location) 

994 

995 return location 

996 

997 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

998 """Write out in memory dataset to datastore. 

999 

1000 Parameters 

1001 ---------- 

1002 inMemoryDataset : `object` 

1003 Dataset to write to datastore. 

1004 ref : `DatasetRef` 

1005 Registry information associated with this dataset. 

1006 

1007 Returns 

1008 ------- 

1009 info : `StoredFileInfo` 

1010 Information describin the artifact written to the datastore. 

1011 """ 

1012 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

1013 uri = location.uri 

1014 

1015 if not uri.dirname().exists(): 

1016 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

1017 uri.dirname().mkdir() 

1018 

1019 if self._transaction is None: 1019 ↛ 1020line 1019 didn't jump to line 1020, because the condition on line 1019 was never true

1020 raise RuntimeError("Attempting to write artifact without transaction enabled") 

1021 

1022 def _removeFileExists(uri: ButlerURI) -> None: 

1023 """Remove a file and do not complain if it is not there. 

1024 

1025 This is important since a formatter might fail before the file 

1026 is written and we should not confuse people by writing spurious 

1027 error messages to the log. 

1028 """ 

1029 try: 

1030 uri.remove() 

1031 except FileNotFoundError: 

1032 pass 

1033 

1034 # Register a callback to try to delete the uploaded data if 

1035 # something fails below 

1036 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1037 

1038 # For a local file, simply use the formatter directly 

1039 if uri.isLocal: 

1040 try: 

1041 formatter.write(inMemoryDataset) 

1042 except Exception as e: 

1043 raise RuntimeError(f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} " 

1044 f"to location {uri}") from e 

1045 log.debug("Successfully wrote python object to local file at %s", uri) 

1046 else: 

1047 # This is a remote URI. Some datasets can be serialized directly 

1048 # to bytes and sent to the remote datastore without writing a 

1049 # file. If the dataset is intended to be saved to the cache 

1050 # a file is always written and direct write to the remote 

1051 # datastore is bypassed. 

1052 data_written = False 

1053 if not self.cacheManager.should_be_cached(ref): 

1054 try: 

1055 serializedDataset = formatter.toBytes(inMemoryDataset) 

1056 except NotImplementedError: 

1057 # Fallback to the file writing option. 

1058 pass 

1059 except Exception as e: 

1060 raise RuntimeError(f"Failed to serialize dataset {ref} " 

1061 f"of type {type(inMemoryDataset)} to bytes.") from e 

1062 else: 

1063 log.debug("Writing bytes directly to %s", uri) 

1064 uri.write(serializedDataset, overwrite=True) 

1065 log.debug("Successfully wrote bytes directly to %s", uri) 

1066 data_written = True 

1067 

1068 if not data_written: 

1069 # Did not write the bytes directly to object store so instead 

1070 # write to temporary file. 

1071 with ButlerURI.temporary_uri(suffix=uri.getExtension()) as temporary_uri: 

1072 # Need to configure the formatter to write to a different 

1073 # location and that needs us to overwrite internals 

1074 log.debug("Writing dataset to temporary location at %s", temporary_uri) 

1075 with formatter._updateLocation(Location(None, temporary_uri)): 

1076 try: 

1077 formatter.write(inMemoryDataset) 

1078 except Exception as e: 

1079 raise RuntimeError(f"Failed to serialize dataset {ref} of type" 

1080 f" {type(inMemoryDataset)} to " 

1081 f"temporary location {temporary_uri}") from e 

1082 uri.transfer_from(temporary_uri, transfer="copy", overwrite=True) 

1083 

1084 # Cache if required 

1085 self.cacheManager.move_to_cache(temporary_uri, ref) 

1086 

1087 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1088 

1089 # URI is needed to resolve what ingest case are we dealing with 

1090 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1091 

1092 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

1093 ref: DatasetRef, isComponent: bool = False, 

1094 cache_ref: Optional[DatasetRef] = None) -> Any: 

1095 """Read the artifact from datastore into in memory object. 

1096 

1097 Parameters 

1098 ---------- 

1099 getInfo : `DatastoreFileGetInformation` 

1100 Information about the artifact within the datastore. 

1101 ref : `DatasetRef` 

1102 The registry information associated with this artifact. 

1103 isComponent : `bool` 

1104 Flag to indicate if a component is being read from this artifact. 

1105 cache_ref : `DatasetRef`, optional 

1106 The DatasetRef to use when looking up the file in the cache. 

1107 This ref must have the same ID as the supplied ref but can 

1108 be a parent ref or component ref to indicate to the cache whether 

1109 a composite file is being requested from the cache or a component 

1110 file. Without this the cache will default to the supplied ref but 

1111 it can get confused with read-only derived components for 

1112 disassembled composites. 

1113 

1114 Returns 

1115 ------- 

1116 inMemoryDataset : `object` 

1117 The artifact as a python object. 

1118 """ 

1119 location = getInfo.location 

1120 uri = location.uri 

1121 log.debug("Accessing data from %s", uri) 

1122 

1123 if cache_ref is None: 

1124 cache_ref = ref 

1125 if cache_ref.id != ref.id: 1125 ↛ 1126line 1125 didn't jump to line 1126, because the condition on line 1125 was never true

1126 raise ValueError("The supplied cache dataset ref refers to a different dataset than expected:" 

1127 f" {ref.id} != {cache_ref.id}") 

1128 

1129 # Cannot recalculate checksum but can compare size as a quick check 

1130 # Do not do this if the size is negative since that indicates 

1131 # we do not know. 

1132 recorded_size = getInfo.info.file_size 

1133 resource_size = uri.size() 

1134 if recorded_size >= 0 and resource_size != recorded_size: 1134 ↛ 1135line 1134 didn't jump to line 1135, because the condition on line 1134 was never true

1135 raise RuntimeError("Integrity failure in Datastore. " 

1136 f"Size of file {uri} ({resource_size}) " 

1137 f"does not match size recorded in registry of {recorded_size}") 

1138 

1139 # For the general case we have choices for how to proceed. 

1140 # 1. Always use a local file (downloading the remote resource to a 

1141 # temporary file if needed). 

1142 # 2. Use a threshold size and read into memory and use bytes. 

1143 # Use both for now with an arbitrary hand off size. 

1144 # This allows small datasets to be downloaded from remote object 

1145 # stores without requiring a temporary file. 

1146 

1147 formatter = getInfo.formatter 

1148 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1149 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1150 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1151 if cached_file is not None: 

1152 desired_uri = cached_file 

1153 msg = f" (cached version of {uri})" 

1154 else: 

1155 desired_uri = uri 

1156 msg = "" 

1157 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)): 

1158 serializedDataset = desired_uri.read() 

1159 log.debug("Deserializing %s from %d bytes from location %s with formatter %s", 

1160 f"component {getInfo.component}" if isComponent else "", 

1161 len(serializedDataset), uri, formatter.name()) 

1162 try: 

1163 result = formatter.fromBytes(serializedDataset, 

1164 component=getInfo.component if isComponent else None) 

1165 except Exception as e: 

1166 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1167 f" ({ref.datasetType.name} from {uri}): {e}") from e 

1168 else: 

1169 # Read from file. 

1170 

1171 # Have to update the Location associated with the formatter 

1172 # because formatter.read does not allow an override. 

1173 # This could be improved. 

1174 location_updated = False 

1175 msg = "" 

1176 

1177 # First check in cache for local version. 

1178 # The cache will only be relevant for remote resources but 

1179 # no harm in always asking. Context manager ensures that cache 

1180 # file is not deleted during cache expiration. 

1181 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1182 if cached_file is not None: 

1183 msg = f"(via cache read of remote file {uri})" 

1184 uri = cached_file 

1185 location_updated = True 

1186 

1187 with uri.as_local() as local_uri: 

1188 

1189 can_be_cached = False 

1190 if uri != local_uri: 1190 ↛ 1192line 1190 didn't jump to line 1192, because the condition on line 1190 was never true

1191 # URI was remote and file was downloaded 

1192 cache_msg = "" 

1193 location_updated = True 

1194 

1195 if self.cacheManager.should_be_cached(cache_ref): 

1196 # In this scenario we want to ask if the downloaded 

1197 # file should be cached but we should not cache 

1198 # it until after we've used it (to ensure it can't 

1199 # be expired whilst we are using it). 

1200 can_be_cached = True 

1201 

1202 # Say that it is "likely" to be cached because 

1203 # if the formatter read fails we will not be 

1204 # caching this file. 

1205 cache_msg = " and likely cached" 

1206 

1207 msg = f"(via download to local file{cache_msg})" 

1208 

1209 # Calculate the (possibly) new location for the formatter 

1210 # to use. 

1211 newLocation = Location(*local_uri.split()) if location_updated else None 

1212 

1213 log.debug("Reading%s from location %s %s with formatter %s", 

1214 f" component {getInfo.component}" if isComponent else "", 

1215 uri, msg, formatter.name()) 

1216 try: 

1217 with formatter._updateLocation(newLocation): 

1218 with time_this(log, msg="Reading%s from location %s %s with formatter %s", 

1219 args=(f" component {getInfo.component}" if isComponent else "", 

1220 uri, msg, formatter.name())): 

1221 result = formatter.read(component=getInfo.component if isComponent else None) 

1222 except Exception as e: 

1223 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1224 f" ({ref.datasetType.name} from {uri}): {e}") from e 

1225 

1226 # File was read successfully so can move to cache 

1227 if can_be_cached: 1227 ↛ 1228line 1227 didn't jump to line 1228, because the condition on line 1227 was never true

1228 self.cacheManager.move_to_cache(local_uri, cache_ref) 

1229 

1230 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

1231 isComponent=isComponent) 

1232 

1233 def knows(self, ref: DatasetRef) -> bool: 

1234 """Check if the dataset is known to the datastore. 

1235 

1236 Does not check for existence of any artifact. 

1237 

1238 Parameters 

1239 ---------- 

1240 ref : `DatasetRef` 

1241 Reference to the required dataset. 

1242 

1243 Returns 

1244 ------- 

1245 exists : `bool` 

1246 `True` if the dataset is known to the datastore. 

1247 """ 

1248 fileLocations = self._get_dataset_locations_info(ref) 

1249 if fileLocations: 

1250 return True 

1251 return False 

1252 

1253 def _process_mexists_records(self, id_to_ref: Dict[DatasetId, DatasetRef], 

1254 records: Dict[DatasetId, List[StoredFileInfo]], 

1255 all_required: bool, 

1256 artifact_existence: Optional[Dict[ButlerURI, 

1257 bool]] = None) -> Dict[DatasetRef, bool]: 

1258 """Helper function for mexists that checks the given records. 

1259 

1260 Parameters 

1261 ---------- 

1262 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`] 

1263 Mapping of the dataset ID to the dataset ref itself. 

1264 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

1265 Records as generally returned by 

1266 ``_get_stored_records_associated_with_refs``. 

1267 all_required : `bool` 

1268 Flag to indicate whether existence requires all artifacts 

1269 associated with a dataset ID to exist or not for existence. 

1270 artifact_existence : `dict` of [`ButlerURI`, `bool`], optional 

1271 Mapping of datastore artifact to existence. Updated by this 

1272 method with details of all artifacts tested. Can be `None` 

1273 if the caller is not interested. 

1274 

1275 Returns 

1276 ------- 

1277 existence : `dict` of [`DatasetRef`, `bool`] 

1278 Mapping from dataset to boolean indicating existence. 

1279 """ 

1280 # The URIs to be checked and a mapping of those URIs to 

1281 # the dataset ID. 

1282 uris_to_check: List[ButlerURI] = [] 

1283 location_map: Dict[ButlerURI, DatasetId] = {} 

1284 

1285 location_factory = self.locationFactory 

1286 

1287 for ref_id, info in records.items(): 

1288 # Key is the dataId, value is list of StoredItemInfo 

1289 uris = [info.file_location(location_factory).uri for info in info] 

1290 uris_to_check.extend(uris) 

1291 location_map.update({uri: ref_id for uri in uris}) 

1292 

1293 uri_existence: Dict[ButlerURI, bool] = {} 

1294 if artifact_existence is not None: 

1295 # If a URI has already been checked remove it from the list 

1296 # and immediately add the status to the output dict. 

1297 filtered_uris_to_check = [] 

1298 for uri in uris_to_check: 

1299 if uri in artifact_existence: 

1300 uri_existence[uri] = artifact_existence[uri] 

1301 else: 

1302 filtered_uris_to_check.append(uri) 

1303 uris_to_check = filtered_uris_to_check 

1304 

1305 # Results. 

1306 dataset_existence: Dict[DatasetRef, bool] = {} 

1307 

1308 uri_existence.update(ButlerURI.mexists(uris_to_check)) 

1309 for uri, exists in uri_existence.items(): 

1310 dataset_id = location_map[uri] 

1311 ref = id_to_ref[dataset_id] 

1312 

1313 # Disassembled composite needs to check all locations. 

1314 # all_required indicates whether all need to exist or not. 

1315 if ref in dataset_existence: 

1316 if all_required: 

1317 exists = dataset_existence[ref] and exists 

1318 else: 

1319 exists = dataset_existence[ref] or exists 

1320 dataset_existence[ref] = exists 

1321 

1322 if artifact_existence is not None: 

1323 artifact_existence.update(uri_existence) 

1324 

1325 return dataset_existence 

1326 

1327 def mexists(self, refs: Iterable[DatasetRef], 

1328 artifact_existence: Optional[Dict[ButlerURI, bool]] = None) -> Dict[DatasetRef, bool]: 

1329 """Check the existence of multiple datasets at once. 

1330 

1331 Parameters 

1332 ---------- 

1333 refs : iterable of `DatasetRef` 

1334 The datasets to be checked. 

1335 artifact_existence : `dict` of [`ButlerURI`, `bool`], optional 

1336 Mapping of datastore artifact to existence. Updated by this 

1337 method with details of all artifacts tested. Can be `None` 

1338 if the caller is not interested. 

1339 

1340 Returns 

1341 ------- 

1342 existence : `dict` of [`DatasetRef`, `bool`] 

1343 Mapping from dataset to boolean indicating existence. 

1344 """ 

1345 chunk_size = 10_000 

1346 dataset_existence: Dict[DatasetRef, bool] = {} 

1347 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", 

1348 chunk_size) 

1349 n_found_total = 0 

1350 n_checked = 0 

1351 n_chunks = 0 

1352 for chunk in chunk_iterable(refs, chunk_size=chunk_size): 

1353 chunk_result = self._mexists(chunk, artifact_existence) 

1354 if log.isEnabledFor(VERBOSE): 

1355 n_results = len(chunk_result) 

1356 n_checked += n_results 

1357 # Can treat the booleans as 0, 1 integers and sum them. 

1358 n_found = sum(chunk_result.values()) 

1359 n_found_total += n_found 

1360 log.log(VERBOSE, "Number of datasets found in datastore for chunk %d = %d/%d" 

1361 " (running total: %d/%d)", 

1362 n_chunks, n_found, n_results, n_found_total, n_checked) 

1363 dataset_existence.update(chunk_result) 

1364 n_chunks += 1 

1365 

1366 return dataset_existence 

1367 

1368 def _mexists(self, refs: Iterable[DatasetRef], 

1369 artifact_existence: Optional[Dict[ButlerURI, bool]] = None) -> Dict[DatasetRef, bool]: 

1370 """Check the existence of multiple datasets at once. 

1371 

1372 Parameters 

1373 ---------- 

1374 refs : iterable of `DatasetRef` 

1375 The datasets to be checked. 

1376 

1377 Returns 

1378 ------- 

1379 existence : `dict` of [`DatasetRef`, `bool`] 

1380 Mapping from dataset to boolean indicating existence. 

1381 """ 

1382 # Need a mapping of dataset_id to dataset ref since the API 

1383 # works with dataset_id 

1384 id_to_ref = {ref.getCheckedId(): ref for ref in refs} 

1385 

1386 # Set of all IDs we are checking for. 

1387 requested_ids = set(id_to_ref.keys()) 

1388 

1389 # The records themselves. Could be missing some entries. 

1390 records = self._get_stored_records_associated_with_refs(refs) 

1391 

1392 dataset_existence = self._process_mexists_records(id_to_ref, records, True, 

1393 artifact_existence=artifact_existence) 

1394 

1395 # Set of IDs that have been handled. 

1396 handled_ids = {ref.id for ref in dataset_existence.keys()} 

1397 

1398 missing_ids = requested_ids - handled_ids 

1399 if missing_ids: 

1400 if not self.trustGetRequest: 

1401 # Must assume these do not exist 

1402 for missing in missing_ids: 

1403 dataset_existence[id_to_ref[missing]] = False 

1404 else: 

1405 log.debug("%d out of %d datasets were not known to datastore during initial existence check.", 

1406 len(missing_ids), len(requested_ids)) 

1407 

1408 # Construct data structure identical to that returned 

1409 # by _get_stored_records_associated_with_refs() but using 

1410 # guessed names. 

1411 records = {} 

1412 for missing in missing_ids: 

1413 expected = self._get_expected_dataset_locations_info(id_to_ref[missing]) 

1414 records[missing] = [info for _, info in expected] 

1415 

1416 dataset_existence.update(self._process_mexists_records(id_to_ref, records, False, 

1417 artifact_existence=artifact_existence)) 

1418 

1419 return dataset_existence 

1420 

1421 def exists(self, ref: DatasetRef) -> bool: 

1422 """Check if the dataset exists in the datastore. 

1423 

1424 Parameters 

1425 ---------- 

1426 ref : `DatasetRef` 

1427 Reference to the required dataset. 

1428 

1429 Returns 

1430 ------- 

1431 exists : `bool` 

1432 `True` if the entity exists in the `Datastore`. 

1433 """ 

1434 fileLocations = self._get_dataset_locations_info(ref) 

1435 

1436 # if we are being asked to trust that registry might not be correct 

1437 # we ask for the expected locations and check them explicitly 

1438 if not fileLocations: 

1439 if not self.trustGetRequest: 

1440 return False 

1441 

1442 # When we are guessing a dataset location we can not check 

1443 # for the existence of every component since we can not 

1444 # know if every component was written. Instead we check 

1445 # for the existence of any of the expected locations. 

1446 for location, _ in self._get_expected_dataset_locations_info(ref): 1446 ↛ 1449line 1446 didn't jump to line 1449, because the loop on line 1446 didn't complete

1447 if self._artifact_exists(location): 1447 ↛ 1446line 1447 didn't jump to line 1446, because the condition on line 1447 was never false

1448 return True 

1449 return False 

1450 

1451 # All listed artifacts must exist. 

1452 for location, _ in fileLocations: 

1453 if not self._artifact_exists(location): 

1454 return False 

1455 

1456 return True 

1457 

1458 def getURIs(self, ref: DatasetRef, 

1459 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

1460 """Return URIs associated with dataset. 

1461 

1462 Parameters 

1463 ---------- 

1464 ref : `DatasetRef` 

1465 Reference to the required dataset. 

1466 predict : `bool`, optional 

1467 If the datastore does not know about the dataset, should it 

1468 return a predicted URI or not? 

1469 

1470 Returns 

1471 ------- 

1472 primary : `ButlerURI` 

1473 The URI to the primary artifact associated with this dataset. 

1474 If the dataset was disassembled within the datastore this 

1475 may be `None`. 

1476 components : `dict` 

1477 URIs to any components associated with the dataset artifact. 

1478 Can be empty if there are no components. 

1479 """ 

1480 

1481 primary: Optional[ButlerURI] = None 

1482 components: Dict[str, ButlerURI] = {} 

1483 

1484 # if this has never been written then we have to guess 

1485 if not self.exists(ref): 

1486 if not predict: 

1487 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

1488 

1489 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1490 

1491 if doDisassembly: 

1492 

1493 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

1494 compRef = ref.makeComponentRef(component) 

1495 compLocation, _ = self._determine_put_formatter_location(compRef) 

1496 

1497 # Add a URI fragment to indicate this is a guess 

1498 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted") 

1499 

1500 else: 

1501 

1502 location, _ = self._determine_put_formatter_location(ref) 

1503 

1504 # Add a URI fragment to indicate this is a guess 

1505 primary = ButlerURI(location.uri.geturl() + "#predicted") 

1506 

1507 return primary, components 

1508 

1509 # If this is a ref that we have written we can get the path. 

1510 # Get file metadata and internal metadata 

1511 fileLocations = self._get_dataset_locations_info(ref) 

1512 

1513 guessing = False 

1514 if not fileLocations: 

1515 if not self.trustGetRequest: 1515 ↛ 1516line 1515 didn't jump to line 1516, because the condition on line 1515 was never true

1516 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1517 fileLocations = self._get_expected_dataset_locations_info(ref) 

1518 guessing = True 

1519 

1520 if len(fileLocations) == 1: 

1521 # No disassembly so this is the primary URI 

1522 uri = fileLocations[0][0].uri 

1523 if guessing and not uri.exists(): 1523 ↛ 1524line 1523 didn't jump to line 1524, because the condition on line 1523 was never true

1524 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1525 primary = uri 

1526 

1527 else: 

1528 for location, storedFileInfo in fileLocations: 

1529 if storedFileInfo.component is None: 1529 ↛ 1530line 1529 didn't jump to line 1530, because the condition on line 1529 was never true

1530 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1531 uri = location.uri 

1532 if guessing and not uri.exists(): 1532 ↛ 1536line 1532 didn't jump to line 1536, because the condition on line 1532 was never true

1533 # If we are trusting then it is entirely possible for 

1534 # some components to be missing. In that case we skip 

1535 # to the next component. 

1536 if self.trustGetRequest: 

1537 continue 

1538 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1539 components[storedFileInfo.component] = uri 

1540 

1541 return primary, components 

1542 

1543 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI: 

1544 """URI to the Dataset. 

1545 

1546 Parameters 

1547 ---------- 

1548 ref : `DatasetRef` 

1549 Reference to the required Dataset. 

1550 predict : `bool` 

1551 If `True`, allow URIs to be returned of datasets that have not 

1552 been written. 

1553 

1554 Returns 

1555 ------- 

1556 uri : `str` 

1557 URI pointing to the dataset within the datastore. If the 

1558 dataset does not exist in the datastore, and if ``predict`` is 

1559 `True`, the URI will be a prediction and will include a URI 

1560 fragment "#predicted". 

1561 If the datastore does not have entities that relate well 

1562 to the concept of a URI the returned URI will be 

1563 descriptive. The returned URI is not guaranteed to be obtainable. 

1564 

1565 Raises 

1566 ------ 

1567 FileNotFoundError 

1568 Raised if a URI has been requested for a dataset that does not 

1569 exist and guessing is not allowed. 

1570 RuntimeError 

1571 Raised if a request is made for a single URI but multiple URIs 

1572 are associated with this dataset. 

1573 

1574 Notes 

1575 ----- 

1576 When a predicted URI is requested an attempt will be made to form 

1577 a reasonable URI based on file templates and the expected formatter. 

1578 """ 

1579 primary, components = self.getURIs(ref, predict) 

1580 if primary is None or components: 1580 ↛ 1581line 1580 didn't jump to line 1581, because the condition on line 1580 was never true

1581 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. " 

1582 "Use Dataastore.getURIs() instead.") 

1583 return primary 

1584 

1585 def retrieveArtifacts(self, refs: Iterable[DatasetRef], 

1586 destination: ButlerURI, transfer: str = "auto", 

1587 preserve_path: bool = True, 

1588 overwrite: bool = False) -> List[ButlerURI]: 

1589 """Retrieve the file artifacts associated with the supplied refs. 

1590 

1591 Parameters 

1592 ---------- 

1593 refs : iterable of `DatasetRef` 

1594 The datasets for which file artifacts are to be retrieved. 

1595 A single ref can result in multiple files. The refs must 

1596 be resolved. 

1597 destination : `ButlerURI` 

1598 Location to write the file artifacts. 

1599 transfer : `str`, optional 

1600 Method to use to transfer the artifacts. Must be one of the options 

1601 supported by `ButlerURI.transfer_from()`. "move" is not allowed. 

1602 preserve_path : `bool`, optional 

1603 If `True` the full path of the file artifact within the datastore 

1604 is preserved. If `False` the final file component of the path 

1605 is used. 

1606 overwrite : `bool`, optional 

1607 If `True` allow transfers to overwrite existing files at the 

1608 destination. 

1609 

1610 Returns 

1611 ------- 

1612 targets : `list` of `ButlerURI` 

1613 URIs of file artifacts in destination location. Order is not 

1614 preserved. 

1615 """ 

1616 if not destination.isdir(): 1616 ↛ 1617line 1616 didn't jump to line 1617, because the condition on line 1616 was never true

1617 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

1618 

1619 if transfer == "move": 

1620 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

1621 

1622 # Source -> Destination 

1623 # This also helps filter out duplicate DatasetRef in the request 

1624 # that will map to the same underlying file transfer. 

1625 to_transfer: Dict[ButlerURI, ButlerURI] = {} 

1626 

1627 for ref in refs: 

1628 locations = self._get_dataset_locations_info(ref) 

1629 for location, _ in locations: 

1630 source_uri = location.uri 

1631 target_path: Union[str, ButlerURI] 

1632 if preserve_path: 

1633 target_path = location.pathInStore 

1634 if target_path.isabs(): 1634 ↛ 1637line 1634 didn't jump to line 1637, because the condition on line 1634 was never true

1635 # This is an absolute path to an external file. 

1636 # Use the full path. 

1637 target_path = target_path.relativeToPathRoot 

1638 else: 

1639 target_path = source_uri.basename() 

1640 target_uri = destination.join(target_path) 

1641 to_transfer[source_uri] = target_uri 

1642 

1643 # In theory can now parallelize the transfer 

1644 log.debug("Number of artifacts to transfer to %s: %d", 

1645 str(destination), len(to_transfer)) 

1646 for source_uri, target_uri in to_transfer.items(): 

1647 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

1648 

1649 return list(to_transfer.values()) 

1650 

1651 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

1652 """Load an InMemoryDataset from the store. 

1653 

1654 Parameters 

1655 ---------- 

1656 ref : `DatasetRef` 

1657 Reference to the required Dataset. 

1658 parameters : `dict` 

1659 `StorageClass`-specific parameters that specify, for example, 

1660 a slice of the dataset to be loaded. 

1661 

1662 Returns 

1663 ------- 

1664 inMemoryDataset : `object` 

1665 Requested dataset or slice thereof as an InMemoryDataset. 

1666 

1667 Raises 

1668 ------ 

1669 FileNotFoundError 

1670 Requested dataset can not be retrieved. 

1671 TypeError 

1672 Return value from formatter has unexpected type. 

1673 ValueError 

1674 Formatter failed to process the dataset. 

1675 """ 

1676 allGetInfo = self._prepare_for_get(ref, parameters) 

1677 refComponent = ref.datasetType.component() 

1678 

1679 # Supplied storage class for the component being read 

1680 refStorageClass = ref.datasetType.storageClass 

1681 

1682 # Create mapping from component name to related info 

1683 allComponents = {i.component: i for i in allGetInfo} 

1684 

1685 # By definition the dataset is disassembled if we have more 

1686 # than one record for it. 

1687 isDisassembled = len(allGetInfo) > 1 

1688 

1689 # Look for the special case where we are disassembled but the 

1690 # component is a derived component that was not written during 

1691 # disassembly. For this scenario we need to check that the 

1692 # component requested is listed as a derived component for the 

1693 # composite storage class 

1694 isDisassembledReadOnlyComponent = False 

1695 if isDisassembled and refComponent: 

1696 # The composite storage class should be accessible through 

1697 # the component dataset type 

1698 compositeStorageClass = ref.datasetType.parentStorageClass 

1699 

1700 # In the unlikely scenario where the composite storage 

1701 # class is not known, we can only assume that this is a 

1702 # normal component. If that assumption is wrong then the 

1703 # branch below that reads a persisted component will fail 

1704 # so there is no need to complain here. 

1705 if compositeStorageClass is not None: 1705 ↛ 1708line 1705 didn't jump to line 1708, because the condition on line 1705 was never false

1706 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

1707 

1708 if isDisassembled and not refComponent: 

1709 # This was a disassembled dataset spread over multiple files 

1710 # and we need to put them all back together again. 

1711 # Read into memory and then assemble 

1712 

1713 # Check that the supplied parameters are suitable for the type read 

1714 refStorageClass.validateParameters(parameters) 

1715 

1716 # We want to keep track of all the parameters that were not used 

1717 # by formatters. We assume that if any of the component formatters 

1718 # use a parameter that we do not need to apply it again in the 

1719 # assembler. 

1720 usedParams = set() 

1721 

1722 components: Dict[str, Any] = {} 

1723 for getInfo in allGetInfo: 

1724 # assemblerParams are parameters not understood by the 

1725 # associated formatter. 

1726 usedParams.update(set(getInfo.formatterParams)) 

1727 

1728 component = getInfo.component 

1729 

1730 if component is None: 1730 ↛ 1731line 1730 didn't jump to line 1731, because the condition on line 1730 was never true

1731 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

1732 

1733 # We do not want the formatter to think it's reading 

1734 # a component though because it is really reading a 

1735 # standalone dataset -- always tell reader it is not a 

1736 # component. 

1737 components[component] = self._read_artifact_into_memory(getInfo, 

1738 ref.makeComponentRef(component), 

1739 isComponent=False) 

1740 

1741 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

1742 

1743 # Any unused parameters will have to be passed to the assembler 

1744 if parameters: 

1745 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

1746 else: 

1747 unusedParams = {} 

1748 

1749 # Process parameters 

1750 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset, 

1751 parameters=unusedParams) 

1752 

1753 elif isDisassembledReadOnlyComponent: 

1754 

1755 compositeStorageClass = ref.datasetType.parentStorageClass 

1756 if compositeStorageClass is None: 1756 ↛ 1757line 1756 didn't jump to line 1757, because the condition on line 1756 was never true

1757 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since" 

1758 "no composite storage class is available.") 

1759 

1760 if refComponent is None: 1760 ↛ 1762line 1760 didn't jump to line 1762, because the condition on line 1760 was never true

1761 # Mainly for mypy 

1762 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

1763 

1764 # Assume that every derived component can be calculated by 

1765 # forwarding the request to a single read/write component. 

1766 # Rather than guessing which rw component is the right one by 

1767 # scanning each for a derived component of the same name, 

1768 # we ask the storage class delegate directly which one is best to 

1769 # use. 

1770 compositeDelegate = compositeStorageClass.delegate() 

1771 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent, 

1772 set(allComponents)) 

1773 

1774 # Select the relevant component 

1775 rwInfo = allComponents[forwardedComponent] 

1776 

1777 # For now assume that read parameters are validated against 

1778 # the real component and not the requested component 

1779 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

1780 forwardedStorageClass.validateParameters(parameters) 

1781 

1782 # The reference to use for the caching must refer to the forwarded 

1783 # component and not the derived component. 

1784 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent) 

1785 

1786 # Unfortunately the FileDescriptor inside the formatter will have 

1787 # the wrong write storage class so we need to create a new one 

1788 # given the immutability constraint. 

1789 writeStorageClass = rwInfo.info.storageClass 

1790 

1791 # We may need to put some thought into parameters for read 

1792 # components but for now forward them on as is 

1793 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location, 

1794 readStorageClass=refStorageClass, 

1795 storageClass=writeStorageClass, 

1796 parameters=parameters), 

1797 ref.dataId) 

1798 

1799 # The assembler can not receive any parameter requests for a 

1800 # derived component at this time since the assembler will 

1801 # see the storage class of the derived component and those 

1802 # parameters will have to be handled by the formatter on the 

1803 # forwarded storage class. 

1804 assemblerParams: Dict[str, Any] = {} 

1805 

1806 # Need to created a new info that specifies the derived 

1807 # component and associated storage class 

1808 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter, 

1809 rwInfo.info, assemblerParams, {}, 

1810 refComponent, refStorageClass) 

1811 

1812 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, 

1813 cache_ref=cache_ref) 

1814 

1815 else: 

1816 # Single file request or component from that composite file 

1817 for lookup in (refComponent, None): 1817 ↛ 1822line 1817 didn't jump to line 1822, because the loop on line 1817 didn't complete

1818 if lookup in allComponents: 1818 ↛ 1817line 1818 didn't jump to line 1817, because the condition on line 1818 was never false

1819 getInfo = allComponents[lookup] 

1820 break 

1821 else: 

1822 raise FileNotFoundError(f"Component {refComponent} not found " 

1823 f"for ref {ref} in datastore {self.name}") 

1824 

1825 # Do not need the component itself if already disassembled 

1826 if isDisassembled: 

1827 isComponent = False 

1828 else: 

1829 isComponent = getInfo.component is not None 

1830 

1831 # For a component read of a composite we want the cache to 

1832 # be looking at the composite ref itself. 

1833 cache_ref = ref.makeCompositeRef() if isComponent else ref 

1834 

1835 # For a disassembled component we can validate parametersagainst 

1836 # the component storage class directly 

1837 if isDisassembled: 

1838 refStorageClass.validateParameters(parameters) 

1839 else: 

1840 # For an assembled composite this could be a derived 

1841 # component derived from a real component. The validity 

1842 # of the parameters is not clear. For now validate against 

1843 # the composite storage class 

1844 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

1845 

1846 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, 

1847 cache_ref=cache_ref) 

1848 

1849 @transactional 

1850 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

1851 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

1852 

1853 Parameters 

1854 ---------- 

1855 inMemoryDataset : `object` 

1856 The dataset to store. 

1857 ref : `DatasetRef` 

1858 Reference to the associated Dataset. 

1859 

1860 Raises 

1861 ------ 

1862 TypeError 

1863 Supplied object and storage class are inconsistent. 

1864 DatasetTypeNotSupportedError 

1865 The associated `DatasetType` is not handled by this datastore. 

1866 

1867 Notes 

1868 ----- 

1869 If the datastore is configured to reject certain dataset types it 

1870 is possible that the put will fail and raise a 

1871 `DatasetTypeNotSupportedError`. The main use case for this is to 

1872 allow `ChainedDatastore` to put to multiple datastores without 

1873 requiring that every datastore accepts the dataset. 

1874 """ 

1875 

1876 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1877 # doDisassembly = True 

1878 

1879 artifacts = [] 

1880 if doDisassembly: 

1881 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

1882 for component, componentInfo in components.items(): 

1883 # Don't recurse because we want to take advantage of 

1884 # bulk insert -- need a new DatasetRef that refers to the 

1885 # same dataset_id but has the component DatasetType 

1886 # DatasetType does not refer to the types of components 

1887 # So we construct one ourselves. 

1888 compRef = ref.makeComponentRef(component) 

1889 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

1890 artifacts.append((compRef, storedInfo)) 

1891 else: 

1892 # Write the entire thing out 

1893 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

1894 artifacts.append((ref, storedInfo)) 

1895 

1896 self._register_datasets(artifacts) 

1897 

1898 @transactional 

1899 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None: 

1900 # At this point can safely remove these datasets from the cache 

1901 # to avoid confusion later on. If they are not trashed later 

1902 # the cache will simply be refilled. 

1903 self.cacheManager.remove_from_cache(ref) 

1904 

1905 # If we are in trust mode there will be nothing to move to 

1906 # the trash table and we will have to try to delete the file 

1907 # immediately. 

1908 if self.trustGetRequest: 

1909 # Try to keep the logic below for a single file trash. 

1910 if isinstance(ref, DatasetRef): 

1911 refs = {ref} 

1912 else: 

1913 # Will recreate ref at the end of this branch. 

1914 refs = set(ref) 

1915 

1916 # Determine which datasets are known to datastore directly. 

1917 id_to_ref = {ref.getCheckedId(): ref for ref in refs} 

1918 existing_ids = self._get_stored_records_associated_with_refs(refs) 

1919 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids} 

1920 

1921 missing = refs - existing_refs 

1922 if missing: 

1923 # Do an explicit existence check on these refs. 

1924 # We only care about the artifacts at this point and not 

1925 # the dataset existence. 

1926 artifact_existence: Dict[ButlerURI, bool] = {} 

1927 _ = self.mexists(missing, artifact_existence) 

1928 uris = [uri for uri, exists in artifact_existence.items() if exists] 

1929 

1930 # FUTURE UPGRADE: Implement a parallelized bulk remove. 

1931 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris)) 

1932 for uri in uris: 

1933 try: 

1934 uri.remove() 

1935 except Exception as e: 

1936 if ignore_errors: 

1937 log.debug("Artifact %s could not be removed: %s", uri, e) 

1938 continue 

1939 raise 

1940 

1941 # There is no point asking the code below to remove refs we 

1942 # know are missing so update it with the list of existing 

1943 # records. Try to retain one vs many logic. 

1944 if not existing_refs: 

1945 # Nothing more to do since none of the datasets were 

1946 # known to the datastore record table. 

1947 return 

1948 ref = list(existing_refs) 

1949 if len(ref) == 1: 

1950 ref = ref[0] 

1951 

1952 # Get file metadata and internal metadata 

1953 if not isinstance(ref, DatasetRef): 

1954 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

1955 # Assumed to be an iterable of refs so bulk mode enabled. 

1956 try: 

1957 self.bridge.moveToTrash(ref) 

1958 except Exception as e: 

1959 if ignore_errors: 

1960 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

1961 else: 

1962 raise 

1963 return 

1964 

1965 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

1966 

1967 fileLocations = self._get_dataset_locations_info(ref) 

1968 

1969 if not fileLocations: 

1970 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

1971 if ignore_errors: 

1972 log.warning(err_msg) 

1973 return 

1974 else: 

1975 raise FileNotFoundError(err_msg) 

1976 

1977 for location, storedFileInfo in fileLocations: 

1978 if not self._artifact_exists(location): 1978 ↛ 1979line 1978 didn't jump to line 1979, because the condition on line 1978 was never true

1979 err_msg = f"Dataset is known to datastore {self.name} but " \ 

1980 f"associated artifact ({location.uri}) is missing" 

1981 if ignore_errors: 

1982 log.warning(err_msg) 

1983 return 

1984 else: 

1985 raise FileNotFoundError(err_msg) 

1986 

1987 # Mark dataset as trashed 

1988 try: 

1989 self.bridge.moveToTrash([ref]) 

1990 except Exception as e: 

1991 if ignore_errors: 

1992 log.warning("Attempted to mark dataset (%s) to be trashed in datastore %s " 

1993 "but encountered an error: %s", ref, self.name, e) 

1994 pass 

1995 else: 

1996 raise 

1997 

1998 @transactional 

1999 def emptyTrash(self, ignore_errors: bool = True) -> None: 

2000 """Remove all datasets from the trash. 

2001 

2002 Parameters 

2003 ---------- 

2004 ignore_errors : `bool` 

2005 If `True` return without error even if something went wrong. 

2006 Problems could occur if another process is simultaneously trying 

2007 to delete. 

2008 """ 

2009 log.debug("Emptying trash in datastore %s", self.name) 

2010 

2011 # Context manager will empty trash iff we finish it without raising. 

2012 # It will also automatically delete the relevant rows from the 

2013 # trash table and the records table. 

2014 with self.bridge.emptyTrash(self._table, record_class=StoredFileInfo, 

2015 record_column="path") as trash_data: 

2016 # Removing the artifacts themselves requires that the files are 

2017 # not also associated with refs that are not to be trashed. 

2018 # Therefore need to do a query with the file paths themselves 

2019 # and return all the refs associated with them. Can only delete 

2020 # a file if the refs to be trashed are the only refs associated 

2021 # with the file. 

2022 # This requires multiple copies of the trashed items 

2023 trashed, artifacts_to_keep = trash_data 

2024 

2025 if artifacts_to_keep is None: 

2026 # The bridge is not helping us so have to work it out 

2027 # ourselves. This is not going to be as efficient. 

2028 trashed = list(trashed) 

2029 

2030 # The instance check is for mypy since up to this point it 

2031 # does not know the type of info. 

2032 path_map = self._refs_associated_with_artifacts([info.path for _, info in trashed 

2033 if isinstance(info, StoredFileInfo)]) 

2034 

2035 for ref, info in trashed: 

2036 

2037 # Mypy needs to know this is not the base class 

2038 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2039 

2040 # Check for mypy 

2041 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

2042 

2043 path_map[info.path].remove(ref.id) 

2044 if not path_map[info.path]: 2044 ↛ 2035line 2044 didn't jump to line 2035, because the condition on line 2044 was never false

2045 del path_map[info.path] 

2046 

2047 artifacts_to_keep = set(path_map) 

2048 

2049 for ref, info in trashed: 

2050 

2051 # Should not happen for this implementation but need 

2052 # to keep mypy happy. 

2053 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

2054 

2055 # Mypy needs to know this is not the base class 

2056 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2057 

2058 # Check for mypy 

2059 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

2060 

2061 if info.path in artifacts_to_keep: 

2062 # This is a multi-dataset artifact and we are not 

2063 # removing all associated refs. 

2064 continue 

2065 

2066 # Only trashed refs still known to datastore will be returned. 

2067 location = info.file_location(self.locationFactory) 

2068 

2069 # Point of no return for this artifact 

2070 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

2071 try: 

2072 self._delete_artifact(location) 

2073 except FileNotFoundError: 

2074 # If the file itself has been deleted there is nothing 

2075 # we can do about it. It is possible that trash has 

2076 # been run in parallel in another process or someone 

2077 # decided to delete the file. It is unlikely to come 

2078 # back and so we should still continue with the removal 

2079 # of the entry from the trash table. It is also possible 

2080 # we removed it in a previous iteration if it was 

2081 # a multi-dataset artifact. The delete artifact method 

2082 # will log a debug message in this scenario. 

2083 # Distinguishing file missing before trash started and 

2084 # file already removed previously as part of this trash 

2085 # is not worth the distinction with regards to potential 

2086 # memory cost. 

2087 pass 

2088 except Exception as e: 

2089 if ignore_errors: 

2090 # Use a debug message here even though it's not 

2091 # a good situation. In some cases this can be 

2092 # caused by a race between user A and user B 

2093 # and neither of them has permissions for the 

2094 # other's files. Butler does not know about users 

2095 # and trash has no idea what collections these 

2096 # files were in (without guessing from a path). 

2097 log.debug("Encountered error removing artifact %s from datastore %s: %s", 

2098 location.uri, self.name, e) 

2099 else: 

2100 raise 

2101 

2102 @transactional 

2103 def transfer_from(self, source_datastore: Datastore, refs: Iterable[DatasetRef], 

2104 local_refs: Optional[Iterable[DatasetRef]] = None, 

2105 transfer: str = "auto", 

2106 artifact_existence: Optional[Dict[ButlerURI, bool]] = None) -> None: 

2107 # Docstring inherited 

2108 if type(self) is not type(source_datastore): 

2109 raise TypeError(f"Datastore mismatch between this datastore ({type(self)}) and the " 

2110 f"source datastore ({type(source_datastore)}).") 

2111 

2112 # Be explicit for mypy 

2113 if not isinstance(source_datastore, FileDatastore): 2113 ↛ 2114line 2113 didn't jump to line 2114, because the condition on line 2113 was never true

2114 raise TypeError("Can only transfer to a FileDatastore from another FileDatastore, not" 

2115 f" {type(source_datastore)}") 

2116 

2117 # Stop early if "direct" transfer mode is requested. That would 

2118 # require that the URI inside the source datastore should be stored 

2119 # directly in the target datastore, which seems unlikely to be useful 

2120 # since at any moment the source datastore could delete the file. 

2121 if transfer in ("direct", "split"): 

2122 raise ValueError(f"Can not transfer from a source datastore using {transfer} mode since" 

2123 " those files are controlled by the other datastore.") 

2124 

2125 # Empty existence lookup if none given. 

2126 if artifact_existence is None: 

2127 artifact_existence = {} 

2128 

2129 # We will go through the list multiple times so must convert 

2130 # generators to lists. 

2131 refs = list(refs) 

2132 

2133 if local_refs is None: 

2134 local_refs = refs 

2135 else: 

2136 local_refs = list(local_refs) 

2137 

2138 # In order to handle disassembled composites the code works 

2139 # at the records level since it can assume that internal APIs 

2140 # can be used. 

2141 # - If the record already exists in the destination this is assumed 

2142 # to be okay. 

2143 # - If there is no record but the source and destination URIs are 

2144 # identical no transfer is done but the record is added. 

2145 # - If the source record refers to an absolute URI currently assume 

2146 # that that URI should remain absolute and will be visible to the 

2147 # destination butler. May need to have a flag to indicate whether 

2148 # the dataset should be transferred. This will only happen if 

2149 # the detached Butler has had a local ingest. 

2150 

2151 # What we really want is all the records in the source datastore 

2152 # associated with these refs. Or derived ones if they don't exist 

2153 # in the source. 

2154 source_records = source_datastore._get_stored_records_associated_with_refs(refs) 

2155 

2156 # The source dataset_ids are the keys in these records 

2157 source_ids = set(source_records) 

2158 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

2159 

2160 # The not None check is to appease mypy 

2161 requested_ids = set(ref.id for ref in refs if ref.id is not None) 

2162 missing_ids = requested_ids - source_ids 

2163 

2164 # Missing IDs can be okay if that datastore has allowed 

2165 # gets based on file existence. Should we transfer what we can 

2166 # or complain about it and warn? 

2167 if missing_ids and not source_datastore.trustGetRequest: 2167 ↛ 2168line 2167 didn't jump to line 2168, because the condition on line 2167 was never true

2168 raise ValueError(f"Some datasets are missing from source datastore {source_datastore}:" 

2169 f" {missing_ids}") 

2170 

2171 # Need to map these missing IDs to a DatasetRef so we can guess 

2172 # the details. 

2173 if missing_ids: 

2174 log.info("Number of expected datasets missing from source datastore records: %d out of %d", 

2175 len(missing_ids), len(requested_ids)) 

2176 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

2177 

2178 # This should be chunked in case we end up having to check 

2179 # the file store since we need some log output to show 

2180 # progress. 

2181 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000): 

2182 records = {} 

2183 for missing in missing_ids_chunk: 

2184 # Ask the source datastore where the missing artifacts 

2185 # should be. An execution butler might not know about the 

2186 # artifacts even if they are there. 

2187 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) 

2188 records[missing] = [info for _, info in expected] 

2189 

2190 # Call the mexist helper method in case we have not already 

2191 # checked these artifacts such that artifact_existence is 

2192 # empty. This allows us to benefit from parallelism. 

2193 # datastore.mexists() itself does not give us access to the 

2194 # derived datastore record. 

2195 log.log(VERBOSE, "Checking existence of %d datasets unknown to datastore", 

2196 len(records)) 

2197 ref_exists = source_datastore._process_mexists_records(id_to_ref, records, False, 

2198 artifact_existence=artifact_existence) 

2199 

2200 # Now go through the records and propagate the ones that exist. 

2201 location_factory = source_datastore.locationFactory 

2202 for missing, record_list in records.items(): 

2203 # Skip completely if the ref does not exist. 

2204 ref = id_to_ref[missing] 

2205 if not ref_exists[ref]: 

2206 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", 

2207 ref) 

2208 continue 

2209 # Check for file artifact to decide which parts of a 

2210 # disassembled composite do exist. If there is only a 

2211 # single record we don't even need to look because it can't 

2212 # be a composite and must exist. 

2213 if len(record_list) == 1: 

2214 dataset_records = record_list 

2215 else: 

2216 dataset_records = [record for record in record_list 

2217 if artifact_existence[record.file_location(location_factory).uri]] 

2218 assert len(dataset_records) > 0, "Disassembled composite should have had some files." 

2219 

2220 # Rely on source_records being a defaultdict. 

2221 source_records[missing].extend(dataset_records) 

2222 

2223 # See if we already have these records 

2224 target_records = self._get_stored_records_associated_with_refs(local_refs) 

2225 

2226 # The artifacts to register 

2227 artifacts = [] 

2228 

2229 # Refs that already exist 

2230 already_present = [] 

2231 

2232 # Now can transfer the artifacts 

2233 for source_ref, target_ref in zip(refs, local_refs): 

2234 if target_ref.id in target_records: 

2235 # Already have an artifact for this. 

2236 already_present.append(target_ref) 

2237 continue 

2238 

2239 # mypy needs to know these are always resolved refs 

2240 for info in source_records[source_ref.getCheckedId()]: 

2241 source_location = info.file_location(source_datastore.locationFactory) 

2242 target_location = info.file_location(self.locationFactory) 

2243 if source_location == target_location: 2243 ↛ 2247line 2243 didn't jump to line 2247, because the condition on line 2243 was never true

2244 # Either the dataset is already in the target datastore 

2245 # (which is how execution butler currently runs) or 

2246 # it is an absolute URI. 

2247 if source_location.pathInStore.isabs(): 

2248 # Just because we can see the artifact when running 

2249 # the transfer doesn't mean it will be generally 

2250 # accessible to a user of this butler. For now warn 

2251 # but assume it will be accessible. 

2252 log.warning("Transfer request for an outside-datastore artifact has been found at %s", 

2253 source_location) 

2254 else: 

2255 # Need to transfer it to the new location. 

2256 # Assume we should always overwrite. If the artifact 

2257 # is there this might indicate that a previous transfer 

2258 # was interrupted but was not able to be rolled back 

2259 # completely (eg pre-emption) so follow Datastore default 

2260 # and overwrite. 

2261 target_location.uri.transfer_from(source_location.uri, transfer=transfer, 

2262 overwrite=True, transaction=self._transaction) 

2263 

2264 artifacts.append((target_ref, info)) 

2265 

2266 self._register_datasets(artifacts) 

2267 

2268 if already_present: 

2269 n_skipped = len(already_present) 

2270 log.info("Skipped transfer of %d dataset%s already present in datastore", n_skipped, 

2271 "" if n_skipped == 1 else "s") 

2272 

2273 @transactional 

2274 def forget(self, refs: Iterable[DatasetRef]) -> None: 

2275 # Docstring inherited. 

2276 refs = list(refs) 

2277 self.bridge.forget(refs) 

2278 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs]) 

2279 

2280 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], 

2281 logFailures: bool = False) -> None: 

2282 """Validate some of the configuration for this datastore. 

2283 

2284 Parameters 

2285 ---------- 

2286 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

2287 Entities to test against this configuration. Can be differing 

2288 types. 

2289 logFailures : `bool`, optional 

2290 If `True`, output a log message for every validation error 

2291 detected. 

2292 

2293 Raises 

2294 ------ 

2295 DatastoreValidationError 

2296 Raised if there is a validation problem with a configuration. 

2297 All the problems are reported in a single exception. 

2298 

2299 Notes 

2300 ----- 

2301 This method checks that all the supplied entities have valid file 

2302 templates and also have formatters defined. 

2303 """ 

2304 

2305 templateFailed = None 

2306 try: 

2307 self.templates.validateTemplates(entities, logFailures=logFailures) 

2308 except FileTemplateValidationError as e: 

2309 templateFailed = str(e) 

2310 

2311 formatterFailed = [] 

2312 for entity in entities: 

2313 try: 

2314 self.formatterFactory.getFormatterClass(entity) 

2315 except KeyError as e: 

2316 formatterFailed.append(str(e)) 

2317 if logFailures: 2317 ↛ 2312line 2317 didn't jump to line 2312, because the condition on line 2317 was never false

2318 log.critical("Formatter failure: %s", e) 

2319 

2320 if templateFailed or formatterFailed: 

2321 messages = [] 

2322 if templateFailed: 2322 ↛ 2323line 2322 didn't jump to line 2323, because the condition on line 2322 was never true

2323 messages.append(templateFailed) 

2324 if formatterFailed: 2324 ↛ 2326line 2324 didn't jump to line 2326, because the condition on line 2324 was never false

2325 messages.append(",".join(formatterFailed)) 

2326 msg = ";\n".join(messages) 

2327 raise DatastoreValidationError(msg) 

2328 

2329 def getLookupKeys(self) -> Set[LookupKey]: 

2330 # Docstring is inherited from base class 

2331 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \ 

2332 self.constraints.getLookupKeys() 

2333 

2334 def validateKey(self, lookupKey: LookupKey, 

2335 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

2336 # Docstring is inherited from base class 

2337 # The key can be valid in either formatters or templates so we can 

2338 # only check the template if it exists 

2339 if lookupKey in self.templates: 

2340 try: 

2341 self.templates[lookupKey].validateTemplate(entity) 

2342 except FileTemplateValidationError as e: 

2343 raise DatastoreValidationError(e) from e 

2344 

2345 def export(self, refs: Iterable[DatasetRef], *, 

2346 directory: Optional[Union[ButlerURI, str]] = None, 

2347 transfer: Optional[str] = "auto") -> Iterable[FileDataset]: 

2348 # Docstring inherited from Datastore.export. 

2349 if transfer is not None and directory is None: 2349 ↛ 2350line 2349 didn't jump to line 2350, because the condition on line 2349 was never true

2350 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no " 

2351 "export directory given") 

2352 

2353 # Force the directory to be a URI object 

2354 directoryUri: Optional[ButlerURI] = None 

2355 if directory is not None: 2355 ↛ 2358line 2355 didn't jump to line 2358, because the condition on line 2355 was never false

2356 directoryUri = ButlerURI(directory, forceDirectory=True) 

2357 

2358 if transfer is not None and directoryUri is not None: 2358 ↛ 2363line 2358 didn't jump to line 2363, because the condition on line 2358 was never false

2359 # mypy needs the second test 

2360 if not directoryUri.exists(): 2360 ↛ 2361line 2360 didn't jump to line 2361, because the condition on line 2360 was never true

2361 raise FileNotFoundError(f"Export location {directory} does not exist") 

2362 

2363 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

2364 for ref in progress.wrap(refs, "Exporting dataset files"): 

2365 fileLocations = self._get_dataset_locations_info(ref) 

2366 if not fileLocations: 2366 ↛ 2367line 2366 didn't jump to line 2367, because the condition on line 2366 was never true

2367 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

2368 # For now we can not export disassembled datasets 

2369 if len(fileLocations) > 1: 

2370 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

2371 location, storedFileInfo = fileLocations[0] 

2372 

2373 pathInStore = location.pathInStore.path 

2374 if transfer is None: 2374 ↛ 2378line 2374 didn't jump to line 2378, because the condition on line 2374 was never true

2375 # TODO: do we also need to return the readStorageClass somehow? 

2376 # We will use the path in store directly. If this is an 

2377 # absolute URI, preserve it. 

2378 if location.pathInStore.isabs(): 

2379 pathInStore = str(location.uri) 

2380 elif transfer == "direct": 2380 ↛ 2382line 2380 didn't jump to line 2382, because the condition on line 2380 was never true

2381 # Use full URIs to the remote store in the export 

2382 pathInStore = str(location.uri) 

2383 else: 

2384 # mypy needs help 

2385 assert directoryUri is not None, "directoryUri must be defined to get here" 

2386 storeUri = ButlerURI(location.uri) 

2387 

2388 # if the datastore has an absolute URI to a resource, we 

2389 # have two options: 

2390 # 1. Keep the absolute URI in the exported YAML 

2391 # 2. Allocate a new name in the local datastore and transfer 

2392 # it. 

2393 # For now go with option 2 

2394 if location.pathInStore.isabs(): 2394 ↛ 2395line 2394 didn't jump to line 2395, because the condition on line 2394 was never true

2395 template = self.templates.getTemplate(ref) 

2396 newURI = ButlerURI(template.format(ref), forceAbsolute=False) 

2397 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

2398 

2399 exportUri = directoryUri.join(pathInStore) 

2400 exportUri.transfer_from(storeUri, transfer=transfer) 

2401 

2402 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

2403 

2404 @staticmethod 

2405 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]: 

2406 """Compute the checksum of the supplied file. 

2407 

2408 Parameters 

2409 ---------- 

2410 uri : `ButlerURI` 

2411 Name of resource to calculate checksum from. 

2412 algorithm : `str`, optional 

2413 Name of algorithm to use. Must be one of the algorithms supported 

2414 by :py:class`hashlib`. 

2415 block_size : `int` 

2416 Number of bytes to read from file at one time. 

2417 

2418 Returns 

2419 ------- 

2420 hexdigest : `str` 

2421 Hex digest of the file. 

2422 

2423 Notes 

2424 ----- 

2425 Currently returns None if the URI is for a remote resource. 

2426 """ 

2427 if algorithm not in hashlib.algorithms_guaranteed: 2427 ↛ 2428line 2427 didn't jump to line 2428, because the condition on line 2427 was never true

2428 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

2429 

2430 if not uri.isLocal: 2430 ↛ 2431line 2430 didn't jump to line 2431, because the condition on line 2430 was never true

2431 return None 

2432 

2433 hasher = hashlib.new(algorithm) 

2434 

2435 with uri.as_local() as local_uri: 

2436 with open(local_uri.ospath, "rb") as f: 

2437 for chunk in iter(lambda: f.read(block_size), b""): 

2438 hasher.update(chunk) 

2439 

2440 return hasher.hexdigest() 

2441 

2442 def needs_expanded_data_ids( 

2443 self, 

2444 transfer: Optional[str], 

2445 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

2446 ) -> bool: 

2447 # Docstring inherited. 

2448 # This _could_ also use entity to inspect whether the filename template 

2449 # involves placeholders other than the required dimensions for its 

2450 # dataset type, but that's not necessary for correctness; it just 

2451 # enables more optimizations (perhaps only in theory). 

2452 return transfer not in ("direct", None)