Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileDatastore", ) 

26 

27import hashlib 

28import logging 

29import os 

30import tempfile 

31 

32from sqlalchemy import BigInteger, String 

33 

34from collections import defaultdict 

35from dataclasses import dataclass 

36from typing import ( 

37 TYPE_CHECKING, 

38 Any, 

39 ClassVar, 

40 Dict, 

41 Iterable, 

42 List, 

43 Mapping, 

44 Optional, 

45 Set, 

46 Tuple, 

47 Type, 

48 Union, 

49) 

50 

51from lsst.daf.butler import ( 

52 ButlerURI, 

53 CompositesMap, 

54 Config, 

55 FileDataset, 

56 DatasetId, 

57 DatasetRef, 

58 DatasetType, 

59 DatasetTypeNotSupportedError, 

60 Datastore, 

61 DatastoreCacheManager, 

62 DatastoreDisabledCacheManager, 

63 DatastoreConfig, 

64 DatastoreValidationError, 

65 FileDescriptor, 

66 FileTemplates, 

67 FileTemplateValidationError, 

68 Formatter, 

69 FormatterFactory, 

70 Location, 

71 LocationFactory, 

72 Progress, 

73 StorageClass, 

74 StoredFileInfo, 

75) 

76 

77from lsst.daf.butler import ddl 

78from lsst.daf.butler.registry.interfaces import ( 

79 ReadOnlyDatabaseError, 

80 DatastoreRegistryBridge, 

81) 

82 

83from lsst.daf.butler.core.repoRelocation import replaceRoot 

84from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional 

85from .genericDatastore import GenericBaseDatastore 

86 

87if TYPE_CHECKING: 87 ↛ 88line 87 didn't jump to line 88, because the condition on line 87 was never true

88 from lsst.daf.butler import LookupKey, AbstractDatastoreCacheManager 

89 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

90 

91log = logging.getLogger(__name__) 

92 

93 

94class _IngestPrepData(Datastore.IngestPrepData): 

95 """Helper class for FileDatastore ingest implementation. 

96 

97 Parameters 

98 ---------- 

99 datasets : `list` of `FileDataset` 

100 Files to be ingested by this datastore. 

101 """ 

102 def __init__(self, datasets: List[FileDataset]): 

103 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

104 self.datasets = datasets 

105 

106 

107@dataclass(frozen=True) 

108class DatastoreFileGetInformation: 

109 """Collection of useful parameters needed to retrieve a file from 

110 a Datastore. 

111 """ 

112 

113 location: Location 

114 """The location from which to read the dataset.""" 

115 

116 formatter: Formatter 

117 """The `Formatter` to use to deserialize the dataset.""" 

118 

119 info: StoredFileInfo 

120 """Stored information about this file and its formatter.""" 

121 

122 assemblerParams: Dict[str, Any] 

123 """Parameters to use for post-processing the retrieved dataset.""" 

124 

125 formatterParams: Dict[str, Any] 

126 """Parameters that were understood by the associated formatter.""" 

127 

128 component: Optional[str] 

129 """The component to be retrieved (can be `None`).""" 

130 

131 readStorageClass: StorageClass 

132 """The `StorageClass` of the dataset being read.""" 

133 

134 

135class FileDatastore(GenericBaseDatastore): 

136 """Generic Datastore for file-based implementations. 

137 

138 Should always be sub-classed since key abstract methods are missing. 

139 

140 Parameters 

141 ---------- 

142 config : `DatastoreConfig` or `str` 

143 Configuration as either a `Config` object or URI to file. 

144 bridgeManager : `DatastoreRegistryBridgeManager` 

145 Object that manages the interface between `Registry` and datastores. 

146 butlerRoot : `str`, optional 

147 New datastore root to use to override the configuration value. 

148 

149 Raises 

150 ------ 

151 ValueError 

152 If root location does not exist and ``create`` is `False` in the 

153 configuration. 

154 """ 

155 

156 defaultConfigFile: ClassVar[Optional[str]] = None 

157 """Path to configuration defaults. Accessed within the ``config`` resource 

158 or relative to a search path. Can be None if no defaults specified. 

159 """ 

160 

161 root: ButlerURI 

162 """Root directory URI of this `Datastore`.""" 

163 

164 locationFactory: LocationFactory 

165 """Factory for creating locations relative to the datastore root.""" 

166 

167 formatterFactory: FormatterFactory 

168 """Factory for creating instances of formatters.""" 

169 

170 templates: FileTemplates 

171 """File templates that can be used by this `Datastore`.""" 

172 

173 composites: CompositesMap 

174 """Determines whether a dataset should be disassembled on put.""" 

175 

176 defaultConfigFile = "datastores/fileDatastore.yaml" 

177 """Path to configuration defaults. Accessed within the ``config`` resource 

178 or relative to a search path. Can be None if no defaults specified. 

179 """ 

180 

181 @classmethod 

182 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

183 """Set any filesystem-dependent config options for this Datastore to 

184 be appropriate for a new empty repository with the given root. 

185 

186 Parameters 

187 ---------- 

188 root : `str` 

189 URI to the root of the data repository. 

190 config : `Config` 

191 A `Config` to update. Only the subset understood by 

192 this component will be updated. Will not expand 

193 defaults. 

194 full : `Config` 

195 A complete config with all defaults expanded that can be 

196 converted to a `DatastoreConfig`. Read-only and will not be 

197 modified by this method. 

198 Repository-specific options that should not be obtained 

199 from defaults when Butler instances are constructed 

200 should be copied from ``full`` to ``config``. 

201 overwrite : `bool`, optional 

202 If `False`, do not modify a value in ``config`` if the value 

203 already exists. Default is always to overwrite with the provided 

204 ``root``. 

205 

206 Notes 

207 ----- 

208 If a keyword is explicitly defined in the supplied ``config`` it 

209 will not be overridden by this method if ``overwrite`` is `False`. 

210 This allows explicit values set in external configs to be retained. 

211 """ 

212 Config.updateParameters(DatastoreConfig, config, full, 

213 toUpdate={"root": root}, 

214 toCopy=("cls", ("records", "table")), overwrite=overwrite) 

215 

216 @classmethod 

217 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

218 return ddl.TableSpec( 

219 fields=[ 

220 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

221 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

222 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

223 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

224 # Use empty string to indicate no component 

225 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

226 # TODO: should checksum be Base64Bytes instead? 

227 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

228 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

229 ], 

230 unique=frozenset(), 

231 indexes=[tuple(["path"])], 

232 ) 

233 

234 def __init__(self, config: Union[DatastoreConfig, str], 

235 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

236 super().__init__(config, bridgeManager) 

237 if "root" not in self.config: 237 ↛ 238line 237 didn't jump to line 238, because the condition on line 237 was never true

238 raise ValueError("No root directory specified in configuration") 

239 

240 # Name ourselves either using an explicit name or a name 

241 # derived from the (unexpanded) root 

242 if "name" in self.config: 

243 self.name = self.config["name"] 

244 else: 

245 # We use the unexpanded root in the name to indicate that this 

246 # datastore can be moved without having to update registry. 

247 self.name = "{}@{}".format(type(self).__name__, 

248 self.config["root"]) 

249 

250 # Support repository relocation in config 

251 # Existence of self.root is checked in subclass 

252 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot), 

253 forceDirectory=True, forceAbsolute=True) 

254 

255 self.locationFactory = LocationFactory(self.root) 

256 self.formatterFactory = FormatterFactory() 

257 

258 # Now associate formatters with storage classes 

259 self.formatterFactory.registerFormatters(self.config["formatters"], 

260 universe=bridgeManager.universe) 

261 

262 # Read the file naming templates 

263 self.templates = FileTemplates(self.config["templates"], 

264 universe=bridgeManager.universe) 

265 

266 # See if composites should be disassembled 

267 self.composites = CompositesMap(self.config["composites"], 

268 universe=bridgeManager.universe) 

269 

270 tableName = self.config["records", "table"] 

271 try: 

272 # Storage of paths and formatters, keyed by dataset_id 

273 self._table = bridgeManager.opaque.register( 

274 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType)) 

275 # Interface to Registry. 

276 self._bridge = bridgeManager.register(self.name) 

277 except ReadOnlyDatabaseError: 

278 # If the database is read only and we just tried and failed to 

279 # create a table, it means someone is trying to create a read-only 

280 # butler client for an empty repo. That should be okay, as long 

281 # as they then try to get any datasets before some other client 

282 # creates the table. Chances are they'rejust validating 

283 # configuration. 

284 pass 

285 

286 # Determine whether checksums should be used - default to False 

287 self.useChecksum = self.config.get("checksum", False) 

288 

289 # Determine whether we can fall back to configuration if a 

290 # requested dataset is not known to registry 

291 self.trustGetRequest = self.config.get("trust_get_request", False) 

292 

293 # Create a cache manager 

294 self.cacheManager: AbstractDatastoreCacheManager 

295 if "cached" in self.config: 295 ↛ 299line 295 didn't jump to line 299, because the condition on line 295 was never false

296 self.cacheManager = DatastoreCacheManager(self.config["cached"], 

297 universe=bridgeManager.universe) 

298 else: 

299 self.cacheManager = DatastoreDisabledCacheManager("", 

300 universe=bridgeManager.universe) 

301 

302 # Check existence and create directory structure if necessary 

303 if not self.root.exists(): 

304 if "create" not in self.config or not self.config["create"]: 304 ↛ 305line 304 didn't jump to line 305, because the condition on line 304 was never true

305 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

306 try: 

307 self.root.mkdir() 

308 except Exception as e: 

309 raise ValueError(f"Can not create datastore root '{self.root}', check permissions." 

310 f" Got error: {e}") from e 

311 

312 def __str__(self) -> str: 

313 return str(self.root) 

314 

315 @property 

316 def bridge(self) -> DatastoreRegistryBridge: 

317 return self._bridge 

318 

319 def _artifact_exists(self, location: Location) -> bool: 

320 """Check that an artifact exists in this datastore at the specified 

321 location. 

322 

323 Parameters 

324 ---------- 

325 location : `Location` 

326 Expected location of the artifact associated with this datastore. 

327 

328 Returns 

329 ------- 

330 exists : `bool` 

331 True if the location can be found, false otherwise. 

332 """ 

333 log.debug("Checking if resource exists: %s", location.uri) 

334 return location.uri.exists() 

335 

336 def _delete_artifact(self, location: Location) -> None: 

337 """Delete the artifact from the datastore. 

338 

339 Parameters 

340 ---------- 

341 location : `Location` 

342 Location of the artifact associated with this datastore. 

343 """ 

344 if location.pathInStore.isabs(): 344 ↛ 345line 344 didn't jump to line 345, because the condition on line 344 was never true

345 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

346 

347 try: 

348 location.uri.remove() 

349 except FileNotFoundError: 

350 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

351 raise 

352 except Exception as e: 

353 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

354 raise 

355 log.debug("Successfully deleted file: %s", location.uri) 

356 

357 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

358 # Docstring inherited from GenericBaseDatastore 

359 records = [info.to_record(ref) for ref, info in zip(refs, infos)] 

360 self._table.insert(*records) 

361 

362 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

363 # Docstring inherited from GenericBaseDatastore 

364 

365 # Look for the dataset_id -- there might be multiple matches 

366 # if we have disassembled the dataset. 

367 records = self._table.fetch(dataset_id=ref.id) 

368 return [StoredFileInfo.from_record(record) for record in records] 

369 

370 def _get_stored_records_associated_with_refs(self, 

371 refs: Iterable[DatasetIdRef] 

372 ) -> Dict[DatasetId, List[StoredFileInfo]]: 

373 """Retrieve all records associated with the provided refs. 

374 

375 Parameters 

376 ---------- 

377 refs : iterable of `DatasetIdRef` 

378 The refs for which records are to be retrieved. 

379 

380 Returns 

381 ------- 

382 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

383 The matching records indexed by the ref ID. The number of entries 

384 in the dict can be smaller than the number of requested refs. 

385 """ 

386 records = self._table.fetch(dataset_id=[ref.id for ref in refs]) 

387 

388 # Uniqueness is dataset_id + component so can have multiple records 

389 # per ref. 

390 records_by_ref = defaultdict(list) 

391 for record in records: 

392 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

393 return records_by_ref 

394 

395 def _refs_associated_with_artifacts(self, paths: List[Union[str, ButlerURI]]) -> Dict[str, 

396 Set[DatasetId]]: 

397 """Return paths and associated dataset refs. 

398 

399 Parameters 

400 ---------- 

401 paths : `list` of `str` or `ButlerURI` 

402 All the paths to include in search. 

403 

404 Returns 

405 ------- 

406 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

407 Mapping of each path to a set of associated database IDs. 

408 """ 

409 records = self._table.fetch(path=[str(path) for path in paths]) 

410 result = defaultdict(set) 

411 for row in records: 

412 result[row["path"]].add(row["dataset_id"]) 

413 return result 

414 

415 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[DatasetId]: 

416 """Return all dataset refs associated with the supplied path. 

417 

418 Parameters 

419 ---------- 

420 pathInStore : `ButlerURI` 

421 Path of interest in the data store. 

422 

423 Returns 

424 ------- 

425 ids : `set` of `int` 

426 All `DatasetRef` IDs associated with this path. 

427 """ 

428 records = list(self._table.fetch(path=str(pathInStore))) 

429 ids = {r["dataset_id"] for r in records} 

430 return ids 

431 

432 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

433 # Docstring inherited from GenericBaseDatastore 

434 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

435 

436 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

437 r"""Find all the `Location`\ s of the requested dataset in the 

438 `Datastore` and the associated stored file information. 

439 

440 Parameters 

441 ---------- 

442 ref : `DatasetRef` 

443 Reference to the required `Dataset`. 

444 

445 Returns 

446 ------- 

447 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

448 Location of the dataset within the datastore and 

449 stored information about each file and its formatter. 

450 """ 

451 # Get the file information (this will fail if no file) 

452 records = self.getStoredItemsInfo(ref) 

453 

454 # Use the path to determine the location -- we need to take 

455 # into account absolute URIs in the datastore record 

456 return [(r.file_location(self.locationFactory), r) for r in records] 

457 

458 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

459 """Check that there is only one dataset associated with the 

460 specified artifact. 

461 

462 Parameters 

463 ---------- 

464 ref : `DatasetRef` or `FakeDatasetRef` 

465 Dataset to be removed. 

466 location : `Location` 

467 The location of the artifact to be removed. 

468 

469 Returns 

470 ------- 

471 can_remove : `Bool` 

472 True if the artifact can be safely removed. 

473 """ 

474 # Can't ever delete absolute URIs. 

475 if location.pathInStore.isabs(): 

476 return False 

477 

478 # Get all entries associated with this path 

479 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

480 if not allRefs: 

481 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

482 

483 # Remove these refs from all the refs and if there is nothing left 

484 # then we can delete 

485 remainingRefs = allRefs - {ref.id} 

486 

487 if remainingRefs: 

488 return False 

489 return True 

490 

491 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, 

492 StoredFileInfo]]: 

493 """Predict the location and related file information of the requested 

494 dataset in this datastore. 

495 

496 Parameters 

497 ---------- 

498 ref : `DatasetRef` 

499 Reference to the required `Dataset`. 

500 

501 Returns 

502 ------- 

503 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

504 Expected Location of the dataset within the datastore and 

505 placeholder information about each file and its formatter. 

506 

507 Notes 

508 ----- 

509 Uses the current configuration to determine how we would expect the 

510 datastore files to have been written if we couldn't ask registry. 

511 This is safe so long as there has been no change to datastore 

512 configuration between writing the dataset and wanting to read it. 

513 Will not work for files that have been ingested without using the 

514 standard file template or default formatter. 

515 """ 

516 

517 # If we have a component ref we always need to ask the questions 

518 # of the composite. If the composite is disassembled this routine 

519 # should return all components. If the composite was not 

520 # disassembled the composite is what is stored regardless of 

521 # component request. Note that if the caller has disassembled 

522 # a composite there is no way for this guess to know that 

523 # without trying both the composite and component ref and seeing 

524 # if there is something at the component Location even without 

525 # disassembly being enabled. 

526 if ref.datasetType.isComponent(): 

527 ref = ref.makeCompositeRef() 

528 

529 # See if the ref is a composite that should be disassembled 

530 doDisassembly = self.composites.shouldBeDisassembled(ref) 

531 

532 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = [] 

533 

534 if doDisassembly: 

535 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

536 compRef = ref.makeComponentRef(component) 

537 location, formatter = self._determine_put_formatter_location(compRef) 

538 all_info.append((location, formatter, componentStorage, component)) 

539 

540 else: 

541 # Always use the composite ref if no disassembly 

542 location, formatter = self._determine_put_formatter_location(ref) 

543 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

544 

545 # Convert the list of tuples to have StoredFileInfo as second element 

546 return [(location, StoredFileInfo(formatter=formatter, 

547 path=location.pathInStore.path, 

548 storageClass=storageClass, 

549 component=component, 

550 checksum=None, 

551 file_size=-1)) 

552 for location, formatter, storageClass, component in all_info] 

553 

554 def _prepare_for_get(self, ref: DatasetRef, 

555 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]: 

556 """Check parameters for ``get`` and obtain formatter and 

557 location. 

558 

559 Parameters 

560 ---------- 

561 ref : `DatasetRef` 

562 Reference to the required Dataset. 

563 parameters : `dict` 

564 `StorageClass`-specific parameters that specify, for example, 

565 a slice of the dataset to be loaded. 

566 

567 Returns 

568 ------- 

569 getInfo : `list` [`DatastoreFileGetInformation`] 

570 Parameters needed to retrieve each file. 

571 """ 

572 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

573 

574 # Get file metadata and internal metadata 

575 fileLocations = self._get_dataset_locations_info(ref) 

576 if not fileLocations: 

577 if not self.trustGetRequest: 

578 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

579 # Assume the dataset is where we think it should be 

580 fileLocations = self._get_expected_dataset_locations_info(ref) 

581 

582 # The storage class we want to use eventually 

583 refStorageClass = ref.datasetType.storageClass 

584 

585 if len(fileLocations) > 1: 

586 disassembled = True 

587 

588 # If trust is involved it is possible that there will be 

589 # components listed here that do not exist in the datastore. 

590 # Explicitly check for file artifact existence and filter out any 

591 # that are missing. 

592 if self.trustGetRequest: 

593 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()] 

594 

595 # For now complain only if we have no components at all. One 

596 # component is probably a problem but we can punt that to the 

597 # assembler. 

598 if not fileLocations: 598 ↛ 599line 598 didn't jump to line 599, because the condition on line 598 was never true

599 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") 

600 

601 else: 

602 disassembled = False 

603 

604 # Is this a component request? 

605 refComponent = ref.datasetType.component() 

606 

607 fileGetInfo = [] 

608 for location, storedFileInfo in fileLocations: 

609 

610 # The storage class used to write the file 

611 writeStorageClass = storedFileInfo.storageClass 

612 

613 # If this has been disassembled we need read to match the write 

614 if disassembled: 

615 readStorageClass = writeStorageClass 

616 else: 

617 readStorageClass = refStorageClass 

618 

619 formatter = getInstanceOf(storedFileInfo.formatter, 

620 FileDescriptor(location, readStorageClass=readStorageClass, 

621 storageClass=writeStorageClass, parameters=parameters), 

622 ref.dataId) 

623 

624 formatterParams, notFormatterParams = formatter.segregateParameters() 

625 

626 # Of the remaining parameters, extract the ones supported by 

627 # this StorageClass (for components not all will be handled) 

628 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

629 

630 # The ref itself could be a component if the dataset was 

631 # disassembled by butler, or we disassembled in datastore and 

632 # components came from the datastore records 

633 component = storedFileInfo.component if storedFileInfo.component else refComponent 

634 

635 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo, 

636 assemblerParams, formatterParams, 

637 component, readStorageClass)) 

638 

639 return fileGetInfo 

640 

641 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

642 """Check the arguments for ``put`` and obtain formatter and 

643 location. 

644 

645 Parameters 

646 ---------- 

647 inMemoryDataset : `object` 

648 The dataset to store. 

649 ref : `DatasetRef` 

650 Reference to the associated Dataset. 

651 

652 Returns 

653 ------- 

654 location : `Location` 

655 The location to write the dataset. 

656 formatter : `Formatter` 

657 The `Formatter` to use to write the dataset. 

658 

659 Raises 

660 ------ 

661 TypeError 

662 Supplied object and storage class are inconsistent. 

663 DatasetTypeNotSupportedError 

664 The associated `DatasetType` is not handled by this datastore. 

665 """ 

666 self._validate_put_parameters(inMemoryDataset, ref) 

667 return self._determine_put_formatter_location(ref) 

668 

669 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]: 

670 """Calculate the formatter and output location to use for put. 

671 

672 Parameters 

673 ---------- 

674 ref : `DatasetRef` 

675 Reference to the associated Dataset. 

676 

677 Returns 

678 ------- 

679 location : `Location` 

680 The location to write the dataset. 

681 formatter : `Formatter` 

682 The `Formatter` to use to write the dataset. 

683 """ 

684 # Work out output file name 

685 try: 

686 template = self.templates.getTemplate(ref) 

687 except KeyError as e: 

688 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

689 

690 # Validate the template to protect against filenames from different 

691 # dataIds returning the same and causing overwrite confusion. 

692 template.validateTemplate(ref) 

693 

694 location = self.locationFactory.fromPath(template.format(ref)) 

695 

696 # Get the formatter based on the storage class 

697 storageClass = ref.datasetType.storageClass 

698 try: 

699 formatter = self.formatterFactory.getFormatter(ref, 

700 FileDescriptor(location, 

701 storageClass=storageClass), 

702 ref.dataId) 

703 except KeyError as e: 

704 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore " 

705 f"{self.name}") from e 

706 

707 # Now that we know the formatter, update the location 

708 location = formatter.makeUpdatedLocation(location) 

709 

710 return location, formatter 

711 

712 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

713 # Docstring inherited from base class 

714 if transfer != "auto": 

715 return transfer 

716 

717 # See if the paths are within the datastore or not 

718 inside = [self._pathInStore(d.path) is not None for d in datasets] 

719 

720 if all(inside): 

721 transfer = None 

722 elif not any(inside): 722 ↛ 726line 722 didn't jump to line 726, because the condition on line 722 was never false

723 # Allow ButlerURI to use its own knowledge 

724 transfer = "auto" 

725 else: 

726 raise ValueError("Some datasets are inside the datastore and some are outside." 

727 " Please use an explicit transfer mode and not 'auto'.") 

728 

729 return transfer 

730 

731 def _pathInStore(self, path: Union[str, ButlerURI]) -> Optional[str]: 

732 """Return path relative to datastore root 

733 

734 Parameters 

735 ---------- 

736 path : `str` or `ButlerURI` 

737 Path to dataset. Can be absolute URI. If relative assumed to 

738 be relative to the datastore. Returns path in datastore 

739 or raises an exception if the path it outside. 

740 

741 Returns 

742 ------- 

743 inStore : `str` 

744 Path relative to datastore root. Returns `None` if the file is 

745 outside the root. 

746 """ 

747 # Relative path will always be relative to datastore 

748 pathUri = ButlerURI(path, forceAbsolute=False) 

749 return pathUri.relative_to(self.root) 

750 

751 def _standardizeIngestPath(self, path: Union[str, ButlerURI], *, 

752 transfer: Optional[str] = None) -> Union[str, ButlerURI]: 

753 """Standardize the path of a to-be-ingested file. 

754 

755 Parameters 

756 ---------- 

757 path : `str` or `ButlerURI` 

758 Path of a file to be ingested. 

759 transfer : `str`, optional 

760 How (and whether) the dataset should be added to the datastore. 

761 See `ingest` for details of transfer modes. 

762 This implementation is provided only so 

763 `NotImplementedError` can be raised if the mode is not supported; 

764 actual transfers are deferred to `_extractIngestInfo`. 

765 

766 Returns 

767 ------- 

768 path : `str` or `ButlerURI` 

769 New path in what the datastore considers standard form. If an 

770 absolute URI was given that will be returned unchanged. 

771 

772 Notes 

773 ----- 

774 Subclasses of `FileDatastore` can implement this method instead 

775 of `_prepIngest`. It should not modify the data repository or given 

776 file in any way. 

777 

778 Raises 

779 ------ 

780 NotImplementedError 

781 Raised if the datastore does not support the given transfer mode 

782 (including the case where ingest is not supported at all). 

783 FileNotFoundError 

784 Raised if one of the given files does not exist. 

785 """ 

786 if transfer not in (None, "direct") + self.root.transferModes: 786 ↛ 787line 786 didn't jump to line 787, because the condition on line 786 was never true

787 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

788 

789 # A relative URI indicates relative to datastore root 

790 srcUri = ButlerURI(path, forceAbsolute=False) 

791 if not srcUri.isabs(): 

792 srcUri = self.root.join(path) 

793 

794 if not srcUri.exists(): 

795 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest " 

796 f"are assumed to be relative to {self.root} unless they are absolute.") 

797 

798 if transfer is None: 

799 relpath = srcUri.relative_to(self.root) 

800 if not relpath: 

801 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not " 

802 f"within datastore ({self.root})") 

803 

804 # Return the relative path within the datastore for internal 

805 # transfer 

806 path = relpath 

807 

808 return path 

809 

810 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *, 

811 formatter: Union[Formatter, Type[Formatter]], 

812 transfer: Optional[str] = None) -> StoredFileInfo: 

813 """Relocate (if necessary) and extract `StoredFileInfo` from a 

814 to-be-ingested file. 

815 

816 Parameters 

817 ---------- 

818 path : `str` or `ButlerURI` 

819 URI or path of a file to be ingested. 

820 ref : `DatasetRef` 

821 Reference for the dataset being ingested. Guaranteed to have 

822 ``dataset_id not None`. 

823 formatter : `type` or `Formatter` 

824 `Formatter` subclass to use for this dataset or an instance. 

825 transfer : `str`, optional 

826 How (and whether) the dataset should be added to the datastore. 

827 See `ingest` for details of transfer modes. 

828 

829 Returns 

830 ------- 

831 info : `StoredFileInfo` 

832 Internal datastore record for this file. This will be inserted by 

833 the caller; the `_extractIngestInfo` is only resposible for 

834 creating and populating the struct. 

835 

836 Raises 

837 ------ 

838 FileNotFoundError 

839 Raised if one of the given files does not exist. 

840 FileExistsError 

841 Raised if transfer is not `None` but the (internal) location the 

842 file would be moved to is already occupied. 

843 """ 

844 if self._transaction is None: 844 ↛ 845line 844 didn't jump to line 845, because the condition on line 844 was never true

845 raise RuntimeError("Ingest called without transaction enabled") 

846 

847 # Create URI of the source path, do not need to force a relative 

848 # path to absolute. 

849 srcUri = ButlerURI(path, forceAbsolute=False) 

850 

851 # Track whether we have read the size of the source yet 

852 have_sized = False 

853 

854 tgtLocation: Optional[Location] 

855 if transfer is None: 

856 # A relative path is assumed to be relative to the datastore 

857 # in this context 

858 if not srcUri.isabs(): 

859 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

860 else: 

861 # Work out the path in the datastore from an absolute URI 

862 # This is required to be within the datastore. 

863 pathInStore = srcUri.relative_to(self.root) 

864 if pathInStore is None: 864 ↛ 865line 864 didn't jump to line 865, because the condition on line 864 was never true

865 raise RuntimeError(f"Unexpectedly learned that {srcUri} is " 

866 f"not within datastore {self.root}") 

867 tgtLocation = self.locationFactory.fromPath(pathInStore) 

868 elif transfer == "direct": 868 ↛ 873line 868 didn't jump to line 873, because the condition on line 868 was never true

869 # Want to store the full URI to the resource directly in 

870 # datastore. This is useful for referring to permanent archive 

871 # storage for raw data. 

872 # Trust that people know what they are doing. 

873 tgtLocation = None 

874 else: 

875 # Work out the name we want this ingested file to have 

876 # inside the datastore 

877 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

878 if not tgtLocation.uri.dirname().exists(): 

879 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

880 tgtLocation.uri.dirname().mkdir() 

881 

882 # if we are transferring from a local file to a remote location 

883 # it may be more efficient to get the size and checksum of the 

884 # local file rather than the transferred one 

885 if not srcUri.scheme or srcUri.scheme == "file": 885 ↛ 891line 885 didn't jump to line 891, because the condition on line 885 was never false

886 size = srcUri.size() 

887 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

888 have_sized = True 

889 

890 # transfer the resource to the destination 

891 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction) 

892 

893 if tgtLocation is None: 893 ↛ 895line 893 didn't jump to line 895, because the condition on line 893 was never true

894 # This means we are using direct mode 

895 targetUri = srcUri 

896 targetPath = str(srcUri) 

897 else: 

898 targetUri = tgtLocation.uri 

899 targetPath = tgtLocation.pathInStore.path 

900 

901 # the file should exist in the datastore now 

902 if not have_sized: 

903 size = targetUri.size() 

904 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

905 

906 return StoredFileInfo(formatter=formatter, path=targetPath, 

907 storageClass=ref.datasetType.storageClass, 

908 component=ref.datasetType.component(), 

909 file_size=size, checksum=checksum) 

910 

911 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

912 # Docstring inherited from Datastore._prepIngest. 

913 filtered = [] 

914 for dataset in datasets: 

915 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

916 if not acceptable: 

917 continue 

918 else: 

919 dataset.refs = acceptable 

920 if dataset.formatter is None: 

921 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

922 else: 

923 assert isinstance(dataset.formatter, (type, str)) 

924 dataset.formatter = getClassOf(dataset.formatter) 

925 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

926 filtered.append(dataset) 

927 return _IngestPrepData(filtered) 

928 

929 @transactional 

930 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None: 

931 # Docstring inherited from Datastore._finishIngest. 

932 refsAndInfos = [] 

933 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

934 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

935 # Do ingest as if the first dataset ref is associated with the file 

936 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter, 

937 transfer=transfer) 

938 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

939 self._register_datasets(refsAndInfos) 

940 

941 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef, 

942 formatter: Union[Formatter, Type[Formatter]]) -> Location: 

943 """Given a source URI and a DatasetRef, determine the name the 

944 dataset will have inside datastore. 

945 

946 Parameters 

947 ---------- 

948 srcUri : `ButlerURI` 

949 URI to the source dataset file. 

950 ref : `DatasetRef` 

951 Ref associated with the newly-ingested dataset artifact. This 

952 is used to determine the name within the datastore. 

953 formatter : `Formatter` or Formatter class. 

954 Formatter to use for validation. Can be a class or an instance. 

955 

956 Returns 

957 ------- 

958 location : `Location` 

959 Target location for the newly-ingested dataset. 

960 """ 

961 # Ingesting a file from outside the datastore. 

962 # This involves a new name. 

963 template = self.templates.getTemplate(ref) 

964 location = self.locationFactory.fromPath(template.format(ref)) 

965 

966 # Get the extension 

967 ext = srcUri.getExtension() 

968 

969 # Update the destination to include that extension 

970 location.updateExtension(ext) 

971 

972 # Ask the formatter to validate this extension 

973 formatter.validateExtension(location) 

974 

975 return location 

976 

977 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

978 """Write out in memory dataset to datastore. 

979 

980 Parameters 

981 ---------- 

982 inMemoryDataset : `object` 

983 Dataset to write to datastore. 

984 ref : `DatasetRef` 

985 Registry information associated with this dataset. 

986 

987 Returns 

988 ------- 

989 info : `StoredFileInfo` 

990 Information describin the artifact written to the datastore. 

991 """ 

992 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

993 uri = location.uri 

994 

995 if not uri.dirname().exists(): 

996 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

997 uri.dirname().mkdir() 

998 

999 if self._transaction is None: 999 ↛ 1000line 999 didn't jump to line 1000, because the condition on line 999 was never true

1000 raise RuntimeError("Attempting to write artifact without transaction enabled") 

1001 

1002 def _removeFileExists(uri: ButlerURI) -> None: 

1003 """Remove a file and do not complain if it is not there. 

1004 

1005 This is important since a formatter might fail before the file 

1006 is written and we should not confuse people by writing spurious 

1007 error messages to the log. 

1008 """ 

1009 try: 

1010 uri.remove() 

1011 except FileNotFoundError: 

1012 pass 

1013 

1014 # Register a callback to try to delete the uploaded data if 

1015 # something fails below 

1016 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1017 

1018 # For a local file, simply use the formatter directly 

1019 if uri.isLocal: 

1020 try: 

1021 formatter.write(inMemoryDataset) 

1022 except Exception as e: 

1023 raise RuntimeError(f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} " 

1024 f"to location {uri}") from e 

1025 log.debug("Successfully wrote python object to local file at %s", uri) 

1026 else: 

1027 # This is a remote URI, so first try bytes and write directly else 

1028 # fallback to a temporary file 

1029 try: 

1030 serializedDataset = formatter.toBytes(inMemoryDataset) 

1031 except NotImplementedError: 1031 ↛ 1050line 1031 didn't jump to line 1050

1032 with tempfile.NamedTemporaryFile(suffix=uri.getExtension()) as tmpFile: 

1033 # Need to configure the formatter to write to a different 

1034 # location and that needs us to overwrite internals 

1035 tmpLocation = Location(*os.path.split(tmpFile.name)) 

1036 log.debug("Writing dataset to temporary location at %s", tmpLocation.uri) 

1037 with formatter._updateLocation(tmpLocation): 

1038 try: 

1039 formatter.write(inMemoryDataset) 

1040 except Exception as e: 

1041 raise RuntimeError(f"Failed to serialize dataset {ref} of type" 

1042 f" {type(inMemoryDataset)} to " 

1043 f"temporary location {tmpLocation.uri}") from e 

1044 uri.transfer_from(tmpLocation.uri, transfer="copy", overwrite=True) 

1045 

1046 # Cache if required 

1047 self.cacheManager.move_to_cache(tmpLocation.uri, ref) 

1048 

1049 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1050 except Exception as e: 

1051 raise RuntimeError(f"Failed to serialize dataset {ref} to bytes.") from e 

1052 else: 

1053 log.debug("Writing bytes directly to %s", uri) 

1054 uri.write(serializedDataset, overwrite=True) 

1055 log.debug("Successfully wrote bytes directly to %s", uri) 

1056 

1057 # URI is needed to resolve what ingest case are we dealing with 

1058 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1059 

1060 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

1061 ref: DatasetRef, isComponent: bool = False) -> Any: 

1062 """Read the artifact from datastore into in memory object. 

1063 

1064 Parameters 

1065 ---------- 

1066 getInfo : `DatastoreFileGetInformation` 

1067 Information about the artifact within the datastore. 

1068 ref : `DatasetRef` 

1069 The registry information associated with this artifact. 

1070 isComponent : `bool` 

1071 Flag to indicate if a component is being read from this artifact. 

1072 

1073 Returns 

1074 ------- 

1075 inMemoryDataset : `object` 

1076 The artifact as a python object. 

1077 """ 

1078 location = getInfo.location 

1079 uri = location.uri 

1080 log.debug("Accessing data from %s", uri) 

1081 

1082 # Cannot recalculate checksum but can compare size as a quick check 

1083 # Do not do this if the size is negative since that indicates 

1084 # we do not know. 

1085 recorded_size = getInfo.info.file_size 

1086 resource_size = uri.size() 

1087 if recorded_size >= 0 and resource_size != recorded_size: 1087 ↛ 1088line 1087 didn't jump to line 1088, because the condition on line 1087 was never true

1088 raise RuntimeError("Integrity failure in Datastore. " 

1089 f"Size of file {uri} ({resource_size}) " 

1090 f"does not match size recorded in registry of {recorded_size}") 

1091 

1092 # For the general case we have choices for how to proceed. 

1093 # 1. Always use a local file (downloading the remote resource to a 

1094 # temporary file if needed). 

1095 # 2. Use a threshold size and read into memory and use bytes. 

1096 # Use both for now with an arbitrary hand off size. 

1097 # This allows small datasets to be downloaded from remote object 

1098 # stores without requiring a temporary file. 

1099 

1100 formatter = getInfo.formatter 

1101 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1102 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1103 serializedDataset = uri.read() 

1104 log.debug("Deserializing %s from %d bytes from location %s with formatter %s", 

1105 f"component {getInfo.component}" if isComponent else "", 

1106 len(serializedDataset), uri, formatter.name()) 

1107 try: 

1108 result = formatter.fromBytes(serializedDataset, 

1109 component=getInfo.component if isComponent else None) 

1110 except Exception as e: 

1111 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1112 f" ({ref.datasetType.name} from {uri}): {e}") from e 

1113 else: 

1114 # Read from file. 

1115 

1116 # Have to update the Location associated with the formatter 

1117 # because formatter.read does not allow an override. 

1118 # This could be improved. 

1119 location_updated = False 

1120 msg = "" 

1121 

1122 # First check in cache for local version. 

1123 # The cache will only be relevant for remote resources. 

1124 if not uri.isLocal: 

1125 cached_file = self.cacheManager.find_in_cache(ref, uri.getExtension()) 

1126 if cached_file is not None: 1126 ↛ 1127line 1126 didn't jump to line 1127, because the condition on line 1126 was never true

1127 msg = f"(via cache read of remote file {uri})" 

1128 uri = cached_file 

1129 location_updated = True 

1130 

1131 with uri.as_local() as local_uri: 

1132 

1133 # URI was remote and file was downloaded 

1134 if uri != local_uri: 

1135 cache_msg = "" 

1136 location_updated = True 

1137 

1138 # Cache the downloaded file if needed. 

1139 cached_uri = self.cacheManager.move_to_cache(local_uri, ref) 

1140 if cached_uri is not None: 1140 ↛ 1141line 1140 didn't jump to line 1141, because the condition on line 1140 was never true

1141 local_uri = cached_uri 

1142 cache_msg = " and cached" 

1143 

1144 msg = f"(via download to local file{cache_msg})" 

1145 

1146 # Calculate the (possibly) new location for the formatter 

1147 # to use. 

1148 newLocation = Location(*local_uri.split()) if location_updated else None 

1149 

1150 log.debug("Reading%s from location %s %s with formatter %s", 

1151 f" component {getInfo.component}" if isComponent else "", 

1152 uri, msg, formatter.name()) 

1153 try: 

1154 with formatter._updateLocation(newLocation): 

1155 result = formatter.read(component=getInfo.component if isComponent else None) 

1156 except Exception as e: 

1157 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1158 f" ({ref.datasetType.name} from {uri}): {e}") from e 

1159 

1160 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

1161 isComponent=isComponent) 

1162 

1163 def knows(self, ref: DatasetRef) -> bool: 

1164 """Check if the dataset is known to the datastore. 

1165 

1166 Does not check for existence of any artifact. 

1167 

1168 Parameters 

1169 ---------- 

1170 ref : `DatasetRef` 

1171 Reference to the required dataset. 

1172 

1173 Returns 

1174 ------- 

1175 exists : `bool` 

1176 `True` if the dataset is known to the datastore. 

1177 """ 

1178 fileLocations = self._get_dataset_locations_info(ref) 

1179 if fileLocations: 

1180 return True 

1181 return False 

1182 

1183 def exists(self, ref: DatasetRef) -> bool: 

1184 """Check if the dataset exists in the datastore. 

1185 

1186 Parameters 

1187 ---------- 

1188 ref : `DatasetRef` 

1189 Reference to the required dataset. 

1190 

1191 Returns 

1192 ------- 

1193 exists : `bool` 

1194 `True` if the entity exists in the `Datastore`. 

1195 """ 

1196 fileLocations = self._get_dataset_locations_info(ref) 

1197 

1198 # if we are being asked to trust that registry might not be correct 

1199 # we ask for the expected locations and check them explicitly 

1200 if not fileLocations: 

1201 if not self.trustGetRequest: 

1202 return False 

1203 

1204 # When we are guessing a dataset location we can not check 

1205 # for the existence of every component since we can not 

1206 # know if every component was written. Instead we check 

1207 # for the existence of any of the expected locations. 

1208 for location, _ in self._get_expected_dataset_locations_info(ref): 1208 ↛ 1211line 1208 didn't jump to line 1211, because the loop on line 1208 didn't complete

1209 if self._artifact_exists(location): 1209 ↛ 1208line 1209 didn't jump to line 1208, because the condition on line 1209 was never false

1210 return True 

1211 return False 

1212 

1213 # All listed artifacts must exist. 

1214 for location, _ in fileLocations: 

1215 if not self._artifact_exists(location): 

1216 return False 

1217 

1218 return True 

1219 

1220 def getURIs(self, ref: DatasetRef, 

1221 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

1222 """Return URIs associated with dataset. 

1223 

1224 Parameters 

1225 ---------- 

1226 ref : `DatasetRef` 

1227 Reference to the required dataset. 

1228 predict : `bool`, optional 

1229 If the datastore does not know about the dataset, should it 

1230 return a predicted URI or not? 

1231 

1232 Returns 

1233 ------- 

1234 primary : `ButlerURI` 

1235 The URI to the primary artifact associated with this dataset. 

1236 If the dataset was disassembled within the datastore this 

1237 may be `None`. 

1238 components : `dict` 

1239 URIs to any components associated with the dataset artifact. 

1240 Can be empty if there are no components. 

1241 """ 

1242 

1243 primary: Optional[ButlerURI] = None 

1244 components: Dict[str, ButlerURI] = {} 

1245 

1246 # if this has never been written then we have to guess 

1247 if not self.exists(ref): 

1248 if not predict: 

1249 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

1250 

1251 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1252 

1253 if doDisassembly: 

1254 

1255 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

1256 compRef = ref.makeComponentRef(component) 

1257 compLocation, _ = self._determine_put_formatter_location(compRef) 

1258 

1259 # Add a URI fragment to indicate this is a guess 

1260 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted") 

1261 

1262 else: 

1263 

1264 location, _ = self._determine_put_formatter_location(ref) 

1265 

1266 # Add a URI fragment to indicate this is a guess 

1267 primary = ButlerURI(location.uri.geturl() + "#predicted") 

1268 

1269 return primary, components 

1270 

1271 # If this is a ref that we have written we can get the path. 

1272 # Get file metadata and internal metadata 

1273 fileLocations = self._get_dataset_locations_info(ref) 

1274 

1275 guessing = False 

1276 if not fileLocations: 

1277 if not self.trustGetRequest: 1277 ↛ 1278line 1277 didn't jump to line 1278, because the condition on line 1277 was never true

1278 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1279 fileLocations = self._get_expected_dataset_locations_info(ref) 

1280 guessing = True 

1281 

1282 if len(fileLocations) == 1: 

1283 # No disassembly so this is the primary URI 

1284 uri = fileLocations[0][0].uri 

1285 if guessing and not uri.exists(): 1285 ↛ 1286line 1285 didn't jump to line 1286, because the condition on line 1285 was never true

1286 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1287 primary = uri 

1288 

1289 else: 

1290 for location, storedFileInfo in fileLocations: 

1291 if storedFileInfo.component is None: 1291 ↛ 1292line 1291 didn't jump to line 1292, because the condition on line 1291 was never true

1292 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1293 uri = location.uri 

1294 if guessing and not uri.exists(): 1294 ↛ 1295line 1294 didn't jump to line 1295, because the condition on line 1294 was never true

1295 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1296 components[storedFileInfo.component] = uri 

1297 

1298 return primary, components 

1299 

1300 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI: 

1301 """URI to the Dataset. 

1302 

1303 Parameters 

1304 ---------- 

1305 ref : `DatasetRef` 

1306 Reference to the required Dataset. 

1307 predict : `bool` 

1308 If `True`, allow URIs to be returned of datasets that have not 

1309 been written. 

1310 

1311 Returns 

1312 ------- 

1313 uri : `str` 

1314 URI pointing to the dataset within the datastore. If the 

1315 dataset does not exist in the datastore, and if ``predict`` is 

1316 `True`, the URI will be a prediction and will include a URI 

1317 fragment "#predicted". 

1318 If the datastore does not have entities that relate well 

1319 to the concept of a URI the returned URI will be 

1320 descriptive. The returned URI is not guaranteed to be obtainable. 

1321 

1322 Raises 

1323 ------ 

1324 FileNotFoundError 

1325 Raised if a URI has been requested for a dataset that does not 

1326 exist and guessing is not allowed. 

1327 RuntimeError 

1328 Raised if a request is made for a single URI but multiple URIs 

1329 are associated with this dataset. 

1330 

1331 Notes 

1332 ----- 

1333 When a predicted URI is requested an attempt will be made to form 

1334 a reasonable URI based on file templates and the expected formatter. 

1335 """ 

1336 primary, components = self.getURIs(ref, predict) 

1337 if primary is None or components: 1337 ↛ 1338line 1337 didn't jump to line 1338, because the condition on line 1337 was never true

1338 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. " 

1339 "Use Dataastore.getURIs() instead.") 

1340 return primary 

1341 

1342 def retrieveArtifacts(self, refs: Iterable[DatasetRef], 

1343 destination: ButlerURI, transfer: str = "auto", 

1344 preserve_path: bool = True, 

1345 overwrite: bool = False) -> List[ButlerURI]: 

1346 """Retrieve the file artifacts associated with the supplied refs. 

1347 

1348 Parameters 

1349 ---------- 

1350 refs : iterable of `DatasetRef` 

1351 The datasets for which file artifacts are to be retrieved. 

1352 A single ref can result in multiple files. The refs must 

1353 be resolved. 

1354 destination : `ButlerURI` 

1355 Location to write the file artifacts. 

1356 transfer : `str`, optional 

1357 Method to use to transfer the artifacts. Must be one of the options 

1358 supported by `ButlerURI.transfer_from()`. "move" is not allowed. 

1359 preserve_path : `bool`, optional 

1360 If `True` the full path of the file artifact within the datastore 

1361 is preserved. If `False` the final file component of the path 

1362 is used. 

1363 overwrite : `bool`, optional 

1364 If `True` allow transfers to overwrite existing files at the 

1365 destination. 

1366 

1367 Returns 

1368 ------- 

1369 targets : `list` of `ButlerURI` 

1370 URIs of file artifacts in destination location. Order is not 

1371 preserved. 

1372 """ 

1373 if not destination.isdir(): 1373 ↛ 1374line 1373 didn't jump to line 1374, because the condition on line 1373 was never true

1374 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

1375 

1376 if transfer == "move": 

1377 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

1378 

1379 # Source -> Destination 

1380 # This also helps filter out duplicate DatasetRef in the request 

1381 # that will map to the same underlying file transfer. 

1382 to_transfer: Dict[ButlerURI, ButlerURI] = {} 

1383 

1384 for ref in refs: 

1385 locations = self._get_dataset_locations_info(ref) 

1386 for location, _ in locations: 

1387 source_uri = location.uri 

1388 target_path: Union[str, ButlerURI] 

1389 if preserve_path: 

1390 target_path = location.pathInStore 

1391 if target_path.isabs(): 1391 ↛ 1394line 1391 didn't jump to line 1394, because the condition on line 1391 was never true

1392 # This is an absolute path to an external file. 

1393 # Use the full path. 

1394 target_path = target_path.relativeToPathRoot 

1395 else: 

1396 target_path = source_uri.basename() 

1397 target_uri = destination.join(target_path) 

1398 to_transfer[source_uri] = target_uri 

1399 

1400 # In theory can now parallelize the transfer 

1401 log.debug("Number of artifacts to transfer to %s: %d", 

1402 str(destination), len(to_transfer)) 

1403 for source_uri, target_uri in to_transfer.items(): 

1404 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

1405 

1406 return list(to_transfer.values()) 

1407 

1408 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

1409 """Load an InMemoryDataset from the store. 

1410 

1411 Parameters 

1412 ---------- 

1413 ref : `DatasetRef` 

1414 Reference to the required Dataset. 

1415 parameters : `dict` 

1416 `StorageClass`-specific parameters that specify, for example, 

1417 a slice of the dataset to be loaded. 

1418 

1419 Returns 

1420 ------- 

1421 inMemoryDataset : `object` 

1422 Requested dataset or slice thereof as an InMemoryDataset. 

1423 

1424 Raises 

1425 ------ 

1426 FileNotFoundError 

1427 Requested dataset can not be retrieved. 

1428 TypeError 

1429 Return value from formatter has unexpected type. 

1430 ValueError 

1431 Formatter failed to process the dataset. 

1432 """ 

1433 allGetInfo = self._prepare_for_get(ref, parameters) 

1434 refComponent = ref.datasetType.component() 

1435 

1436 # Supplied storage class for the component being read 

1437 refStorageClass = ref.datasetType.storageClass 

1438 

1439 # Create mapping from component name to related info 

1440 allComponents = {i.component: i for i in allGetInfo} 

1441 

1442 # By definition the dataset is disassembled if we have more 

1443 # than one record for it. 

1444 isDisassembled = len(allGetInfo) > 1 

1445 

1446 # Look for the special case where we are disassembled but the 

1447 # component is a derived component that was not written during 

1448 # disassembly. For this scenario we need to check that the 

1449 # component requested is listed as a derived component for the 

1450 # composite storage class 

1451 isDisassembledReadOnlyComponent = False 

1452 if isDisassembled and refComponent: 

1453 # The composite storage class should be accessible through 

1454 # the component dataset type 

1455 compositeStorageClass = ref.datasetType.parentStorageClass 

1456 

1457 # In the unlikely scenario where the composite storage 

1458 # class is not known, we can only assume that this is a 

1459 # normal component. If that assumption is wrong then the 

1460 # branch below that reads a persisted component will fail 

1461 # so there is no need to complain here. 

1462 if compositeStorageClass is not None: 1462 ↛ 1465line 1462 didn't jump to line 1465, because the condition on line 1462 was never false

1463 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

1464 

1465 if isDisassembled and not refComponent: 

1466 # This was a disassembled dataset spread over multiple files 

1467 # and we need to put them all back together again. 

1468 # Read into memory and then assemble 

1469 

1470 # Check that the supplied parameters are suitable for the type read 

1471 refStorageClass.validateParameters(parameters) 

1472 

1473 # We want to keep track of all the parameters that were not used 

1474 # by formatters. We assume that if any of the component formatters 

1475 # use a parameter that we do not need to apply it again in the 

1476 # assembler. 

1477 usedParams = set() 

1478 

1479 components: Dict[str, Any] = {} 

1480 for getInfo in allGetInfo: 

1481 # assemblerParams are parameters not understood by the 

1482 # associated formatter. 

1483 usedParams.update(set(getInfo.formatterParams)) 

1484 

1485 component = getInfo.component 

1486 

1487 if component is None: 1487 ↛ 1488line 1487 didn't jump to line 1488, because the condition on line 1487 was never true

1488 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

1489 

1490 # We do not want the formatter to think it's reading 

1491 # a component though because it is really reading a 

1492 # standalone dataset -- always tell reader it is not a 

1493 # component. 

1494 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False) 

1495 

1496 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

1497 

1498 # Any unused parameters will have to be passed to the assembler 

1499 if parameters: 

1500 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

1501 else: 

1502 unusedParams = {} 

1503 

1504 # Process parameters 

1505 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset, 

1506 parameters=unusedParams) 

1507 

1508 elif isDisassembledReadOnlyComponent: 

1509 

1510 compositeStorageClass = ref.datasetType.parentStorageClass 

1511 if compositeStorageClass is None: 1511 ↛ 1512line 1511 didn't jump to line 1512, because the condition on line 1511 was never true

1512 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since" 

1513 "no composite storage class is available.") 

1514 

1515 if refComponent is None: 1515 ↛ 1517line 1515 didn't jump to line 1517, because the condition on line 1515 was never true

1516 # Mainly for mypy 

1517 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

1518 

1519 # Assume that every derived component can be calculated by 

1520 # forwarding the request to a single read/write component. 

1521 # Rather than guessing which rw component is the right one by 

1522 # scanning each for a derived component of the same name, 

1523 # we ask the storage class delegate directly which one is best to 

1524 # use. 

1525 compositeDelegate = compositeStorageClass.delegate() 

1526 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent, 

1527 set(allComponents)) 

1528 

1529 # Select the relevant component 

1530 rwInfo = allComponents[forwardedComponent] 

1531 

1532 # For now assume that read parameters are validated against 

1533 # the real component and not the requested component 

1534 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

1535 forwardedStorageClass.validateParameters(parameters) 

1536 

1537 # Unfortunately the FileDescriptor inside the formatter will have 

1538 # the wrong write storage class so we need to create a new one 

1539 # given the immutability constraint. 

1540 writeStorageClass = rwInfo.info.storageClass 

1541 

1542 # We may need to put some thought into parameters for read 

1543 # components but for now forward them on as is 

1544 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location, 

1545 readStorageClass=refStorageClass, 

1546 storageClass=writeStorageClass, 

1547 parameters=parameters), 

1548 ref.dataId) 

1549 

1550 # The assembler can not receive any parameter requests for a 

1551 # derived component at this time since the assembler will 

1552 # see the storage class of the derived component and those 

1553 # parameters will have to be handled by the formatter on the 

1554 # forwarded storage class. 

1555 assemblerParams: Dict[str, Any] = {} 

1556 

1557 # Need to created a new info that specifies the derived 

1558 # component and associated storage class 

1559 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter, 

1560 rwInfo.info, assemblerParams, {}, 

1561 refComponent, refStorageClass) 

1562 

1563 return self._read_artifact_into_memory(readInfo, ref, isComponent=True) 

1564 

1565 else: 

1566 # Single file request or component from that composite file 

1567 for lookup in (refComponent, None): 1567 ↛ 1572line 1567 didn't jump to line 1572, because the loop on line 1567 didn't complete

1568 if lookup in allComponents: 1568 ↛ 1567line 1568 didn't jump to line 1567, because the condition on line 1568 was never false

1569 getInfo = allComponents[lookup] 

1570 break 

1571 else: 

1572 raise FileNotFoundError(f"Component {refComponent} not found " 

1573 f"for ref {ref} in datastore {self.name}") 

1574 

1575 # Do not need the component itself if already disassembled 

1576 if isDisassembled: 

1577 isComponent = False 

1578 else: 

1579 isComponent = getInfo.component is not None 

1580 

1581 # For a disassembled component we can validate parametersagainst 

1582 # the component storage class directly 

1583 if isDisassembled: 

1584 refStorageClass.validateParameters(parameters) 

1585 else: 

1586 # For an assembled composite this could be a derived 

1587 # component derived from a real component. The validity 

1588 # of the parameters is not clear. For now validate against 

1589 # the composite storage class 

1590 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

1591 

1592 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent) 

1593 

1594 @transactional 

1595 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

1596 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

1597 

1598 Parameters 

1599 ---------- 

1600 inMemoryDataset : `object` 

1601 The dataset to store. 

1602 ref : `DatasetRef` 

1603 Reference to the associated Dataset. 

1604 

1605 Raises 

1606 ------ 

1607 TypeError 

1608 Supplied object and storage class are inconsistent. 

1609 DatasetTypeNotSupportedError 

1610 The associated `DatasetType` is not handled by this datastore. 

1611 

1612 Notes 

1613 ----- 

1614 If the datastore is configured to reject certain dataset types it 

1615 is possible that the put will fail and raise a 

1616 `DatasetTypeNotSupportedError`. The main use case for this is to 

1617 allow `ChainedDatastore` to put to multiple datastores without 

1618 requiring that every datastore accepts the dataset. 

1619 """ 

1620 

1621 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1622 # doDisassembly = True 

1623 

1624 artifacts = [] 

1625 if doDisassembly: 

1626 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

1627 for component, componentInfo in components.items(): 

1628 # Don't recurse because we want to take advantage of 

1629 # bulk insert -- need a new DatasetRef that refers to the 

1630 # same dataset_id but has the component DatasetType 

1631 # DatasetType does not refer to the types of components 

1632 # So we construct one ourselves. 

1633 compRef = ref.makeComponentRef(component) 

1634 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

1635 artifacts.append((compRef, storedInfo)) 

1636 else: 

1637 # Write the entire thing out 

1638 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

1639 artifacts.append((ref, storedInfo)) 

1640 

1641 self._register_datasets(artifacts) 

1642 

1643 @transactional 

1644 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None: 

1645 # Get file metadata and internal metadata 

1646 if not isinstance(ref, DatasetRef): 

1647 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

1648 # Assumed to be an iterable of refs so bulk mode enabled. 

1649 try: 

1650 self.bridge.moveToTrash(ref) 

1651 except Exception as e: 

1652 if ignore_errors: 

1653 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

1654 else: 

1655 raise 

1656 return 

1657 

1658 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

1659 

1660 fileLocations = self._get_dataset_locations_info(ref) 

1661 

1662 if not fileLocations: 

1663 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

1664 if ignore_errors: 1664 ↛ 1665line 1664 didn't jump to line 1665, because the condition on line 1664 was never true

1665 log.warning(err_msg) 

1666 return 

1667 else: 

1668 raise FileNotFoundError(err_msg) 

1669 

1670 for location, storedFileInfo in fileLocations: 

1671 if not self._artifact_exists(location): 1671 ↛ 1672line 1671 didn't jump to line 1672, because the condition on line 1671 was never true

1672 err_msg = f"Dataset is known to datastore {self.name} but " \ 

1673 f"associated artifact ({location.uri}) is missing" 

1674 if ignore_errors: 

1675 log.warning(err_msg) 

1676 return 

1677 else: 

1678 raise FileNotFoundError(err_msg) 

1679 

1680 # Mark dataset as trashed 

1681 try: 

1682 self.bridge.moveToTrash([ref]) 

1683 except Exception as e: 

1684 if ignore_errors: 

1685 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} " 

1686 f"but encountered an error: {e}") 

1687 pass 

1688 else: 

1689 raise 

1690 

1691 @transactional 

1692 def emptyTrash(self, ignore_errors: bool = True) -> None: 

1693 """Remove all datasets from the trash. 

1694 

1695 Parameters 

1696 ---------- 

1697 ignore_errors : `bool` 

1698 If `True` return without error even if something went wrong. 

1699 Problems could occur if another process is simultaneously trying 

1700 to delete. 

1701 """ 

1702 log.debug("Emptying trash in datastore %s", self.name) 

1703 

1704 # Context manager will empty trash iff we finish it without raising. 

1705 # It will also automatically delete the relevant rows from the 

1706 # trash table and the records table. 

1707 with self.bridge.emptyTrash(self._table, record_class=StoredFileInfo, 

1708 record_column="path") as trash_data: 

1709 # Removing the artifacts themselves requires that the files are 

1710 # not also associated with refs that are not to be trashed. 

1711 # Therefore need to do a query with the file paths themselves 

1712 # and return all the refs associated with them. Can only delete 

1713 # a file if the refs to be trashed are the only refs associated 

1714 # with the file. 

1715 # This requires multiple copies of the trashed items 

1716 trashed, artifacts_to_keep = trash_data 

1717 

1718 if artifacts_to_keep is None: 

1719 # The bridge is not helping us so have to work it out 

1720 # ourselves. This is not going to be as efficient. 

1721 trashed = list(trashed) 

1722 

1723 # The instance check is for mypy since up to this point it 

1724 # does not know the type of info. 

1725 path_map = self._refs_associated_with_artifacts([info.path for _, info in trashed 

1726 if isinstance(info, StoredFileInfo)]) 

1727 

1728 for ref, info in trashed: 

1729 

1730 # Mypy needs to know this is not the base class 

1731 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

1732 

1733 # Check for mypy 

1734 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

1735 

1736 path_map[info.path].remove(ref.id) 

1737 if not path_map[info.path]: 1737 ↛ 1728line 1737 didn't jump to line 1728, because the condition on line 1737 was never false

1738 del path_map[info.path] 

1739 

1740 artifacts_to_keep = set(path_map) 

1741 

1742 for ref, info in trashed: 

1743 

1744 # Should not happen for this implementation but need 

1745 # to keep mypy happy. 

1746 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

1747 

1748 # Mypy needs to know this is not the base class 

1749 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

1750 

1751 # Check for mypy 

1752 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

1753 

1754 if info.path in artifacts_to_keep: 

1755 # This is a multi-dataset artifact and we are not 

1756 # removing all associated refs. 

1757 continue 

1758 

1759 # Only trashed refs still known to datastore will be returned. 

1760 location = info.file_location(self.locationFactory) 

1761 

1762 # Point of no return for this artifact 

1763 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

1764 try: 

1765 self._delete_artifact(location) 

1766 except FileNotFoundError: 

1767 # If the file itself has been deleted there is nothing 

1768 # we can do about it. It is possible that trash has 

1769 # been run in parallel in another process or someone 

1770 # decided to delete the file. It is unlikely to come 

1771 # back and so we should still continue with the removal 

1772 # of the entry from the trash table. It is also possible 

1773 # we removed it in a previous iteration if it was 

1774 # a multi-dataset artifact. The delete artifact method 

1775 # will log a debug message in this scenario. 

1776 # Distinguishing file missing before trash started and 

1777 # file already removed previously as part of this trash 

1778 # is not worth the distinction with regards to potential 

1779 # memory cost. 

1780 pass 

1781 except Exception as e: 

1782 if ignore_errors: 

1783 # Use a debug message here even though it's not 

1784 # a good situation. In some cases this can be 

1785 # caused by a race between user A and user B 

1786 # and neither of them has permissions for the 

1787 # other's files. Butler does not know about users 

1788 # and trash has no idea what collections these 

1789 # files were in (without guessing from a path). 

1790 log.debug("Encountered error removing artifact %s from datastore %s: %s", 

1791 location.uri, self.name, e) 

1792 else: 

1793 raise 

1794 

1795 @transactional 

1796 def transfer_from(self, source_datastore: Datastore, refs: Iterable[DatasetRef], 

1797 local_refs: Optional[Iterable[DatasetRef]] = None, 

1798 transfer: str = "auto") -> None: 

1799 # Docstring inherited 

1800 if type(self) is not type(source_datastore): 1800 ↛ 1801line 1800 didn't jump to line 1801, because the condition on line 1800 was never true

1801 raise TypeError(f"Datastore mismatch between this datastore ({type(self)}) and the " 

1802 f"source datastore ({type(source_datastore)}).") 

1803 

1804 # Be explicit for mypy 

1805 if not isinstance(source_datastore, FileDatastore): 1805 ↛ 1806line 1805 didn't jump to line 1806, because the condition on line 1805 was never true

1806 raise TypeError("Can only transfer to a FileDatastore from another FileDatastore, not" 

1807 f" {type(source_datastore)}") 

1808 

1809 # Stop early if "direct" transfer mode is requested. That would 

1810 # require that the URI inside the source datastore should be stored 

1811 # directly in the target datastore, which seems unlikely to be useful 

1812 # since at any moment the source datastore could delete the file. 

1813 if transfer == "direct": 1813 ↛ 1814line 1813 didn't jump to line 1814, because the condition on line 1813 was never true

1814 raise ValueError("Can not transfer from a source datastore using direct mode since" 

1815 " those files are controlled by the other datastore.") 

1816 

1817 # We will go through the list multiple times so must convert 

1818 # generators to lists. 

1819 refs = list(refs) 

1820 

1821 if local_refs is None: 1821 ↛ 1822line 1821 didn't jump to line 1822, because the condition on line 1821 was never true

1822 local_refs = refs 

1823 else: 

1824 local_refs = list(local_refs) 

1825 

1826 # In order to handle disassembled composites the code works 

1827 # at the records level since it can assume that internal APIs 

1828 # can be used. 

1829 # - If the record already exists in the destination this is assumed 

1830 # to be okay. 

1831 # - If there is no record but the source and destination URIs are 

1832 # identical no transfer is done but the record is added. 

1833 # - If the source record refers to an absolute URI currently assume 

1834 # that that URI should remain absolute and will be visible to the 

1835 # destination butler. May need to have a flag to indicate whether 

1836 # the dataset should be transferred. This will only happen if 

1837 # the detached Butler has had a local ingest. 

1838 

1839 # What we really want is all the records in the source datastore 

1840 # associated with these refs. Or derived ones if they don't exist 

1841 # in the source. 

1842 source_records = source_datastore._get_stored_records_associated_with_refs(refs) 

1843 

1844 # The source dataset_ids are the keys in these records 

1845 source_ids = set(source_records) 

1846 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

1847 

1848 # The not None check is to appease mypy 

1849 requested_ids = set(ref.id for ref in refs if ref.id is not None) 

1850 missing_ids = requested_ids - source_ids 

1851 

1852 # Missing IDs can be okay if that datastore has allowed 

1853 # gets based on file existence. Should we transfer what we can 

1854 # or complain about it and warn? 

1855 if missing_ids and not source_datastore.trustGetRequest: 1855 ↛ 1856line 1855 didn't jump to line 1856, because the condition on line 1855 was never true

1856 raise ValueError(f"Some datasets are missing from source datastore {source_datastore}:" 

1857 f" {missing_ids}") 

1858 

1859 # Need to map these missing IDs to a DatasetRef so we can guess 

1860 # the details. 

1861 if missing_ids: 1861 ↛ 1862line 1861 didn't jump to line 1862, because the condition on line 1861 was never true

1862 log.info("Number of expected datasets missing from source datastore records: %d out of %d", 

1863 len(missing_ids), len(requested_ids)) 

1864 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

1865 

1866 for missing in missing_ids: 

1867 # Ask the source datastore where the missing artifacts 

1868 # should be. An execution butler might not know about the 

1869 # artifacts even if they are there. 

1870 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) 

1871 

1872 # Not all components can be guaranteed to exist so this 

1873 # list has to filter those by checking to see if the 

1874 # artifact is really there. 

1875 records = [info for location, info in expected if location.uri.exists()] 

1876 if records: 

1877 source_records[missing].extend(records) 

1878 else: 

1879 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", 

1880 id_to_ref[missing]) 

1881 

1882 # See if we already have these records 

1883 target_records = self._get_stored_records_associated_with_refs(local_refs) 

1884 

1885 # The artifacts to register 

1886 artifacts = [] 

1887 

1888 # Refs that already exist 

1889 already_present = [] 

1890 

1891 # Now can transfer the artifacts 

1892 for source_ref, target_ref in zip(refs, local_refs): 

1893 if target_ref.id in target_records: 1893 ↛ 1895line 1893 didn't jump to line 1895, because the condition on line 1893 was never true

1894 # Already have an artifact for this. 

1895 already_present.append(target_ref) 

1896 continue 

1897 

1898 # mypy needs to know these are always resolved refs 

1899 for info in source_records[source_ref.getCheckedId()]: 

1900 source_location = info.file_location(source_datastore.locationFactory) 

1901 target_location = info.file_location(self.locationFactory) 

1902 if source_location == target_location: 1902 ↛ 1906line 1902 didn't jump to line 1906, because the condition on line 1902 was never true

1903 # Either the dataset is already in the target datastore 

1904 # (which is how execution butler currently runs) or 

1905 # it is an absolute URI. 

1906 if source_location.pathInStore.isabs(): 

1907 # Just because we can see the artifact when running 

1908 # the transfer doesn't mean it will be generally 

1909 # accessible to a user of this butler. For now warn 

1910 # but assume it will be accessible. 

1911 log.warning("Transfer request for an outside-datastore artifact has been found at %s", 

1912 source_location) 

1913 else: 

1914 # Need to transfer it to the new location. 

1915 # Assume we should always overwrite. If the artifact 

1916 # is there this might indicate that a previous transfer 

1917 # was interrupted but was not able to be rolled back 

1918 # completely (eg pre-emption) so follow Datastore default 

1919 # and overwrite. 

1920 target_location.uri.transfer_from(source_location.uri, transfer=transfer, 

1921 overwrite=True, transaction=self._transaction) 

1922 

1923 artifacts.append((target_ref, info)) 

1924 

1925 self._register_datasets(artifacts) 

1926 

1927 if already_present: 1927 ↛ 1928line 1927 didn't jump to line 1928, because the condition on line 1927 was never true

1928 n_skipped = len(already_present) 

1929 log.info("Skipped transfer of %d dataset%s already present in datastore", n_skipped, 

1930 "" if n_skipped == 1 else "s") 

1931 

1932 @transactional 

1933 def forget(self, refs: Iterable[DatasetRef]) -> None: 

1934 # Docstring inherited. 

1935 refs = list(refs) 

1936 self.bridge.forget(refs) 

1937 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs]) 

1938 

1939 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], 

1940 logFailures: bool = False) -> None: 

1941 """Validate some of the configuration for this datastore. 

1942 

1943 Parameters 

1944 ---------- 

1945 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

1946 Entities to test against this configuration. Can be differing 

1947 types. 

1948 logFailures : `bool`, optional 

1949 If `True`, output a log message for every validation error 

1950 detected. 

1951 

1952 Raises 

1953 ------ 

1954 DatastoreValidationError 

1955 Raised if there is a validation problem with a configuration. 

1956 All the problems are reported in a single exception. 

1957 

1958 Notes 

1959 ----- 

1960 This method checks that all the supplied entities have valid file 

1961 templates and also have formatters defined. 

1962 """ 

1963 

1964 templateFailed = None 

1965 try: 

1966 self.templates.validateTemplates(entities, logFailures=logFailures) 

1967 except FileTemplateValidationError as e: 

1968 templateFailed = str(e) 

1969 

1970 formatterFailed = [] 

1971 for entity in entities: 

1972 try: 

1973 self.formatterFactory.getFormatterClass(entity) 

1974 except KeyError as e: 

1975 formatterFailed.append(str(e)) 

1976 if logFailures: 1976 ↛ 1971line 1976 didn't jump to line 1971, because the condition on line 1976 was never false

1977 log.critical("Formatter failure: %s", e) 

1978 

1979 if templateFailed or formatterFailed: 

1980 messages = [] 

1981 if templateFailed: 1981 ↛ 1982line 1981 didn't jump to line 1982, because the condition on line 1981 was never true

1982 messages.append(templateFailed) 

1983 if formatterFailed: 1983 ↛ 1985line 1983 didn't jump to line 1985, because the condition on line 1983 was never false

1984 messages.append(",".join(formatterFailed)) 

1985 msg = ";\n".join(messages) 

1986 raise DatastoreValidationError(msg) 

1987 

1988 def getLookupKeys(self) -> Set[LookupKey]: 

1989 # Docstring is inherited from base class 

1990 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \ 

1991 self.constraints.getLookupKeys() 

1992 

1993 def validateKey(self, lookupKey: LookupKey, 

1994 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

1995 # Docstring is inherited from base class 

1996 # The key can be valid in either formatters or templates so we can 

1997 # only check the template if it exists 

1998 if lookupKey in self.templates: 

1999 try: 

2000 self.templates[lookupKey].validateTemplate(entity) 

2001 except FileTemplateValidationError as e: 

2002 raise DatastoreValidationError(e) from e 

2003 

2004 def export(self, refs: Iterable[DatasetRef], *, 

2005 directory: Optional[Union[ButlerURI, str]] = None, 

2006 transfer: Optional[str] = "auto") -> Iterable[FileDataset]: 

2007 # Docstring inherited from Datastore.export. 

2008 if transfer is not None and directory is None: 2008 ↛ 2009line 2008 didn't jump to line 2009, because the condition on line 2008 was never true

2009 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no " 

2010 "export directory given") 

2011 

2012 # Force the directory to be a URI object 

2013 directoryUri: Optional[ButlerURI] = None 

2014 if directory is not None: 2014 ↛ 2017line 2014 didn't jump to line 2017, because the condition on line 2014 was never false

2015 directoryUri = ButlerURI(directory, forceDirectory=True) 

2016 

2017 if transfer is not None and directoryUri is not None: 2017 ↛ 2022line 2017 didn't jump to line 2022, because the condition on line 2017 was never false

2018 # mypy needs the second test 

2019 if not directoryUri.exists(): 2019 ↛ 2020line 2019 didn't jump to line 2020, because the condition on line 2019 was never true

2020 raise FileNotFoundError(f"Export location {directory} does not exist") 

2021 

2022 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

2023 for ref in progress.wrap(refs, "Exporting dataset files"): 

2024 fileLocations = self._get_dataset_locations_info(ref) 

2025 if not fileLocations: 2025 ↛ 2026line 2025 didn't jump to line 2026, because the condition on line 2025 was never true

2026 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

2027 # For now we can not export disassembled datasets 

2028 if len(fileLocations) > 1: 2028 ↛ 2029line 2028 didn't jump to line 2029, because the condition on line 2028 was never true

2029 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

2030 location, storedFileInfo = fileLocations[0] 

2031 

2032 pathInStore = location.pathInStore.path 

2033 if transfer is None: 2033 ↛ 2036line 2033 didn't jump to line 2036, because the condition on line 2033 was never true

2034 # TODO: do we also need to return the readStorageClass somehow? 

2035 # We will use the path in store directly 

2036 pass 

2037 elif transfer == "direct": 2037 ↛ 2039line 2037 didn't jump to line 2039, because the condition on line 2037 was never true

2038 # Use full URIs to the remote store in the export 

2039 pathInStore = str(location.uri) 

2040 else: 

2041 # mypy needs help 

2042 assert directoryUri is not None, "directoryUri must be defined to get here" 

2043 storeUri = ButlerURI(location.uri) 

2044 

2045 # if the datastore has an absolute URI to a resource, we 

2046 # have two options: 

2047 # 1. Keep the absolute URI in the exported YAML 

2048 # 2. Allocate a new name in the local datastore and transfer 

2049 # it. 

2050 # For now go with option 2 

2051 if location.pathInStore.isabs(): 2051 ↛ 2052line 2051 didn't jump to line 2052, because the condition on line 2051 was never true

2052 template = self.templates.getTemplate(ref) 

2053 newURI = ButlerURI(template.format(ref), forceAbsolute=False) 

2054 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

2055 

2056 exportUri = directoryUri.join(pathInStore) 

2057 exportUri.transfer_from(storeUri, transfer=transfer) 

2058 

2059 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

2060 

2061 @staticmethod 

2062 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]: 

2063 """Compute the checksum of the supplied file. 

2064 

2065 Parameters 

2066 ---------- 

2067 uri : `ButlerURI` 

2068 Name of resource to calculate checksum from. 

2069 algorithm : `str`, optional 

2070 Name of algorithm to use. Must be one of the algorithms supported 

2071 by :py:class`hashlib`. 

2072 block_size : `int` 

2073 Number of bytes to read from file at one time. 

2074 

2075 Returns 

2076 ------- 

2077 hexdigest : `str` 

2078 Hex digest of the file. 

2079 

2080 Notes 

2081 ----- 

2082 Currently returns None if the URI is for a remote resource. 

2083 """ 

2084 if algorithm not in hashlib.algorithms_guaranteed: 2084 ↛ 2085line 2084 didn't jump to line 2085, because the condition on line 2084 was never true

2085 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

2086 

2087 if not uri.isLocal: 2087 ↛ 2088line 2087 didn't jump to line 2088, because the condition on line 2087 was never true

2088 return None 

2089 

2090 hasher = hashlib.new(algorithm) 

2091 

2092 with uri.as_local() as local_uri: 

2093 with open(local_uri.ospath, "rb") as f: 

2094 for chunk in iter(lambda: f.read(block_size), b""): 

2095 hasher.update(chunk) 

2096 

2097 return hasher.hexdigest() 

2098 

2099 def needs_expanded_data_ids( 

2100 self, 

2101 transfer: Optional[str], 

2102 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

2103 ) -> bool: 

2104 # Docstring inherited. 

2105 # This _could_ also use entity to inspect whether the filename template 

2106 # involves placeholders other than the required dimensions for its 

2107 # dataset type, but that's not necessary for correctness; it just 

2108 # enables more optimizations (perhaps only in theory). 

2109 return transfer not in ("direct", None)