Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileDatastore", ) 

26 

27import hashlib 

28import logging 

29 

30from sqlalchemy import BigInteger, String 

31 

32from collections import defaultdict 

33from dataclasses import dataclass 

34from typing import ( 

35 TYPE_CHECKING, 

36 Any, 

37 ClassVar, 

38 Dict, 

39 Iterable, 

40 List, 

41 Mapping, 

42 Optional, 

43 Set, 

44 Tuple, 

45 Type, 

46 Union, 

47) 

48 

49from lsst.daf.butler import ( 

50 ButlerURI, 

51 CompositesMap, 

52 Config, 

53 FileDataset, 

54 DatasetId, 

55 DatasetRef, 

56 DatasetType, 

57 DatasetTypeNotSupportedError, 

58 Datastore, 

59 DatastoreCacheManager, 

60 DatastoreDisabledCacheManager, 

61 DatastoreConfig, 

62 DatastoreValidationError, 

63 FileDescriptor, 

64 FileTemplates, 

65 FileTemplateValidationError, 

66 Formatter, 

67 FormatterFactory, 

68 Location, 

69 LocationFactory, 

70 Progress, 

71 StorageClass, 

72 StoredFileInfo, 

73) 

74 

75from lsst.daf.butler import ddl 

76from lsst.daf.butler.registry.interfaces import ( 

77 ReadOnlyDatabaseError, 

78 DatastoreRegistryBridge, 

79) 

80 

81from lsst.daf.butler.core.repoRelocation import replaceRoot 

82from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional, time_this 

83from .genericDatastore import GenericBaseDatastore 

84 

85if TYPE_CHECKING: 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true

86 from lsst.daf.butler import LookupKey, AbstractDatastoreCacheManager 

87 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

88 

89log = logging.getLogger(__name__) 

90 

91 

92class _IngestPrepData(Datastore.IngestPrepData): 

93 """Helper class for FileDatastore ingest implementation. 

94 

95 Parameters 

96 ---------- 

97 datasets : `list` of `FileDataset` 

98 Files to be ingested by this datastore. 

99 """ 

100 def __init__(self, datasets: List[FileDataset]): 

101 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

102 self.datasets = datasets 

103 

104 

105@dataclass(frozen=True) 

106class DatastoreFileGetInformation: 

107 """Collection of useful parameters needed to retrieve a file from 

108 a Datastore. 

109 """ 

110 

111 location: Location 

112 """The location from which to read the dataset.""" 

113 

114 formatter: Formatter 

115 """The `Formatter` to use to deserialize the dataset.""" 

116 

117 info: StoredFileInfo 

118 """Stored information about this file and its formatter.""" 

119 

120 assemblerParams: Dict[str, Any] 

121 """Parameters to use for post-processing the retrieved dataset.""" 

122 

123 formatterParams: Dict[str, Any] 

124 """Parameters that were understood by the associated formatter.""" 

125 

126 component: Optional[str] 

127 """The component to be retrieved (can be `None`).""" 

128 

129 readStorageClass: StorageClass 

130 """The `StorageClass` of the dataset being read.""" 

131 

132 

133class FileDatastore(GenericBaseDatastore): 

134 """Generic Datastore for file-based implementations. 

135 

136 Should always be sub-classed since key abstract methods are missing. 

137 

138 Parameters 

139 ---------- 

140 config : `DatastoreConfig` or `str` 

141 Configuration as either a `Config` object or URI to file. 

142 bridgeManager : `DatastoreRegistryBridgeManager` 

143 Object that manages the interface between `Registry` and datastores. 

144 butlerRoot : `str`, optional 

145 New datastore root to use to override the configuration value. 

146 

147 Raises 

148 ------ 

149 ValueError 

150 If root location does not exist and ``create`` is `False` in the 

151 configuration. 

152 """ 

153 

154 defaultConfigFile: ClassVar[Optional[str]] = None 

155 """Path to configuration defaults. Accessed within the ``config`` resource 

156 or relative to a search path. Can be None if no defaults specified. 

157 """ 

158 

159 root: ButlerURI 

160 """Root directory URI of this `Datastore`.""" 

161 

162 locationFactory: LocationFactory 

163 """Factory for creating locations relative to the datastore root.""" 

164 

165 formatterFactory: FormatterFactory 

166 """Factory for creating instances of formatters.""" 

167 

168 templates: FileTemplates 

169 """File templates that can be used by this `Datastore`.""" 

170 

171 composites: CompositesMap 

172 """Determines whether a dataset should be disassembled on put.""" 

173 

174 defaultConfigFile = "datastores/fileDatastore.yaml" 

175 """Path to configuration defaults. Accessed within the ``config`` resource 

176 or relative to a search path. Can be None if no defaults specified. 

177 """ 

178 

179 @classmethod 

180 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

181 """Set any filesystem-dependent config options for this Datastore to 

182 be appropriate for a new empty repository with the given root. 

183 

184 Parameters 

185 ---------- 

186 root : `str` 

187 URI to the root of the data repository. 

188 config : `Config` 

189 A `Config` to update. Only the subset understood by 

190 this component will be updated. Will not expand 

191 defaults. 

192 full : `Config` 

193 A complete config with all defaults expanded that can be 

194 converted to a `DatastoreConfig`. Read-only and will not be 

195 modified by this method. 

196 Repository-specific options that should not be obtained 

197 from defaults when Butler instances are constructed 

198 should be copied from ``full`` to ``config``. 

199 overwrite : `bool`, optional 

200 If `False`, do not modify a value in ``config`` if the value 

201 already exists. Default is always to overwrite with the provided 

202 ``root``. 

203 

204 Notes 

205 ----- 

206 If a keyword is explicitly defined in the supplied ``config`` it 

207 will not be overridden by this method if ``overwrite`` is `False`. 

208 This allows explicit values set in external configs to be retained. 

209 """ 

210 Config.updateParameters(DatastoreConfig, config, full, 

211 toUpdate={"root": root}, 

212 toCopy=("cls", ("records", "table")), overwrite=overwrite) 

213 

214 @classmethod 

215 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

216 return ddl.TableSpec( 

217 fields=[ 

218 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

219 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

220 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

221 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

222 # Use empty string to indicate no component 

223 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

224 # TODO: should checksum be Base64Bytes instead? 

225 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

226 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

227 ], 

228 unique=frozenset(), 

229 indexes=[tuple(["path"])], 

230 ) 

231 

232 def __init__(self, config: Union[DatastoreConfig, str], 

233 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

234 super().__init__(config, bridgeManager) 

235 if "root" not in self.config: 235 ↛ 236line 235 didn't jump to line 236, because the condition on line 235 was never true

236 raise ValueError("No root directory specified in configuration") 

237 

238 # Name ourselves either using an explicit name or a name 

239 # derived from the (unexpanded) root 

240 if "name" in self.config: 

241 self.name = self.config["name"] 

242 else: 

243 # We use the unexpanded root in the name to indicate that this 

244 # datastore can be moved without having to update registry. 

245 self.name = "{}@{}".format(type(self).__name__, 

246 self.config["root"]) 

247 

248 # Support repository relocation in config 

249 # Existence of self.root is checked in subclass 

250 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot), 

251 forceDirectory=True, forceAbsolute=True) 

252 

253 self.locationFactory = LocationFactory(self.root) 

254 self.formatterFactory = FormatterFactory() 

255 

256 # Now associate formatters with storage classes 

257 self.formatterFactory.registerFormatters(self.config["formatters"], 

258 universe=bridgeManager.universe) 

259 

260 # Read the file naming templates 

261 self.templates = FileTemplates(self.config["templates"], 

262 universe=bridgeManager.universe) 

263 

264 # See if composites should be disassembled 

265 self.composites = CompositesMap(self.config["composites"], 

266 universe=bridgeManager.universe) 

267 

268 tableName = self.config["records", "table"] 

269 try: 

270 # Storage of paths and formatters, keyed by dataset_id 

271 self._table = bridgeManager.opaque.register( 

272 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType)) 

273 # Interface to Registry. 

274 self._bridge = bridgeManager.register(self.name) 

275 except ReadOnlyDatabaseError: 

276 # If the database is read only and we just tried and failed to 

277 # create a table, it means someone is trying to create a read-only 

278 # butler client for an empty repo. That should be okay, as long 

279 # as they then try to get any datasets before some other client 

280 # creates the table. Chances are they'rejust validating 

281 # configuration. 

282 pass 

283 

284 # Determine whether checksums should be used - default to False 

285 self.useChecksum = self.config.get("checksum", False) 

286 

287 # Determine whether we can fall back to configuration if a 

288 # requested dataset is not known to registry 

289 self.trustGetRequest = self.config.get("trust_get_request", False) 

290 

291 # Create a cache manager 

292 self.cacheManager: AbstractDatastoreCacheManager 

293 if "cached" in self.config: 293 ↛ 297line 293 didn't jump to line 297, because the condition on line 293 was never false

294 self.cacheManager = DatastoreCacheManager(self.config["cached"], 

295 universe=bridgeManager.universe) 

296 else: 

297 self.cacheManager = DatastoreDisabledCacheManager("", 

298 universe=bridgeManager.universe) 

299 

300 # Check existence and create directory structure if necessary 

301 if not self.root.exists(): 

302 if "create" not in self.config or not self.config["create"]: 302 ↛ 303line 302 didn't jump to line 303, because the condition on line 302 was never true

303 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

304 try: 

305 self.root.mkdir() 

306 except Exception as e: 

307 raise ValueError(f"Can not create datastore root '{self.root}', check permissions." 

308 f" Got error: {e}") from e 

309 

310 def __str__(self) -> str: 

311 return str(self.root) 

312 

313 @property 

314 def bridge(self) -> DatastoreRegistryBridge: 

315 return self._bridge 

316 

317 def _artifact_exists(self, location: Location) -> bool: 

318 """Check that an artifact exists in this datastore at the specified 

319 location. 

320 

321 Parameters 

322 ---------- 

323 location : `Location` 

324 Expected location of the artifact associated with this datastore. 

325 

326 Returns 

327 ------- 

328 exists : `bool` 

329 True if the location can be found, false otherwise. 

330 """ 

331 log.debug("Checking if resource exists: %s", location.uri) 

332 return location.uri.exists() 

333 

334 def _delete_artifact(self, location: Location) -> None: 

335 """Delete the artifact from the datastore. 

336 

337 Parameters 

338 ---------- 

339 location : `Location` 

340 Location of the artifact associated with this datastore. 

341 """ 

342 if location.pathInStore.isabs(): 342 ↛ 343line 342 didn't jump to line 343, because the condition on line 342 was never true

343 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

344 

345 try: 

346 location.uri.remove() 

347 except FileNotFoundError: 

348 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

349 raise 

350 except Exception as e: 

351 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

352 raise 

353 log.debug("Successfully deleted file: %s", location.uri) 

354 

355 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

356 # Docstring inherited from GenericBaseDatastore 

357 records = [info.to_record(ref) for ref, info in zip(refs, infos)] 

358 self._table.insert(*records) 

359 

360 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

361 # Docstring inherited from GenericBaseDatastore 

362 

363 # Look for the dataset_id -- there might be multiple matches 

364 # if we have disassembled the dataset. 

365 records = self._table.fetch(dataset_id=ref.id) 

366 return [StoredFileInfo.from_record(record) for record in records] 

367 

368 def _get_stored_records_associated_with_refs(self, 

369 refs: Iterable[DatasetIdRef] 

370 ) -> Dict[DatasetId, List[StoredFileInfo]]: 

371 """Retrieve all records associated with the provided refs. 

372 

373 Parameters 

374 ---------- 

375 refs : iterable of `DatasetIdRef` 

376 The refs for which records are to be retrieved. 

377 

378 Returns 

379 ------- 

380 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

381 The matching records indexed by the ref ID. The number of entries 

382 in the dict can be smaller than the number of requested refs. 

383 """ 

384 records = self._table.fetch(dataset_id=[ref.id for ref in refs]) 

385 

386 # Uniqueness is dataset_id + component so can have multiple records 

387 # per ref. 

388 records_by_ref = defaultdict(list) 

389 for record in records: 

390 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

391 return records_by_ref 

392 

393 def _refs_associated_with_artifacts(self, paths: List[Union[str, ButlerURI]]) -> Dict[str, 

394 Set[DatasetId]]: 

395 """Return paths and associated dataset refs. 

396 

397 Parameters 

398 ---------- 

399 paths : `list` of `str` or `ButlerURI` 

400 All the paths to include in search. 

401 

402 Returns 

403 ------- 

404 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

405 Mapping of each path to a set of associated database IDs. 

406 """ 

407 records = self._table.fetch(path=[str(path) for path in paths]) 

408 result = defaultdict(set) 

409 for row in records: 

410 result[row["path"]].add(row["dataset_id"]) 

411 return result 

412 

413 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[DatasetId]: 

414 """Return all dataset refs associated with the supplied path. 

415 

416 Parameters 

417 ---------- 

418 pathInStore : `ButlerURI` 

419 Path of interest in the data store. 

420 

421 Returns 

422 ------- 

423 ids : `set` of `int` 

424 All `DatasetRef` IDs associated with this path. 

425 """ 

426 records = list(self._table.fetch(path=str(pathInStore))) 

427 ids = {r["dataset_id"] for r in records} 

428 return ids 

429 

430 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

431 # Docstring inherited from GenericBaseDatastore 

432 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

433 

434 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

435 r"""Find all the `Location`\ s of the requested dataset in the 

436 `Datastore` and the associated stored file information. 

437 

438 Parameters 

439 ---------- 

440 ref : `DatasetRef` 

441 Reference to the required `Dataset`. 

442 

443 Returns 

444 ------- 

445 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

446 Location of the dataset within the datastore and 

447 stored information about each file and its formatter. 

448 """ 

449 # Get the file information (this will fail if no file) 

450 records = self.getStoredItemsInfo(ref) 

451 

452 # Use the path to determine the location -- we need to take 

453 # into account absolute URIs in the datastore record 

454 return [(r.file_location(self.locationFactory), r) for r in records] 

455 

456 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

457 """Check that there is only one dataset associated with the 

458 specified artifact. 

459 

460 Parameters 

461 ---------- 

462 ref : `DatasetRef` or `FakeDatasetRef` 

463 Dataset to be removed. 

464 location : `Location` 

465 The location of the artifact to be removed. 

466 

467 Returns 

468 ------- 

469 can_remove : `Bool` 

470 True if the artifact can be safely removed. 

471 """ 

472 # Can't ever delete absolute URIs. 

473 if location.pathInStore.isabs(): 

474 return False 

475 

476 # Get all entries associated with this path 

477 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

478 if not allRefs: 

479 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

480 

481 # Remove these refs from all the refs and if there is nothing left 

482 # then we can delete 

483 remainingRefs = allRefs - {ref.id} 

484 

485 if remainingRefs: 

486 return False 

487 return True 

488 

489 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, 

490 StoredFileInfo]]: 

491 """Predict the location and related file information of the requested 

492 dataset in this datastore. 

493 

494 Parameters 

495 ---------- 

496 ref : `DatasetRef` 

497 Reference to the required `Dataset`. 

498 

499 Returns 

500 ------- 

501 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

502 Expected Location of the dataset within the datastore and 

503 placeholder information about each file and its formatter. 

504 

505 Notes 

506 ----- 

507 Uses the current configuration to determine how we would expect the 

508 datastore files to have been written if we couldn't ask registry. 

509 This is safe so long as there has been no change to datastore 

510 configuration between writing the dataset and wanting to read it. 

511 Will not work for files that have been ingested without using the 

512 standard file template or default formatter. 

513 """ 

514 

515 # If we have a component ref we always need to ask the questions 

516 # of the composite. If the composite is disassembled this routine 

517 # should return all components. If the composite was not 

518 # disassembled the composite is what is stored regardless of 

519 # component request. Note that if the caller has disassembled 

520 # a composite there is no way for this guess to know that 

521 # without trying both the composite and component ref and seeing 

522 # if there is something at the component Location even without 

523 # disassembly being enabled. 

524 if ref.datasetType.isComponent(): 

525 ref = ref.makeCompositeRef() 

526 

527 # See if the ref is a composite that should be disassembled 

528 doDisassembly = self.composites.shouldBeDisassembled(ref) 

529 

530 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = [] 

531 

532 if doDisassembly: 

533 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

534 compRef = ref.makeComponentRef(component) 

535 location, formatter = self._determine_put_formatter_location(compRef) 

536 all_info.append((location, formatter, componentStorage, component)) 

537 

538 else: 

539 # Always use the composite ref if no disassembly 

540 location, formatter = self._determine_put_formatter_location(ref) 

541 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

542 

543 # Convert the list of tuples to have StoredFileInfo as second element 

544 return [(location, StoredFileInfo(formatter=formatter, 

545 path=location.pathInStore.path, 

546 storageClass=storageClass, 

547 component=component, 

548 checksum=None, 

549 file_size=-1)) 

550 for location, formatter, storageClass, component in all_info] 

551 

552 def _prepare_for_get(self, ref: DatasetRef, 

553 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]: 

554 """Check parameters for ``get`` and obtain formatter and 

555 location. 

556 

557 Parameters 

558 ---------- 

559 ref : `DatasetRef` 

560 Reference to the required Dataset. 

561 parameters : `dict` 

562 `StorageClass`-specific parameters that specify, for example, 

563 a slice of the dataset to be loaded. 

564 

565 Returns 

566 ------- 

567 getInfo : `list` [`DatastoreFileGetInformation`] 

568 Parameters needed to retrieve each file. 

569 """ 

570 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

571 

572 # Get file metadata and internal metadata 

573 fileLocations = self._get_dataset_locations_info(ref) 

574 if not fileLocations: 

575 if not self.trustGetRequest: 

576 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

577 # Assume the dataset is where we think it should be 

578 fileLocations = self._get_expected_dataset_locations_info(ref) 

579 

580 # The storage class we want to use eventually 

581 refStorageClass = ref.datasetType.storageClass 

582 

583 if len(fileLocations) > 1: 

584 disassembled = True 

585 

586 # If trust is involved it is possible that there will be 

587 # components listed here that do not exist in the datastore. 

588 # Explicitly check for file artifact existence and filter out any 

589 # that are missing. 

590 if self.trustGetRequest: 

591 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()] 

592 

593 # For now complain only if we have no components at all. One 

594 # component is probably a problem but we can punt that to the 

595 # assembler. 

596 if not fileLocations: 596 ↛ 597line 596 didn't jump to line 597, because the condition on line 596 was never true

597 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") 

598 

599 else: 

600 disassembled = False 

601 

602 # Is this a component request? 

603 refComponent = ref.datasetType.component() 

604 

605 fileGetInfo = [] 

606 for location, storedFileInfo in fileLocations: 

607 

608 # The storage class used to write the file 

609 writeStorageClass = storedFileInfo.storageClass 

610 

611 # If this has been disassembled we need read to match the write 

612 if disassembled: 

613 readStorageClass = writeStorageClass 

614 else: 

615 readStorageClass = refStorageClass 

616 

617 formatter = getInstanceOf(storedFileInfo.formatter, 

618 FileDescriptor(location, readStorageClass=readStorageClass, 

619 storageClass=writeStorageClass, parameters=parameters), 

620 ref.dataId) 

621 

622 formatterParams, notFormatterParams = formatter.segregateParameters() 

623 

624 # Of the remaining parameters, extract the ones supported by 

625 # this StorageClass (for components not all will be handled) 

626 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

627 

628 # The ref itself could be a component if the dataset was 

629 # disassembled by butler, or we disassembled in datastore and 

630 # components came from the datastore records 

631 component = storedFileInfo.component if storedFileInfo.component else refComponent 

632 

633 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo, 

634 assemblerParams, formatterParams, 

635 component, readStorageClass)) 

636 

637 return fileGetInfo 

638 

639 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

640 """Check the arguments for ``put`` and obtain formatter and 

641 location. 

642 

643 Parameters 

644 ---------- 

645 inMemoryDataset : `object` 

646 The dataset to store. 

647 ref : `DatasetRef` 

648 Reference to the associated Dataset. 

649 

650 Returns 

651 ------- 

652 location : `Location` 

653 The location to write the dataset. 

654 formatter : `Formatter` 

655 The `Formatter` to use to write the dataset. 

656 

657 Raises 

658 ------ 

659 TypeError 

660 Supplied object and storage class are inconsistent. 

661 DatasetTypeNotSupportedError 

662 The associated `DatasetType` is not handled by this datastore. 

663 """ 

664 self._validate_put_parameters(inMemoryDataset, ref) 

665 return self._determine_put_formatter_location(ref) 

666 

667 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]: 

668 """Calculate the formatter and output location to use for put. 

669 

670 Parameters 

671 ---------- 

672 ref : `DatasetRef` 

673 Reference to the associated Dataset. 

674 

675 Returns 

676 ------- 

677 location : `Location` 

678 The location to write the dataset. 

679 formatter : `Formatter` 

680 The `Formatter` to use to write the dataset. 

681 """ 

682 # Work out output file name 

683 try: 

684 template = self.templates.getTemplate(ref) 

685 except KeyError as e: 

686 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

687 

688 # Validate the template to protect against filenames from different 

689 # dataIds returning the same and causing overwrite confusion. 

690 template.validateTemplate(ref) 

691 

692 location = self.locationFactory.fromPath(template.format(ref)) 

693 

694 # Get the formatter based on the storage class 

695 storageClass = ref.datasetType.storageClass 

696 try: 

697 formatter = self.formatterFactory.getFormatter(ref, 

698 FileDescriptor(location, 

699 storageClass=storageClass), 

700 ref.dataId) 

701 except KeyError as e: 

702 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore " 

703 f"{self.name}") from e 

704 

705 # Now that we know the formatter, update the location 

706 location = formatter.makeUpdatedLocation(location) 

707 

708 return location, formatter 

709 

710 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

711 # Docstring inherited from base class 

712 if transfer != "auto": 

713 return transfer 

714 

715 # See if the paths are within the datastore or not 

716 inside = [self._pathInStore(d.path) is not None for d in datasets] 

717 

718 if all(inside): 

719 transfer = None 

720 elif not any(inside): 720 ↛ 729line 720 didn't jump to line 729, because the condition on line 720 was never false

721 # Allow ButlerURI to use its own knowledge 

722 transfer = "auto" 

723 else: 

724 # This can happen when importing from a datastore that 

725 # has had some datasets ingested using "direct" mode. 

726 # Also allow ButlerURI to sort it out but warn about it. 

727 # This can happen if you are importing from a datastore 

728 # that had some direct transfer datasets. 

729 log.warning("Some datasets are inside the datastore and some are outside. Using 'split' " 

730 "transfer mode. This assumes that the files outside the datastore are " 

731 "still accessible to the new butler since they will not be copied into " 

732 "the target datastore.") 

733 transfer = "split" 

734 

735 return transfer 

736 

737 def _pathInStore(self, path: Union[str, ButlerURI]) -> Optional[str]: 

738 """Return path relative to datastore root 

739 

740 Parameters 

741 ---------- 

742 path : `str` or `ButlerURI` 

743 Path to dataset. Can be absolute URI. If relative assumed to 

744 be relative to the datastore. Returns path in datastore 

745 or raises an exception if the path it outside. 

746 

747 Returns 

748 ------- 

749 inStore : `str` 

750 Path relative to datastore root. Returns `None` if the file is 

751 outside the root. 

752 """ 

753 # Relative path will always be relative to datastore 

754 pathUri = ButlerURI(path, forceAbsolute=False) 

755 return pathUri.relative_to(self.root) 

756 

757 def _standardizeIngestPath(self, path: Union[str, ButlerURI], *, 

758 transfer: Optional[str] = None) -> Union[str, ButlerURI]: 

759 """Standardize the path of a to-be-ingested file. 

760 

761 Parameters 

762 ---------- 

763 path : `str` or `ButlerURI` 

764 Path of a file to be ingested. 

765 transfer : `str`, optional 

766 How (and whether) the dataset should be added to the datastore. 

767 See `ingest` for details of transfer modes. 

768 This implementation is provided only so 

769 `NotImplementedError` can be raised if the mode is not supported; 

770 actual transfers are deferred to `_extractIngestInfo`. 

771 

772 Returns 

773 ------- 

774 path : `str` or `ButlerURI` 

775 New path in what the datastore considers standard form. If an 

776 absolute URI was given that will be returned unchanged. 

777 

778 Notes 

779 ----- 

780 Subclasses of `FileDatastore` can implement this method instead 

781 of `_prepIngest`. It should not modify the data repository or given 

782 file in any way. 

783 

784 Raises 

785 ------ 

786 NotImplementedError 

787 Raised if the datastore does not support the given transfer mode 

788 (including the case where ingest is not supported at all). 

789 FileNotFoundError 

790 Raised if one of the given files does not exist. 

791 """ 

792 if transfer not in (None, "direct", "split") + self.root.transferModes: 792 ↛ 793line 792 didn't jump to line 793, because the condition on line 792 was never true

793 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

794 

795 # A relative URI indicates relative to datastore root 

796 srcUri = ButlerURI(path, forceAbsolute=False) 

797 if not srcUri.isabs(): 

798 srcUri = self.root.join(path) 

799 

800 if not srcUri.exists(): 

801 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest " 

802 f"are assumed to be relative to {self.root} unless they are absolute.") 

803 

804 if transfer is None: 

805 relpath = srcUri.relative_to(self.root) 

806 if not relpath: 

807 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not " 

808 f"within datastore ({self.root})") 

809 

810 # Return the relative path within the datastore for internal 

811 # transfer 

812 path = relpath 

813 

814 return path 

815 

816 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *, 

817 formatter: Union[Formatter, Type[Formatter]], 

818 transfer: Optional[str] = None) -> StoredFileInfo: 

819 """Relocate (if necessary) and extract `StoredFileInfo` from a 

820 to-be-ingested file. 

821 

822 Parameters 

823 ---------- 

824 path : `str` or `ButlerURI` 

825 URI or path of a file to be ingested. 

826 ref : `DatasetRef` 

827 Reference for the dataset being ingested. Guaranteed to have 

828 ``dataset_id not None`. 

829 formatter : `type` or `Formatter` 

830 `Formatter` subclass to use for this dataset or an instance. 

831 transfer : `str`, optional 

832 How (and whether) the dataset should be added to the datastore. 

833 See `ingest` for details of transfer modes. 

834 

835 Returns 

836 ------- 

837 info : `StoredFileInfo` 

838 Internal datastore record for this file. This will be inserted by 

839 the caller; the `_extractIngestInfo` is only resposible for 

840 creating and populating the struct. 

841 

842 Raises 

843 ------ 

844 FileNotFoundError 

845 Raised if one of the given files does not exist. 

846 FileExistsError 

847 Raised if transfer is not `None` but the (internal) location the 

848 file would be moved to is already occupied. 

849 """ 

850 if self._transaction is None: 850 ↛ 851line 850 didn't jump to line 851, because the condition on line 850 was never true

851 raise RuntimeError("Ingest called without transaction enabled") 

852 

853 # Create URI of the source path, do not need to force a relative 

854 # path to absolute. 

855 srcUri = ButlerURI(path, forceAbsolute=False) 

856 

857 # Track whether we have read the size of the source yet 

858 have_sized = False 

859 

860 tgtLocation: Optional[Location] 

861 if transfer is None or transfer == "split": 

862 # A relative path is assumed to be relative to the datastore 

863 # in this context 

864 if not srcUri.isabs(): 

865 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

866 else: 

867 # Work out the path in the datastore from an absolute URI 

868 # This is required to be within the datastore. 

869 pathInStore = srcUri.relative_to(self.root) 

870 if pathInStore is None and transfer is None: 870 ↛ 871line 870 didn't jump to line 871, because the condition on line 870 was never true

871 raise RuntimeError(f"Unexpectedly learned that {srcUri} is " 

872 f"not within datastore {self.root}") 

873 if pathInStore: 873 ↛ 875line 873 didn't jump to line 875, because the condition on line 873 was never false

874 tgtLocation = self.locationFactory.fromPath(pathInStore) 

875 elif transfer == "split": 

876 # Outside the datastore but treat that as a direct ingest 

877 # instead. 

878 tgtLocation = None 

879 else: 

880 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for" 

881 f" URI {srcUri}") 

882 elif transfer == "direct": 882 ↛ 887line 882 didn't jump to line 887, because the condition on line 882 was never true

883 # Want to store the full URI to the resource directly in 

884 # datastore. This is useful for referring to permanent archive 

885 # storage for raw data. 

886 # Trust that people know what they are doing. 

887 tgtLocation = None 

888 else: 

889 # Work out the name we want this ingested file to have 

890 # inside the datastore 

891 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

892 if not tgtLocation.uri.dirname().exists(): 

893 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

894 tgtLocation.uri.dirname().mkdir() 

895 

896 # if we are transferring from a local file to a remote location 

897 # it may be more efficient to get the size and checksum of the 

898 # local file rather than the transferred one 

899 if not srcUri.scheme or srcUri.scheme == "file": 899 ↛ 905line 899 didn't jump to line 905, because the condition on line 899 was never false

900 size = srcUri.size() 

901 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

902 have_sized = True 

903 

904 # transfer the resource to the destination 

905 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction) 

906 

907 if tgtLocation is None: 907 ↛ 909line 907 didn't jump to line 909, because the condition on line 907 was never true

908 # This means we are using direct mode 

909 targetUri = srcUri 

910 targetPath = str(srcUri) 

911 else: 

912 targetUri = tgtLocation.uri 

913 targetPath = tgtLocation.pathInStore.path 

914 

915 # the file should exist in the datastore now 

916 if not have_sized: 

917 size = targetUri.size() 

918 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

919 

920 return StoredFileInfo(formatter=formatter, path=targetPath, 

921 storageClass=ref.datasetType.storageClass, 

922 component=ref.datasetType.component(), 

923 file_size=size, checksum=checksum) 

924 

925 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

926 # Docstring inherited from Datastore._prepIngest. 

927 filtered = [] 

928 for dataset in datasets: 

929 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

930 if not acceptable: 

931 continue 

932 else: 

933 dataset.refs = acceptable 

934 if dataset.formatter is None: 

935 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

936 else: 

937 assert isinstance(dataset.formatter, (type, str)) 

938 dataset.formatter = getClassOf(dataset.formatter) 

939 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

940 filtered.append(dataset) 

941 return _IngestPrepData(filtered) 

942 

943 @transactional 

944 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None: 

945 # Docstring inherited from Datastore._finishIngest. 

946 refsAndInfos = [] 

947 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

948 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

949 # Do ingest as if the first dataset ref is associated with the file 

950 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter, 

951 transfer=transfer) 

952 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

953 self._register_datasets(refsAndInfos) 

954 

955 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef, 

956 formatter: Union[Formatter, Type[Formatter]]) -> Location: 

957 """Given a source URI and a DatasetRef, determine the name the 

958 dataset will have inside datastore. 

959 

960 Parameters 

961 ---------- 

962 srcUri : `ButlerURI` 

963 URI to the source dataset file. 

964 ref : `DatasetRef` 

965 Ref associated with the newly-ingested dataset artifact. This 

966 is used to determine the name within the datastore. 

967 formatter : `Formatter` or Formatter class. 

968 Formatter to use for validation. Can be a class or an instance. 

969 

970 Returns 

971 ------- 

972 location : `Location` 

973 Target location for the newly-ingested dataset. 

974 """ 

975 # Ingesting a file from outside the datastore. 

976 # This involves a new name. 

977 template = self.templates.getTemplate(ref) 

978 location = self.locationFactory.fromPath(template.format(ref)) 

979 

980 # Get the extension 

981 ext = srcUri.getExtension() 

982 

983 # Update the destination to include that extension 

984 location.updateExtension(ext) 

985 

986 # Ask the formatter to validate this extension 

987 formatter.validateExtension(location) 

988 

989 return location 

990 

991 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

992 """Write out in memory dataset to datastore. 

993 

994 Parameters 

995 ---------- 

996 inMemoryDataset : `object` 

997 Dataset to write to datastore. 

998 ref : `DatasetRef` 

999 Registry information associated with this dataset. 

1000 

1001 Returns 

1002 ------- 

1003 info : `StoredFileInfo` 

1004 Information describin the artifact written to the datastore. 

1005 """ 

1006 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

1007 uri = location.uri 

1008 

1009 if not uri.dirname().exists(): 

1010 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

1011 uri.dirname().mkdir() 

1012 

1013 if self._transaction is None: 1013 ↛ 1014line 1013 didn't jump to line 1014, because the condition on line 1013 was never true

1014 raise RuntimeError("Attempting to write artifact without transaction enabled") 

1015 

1016 def _removeFileExists(uri: ButlerURI) -> None: 

1017 """Remove a file and do not complain if it is not there. 

1018 

1019 This is important since a formatter might fail before the file 

1020 is written and we should not confuse people by writing spurious 

1021 error messages to the log. 

1022 """ 

1023 try: 

1024 uri.remove() 

1025 except FileNotFoundError: 

1026 pass 

1027 

1028 # Register a callback to try to delete the uploaded data if 

1029 # something fails below 

1030 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1031 

1032 # For a local file, simply use the formatter directly 

1033 if uri.isLocal: 

1034 try: 

1035 formatter.write(inMemoryDataset) 

1036 except Exception as e: 

1037 raise RuntimeError(f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} " 

1038 f"to location {uri}") from e 

1039 log.debug("Successfully wrote python object to local file at %s", uri) 

1040 else: 

1041 # This is a remote URI. Some datasets can be serialized directly 

1042 # to bytes and sent to the remote datastore without writing a 

1043 # file. If the dataset is intended to be saved to the cache 

1044 # a file is always written and direct write to the remote 

1045 # datastore is bypassed. 

1046 data_written = False 

1047 if not self.cacheManager.should_be_cached(ref): 

1048 try: 

1049 serializedDataset = formatter.toBytes(inMemoryDataset) 

1050 except NotImplementedError: 

1051 # Fallback to the file writing option. 

1052 pass 

1053 except Exception as e: 

1054 raise RuntimeError(f"Failed to serialize dataset {ref} " 

1055 f"of type {type(inMemoryDataset)} to bytes.") from e 

1056 else: 

1057 log.debug("Writing bytes directly to %s", uri) 

1058 uri.write(serializedDataset, overwrite=True) 

1059 log.debug("Successfully wrote bytes directly to %s", uri) 

1060 data_written = True 

1061 

1062 if not data_written: 

1063 # Did not write the bytes directly to object store so instead 

1064 # write to temporary file. 

1065 with ButlerURI.temporary_uri(suffix=uri.getExtension()) as temporary_uri: 

1066 # Need to configure the formatter to write to a different 

1067 # location and that needs us to overwrite internals 

1068 log.debug("Writing dataset to temporary location at %s", temporary_uri) 

1069 with formatter._updateLocation(Location(None, temporary_uri)): 

1070 try: 

1071 formatter.write(inMemoryDataset) 

1072 except Exception as e: 

1073 raise RuntimeError(f"Failed to serialize dataset {ref} of type" 

1074 f" {type(inMemoryDataset)} to " 

1075 f"temporary location {temporary_uri}") from e 

1076 uri.transfer_from(temporary_uri, transfer="copy", overwrite=True) 

1077 

1078 # Cache if required 

1079 self.cacheManager.move_to_cache(temporary_uri, ref) 

1080 

1081 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1082 

1083 # URI is needed to resolve what ingest case are we dealing with 

1084 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1085 

1086 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

1087 ref: DatasetRef, isComponent: bool = False, 

1088 cache_ref: Optional[DatasetRef] = None) -> Any: 

1089 """Read the artifact from datastore into in memory object. 

1090 

1091 Parameters 

1092 ---------- 

1093 getInfo : `DatastoreFileGetInformation` 

1094 Information about the artifact within the datastore. 

1095 ref : `DatasetRef` 

1096 The registry information associated with this artifact. 

1097 isComponent : `bool` 

1098 Flag to indicate if a component is being read from this artifact. 

1099 cache_ref : `DatasetRef`, optional 

1100 The DatasetRef to use when looking up the file in the cache. 

1101 This ref must have the same ID as the supplied ref but can 

1102 be a parent ref or component ref to indicate to the cache whether 

1103 a composite file is being requested from the cache or a component 

1104 file. Without this the cache will default to the supplied ref but 

1105 it can get confused with read-only derived components for 

1106 disassembled composites. 

1107 

1108 Returns 

1109 ------- 

1110 inMemoryDataset : `object` 

1111 The artifact as a python object. 

1112 """ 

1113 location = getInfo.location 

1114 uri = location.uri 

1115 log.debug("Accessing data from %s", uri) 

1116 

1117 if cache_ref is None: 

1118 cache_ref = ref 

1119 if cache_ref.id != ref.id: 1119 ↛ 1120line 1119 didn't jump to line 1120, because the condition on line 1119 was never true

1120 raise ValueError("The supplied cache dataset ref refers to a different dataset than expected:" 

1121 f" {ref.id} != {cache_ref.id}") 

1122 

1123 # Cannot recalculate checksum but can compare size as a quick check 

1124 # Do not do this if the size is negative since that indicates 

1125 # we do not know. 

1126 recorded_size = getInfo.info.file_size 

1127 resource_size = uri.size() 

1128 if recorded_size >= 0 and resource_size != recorded_size: 1128 ↛ 1129line 1128 didn't jump to line 1129, because the condition on line 1128 was never true

1129 raise RuntimeError("Integrity failure in Datastore. " 

1130 f"Size of file {uri} ({resource_size}) " 

1131 f"does not match size recorded in registry of {recorded_size}") 

1132 

1133 # For the general case we have choices for how to proceed. 

1134 # 1. Always use a local file (downloading the remote resource to a 

1135 # temporary file if needed). 

1136 # 2. Use a threshold size and read into memory and use bytes. 

1137 # Use both for now with an arbitrary hand off size. 

1138 # This allows small datasets to be downloaded from remote object 

1139 # stores without requiring a temporary file. 

1140 

1141 formatter = getInfo.formatter 

1142 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1143 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1144 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1145 if cached_file is not None: 

1146 desired_uri = cached_file 

1147 msg = f" (cached version of {uri})" 

1148 else: 

1149 desired_uri = uri 

1150 msg = "" 

1151 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)): 

1152 serializedDataset = desired_uri.read() 

1153 log.debug("Deserializing %s from %d bytes from location %s with formatter %s", 

1154 f"component {getInfo.component}" if isComponent else "", 

1155 len(serializedDataset), uri, formatter.name()) 

1156 try: 

1157 result = formatter.fromBytes(serializedDataset, 

1158 component=getInfo.component if isComponent else None) 

1159 except Exception as e: 

1160 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1161 f" ({ref.datasetType.name} from {uri}): {e}") from e 

1162 else: 

1163 # Read from file. 

1164 

1165 # Have to update the Location associated with the formatter 

1166 # because formatter.read does not allow an override. 

1167 # This could be improved. 

1168 location_updated = False 

1169 msg = "" 

1170 

1171 # First check in cache for local version. 

1172 # The cache will only be relevant for remote resources but 

1173 # no harm in always asking. Context manager ensures that cache 

1174 # file is not deleted during cache expiration. 

1175 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1176 if cached_file is not None: 

1177 msg = f"(via cache read of remote file {uri})" 

1178 uri = cached_file 

1179 location_updated = True 

1180 

1181 with uri.as_local() as local_uri: 

1182 

1183 can_be_cached = False 

1184 if uri != local_uri: 1184 ↛ 1186line 1184 didn't jump to line 1186, because the condition on line 1184 was never true

1185 # URI was remote and file was downloaded 

1186 cache_msg = "" 

1187 location_updated = True 

1188 

1189 if self.cacheManager.should_be_cached(cache_ref): 

1190 # In this scenario we want to ask if the downloaded 

1191 # file should be cached but we should not cache 

1192 # it until after we've used it (to ensure it can't 

1193 # be expired whilst we are using it). 

1194 can_be_cached = True 

1195 

1196 # Say that it is "likely" to be cached because 

1197 # if the formatter read fails we will not be 

1198 # caching this file. 

1199 cache_msg = " and likely cached" 

1200 

1201 msg = f"(via download to local file{cache_msg})" 

1202 

1203 # Calculate the (possibly) new location for the formatter 

1204 # to use. 

1205 newLocation = Location(*local_uri.split()) if location_updated else None 

1206 

1207 log.debug("Reading%s from location %s %s with formatter %s", 

1208 f" component {getInfo.component}" if isComponent else "", 

1209 uri, msg, formatter.name()) 

1210 try: 

1211 with formatter._updateLocation(newLocation): 

1212 with time_this(log, msg="Reading%s from location %s %s with formatter %s", 

1213 args=(f" component {getInfo.component}" if isComponent else "", 

1214 uri, msg, formatter.name())): 

1215 result = formatter.read(component=getInfo.component if isComponent else None) 

1216 except Exception as e: 

1217 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1218 f" ({ref.datasetType.name} from {uri}): {e}") from e 

1219 

1220 # File was read successfully so can move to cache 

1221 if can_be_cached: 1221 ↛ 1222line 1221 didn't jump to line 1222, because the condition on line 1221 was never true

1222 self.cacheManager.move_to_cache(local_uri, cache_ref) 

1223 

1224 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

1225 isComponent=isComponent) 

1226 

1227 def knows(self, ref: DatasetRef) -> bool: 

1228 """Check if the dataset is known to the datastore. 

1229 

1230 Does not check for existence of any artifact. 

1231 

1232 Parameters 

1233 ---------- 

1234 ref : `DatasetRef` 

1235 Reference to the required dataset. 

1236 

1237 Returns 

1238 ------- 

1239 exists : `bool` 

1240 `True` if the dataset is known to the datastore. 

1241 """ 

1242 fileLocations = self._get_dataset_locations_info(ref) 

1243 if fileLocations: 

1244 return True 

1245 return False 

1246 

1247 def exists(self, ref: DatasetRef) -> bool: 

1248 """Check if the dataset exists in the datastore. 

1249 

1250 Parameters 

1251 ---------- 

1252 ref : `DatasetRef` 

1253 Reference to the required dataset. 

1254 

1255 Returns 

1256 ------- 

1257 exists : `bool` 

1258 `True` if the entity exists in the `Datastore`. 

1259 """ 

1260 fileLocations = self._get_dataset_locations_info(ref) 

1261 

1262 # if we are being asked to trust that registry might not be correct 

1263 # we ask for the expected locations and check them explicitly 

1264 if not fileLocations: 

1265 if not self.trustGetRequest: 

1266 return False 

1267 

1268 # When we are guessing a dataset location we can not check 

1269 # for the existence of every component since we can not 

1270 # know if every component was written. Instead we check 

1271 # for the existence of any of the expected locations. 

1272 for location, _ in self._get_expected_dataset_locations_info(ref): 1272 ↛ 1275line 1272 didn't jump to line 1275, because the loop on line 1272 didn't complete

1273 if self._artifact_exists(location): 1273 ↛ 1272line 1273 didn't jump to line 1272, because the condition on line 1273 was never false

1274 return True 

1275 return False 

1276 

1277 # All listed artifacts must exist. 

1278 for location, _ in fileLocations: 

1279 if not self._artifact_exists(location): 

1280 return False 

1281 

1282 return True 

1283 

1284 def getURIs(self, ref: DatasetRef, 

1285 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

1286 """Return URIs associated with dataset. 

1287 

1288 Parameters 

1289 ---------- 

1290 ref : `DatasetRef` 

1291 Reference to the required dataset. 

1292 predict : `bool`, optional 

1293 If the datastore does not know about the dataset, should it 

1294 return a predicted URI or not? 

1295 

1296 Returns 

1297 ------- 

1298 primary : `ButlerURI` 

1299 The URI to the primary artifact associated with this dataset. 

1300 If the dataset was disassembled within the datastore this 

1301 may be `None`. 

1302 components : `dict` 

1303 URIs to any components associated with the dataset artifact. 

1304 Can be empty if there are no components. 

1305 """ 

1306 

1307 primary: Optional[ButlerURI] = None 

1308 components: Dict[str, ButlerURI] = {} 

1309 

1310 # if this has never been written then we have to guess 

1311 if not self.exists(ref): 

1312 if not predict: 

1313 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

1314 

1315 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1316 

1317 if doDisassembly: 

1318 

1319 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

1320 compRef = ref.makeComponentRef(component) 

1321 compLocation, _ = self._determine_put_formatter_location(compRef) 

1322 

1323 # Add a URI fragment to indicate this is a guess 

1324 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted") 

1325 

1326 else: 

1327 

1328 location, _ = self._determine_put_formatter_location(ref) 

1329 

1330 # Add a URI fragment to indicate this is a guess 

1331 primary = ButlerURI(location.uri.geturl() + "#predicted") 

1332 

1333 return primary, components 

1334 

1335 # If this is a ref that we have written we can get the path. 

1336 # Get file metadata and internal metadata 

1337 fileLocations = self._get_dataset_locations_info(ref) 

1338 

1339 guessing = False 

1340 if not fileLocations: 

1341 if not self.trustGetRequest: 1341 ↛ 1342line 1341 didn't jump to line 1342, because the condition on line 1341 was never true

1342 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1343 fileLocations = self._get_expected_dataset_locations_info(ref) 

1344 guessing = True 

1345 

1346 if len(fileLocations) == 1: 

1347 # No disassembly so this is the primary URI 

1348 uri = fileLocations[0][0].uri 

1349 if guessing and not uri.exists(): 1349 ↛ 1350line 1349 didn't jump to line 1350, because the condition on line 1349 was never true

1350 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1351 primary = uri 

1352 

1353 else: 

1354 for location, storedFileInfo in fileLocations: 

1355 if storedFileInfo.component is None: 1355 ↛ 1356line 1355 didn't jump to line 1356, because the condition on line 1355 was never true

1356 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1357 uri = location.uri 

1358 if guessing and not uri.exists(): 1358 ↛ 1362line 1358 didn't jump to line 1362, because the condition on line 1358 was never true

1359 # If we are trusting then it is entirely possible for 

1360 # some components to be missing. In that case we skip 

1361 # to the next component. 

1362 if self.trustGetRequest: 

1363 continue 

1364 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1365 components[storedFileInfo.component] = uri 

1366 

1367 return primary, components 

1368 

1369 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI: 

1370 """URI to the Dataset. 

1371 

1372 Parameters 

1373 ---------- 

1374 ref : `DatasetRef` 

1375 Reference to the required Dataset. 

1376 predict : `bool` 

1377 If `True`, allow URIs to be returned of datasets that have not 

1378 been written. 

1379 

1380 Returns 

1381 ------- 

1382 uri : `str` 

1383 URI pointing to the dataset within the datastore. If the 

1384 dataset does not exist in the datastore, and if ``predict`` is 

1385 `True`, the URI will be a prediction and will include a URI 

1386 fragment "#predicted". 

1387 If the datastore does not have entities that relate well 

1388 to the concept of a URI the returned URI will be 

1389 descriptive. The returned URI is not guaranteed to be obtainable. 

1390 

1391 Raises 

1392 ------ 

1393 FileNotFoundError 

1394 Raised if a URI has been requested for a dataset that does not 

1395 exist and guessing is not allowed. 

1396 RuntimeError 

1397 Raised if a request is made for a single URI but multiple URIs 

1398 are associated with this dataset. 

1399 

1400 Notes 

1401 ----- 

1402 When a predicted URI is requested an attempt will be made to form 

1403 a reasonable URI based on file templates and the expected formatter. 

1404 """ 

1405 primary, components = self.getURIs(ref, predict) 

1406 if primary is None or components: 1406 ↛ 1407line 1406 didn't jump to line 1407, because the condition on line 1406 was never true

1407 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. " 

1408 "Use Dataastore.getURIs() instead.") 

1409 return primary 

1410 

1411 def retrieveArtifacts(self, refs: Iterable[DatasetRef], 

1412 destination: ButlerURI, transfer: str = "auto", 

1413 preserve_path: bool = True, 

1414 overwrite: bool = False) -> List[ButlerURI]: 

1415 """Retrieve the file artifacts associated with the supplied refs. 

1416 

1417 Parameters 

1418 ---------- 

1419 refs : iterable of `DatasetRef` 

1420 The datasets for which file artifacts are to be retrieved. 

1421 A single ref can result in multiple files. The refs must 

1422 be resolved. 

1423 destination : `ButlerURI` 

1424 Location to write the file artifacts. 

1425 transfer : `str`, optional 

1426 Method to use to transfer the artifacts. Must be one of the options 

1427 supported by `ButlerURI.transfer_from()`. "move" is not allowed. 

1428 preserve_path : `bool`, optional 

1429 If `True` the full path of the file artifact within the datastore 

1430 is preserved. If `False` the final file component of the path 

1431 is used. 

1432 overwrite : `bool`, optional 

1433 If `True` allow transfers to overwrite existing files at the 

1434 destination. 

1435 

1436 Returns 

1437 ------- 

1438 targets : `list` of `ButlerURI` 

1439 URIs of file artifacts in destination location. Order is not 

1440 preserved. 

1441 """ 

1442 if not destination.isdir(): 1442 ↛ 1443line 1442 didn't jump to line 1443, because the condition on line 1442 was never true

1443 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

1444 

1445 if transfer == "move": 

1446 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

1447 

1448 # Source -> Destination 

1449 # This also helps filter out duplicate DatasetRef in the request 

1450 # that will map to the same underlying file transfer. 

1451 to_transfer: Dict[ButlerURI, ButlerURI] = {} 

1452 

1453 for ref in refs: 

1454 locations = self._get_dataset_locations_info(ref) 

1455 for location, _ in locations: 

1456 source_uri = location.uri 

1457 target_path: Union[str, ButlerURI] 

1458 if preserve_path: 

1459 target_path = location.pathInStore 

1460 if target_path.isabs(): 1460 ↛ 1463line 1460 didn't jump to line 1463, because the condition on line 1460 was never true

1461 # This is an absolute path to an external file. 

1462 # Use the full path. 

1463 target_path = target_path.relativeToPathRoot 

1464 else: 

1465 target_path = source_uri.basename() 

1466 target_uri = destination.join(target_path) 

1467 to_transfer[source_uri] = target_uri 

1468 

1469 # In theory can now parallelize the transfer 

1470 log.debug("Number of artifacts to transfer to %s: %d", 

1471 str(destination), len(to_transfer)) 

1472 for source_uri, target_uri in to_transfer.items(): 

1473 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

1474 

1475 return list(to_transfer.values()) 

1476 

1477 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

1478 """Load an InMemoryDataset from the store. 

1479 

1480 Parameters 

1481 ---------- 

1482 ref : `DatasetRef` 

1483 Reference to the required Dataset. 

1484 parameters : `dict` 

1485 `StorageClass`-specific parameters that specify, for example, 

1486 a slice of the dataset to be loaded. 

1487 

1488 Returns 

1489 ------- 

1490 inMemoryDataset : `object` 

1491 Requested dataset or slice thereof as an InMemoryDataset. 

1492 

1493 Raises 

1494 ------ 

1495 FileNotFoundError 

1496 Requested dataset can not be retrieved. 

1497 TypeError 

1498 Return value from formatter has unexpected type. 

1499 ValueError 

1500 Formatter failed to process the dataset. 

1501 """ 

1502 allGetInfo = self._prepare_for_get(ref, parameters) 

1503 refComponent = ref.datasetType.component() 

1504 

1505 # Supplied storage class for the component being read 

1506 refStorageClass = ref.datasetType.storageClass 

1507 

1508 # Create mapping from component name to related info 

1509 allComponents = {i.component: i for i in allGetInfo} 

1510 

1511 # By definition the dataset is disassembled if we have more 

1512 # than one record for it. 

1513 isDisassembled = len(allGetInfo) > 1 

1514 

1515 # Look for the special case where we are disassembled but the 

1516 # component is a derived component that was not written during 

1517 # disassembly. For this scenario we need to check that the 

1518 # component requested is listed as a derived component for the 

1519 # composite storage class 

1520 isDisassembledReadOnlyComponent = False 

1521 if isDisassembled and refComponent: 

1522 # The composite storage class should be accessible through 

1523 # the component dataset type 

1524 compositeStorageClass = ref.datasetType.parentStorageClass 

1525 

1526 # In the unlikely scenario where the composite storage 

1527 # class is not known, we can only assume that this is a 

1528 # normal component. If that assumption is wrong then the 

1529 # branch below that reads a persisted component will fail 

1530 # so there is no need to complain here. 

1531 if compositeStorageClass is not None: 1531 ↛ 1534line 1531 didn't jump to line 1534, because the condition on line 1531 was never false

1532 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

1533 

1534 if isDisassembled and not refComponent: 

1535 # This was a disassembled dataset spread over multiple files 

1536 # and we need to put them all back together again. 

1537 # Read into memory and then assemble 

1538 

1539 # Check that the supplied parameters are suitable for the type read 

1540 refStorageClass.validateParameters(parameters) 

1541 

1542 # We want to keep track of all the parameters that were not used 

1543 # by formatters. We assume that if any of the component formatters 

1544 # use a parameter that we do not need to apply it again in the 

1545 # assembler. 

1546 usedParams = set() 

1547 

1548 components: Dict[str, Any] = {} 

1549 for getInfo in allGetInfo: 

1550 # assemblerParams are parameters not understood by the 

1551 # associated formatter. 

1552 usedParams.update(set(getInfo.formatterParams)) 

1553 

1554 component = getInfo.component 

1555 

1556 if component is None: 1556 ↛ 1557line 1556 didn't jump to line 1557, because the condition on line 1556 was never true

1557 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

1558 

1559 # We do not want the formatter to think it's reading 

1560 # a component though because it is really reading a 

1561 # standalone dataset -- always tell reader it is not a 

1562 # component. 

1563 components[component] = self._read_artifact_into_memory(getInfo, 

1564 ref.makeComponentRef(component), 

1565 isComponent=False) 

1566 

1567 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

1568 

1569 # Any unused parameters will have to be passed to the assembler 

1570 if parameters: 

1571 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

1572 else: 

1573 unusedParams = {} 

1574 

1575 # Process parameters 

1576 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset, 

1577 parameters=unusedParams) 

1578 

1579 elif isDisassembledReadOnlyComponent: 

1580 

1581 compositeStorageClass = ref.datasetType.parentStorageClass 

1582 if compositeStorageClass is None: 1582 ↛ 1583line 1582 didn't jump to line 1583, because the condition on line 1582 was never true

1583 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since" 

1584 "no composite storage class is available.") 

1585 

1586 if refComponent is None: 1586 ↛ 1588line 1586 didn't jump to line 1588, because the condition on line 1586 was never true

1587 # Mainly for mypy 

1588 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

1589 

1590 # Assume that every derived component can be calculated by 

1591 # forwarding the request to a single read/write component. 

1592 # Rather than guessing which rw component is the right one by 

1593 # scanning each for a derived component of the same name, 

1594 # we ask the storage class delegate directly which one is best to 

1595 # use. 

1596 compositeDelegate = compositeStorageClass.delegate() 

1597 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent, 

1598 set(allComponents)) 

1599 

1600 # Select the relevant component 

1601 rwInfo = allComponents[forwardedComponent] 

1602 

1603 # For now assume that read parameters are validated against 

1604 # the real component and not the requested component 

1605 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

1606 forwardedStorageClass.validateParameters(parameters) 

1607 

1608 # The reference to use for the caching must refer to the forwarded 

1609 # component and not the derived component. 

1610 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent) 

1611 

1612 # Unfortunately the FileDescriptor inside the formatter will have 

1613 # the wrong write storage class so we need to create a new one 

1614 # given the immutability constraint. 

1615 writeStorageClass = rwInfo.info.storageClass 

1616 

1617 # We may need to put some thought into parameters for read 

1618 # components but for now forward them on as is 

1619 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location, 

1620 readStorageClass=refStorageClass, 

1621 storageClass=writeStorageClass, 

1622 parameters=parameters), 

1623 ref.dataId) 

1624 

1625 # The assembler can not receive any parameter requests for a 

1626 # derived component at this time since the assembler will 

1627 # see the storage class of the derived component and those 

1628 # parameters will have to be handled by the formatter on the 

1629 # forwarded storage class. 

1630 assemblerParams: Dict[str, Any] = {} 

1631 

1632 # Need to created a new info that specifies the derived 

1633 # component and associated storage class 

1634 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter, 

1635 rwInfo.info, assemblerParams, {}, 

1636 refComponent, refStorageClass) 

1637 

1638 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, 

1639 cache_ref=cache_ref) 

1640 

1641 else: 

1642 # Single file request or component from that composite file 

1643 for lookup in (refComponent, None): 1643 ↛ 1648line 1643 didn't jump to line 1648, because the loop on line 1643 didn't complete

1644 if lookup in allComponents: 1644 ↛ 1643line 1644 didn't jump to line 1643, because the condition on line 1644 was never false

1645 getInfo = allComponents[lookup] 

1646 break 

1647 else: 

1648 raise FileNotFoundError(f"Component {refComponent} not found " 

1649 f"for ref {ref} in datastore {self.name}") 

1650 

1651 # Do not need the component itself if already disassembled 

1652 if isDisassembled: 

1653 isComponent = False 

1654 else: 

1655 isComponent = getInfo.component is not None 

1656 

1657 # For a component read of a composite we want the cache to 

1658 # be looking at the composite ref itself. 

1659 cache_ref = ref.makeCompositeRef() if isComponent else ref 

1660 

1661 # For a disassembled component we can validate parametersagainst 

1662 # the component storage class directly 

1663 if isDisassembled: 

1664 refStorageClass.validateParameters(parameters) 

1665 else: 

1666 # For an assembled composite this could be a derived 

1667 # component derived from a real component. The validity 

1668 # of the parameters is not clear. For now validate against 

1669 # the composite storage class 

1670 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

1671 

1672 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, 

1673 cache_ref=cache_ref) 

1674 

1675 @transactional 

1676 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

1677 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

1678 

1679 Parameters 

1680 ---------- 

1681 inMemoryDataset : `object` 

1682 The dataset to store. 

1683 ref : `DatasetRef` 

1684 Reference to the associated Dataset. 

1685 

1686 Raises 

1687 ------ 

1688 TypeError 

1689 Supplied object and storage class are inconsistent. 

1690 DatasetTypeNotSupportedError 

1691 The associated `DatasetType` is not handled by this datastore. 

1692 

1693 Notes 

1694 ----- 

1695 If the datastore is configured to reject certain dataset types it 

1696 is possible that the put will fail and raise a 

1697 `DatasetTypeNotSupportedError`. The main use case for this is to 

1698 allow `ChainedDatastore` to put to multiple datastores without 

1699 requiring that every datastore accepts the dataset. 

1700 """ 

1701 

1702 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1703 # doDisassembly = True 

1704 

1705 artifacts = [] 

1706 if doDisassembly: 

1707 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

1708 for component, componentInfo in components.items(): 

1709 # Don't recurse because we want to take advantage of 

1710 # bulk insert -- need a new DatasetRef that refers to the 

1711 # same dataset_id but has the component DatasetType 

1712 # DatasetType does not refer to the types of components 

1713 # So we construct one ourselves. 

1714 compRef = ref.makeComponentRef(component) 

1715 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

1716 artifacts.append((compRef, storedInfo)) 

1717 else: 

1718 # Write the entire thing out 

1719 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

1720 artifacts.append((ref, storedInfo)) 

1721 

1722 self._register_datasets(artifacts) 

1723 

1724 @transactional 

1725 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None: 

1726 # At this point can safely remove these datasets from the cache 

1727 # to avoid confusion later on. If they are not trashed later 

1728 # the cache will simply be refilled. 

1729 self.cacheManager.remove_from_cache(ref) 

1730 

1731 # Get file metadata and internal metadata 

1732 if not isinstance(ref, DatasetRef): 

1733 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

1734 # Assumed to be an iterable of refs so bulk mode enabled. 

1735 try: 

1736 self.bridge.moveToTrash(ref) 

1737 except Exception as e: 

1738 if ignore_errors: 

1739 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

1740 else: 

1741 raise 

1742 return 

1743 

1744 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

1745 

1746 fileLocations = self._get_dataset_locations_info(ref) 

1747 

1748 if not fileLocations: 

1749 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

1750 if ignore_errors: 1750 ↛ 1751line 1750 didn't jump to line 1751, because the condition on line 1750 was never true

1751 log.warning(err_msg) 

1752 return 

1753 else: 

1754 raise FileNotFoundError(err_msg) 

1755 

1756 for location, storedFileInfo in fileLocations: 

1757 if not self._artifact_exists(location): 1757 ↛ 1758line 1757 didn't jump to line 1758, because the condition on line 1757 was never true

1758 err_msg = f"Dataset is known to datastore {self.name} but " \ 

1759 f"associated artifact ({location.uri}) is missing" 

1760 if ignore_errors: 

1761 log.warning(err_msg) 

1762 return 

1763 else: 

1764 raise FileNotFoundError(err_msg) 

1765 

1766 # Mark dataset as trashed 

1767 try: 

1768 self.bridge.moveToTrash([ref]) 

1769 except Exception as e: 

1770 if ignore_errors: 

1771 log.warning("Attempted to mark dataset (%s) to be trashed in datastore %s " 

1772 "but encountered an error: %s", ref, self.name, e) 

1773 pass 

1774 else: 

1775 raise 

1776 

1777 @transactional 

1778 def emptyTrash(self, ignore_errors: bool = True) -> None: 

1779 """Remove all datasets from the trash. 

1780 

1781 Parameters 

1782 ---------- 

1783 ignore_errors : `bool` 

1784 If `True` return without error even if something went wrong. 

1785 Problems could occur if another process is simultaneously trying 

1786 to delete. 

1787 """ 

1788 log.debug("Emptying trash in datastore %s", self.name) 

1789 

1790 # Context manager will empty trash iff we finish it without raising. 

1791 # It will also automatically delete the relevant rows from the 

1792 # trash table and the records table. 

1793 with self.bridge.emptyTrash(self._table, record_class=StoredFileInfo, 

1794 record_column="path") as trash_data: 

1795 # Removing the artifacts themselves requires that the files are 

1796 # not also associated with refs that are not to be trashed. 

1797 # Therefore need to do a query with the file paths themselves 

1798 # and return all the refs associated with them. Can only delete 

1799 # a file if the refs to be trashed are the only refs associated 

1800 # with the file. 

1801 # This requires multiple copies of the trashed items 

1802 trashed, artifacts_to_keep = trash_data 

1803 

1804 if artifacts_to_keep is None: 

1805 # The bridge is not helping us so have to work it out 

1806 # ourselves. This is not going to be as efficient. 

1807 trashed = list(trashed) 

1808 

1809 # The instance check is for mypy since up to this point it 

1810 # does not know the type of info. 

1811 path_map = self._refs_associated_with_artifacts([info.path for _, info in trashed 

1812 if isinstance(info, StoredFileInfo)]) 

1813 

1814 for ref, info in trashed: 

1815 

1816 # Mypy needs to know this is not the base class 

1817 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

1818 

1819 # Check for mypy 

1820 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

1821 

1822 path_map[info.path].remove(ref.id) 

1823 if not path_map[info.path]: 1823 ↛ 1814line 1823 didn't jump to line 1814, because the condition on line 1823 was never false

1824 del path_map[info.path] 

1825 

1826 artifacts_to_keep = set(path_map) 

1827 

1828 for ref, info in trashed: 

1829 

1830 # Should not happen for this implementation but need 

1831 # to keep mypy happy. 

1832 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

1833 

1834 # Mypy needs to know this is not the base class 

1835 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

1836 

1837 # Check for mypy 

1838 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

1839 

1840 if info.path in artifacts_to_keep: 

1841 # This is a multi-dataset artifact and we are not 

1842 # removing all associated refs. 

1843 continue 

1844 

1845 # Only trashed refs still known to datastore will be returned. 

1846 location = info.file_location(self.locationFactory) 

1847 

1848 # Point of no return for this artifact 

1849 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

1850 try: 

1851 self._delete_artifact(location) 

1852 except FileNotFoundError: 

1853 # If the file itself has been deleted there is nothing 

1854 # we can do about it. It is possible that trash has 

1855 # been run in parallel in another process or someone 

1856 # decided to delete the file. It is unlikely to come 

1857 # back and so we should still continue with the removal 

1858 # of the entry from the trash table. It is also possible 

1859 # we removed it in a previous iteration if it was 

1860 # a multi-dataset artifact. The delete artifact method 

1861 # will log a debug message in this scenario. 

1862 # Distinguishing file missing before trash started and 

1863 # file already removed previously as part of this trash 

1864 # is not worth the distinction with regards to potential 

1865 # memory cost. 

1866 pass 

1867 except Exception as e: 

1868 if ignore_errors: 

1869 # Use a debug message here even though it's not 

1870 # a good situation. In some cases this can be 

1871 # caused by a race between user A and user B 

1872 # and neither of them has permissions for the 

1873 # other's files. Butler does not know about users 

1874 # and trash has no idea what collections these 

1875 # files were in (without guessing from a path). 

1876 log.debug("Encountered error removing artifact %s from datastore %s: %s", 

1877 location.uri, self.name, e) 

1878 else: 

1879 raise 

1880 

1881 @transactional 

1882 def transfer_from(self, source_datastore: Datastore, refs: Iterable[DatasetRef], 

1883 local_refs: Optional[Iterable[DatasetRef]] = None, 

1884 transfer: str = "auto") -> None: 

1885 # Docstring inherited 

1886 if type(self) is not type(source_datastore): 1886 ↛ 1887line 1886 didn't jump to line 1887, because the condition on line 1886 was never true

1887 raise TypeError(f"Datastore mismatch between this datastore ({type(self)}) and the " 

1888 f"source datastore ({type(source_datastore)}).") 

1889 

1890 # Be explicit for mypy 

1891 if not isinstance(source_datastore, FileDatastore): 1891 ↛ 1892line 1891 didn't jump to line 1892, because the condition on line 1891 was never true

1892 raise TypeError("Can only transfer to a FileDatastore from another FileDatastore, not" 

1893 f" {type(source_datastore)}") 

1894 

1895 # Stop early if "direct" transfer mode is requested. That would 

1896 # require that the URI inside the source datastore should be stored 

1897 # directly in the target datastore, which seems unlikely to be useful 

1898 # since at any moment the source datastore could delete the file. 

1899 if transfer in ("direct", "split"): 1899 ↛ 1900line 1899 didn't jump to line 1900, because the condition on line 1899 was never true

1900 raise ValueError("Can not transfer from a source datastore using direct mode since" 

1901 " those files are controlled by the other datastore.") 

1902 

1903 # We will go through the list multiple times so must convert 

1904 # generators to lists. 

1905 refs = list(refs) 

1906 

1907 if local_refs is None: 1907 ↛ 1908line 1907 didn't jump to line 1908, because the condition on line 1907 was never true

1908 local_refs = refs 

1909 else: 

1910 local_refs = list(local_refs) 

1911 

1912 # In order to handle disassembled composites the code works 

1913 # at the records level since it can assume that internal APIs 

1914 # can be used. 

1915 # - If the record already exists in the destination this is assumed 

1916 # to be okay. 

1917 # - If there is no record but the source and destination URIs are 

1918 # identical no transfer is done but the record is added. 

1919 # - If the source record refers to an absolute URI currently assume 

1920 # that that URI should remain absolute and will be visible to the 

1921 # destination butler. May need to have a flag to indicate whether 

1922 # the dataset should be transferred. This will only happen if 

1923 # the detached Butler has had a local ingest. 

1924 

1925 # What we really want is all the records in the source datastore 

1926 # associated with these refs. Or derived ones if they don't exist 

1927 # in the source. 

1928 source_records = source_datastore._get_stored_records_associated_with_refs(refs) 

1929 

1930 # The source dataset_ids are the keys in these records 

1931 source_ids = set(source_records) 

1932 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

1933 

1934 # The not None check is to appease mypy 

1935 requested_ids = set(ref.id for ref in refs if ref.id is not None) 

1936 missing_ids = requested_ids - source_ids 

1937 

1938 # Missing IDs can be okay if that datastore has allowed 

1939 # gets based on file existence. Should we transfer what we can 

1940 # or complain about it and warn? 

1941 if missing_ids and not source_datastore.trustGetRequest: 1941 ↛ 1942line 1941 didn't jump to line 1942, because the condition on line 1941 was never true

1942 raise ValueError(f"Some datasets are missing from source datastore {source_datastore}:" 

1943 f" {missing_ids}") 

1944 

1945 # Need to map these missing IDs to a DatasetRef so we can guess 

1946 # the details. 

1947 if missing_ids: 1947 ↛ 1948line 1947 didn't jump to line 1948, because the condition on line 1947 was never true

1948 log.info("Number of expected datasets missing from source datastore records: %d out of %d", 

1949 len(missing_ids), len(requested_ids)) 

1950 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

1951 

1952 for missing in missing_ids: 

1953 # Ask the source datastore where the missing artifacts 

1954 # should be. An execution butler might not know about the 

1955 # artifacts even if they are there. 

1956 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) 

1957 

1958 # Not all components can be guaranteed to exist so this 

1959 # list has to filter those by checking to see if the 

1960 # artifact is really there. 

1961 records = [info for location, info in expected if location.uri.exists()] 

1962 if records: 

1963 source_records[missing].extend(records) 

1964 else: 

1965 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", 

1966 id_to_ref[missing]) 

1967 

1968 # See if we already have these records 

1969 target_records = self._get_stored_records_associated_with_refs(local_refs) 

1970 

1971 # The artifacts to register 

1972 artifacts = [] 

1973 

1974 # Refs that already exist 

1975 already_present = [] 

1976 

1977 # Now can transfer the artifacts 

1978 for source_ref, target_ref in zip(refs, local_refs): 

1979 if target_ref.id in target_records: 1979 ↛ 1981line 1979 didn't jump to line 1981, because the condition on line 1979 was never true

1980 # Already have an artifact for this. 

1981 already_present.append(target_ref) 

1982 continue 

1983 

1984 # mypy needs to know these are always resolved refs 

1985 for info in source_records[source_ref.getCheckedId()]: 

1986 source_location = info.file_location(source_datastore.locationFactory) 

1987 target_location = info.file_location(self.locationFactory) 

1988 if source_location == target_location: 1988 ↛ 1992line 1988 didn't jump to line 1992, because the condition on line 1988 was never true

1989 # Either the dataset is already in the target datastore 

1990 # (which is how execution butler currently runs) or 

1991 # it is an absolute URI. 

1992 if source_location.pathInStore.isabs(): 

1993 # Just because we can see the artifact when running 

1994 # the transfer doesn't mean it will be generally 

1995 # accessible to a user of this butler. For now warn 

1996 # but assume it will be accessible. 

1997 log.warning("Transfer request for an outside-datastore artifact has been found at %s", 

1998 source_location) 

1999 else: 

2000 # Need to transfer it to the new location. 

2001 # Assume we should always overwrite. If the artifact 

2002 # is there this might indicate that a previous transfer 

2003 # was interrupted but was not able to be rolled back 

2004 # completely (eg pre-emption) so follow Datastore default 

2005 # and overwrite. 

2006 target_location.uri.transfer_from(source_location.uri, transfer=transfer, 

2007 overwrite=True, transaction=self._transaction) 

2008 

2009 artifacts.append((target_ref, info)) 

2010 

2011 self._register_datasets(artifacts) 

2012 

2013 if already_present: 2013 ↛ 2014line 2013 didn't jump to line 2014, because the condition on line 2013 was never true

2014 n_skipped = len(already_present) 

2015 log.info("Skipped transfer of %d dataset%s already present in datastore", n_skipped, 

2016 "" if n_skipped == 1 else "s") 

2017 

2018 @transactional 

2019 def forget(self, refs: Iterable[DatasetRef]) -> None: 

2020 # Docstring inherited. 

2021 refs = list(refs) 

2022 self.bridge.forget(refs) 

2023 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs]) 

2024 

2025 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], 

2026 logFailures: bool = False) -> None: 

2027 """Validate some of the configuration for this datastore. 

2028 

2029 Parameters 

2030 ---------- 

2031 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

2032 Entities to test against this configuration. Can be differing 

2033 types. 

2034 logFailures : `bool`, optional 

2035 If `True`, output a log message for every validation error 

2036 detected. 

2037 

2038 Raises 

2039 ------ 

2040 DatastoreValidationError 

2041 Raised if there is a validation problem with a configuration. 

2042 All the problems are reported in a single exception. 

2043 

2044 Notes 

2045 ----- 

2046 This method checks that all the supplied entities have valid file 

2047 templates and also have formatters defined. 

2048 """ 

2049 

2050 templateFailed = None 

2051 try: 

2052 self.templates.validateTemplates(entities, logFailures=logFailures) 

2053 except FileTemplateValidationError as e: 

2054 templateFailed = str(e) 

2055 

2056 formatterFailed = [] 

2057 for entity in entities: 

2058 try: 

2059 self.formatterFactory.getFormatterClass(entity) 

2060 except KeyError as e: 

2061 formatterFailed.append(str(e)) 

2062 if logFailures: 2062 ↛ 2057line 2062 didn't jump to line 2057, because the condition on line 2062 was never false

2063 log.critical("Formatter failure: %s", e) 

2064 

2065 if templateFailed or formatterFailed: 

2066 messages = [] 

2067 if templateFailed: 2067 ↛ 2068line 2067 didn't jump to line 2068, because the condition on line 2067 was never true

2068 messages.append(templateFailed) 

2069 if formatterFailed: 2069 ↛ 2071line 2069 didn't jump to line 2071, because the condition on line 2069 was never false

2070 messages.append(",".join(formatterFailed)) 

2071 msg = ";\n".join(messages) 

2072 raise DatastoreValidationError(msg) 

2073 

2074 def getLookupKeys(self) -> Set[LookupKey]: 

2075 # Docstring is inherited from base class 

2076 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \ 

2077 self.constraints.getLookupKeys() 

2078 

2079 def validateKey(self, lookupKey: LookupKey, 

2080 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

2081 # Docstring is inherited from base class 

2082 # The key can be valid in either formatters or templates so we can 

2083 # only check the template if it exists 

2084 if lookupKey in self.templates: 

2085 try: 

2086 self.templates[lookupKey].validateTemplate(entity) 

2087 except FileTemplateValidationError as e: 

2088 raise DatastoreValidationError(e) from e 

2089 

2090 def export(self, refs: Iterable[DatasetRef], *, 

2091 directory: Optional[Union[ButlerURI, str]] = None, 

2092 transfer: Optional[str] = "auto") -> Iterable[FileDataset]: 

2093 # Docstring inherited from Datastore.export. 

2094 if transfer is not None and directory is None: 2094 ↛ 2095line 2094 didn't jump to line 2095, because the condition on line 2094 was never true

2095 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no " 

2096 "export directory given") 

2097 

2098 # Force the directory to be a URI object 

2099 directoryUri: Optional[ButlerURI] = None 

2100 if directory is not None: 2100 ↛ 2103line 2100 didn't jump to line 2103, because the condition on line 2100 was never false

2101 directoryUri = ButlerURI(directory, forceDirectory=True) 

2102 

2103 if transfer is not None and directoryUri is not None: 2103 ↛ 2108line 2103 didn't jump to line 2108, because the condition on line 2103 was never false

2104 # mypy needs the second test 

2105 if not directoryUri.exists(): 2105 ↛ 2106line 2105 didn't jump to line 2106, because the condition on line 2105 was never true

2106 raise FileNotFoundError(f"Export location {directory} does not exist") 

2107 

2108 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

2109 for ref in progress.wrap(refs, "Exporting dataset files"): 

2110 fileLocations = self._get_dataset_locations_info(ref) 

2111 if not fileLocations: 2111 ↛ 2112line 2111 didn't jump to line 2112, because the condition on line 2111 was never true

2112 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

2113 # For now we can not export disassembled datasets 

2114 if len(fileLocations) > 1: 

2115 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

2116 location, storedFileInfo = fileLocations[0] 

2117 

2118 pathInStore = location.pathInStore.path 

2119 if transfer is None: 2119 ↛ 2123line 2119 didn't jump to line 2123, because the condition on line 2119 was never true

2120 # TODO: do we also need to return the readStorageClass somehow? 

2121 # We will use the path in store directly. If this is an 

2122 # absolute URI, preserve it. 

2123 if location.pathInStore.isabs(): 

2124 pathInStore = str(location.uri) 

2125 elif transfer == "direct": 2125 ↛ 2127line 2125 didn't jump to line 2127, because the condition on line 2125 was never true

2126 # Use full URIs to the remote store in the export 

2127 pathInStore = str(location.uri) 

2128 else: 

2129 # mypy needs help 

2130 assert directoryUri is not None, "directoryUri must be defined to get here" 

2131 storeUri = ButlerURI(location.uri) 

2132 

2133 # if the datastore has an absolute URI to a resource, we 

2134 # have two options: 

2135 # 1. Keep the absolute URI in the exported YAML 

2136 # 2. Allocate a new name in the local datastore and transfer 

2137 # it. 

2138 # For now go with option 2 

2139 if location.pathInStore.isabs(): 2139 ↛ 2140line 2139 didn't jump to line 2140, because the condition on line 2139 was never true

2140 template = self.templates.getTemplate(ref) 

2141 newURI = ButlerURI(template.format(ref), forceAbsolute=False) 

2142 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

2143 

2144 exportUri = directoryUri.join(pathInStore) 

2145 exportUri.transfer_from(storeUri, transfer=transfer) 

2146 

2147 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

2148 

2149 @staticmethod 

2150 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]: 

2151 """Compute the checksum of the supplied file. 

2152 

2153 Parameters 

2154 ---------- 

2155 uri : `ButlerURI` 

2156 Name of resource to calculate checksum from. 

2157 algorithm : `str`, optional 

2158 Name of algorithm to use. Must be one of the algorithms supported 

2159 by :py:class`hashlib`. 

2160 block_size : `int` 

2161 Number of bytes to read from file at one time. 

2162 

2163 Returns 

2164 ------- 

2165 hexdigest : `str` 

2166 Hex digest of the file. 

2167 

2168 Notes 

2169 ----- 

2170 Currently returns None if the URI is for a remote resource. 

2171 """ 

2172 if algorithm not in hashlib.algorithms_guaranteed: 2172 ↛ 2173line 2172 didn't jump to line 2173, because the condition on line 2172 was never true

2173 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

2174 

2175 if not uri.isLocal: 2175 ↛ 2176line 2175 didn't jump to line 2176, because the condition on line 2175 was never true

2176 return None 

2177 

2178 hasher = hashlib.new(algorithm) 

2179 

2180 with uri.as_local() as local_uri: 

2181 with open(local_uri.ospath, "rb") as f: 

2182 for chunk in iter(lambda: f.read(block_size), b""): 

2183 hasher.update(chunk) 

2184 

2185 return hasher.hexdigest() 

2186 

2187 def needs_expanded_data_ids( 

2188 self, 

2189 transfer: Optional[str], 

2190 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

2191 ) -> bool: 

2192 # Docstring inherited. 

2193 # This _could_ also use entity to inspect whether the filename template 

2194 # involves placeholders other than the required dimensions for its 

2195 # dataset type, but that's not necessary for correctness; it just 

2196 # enables more optimizations (perhaps only in theory). 

2197 return transfer not in ("direct", None)