Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 84%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

887 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileDatastore",) 

26 

27import hashlib 

28import logging 

29from collections import defaultdict 

30from dataclasses import dataclass 

31from typing import ( 

32 TYPE_CHECKING, 

33 Any, 

34 ClassVar, 

35 Dict, 

36 Iterable, 

37 List, 

38 Mapping, 

39 Optional, 

40 Set, 

41 Tuple, 

42 Type, 

43 Union, 

44) 

45 

46from lsst.daf.butler import ( 

47 CompositesMap, 

48 Config, 

49 DatasetId, 

50 DatasetRef, 

51 DatasetType, 

52 DatasetTypeNotSupportedError, 

53 Datastore, 

54 DatastoreCacheManager, 

55 DatastoreConfig, 

56 DatastoreDisabledCacheManager, 

57 DatastoreRecordData, 

58 DatastoreValidationError, 

59 FileDataset, 

60 FileDescriptor, 

61 FileTemplates, 

62 FileTemplateValidationError, 

63 Formatter, 

64 FormatterFactory, 

65 Location, 

66 LocationFactory, 

67 Progress, 

68 StorageClass, 

69 StoredDatastoreItemInfo, 

70 StoredFileInfo, 

71 ddl, 

72) 

73from lsst.daf.butler.core.repoRelocation import replaceRoot 

74from lsst.daf.butler.core.utils import transactional 

75from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError 

76from lsst.resources import ResourcePath, ResourcePathExpression 

77from lsst.utils.introspection import get_class_of, get_instance_of 

78from lsst.utils.iteration import chunk_iterable 

79 

80# For VERBOSE logging usage. 

81from lsst.utils.logging import VERBOSE, getLogger 

82from lsst.utils.timer import time_this 

83from sqlalchemy import BigInteger, String 

84 

85from ..registry.interfaces import FakeDatasetRef 

86from .genericDatastore import GenericBaseDatastore 

87 

88if TYPE_CHECKING: 88 ↛ 89line 88 didn't jump to line 89, because the condition on line 88 was never true

89 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey 

90 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

91 

92log = getLogger(__name__) 

93 

94 

95class _IngestPrepData(Datastore.IngestPrepData): 

96 """Helper class for FileDatastore ingest implementation. 

97 

98 Parameters 

99 ---------- 

100 datasets : `list` of `FileDataset` 

101 Files to be ingested by this datastore. 

102 """ 

103 

104 def __init__(self, datasets: List[FileDataset]): 

105 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

106 self.datasets = datasets 

107 

108 

109@dataclass(frozen=True) 

110class DatastoreFileGetInformation: 

111 """Collection of useful parameters needed to retrieve a file from 

112 a Datastore. 

113 """ 

114 

115 location: Location 

116 """The location from which to read the dataset.""" 

117 

118 formatter: Formatter 

119 """The `Formatter` to use to deserialize the dataset.""" 

120 

121 info: StoredFileInfo 

122 """Stored information about this file and its formatter.""" 

123 

124 assemblerParams: Mapping[str, Any] 

125 """Parameters to use for post-processing the retrieved dataset.""" 

126 

127 formatterParams: Mapping[str, Any] 

128 """Parameters that were understood by the associated formatter.""" 

129 

130 component: Optional[str] 

131 """The component to be retrieved (can be `None`).""" 

132 

133 readStorageClass: StorageClass 

134 """The `StorageClass` of the dataset being read.""" 

135 

136 

137class FileDatastore(GenericBaseDatastore): 

138 """Generic Datastore for file-based implementations. 

139 

140 Should always be sub-classed since key abstract methods are missing. 

141 

142 Parameters 

143 ---------- 

144 config : `DatastoreConfig` or `str` 

145 Configuration as either a `Config` object or URI to file. 

146 bridgeManager : `DatastoreRegistryBridgeManager` 

147 Object that manages the interface between `Registry` and datastores. 

148 butlerRoot : `str`, optional 

149 New datastore root to use to override the configuration value. 

150 

151 Raises 

152 ------ 

153 ValueError 

154 If root location does not exist and ``create`` is `False` in the 

155 configuration. 

156 """ 

157 

158 defaultConfigFile: ClassVar[Optional[str]] = None 

159 """Path to configuration defaults. Accessed within the ``config`` resource 

160 or relative to a search path. Can be None if no defaults specified. 

161 """ 

162 

163 root: ResourcePath 

164 """Root directory URI of this `Datastore`.""" 

165 

166 locationFactory: LocationFactory 

167 """Factory for creating locations relative to the datastore root.""" 

168 

169 formatterFactory: FormatterFactory 

170 """Factory for creating instances of formatters.""" 

171 

172 templates: FileTemplates 

173 """File templates that can be used by this `Datastore`.""" 

174 

175 composites: CompositesMap 

176 """Determines whether a dataset should be disassembled on put.""" 

177 

178 defaultConfigFile = "datastores/fileDatastore.yaml" 

179 """Path to configuration defaults. Accessed within the ``config`` resource 

180 or relative to a search path. Can be None if no defaults specified. 

181 """ 

182 

183 @classmethod 

184 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

185 """Set any filesystem-dependent config options for this Datastore to 

186 be appropriate for a new empty repository with the given root. 

187 

188 Parameters 

189 ---------- 

190 root : `str` 

191 URI to the root of the data repository. 

192 config : `Config` 

193 A `Config` to update. Only the subset understood by 

194 this component will be updated. Will not expand 

195 defaults. 

196 full : `Config` 

197 A complete config with all defaults expanded that can be 

198 converted to a `DatastoreConfig`. Read-only and will not be 

199 modified by this method. 

200 Repository-specific options that should not be obtained 

201 from defaults when Butler instances are constructed 

202 should be copied from ``full`` to ``config``. 

203 overwrite : `bool`, optional 

204 If `False`, do not modify a value in ``config`` if the value 

205 already exists. Default is always to overwrite with the provided 

206 ``root``. 

207 

208 Notes 

209 ----- 

210 If a keyword is explicitly defined in the supplied ``config`` it 

211 will not be overridden by this method if ``overwrite`` is `False`. 

212 This allows explicit values set in external configs to be retained. 

213 """ 

214 Config.updateParameters( 

215 DatastoreConfig, 

216 config, 

217 full, 

218 toUpdate={"root": root}, 

219 toCopy=("cls", ("records", "table")), 

220 overwrite=overwrite, 

221 ) 

222 

223 @classmethod 

224 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

225 return ddl.TableSpec( 

226 fields=[ 

227 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

228 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

229 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

230 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

231 # Use empty string to indicate no component 

232 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

233 # TODO: should checksum be Base64Bytes instead? 

234 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

235 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

236 ], 

237 unique=frozenset(), 

238 indexes=[tuple(["path"])], 

239 ) 

240 

241 def __init__( 

242 self, 

243 config: Union[DatastoreConfig, str], 

244 bridgeManager: DatastoreRegistryBridgeManager, 

245 butlerRoot: str = None, 

246 ): 

247 super().__init__(config, bridgeManager) 

248 if "root" not in self.config: 248 ↛ 249line 248 didn't jump to line 249, because the condition on line 248 was never true

249 raise ValueError("No root directory specified in configuration") 

250 

251 self._bridgeManager = bridgeManager 

252 

253 # Name ourselves either using an explicit name or a name 

254 # derived from the (unexpanded) root 

255 if "name" in self.config: 

256 self.name = self.config["name"] 

257 else: 

258 # We use the unexpanded root in the name to indicate that this 

259 # datastore can be moved without having to update registry. 

260 self.name = "{}@{}".format(type(self).__name__, self.config["root"]) 

261 

262 # Support repository relocation in config 

263 # Existence of self.root is checked in subclass 

264 self.root = ResourcePath( 

265 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True 

266 ) 

267 

268 self.locationFactory = LocationFactory(self.root) 

269 self.formatterFactory = FormatterFactory() 

270 

271 # Now associate formatters with storage classes 

272 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe) 

273 

274 # Read the file naming templates 

275 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe) 

276 

277 # See if composites should be disassembled 

278 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe) 

279 

280 tableName = self.config["records", "table"] 

281 try: 

282 # Storage of paths and formatters, keyed by dataset_id 

283 self._table = bridgeManager.opaque.register( 

284 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType) 

285 ) 

286 # Interface to Registry. 

287 self._bridge = bridgeManager.register(self.name) 

288 except ReadOnlyDatabaseError: 

289 # If the database is read only and we just tried and failed to 

290 # create a table, it means someone is trying to create a read-only 

291 # butler client for an empty repo. That should be okay, as long 

292 # as they then try to get any datasets before some other client 

293 # creates the table. Chances are they'rejust validating 

294 # configuration. 

295 pass 

296 

297 # Determine whether checksums should be used - default to False 

298 self.useChecksum = self.config.get("checksum", False) 

299 

300 # Determine whether we can fall back to configuration if a 

301 # requested dataset is not known to registry 

302 self.trustGetRequest = self.config.get("trust_get_request", False) 

303 

304 # Create a cache manager 

305 self.cacheManager: AbstractDatastoreCacheManager 

306 if "cached" in self.config: 306 ↛ 309line 306 didn't jump to line 309, because the condition on line 306 was never false

307 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe) 

308 else: 

309 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe) 

310 

311 # Check existence and create directory structure if necessary 

312 if not self.root.exists(): 

313 if "create" not in self.config or not self.config["create"]: 313 ↛ 314line 313 didn't jump to line 314, because the condition on line 313 was never true

314 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

315 try: 

316 self.root.mkdir() 

317 except Exception as e: 

318 raise ValueError( 

319 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}" 

320 ) from e 

321 

322 def __str__(self) -> str: 

323 return str(self.root) 

324 

325 @property 

326 def bridge(self) -> DatastoreRegistryBridge: 

327 return self._bridge 

328 

329 def _artifact_exists(self, location: Location) -> bool: 

330 """Check that an artifact exists in this datastore at the specified 

331 location. 

332 

333 Parameters 

334 ---------- 

335 location : `Location` 

336 Expected location of the artifact associated with this datastore. 

337 

338 Returns 

339 ------- 

340 exists : `bool` 

341 True if the location can be found, false otherwise. 

342 """ 

343 log.debug("Checking if resource exists: %s", location.uri) 

344 return location.uri.exists() 

345 

346 def _delete_artifact(self, location: Location) -> None: 

347 """Delete the artifact from the datastore. 

348 

349 Parameters 

350 ---------- 

351 location : `Location` 

352 Location of the artifact associated with this datastore. 

353 """ 

354 if location.pathInStore.isabs(): 354 ↛ 355line 354 didn't jump to line 355, because the condition on line 354 was never true

355 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

356 

357 try: 

358 location.uri.remove() 

359 except FileNotFoundError: 

360 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

361 raise 

362 except Exception as e: 

363 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

364 raise 

365 log.debug("Successfully deleted file: %s", location.uri) 

366 

367 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

368 # Docstring inherited from GenericBaseDatastore 

369 records = [info.rebase(ref).to_record() for ref, info in zip(refs, infos)] 

370 self._table.insert(*records) 

371 

372 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

373 # Docstring inherited from GenericBaseDatastore 

374 

375 # Look for the dataset_id -- there might be multiple matches 

376 # if we have disassembled the dataset. 

377 records = self._table.fetch(dataset_id=ref.id) 

378 return [StoredFileInfo.from_record(record) for record in records] 

379 

380 def _get_stored_records_associated_with_refs( 

381 self, refs: Iterable[DatasetIdRef] 

382 ) -> Dict[DatasetId, List[StoredFileInfo]]: 

383 """Retrieve all records associated with the provided refs. 

384 

385 Parameters 

386 ---------- 

387 refs : iterable of `DatasetIdRef` 

388 The refs for which records are to be retrieved. 

389 

390 Returns 

391 ------- 

392 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

393 The matching records indexed by the ref ID. The number of entries 

394 in the dict can be smaller than the number of requested refs. 

395 """ 

396 records = self._table.fetch(dataset_id=[ref.id for ref in refs]) 

397 

398 # Uniqueness is dataset_id + component so can have multiple records 

399 # per ref. 

400 records_by_ref = defaultdict(list) 

401 for record in records: 

402 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

403 return records_by_ref 

404 

405 def _refs_associated_with_artifacts( 

406 self, paths: List[Union[str, ResourcePath]] 

407 ) -> Dict[str, Set[DatasetId]]: 

408 """Return paths and associated dataset refs. 

409 

410 Parameters 

411 ---------- 

412 paths : `list` of `str` or `lsst.resources.ResourcePath` 

413 All the paths to include in search. 

414 

415 Returns 

416 ------- 

417 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

418 Mapping of each path to a set of associated database IDs. 

419 """ 

420 records = self._table.fetch(path=[str(path) for path in paths]) 

421 result = defaultdict(set) 

422 for row in records: 

423 result[row["path"]].add(row["dataset_id"]) 

424 return result 

425 

426 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> Set[DatasetId]: 

427 """Return all dataset refs associated with the supplied path. 

428 

429 Parameters 

430 ---------- 

431 pathInStore : `lsst.resources.ResourcePath` 

432 Path of interest in the data store. 

433 

434 Returns 

435 ------- 

436 ids : `set` of `int` 

437 All `DatasetRef` IDs associated with this path. 

438 """ 

439 records = list(self._table.fetch(path=str(pathInStore))) 

440 ids = {r["dataset_id"] for r in records} 

441 return ids 

442 

443 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

444 # Docstring inherited from GenericBaseDatastore 

445 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

446 

447 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

448 r"""Find all the `Location`\ s of the requested dataset in the 

449 `Datastore` and the associated stored file information. 

450 

451 Parameters 

452 ---------- 

453 ref : `DatasetRef` 

454 Reference to the required `Dataset`. 

455 

456 Returns 

457 ------- 

458 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

459 Location of the dataset within the datastore and 

460 stored information about each file and its formatter. 

461 """ 

462 # Get the file information (this will fail if no file) 

463 records = self.getStoredItemsInfo(ref) 

464 

465 # Use the path to determine the location -- we need to take 

466 # into account absolute URIs in the datastore record 

467 return [(r.file_location(self.locationFactory), r) for r in records] 

468 

469 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

470 """Check that there is only one dataset associated with the 

471 specified artifact. 

472 

473 Parameters 

474 ---------- 

475 ref : `DatasetRef` or `FakeDatasetRef` 

476 Dataset to be removed. 

477 location : `Location` 

478 The location of the artifact to be removed. 

479 

480 Returns 

481 ------- 

482 can_remove : `Bool` 

483 True if the artifact can be safely removed. 

484 """ 

485 # Can't ever delete absolute URIs. 

486 if location.pathInStore.isabs(): 

487 return False 

488 

489 # Get all entries associated with this path 

490 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

491 if not allRefs: 

492 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

493 

494 # Remove these refs from all the refs and if there is nothing left 

495 # then we can delete 

496 remainingRefs = allRefs - {ref.id} 

497 

498 if remainingRefs: 

499 return False 

500 return True 

501 

502 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, StoredFileInfo]]: 

503 """Predict the location and related file information of the requested 

504 dataset in this datastore. 

505 

506 Parameters 

507 ---------- 

508 ref : `DatasetRef` 

509 Reference to the required `Dataset`. 

510 

511 Returns 

512 ------- 

513 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

514 Expected Location of the dataset within the datastore and 

515 placeholder information about each file and its formatter. 

516 

517 Notes 

518 ----- 

519 Uses the current configuration to determine how we would expect the 

520 datastore files to have been written if we couldn't ask registry. 

521 This is safe so long as there has been no change to datastore 

522 configuration between writing the dataset and wanting to read it. 

523 Will not work for files that have been ingested without using the 

524 standard file template or default formatter. 

525 """ 

526 

527 # If we have a component ref we always need to ask the questions 

528 # of the composite. If the composite is disassembled this routine 

529 # should return all components. If the composite was not 

530 # disassembled the composite is what is stored regardless of 

531 # component request. Note that if the caller has disassembled 

532 # a composite there is no way for this guess to know that 

533 # without trying both the composite and component ref and seeing 

534 # if there is something at the component Location even without 

535 # disassembly being enabled. 

536 if ref.datasetType.isComponent(): 

537 ref = ref.makeCompositeRef() 

538 

539 # See if the ref is a composite that should be disassembled 

540 doDisassembly = self.composites.shouldBeDisassembled(ref) 

541 

542 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = [] 

543 

544 if doDisassembly: 

545 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

546 compRef = ref.makeComponentRef(component) 

547 location, formatter = self._determine_put_formatter_location(compRef) 

548 all_info.append((location, formatter, componentStorage, component)) 

549 

550 else: 

551 # Always use the composite ref if no disassembly 

552 location, formatter = self._determine_put_formatter_location(ref) 

553 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

554 

555 # Convert the list of tuples to have StoredFileInfo as second element 

556 return [ 

557 ( 

558 location, 

559 StoredFileInfo( 

560 formatter=formatter, 

561 path=location.pathInStore.path, 

562 storageClass=storageClass, 

563 component=component, 

564 checksum=None, 

565 file_size=-1, 

566 dataset_id=ref.getCheckedId(), 

567 ), 

568 ) 

569 for location, formatter, storageClass, component in all_info 

570 ] 

571 

572 def _prepare_for_get( 

573 self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None 

574 ) -> List[DatastoreFileGetInformation]: 

575 """Check parameters for ``get`` and obtain formatter and 

576 location. 

577 

578 Parameters 

579 ---------- 

580 ref : `DatasetRef` 

581 Reference to the required Dataset. 

582 parameters : `dict` 

583 `StorageClass`-specific parameters that specify, for example, 

584 a slice of the dataset to be loaded. 

585 

586 Returns 

587 ------- 

588 getInfo : `list` [`DatastoreFileGetInformation`] 

589 Parameters needed to retrieve each file. 

590 """ 

591 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

592 

593 # Get file metadata and internal metadata 

594 fileLocations = self._get_dataset_locations_info(ref) 

595 if not fileLocations: 

596 if not self.trustGetRequest: 

597 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

598 # Assume the dataset is where we think it should be 

599 fileLocations = self._get_expected_dataset_locations_info(ref) 

600 

601 # The storage class we want to use eventually 

602 refStorageClass = ref.datasetType.storageClass 

603 

604 if len(fileLocations) > 1: 

605 disassembled = True 

606 

607 # If trust is involved it is possible that there will be 

608 # components listed here that do not exist in the datastore. 

609 # Explicitly check for file artifact existence and filter out any 

610 # that are missing. 

611 if self.trustGetRequest: 

612 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()] 

613 

614 # For now complain only if we have no components at all. One 

615 # component is probably a problem but we can punt that to the 

616 # assembler. 

617 if not fileLocations: 617 ↛ 618line 617 didn't jump to line 618, because the condition on line 617 was never true

618 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") 

619 

620 else: 

621 disassembled = False 

622 

623 # Is this a component request? 

624 refComponent = ref.datasetType.component() 

625 

626 fileGetInfo = [] 

627 for location, storedFileInfo in fileLocations: 

628 

629 # The storage class used to write the file 

630 writeStorageClass = storedFileInfo.storageClass 

631 

632 # If this has been disassembled we need read to match the write 

633 if disassembled: 

634 readStorageClass = writeStorageClass 

635 else: 

636 readStorageClass = refStorageClass 

637 

638 formatter = get_instance_of( 

639 storedFileInfo.formatter, 

640 FileDescriptor( 

641 location, 

642 readStorageClass=readStorageClass, 

643 storageClass=writeStorageClass, 

644 parameters=parameters, 

645 ), 

646 ref.dataId, 

647 ) 

648 

649 formatterParams, notFormatterParams = formatter.segregateParameters() 

650 

651 # Of the remaining parameters, extract the ones supported by 

652 # this StorageClass (for components not all will be handled) 

653 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

654 

655 # The ref itself could be a component if the dataset was 

656 # disassembled by butler, or we disassembled in datastore and 

657 # components came from the datastore records 

658 component = storedFileInfo.component if storedFileInfo.component else refComponent 

659 

660 fileGetInfo.append( 

661 DatastoreFileGetInformation( 

662 location, 

663 formatter, 

664 storedFileInfo, 

665 assemblerParams, 

666 formatterParams, 

667 component, 

668 readStorageClass, 

669 ) 

670 ) 

671 

672 return fileGetInfo 

673 

674 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

675 """Check the arguments for ``put`` and obtain formatter and 

676 location. 

677 

678 Parameters 

679 ---------- 

680 inMemoryDataset : `object` 

681 The dataset to store. 

682 ref : `DatasetRef` 

683 Reference to the associated Dataset. 

684 

685 Returns 

686 ------- 

687 location : `Location` 

688 The location to write the dataset. 

689 formatter : `Formatter` 

690 The `Formatter` to use to write the dataset. 

691 

692 Raises 

693 ------ 

694 TypeError 

695 Supplied object and storage class are inconsistent. 

696 DatasetTypeNotSupportedError 

697 The associated `DatasetType` is not handled by this datastore. 

698 """ 

699 self._validate_put_parameters(inMemoryDataset, ref) 

700 return self._determine_put_formatter_location(ref) 

701 

702 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]: 

703 """Calculate the formatter and output location to use for put. 

704 

705 Parameters 

706 ---------- 

707 ref : `DatasetRef` 

708 Reference to the associated Dataset. 

709 

710 Returns 

711 ------- 

712 location : `Location` 

713 The location to write the dataset. 

714 formatter : `Formatter` 

715 The `Formatter` to use to write the dataset. 

716 """ 

717 # Work out output file name 

718 try: 

719 template = self.templates.getTemplate(ref) 

720 except KeyError as e: 

721 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

722 

723 # Validate the template to protect against filenames from different 

724 # dataIds returning the same and causing overwrite confusion. 

725 template.validateTemplate(ref) 

726 

727 location = self.locationFactory.fromPath(template.format(ref)) 

728 

729 # Get the formatter based on the storage class 

730 storageClass = ref.datasetType.storageClass 

731 try: 

732 formatter = self.formatterFactory.getFormatter( 

733 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId 

734 ) 

735 except KeyError as e: 

736 raise DatasetTypeNotSupportedError( 

737 f"Unable to find formatter for {ref} in datastore {self.name}" 

738 ) from e 

739 

740 # Now that we know the formatter, update the location 

741 location = formatter.makeUpdatedLocation(location) 

742 

743 return location, formatter 

744 

745 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

746 # Docstring inherited from base class 

747 if transfer != "auto": 

748 return transfer 

749 

750 # See if the paths are within the datastore or not 

751 inside = [self._pathInStore(d.path) is not None for d in datasets] 

752 

753 if all(inside): 

754 transfer = None 

755 elif not any(inside): 755 ↛ 764line 755 didn't jump to line 764, because the condition on line 755 was never false

756 # Allow ResourcePath to use its own knowledge 

757 transfer = "auto" 

758 else: 

759 # This can happen when importing from a datastore that 

760 # has had some datasets ingested using "direct" mode. 

761 # Also allow ResourcePath to sort it out but warn about it. 

762 # This can happen if you are importing from a datastore 

763 # that had some direct transfer datasets. 

764 log.warning( 

765 "Some datasets are inside the datastore and some are outside. Using 'split' " 

766 "transfer mode. This assumes that the files outside the datastore are " 

767 "still accessible to the new butler since they will not be copied into " 

768 "the target datastore." 

769 ) 

770 transfer = "split" 

771 

772 return transfer 

773 

774 def _pathInStore(self, path: ResourcePathExpression) -> Optional[str]: 

775 """Return path relative to datastore root 

776 

777 Parameters 

778 ---------- 

779 path : `lsst.resources.ResourcePathExpression` 

780 Path to dataset. Can be absolute URI. If relative assumed to 

781 be relative to the datastore. Returns path in datastore 

782 or raises an exception if the path it outside. 

783 

784 Returns 

785 ------- 

786 inStore : `str` 

787 Path relative to datastore root. Returns `None` if the file is 

788 outside the root. 

789 """ 

790 # Relative path will always be relative to datastore 

791 pathUri = ResourcePath(path, forceAbsolute=False) 

792 return pathUri.relative_to(self.root) 

793 

794 def _standardizeIngestPath( 

795 self, path: Union[str, ResourcePath], *, transfer: Optional[str] = None 

796 ) -> Union[str, ResourcePath]: 

797 """Standardize the path of a to-be-ingested file. 

798 

799 Parameters 

800 ---------- 

801 path : `str` or `lsst.resources.ResourcePath` 

802 Path of a file to be ingested. This parameter is not expected 

803 to be all the types that can be used to construct a 

804 `~lsst.resources.ResourcePath`. 

805 transfer : `str`, optional 

806 How (and whether) the dataset should be added to the datastore. 

807 See `ingest` for details of transfer modes. 

808 This implementation is provided only so 

809 `NotImplementedError` can be raised if the mode is not supported; 

810 actual transfers are deferred to `_extractIngestInfo`. 

811 

812 Returns 

813 ------- 

814 path : `str` or `lsst.resources.ResourcePath` 

815 New path in what the datastore considers standard form. If an 

816 absolute URI was given that will be returned unchanged. 

817 

818 Notes 

819 ----- 

820 Subclasses of `FileDatastore` can implement this method instead 

821 of `_prepIngest`. It should not modify the data repository or given 

822 file in any way. 

823 

824 Raises 

825 ------ 

826 NotImplementedError 

827 Raised if the datastore does not support the given transfer mode 

828 (including the case where ingest is not supported at all). 

829 FileNotFoundError 

830 Raised if one of the given files does not exist. 

831 """ 

832 if transfer not in (None, "direct", "split") + self.root.transferModes: 832 ↛ 833line 832 didn't jump to line 833, because the condition on line 832 was never true

833 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

834 

835 # A relative URI indicates relative to datastore root 

836 srcUri = ResourcePath(path, forceAbsolute=False) 

837 if not srcUri.isabs(): 

838 srcUri = self.root.join(path) 

839 

840 if not srcUri.exists(): 

841 raise FileNotFoundError( 

842 f"Resource at {srcUri} does not exist; note that paths to ingest " 

843 f"are assumed to be relative to {self.root} unless they are absolute." 

844 ) 

845 

846 if transfer is None: 

847 relpath = srcUri.relative_to(self.root) 

848 if not relpath: 

849 raise RuntimeError( 

850 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})" 

851 ) 

852 

853 # Return the relative path within the datastore for internal 

854 # transfer 

855 path = relpath 

856 

857 return path 

858 

859 def _extractIngestInfo( 

860 self, 

861 path: ResourcePathExpression, 

862 ref: DatasetRef, 

863 *, 

864 formatter: Union[Formatter, Type[Formatter]], 

865 transfer: Optional[str] = None, 

866 record_validation_info: bool = True, 

867 ) -> StoredFileInfo: 

868 """Relocate (if necessary) and extract `StoredFileInfo` from a 

869 to-be-ingested file. 

870 

871 Parameters 

872 ---------- 

873 path : `lsst.resources.ResourcePathExpression` 

874 URI or path of a file to be ingested. 

875 ref : `DatasetRef` 

876 Reference for the dataset being ingested. Guaranteed to have 

877 ``dataset_id not None`. 

878 formatter : `type` or `Formatter` 

879 `Formatter` subclass to use for this dataset or an instance. 

880 transfer : `str`, optional 

881 How (and whether) the dataset should be added to the datastore. 

882 See `ingest` for details of transfer modes. 

883 record_validation_info : `bool`, optional 

884 If `True`, the default, the datastore can record validation 

885 information associated with the file. If `False` the datastore 

886 will not attempt to track any information such as checksums 

887 or file sizes. This can be useful if such information is tracked 

888 in an external system or if the file is to be compressed in place. 

889 It is up to the datastore whether this parameter is relevant. 

890 

891 Returns 

892 ------- 

893 info : `StoredFileInfo` 

894 Internal datastore record for this file. This will be inserted by 

895 the caller; the `_extractIngestInfo` is only responsible for 

896 creating and populating the struct. 

897 

898 Raises 

899 ------ 

900 FileNotFoundError 

901 Raised if one of the given files does not exist. 

902 FileExistsError 

903 Raised if transfer is not `None` but the (internal) location the 

904 file would be moved to is already occupied. 

905 """ 

906 if self._transaction is None: 906 ↛ 907line 906 didn't jump to line 907, because the condition on line 906 was never true

907 raise RuntimeError("Ingest called without transaction enabled") 

908 

909 # Create URI of the source path, do not need to force a relative 

910 # path to absolute. 

911 srcUri = ResourcePath(path, forceAbsolute=False) 

912 

913 # Track whether we have read the size of the source yet 

914 have_sized = False 

915 

916 tgtLocation: Optional[Location] 

917 if transfer is None or transfer == "split": 

918 # A relative path is assumed to be relative to the datastore 

919 # in this context 

920 if not srcUri.isabs(): 

921 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

922 else: 

923 # Work out the path in the datastore from an absolute URI 

924 # This is required to be within the datastore. 

925 pathInStore = srcUri.relative_to(self.root) 

926 if pathInStore is None and transfer is None: 926 ↛ 927line 926 didn't jump to line 927, because the condition on line 926 was never true

927 raise RuntimeError( 

928 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}" 

929 ) 

930 if pathInStore: 930 ↛ 932line 930 didn't jump to line 932, because the condition on line 930 was never false

931 tgtLocation = self.locationFactory.fromPath(pathInStore) 

932 elif transfer == "split": 

933 # Outside the datastore but treat that as a direct ingest 

934 # instead. 

935 tgtLocation = None 

936 else: 

937 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}") 

938 elif transfer == "direct": 938 ↛ 943line 938 didn't jump to line 943, because the condition on line 938 was never true

939 # Want to store the full URI to the resource directly in 

940 # datastore. This is useful for referring to permanent archive 

941 # storage for raw data. 

942 # Trust that people know what they are doing. 

943 tgtLocation = None 

944 else: 

945 # Work out the name we want this ingested file to have 

946 # inside the datastore 

947 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

948 if not tgtLocation.uri.dirname().exists(): 

949 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

950 tgtLocation.uri.dirname().mkdir() 

951 

952 # if we are transferring from a local file to a remote location 

953 # it may be more efficient to get the size and checksum of the 

954 # local file rather than the transferred one 

955 if record_validation_info and srcUri.isLocal: 

956 size = srcUri.size() 

957 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

958 have_sized = True 

959 

960 # Transfer the resource to the destination. 

961 # Allow overwrite of an existing file. This matches the behavior 

962 # of datastore.put() in that it trusts that registry would not 

963 # be asking to overwrite unless registry thought that the 

964 # overwrite was allowed. 

965 tgtLocation.uri.transfer_from( 

966 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True 

967 ) 

968 

969 if tgtLocation is None: 969 ↛ 971line 969 didn't jump to line 971, because the condition on line 969 was never true

970 # This means we are using direct mode 

971 targetUri = srcUri 

972 targetPath = str(srcUri) 

973 else: 

974 targetUri = tgtLocation.uri 

975 targetPath = tgtLocation.pathInStore.path 

976 

977 # the file should exist in the datastore now 

978 if record_validation_info: 

979 if not have_sized: 

980 size = targetUri.size() 

981 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

982 else: 

983 # Not recording any file information. 

984 size = -1 

985 checksum = None 

986 

987 return StoredFileInfo( 

988 formatter=formatter, 

989 path=targetPath, 

990 storageClass=ref.datasetType.storageClass, 

991 component=ref.datasetType.component(), 

992 file_size=size, 

993 checksum=checksum, 

994 dataset_id=ref.getCheckedId(), 

995 ) 

996 

997 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

998 # Docstring inherited from Datastore._prepIngest. 

999 filtered = [] 

1000 for dataset in datasets: 

1001 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

1002 if not acceptable: 

1003 continue 

1004 else: 

1005 dataset.refs = acceptable 

1006 if dataset.formatter is None: 

1007 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

1008 else: 

1009 assert isinstance(dataset.formatter, (type, str)) 

1010 formatter_class = get_class_of(dataset.formatter) 

1011 if not issubclass(formatter_class, Formatter): 1011 ↛ 1012line 1011 didn't jump to line 1012, because the condition on line 1011 was never true

1012 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.") 

1013 dataset.formatter = formatter_class 

1014 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

1015 filtered.append(dataset) 

1016 return _IngestPrepData(filtered) 

1017 

1018 @transactional 

1019 def _finishIngest( 

1020 self, 

1021 prepData: Datastore.IngestPrepData, 

1022 *, 

1023 transfer: Optional[str] = None, 

1024 record_validation_info: bool = True, 

1025 ) -> None: 

1026 # Docstring inherited from Datastore._finishIngest. 

1027 refsAndInfos = [] 

1028 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

1029 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

1030 # Do ingest as if the first dataset ref is associated with the file 

1031 info = self._extractIngestInfo( 

1032 dataset.path, 

1033 dataset.refs[0], 

1034 formatter=dataset.formatter, 

1035 transfer=transfer, 

1036 record_validation_info=record_validation_info, 

1037 ) 

1038 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

1039 self._register_datasets(refsAndInfos) 

1040 

1041 def _calculate_ingested_datastore_name( 

1042 self, srcUri: ResourcePath, ref: DatasetRef, formatter: Union[Formatter, Type[Formatter]] 

1043 ) -> Location: 

1044 """Given a source URI and a DatasetRef, determine the name the 

1045 dataset will have inside datastore. 

1046 

1047 Parameters 

1048 ---------- 

1049 srcUri : `lsst.resources.ResourcePath` 

1050 URI to the source dataset file. 

1051 ref : `DatasetRef` 

1052 Ref associated with the newly-ingested dataset artifact. This 

1053 is used to determine the name within the datastore. 

1054 formatter : `Formatter` or Formatter class. 

1055 Formatter to use for validation. Can be a class or an instance. 

1056 

1057 Returns 

1058 ------- 

1059 location : `Location` 

1060 Target location for the newly-ingested dataset. 

1061 """ 

1062 # Ingesting a file from outside the datastore. 

1063 # This involves a new name. 

1064 template = self.templates.getTemplate(ref) 

1065 location = self.locationFactory.fromPath(template.format(ref)) 

1066 

1067 # Get the extension 

1068 ext = srcUri.getExtension() 

1069 

1070 # Update the destination to include that extension 

1071 location.updateExtension(ext) 

1072 

1073 # Ask the formatter to validate this extension 

1074 formatter.validateExtension(location) 

1075 

1076 return location 

1077 

1078 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

1079 """Write out in memory dataset to datastore. 

1080 

1081 Parameters 

1082 ---------- 

1083 inMemoryDataset : `object` 

1084 Dataset to write to datastore. 

1085 ref : `DatasetRef` 

1086 Registry information associated with this dataset. 

1087 

1088 Returns 

1089 ------- 

1090 info : `StoredFileInfo` 

1091 Information describing the artifact written to the datastore. 

1092 """ 

1093 # May need to coerce the in memory dataset to the correct 

1094 # python type. 

1095 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset) 

1096 

1097 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

1098 uri = location.uri 

1099 

1100 if not uri.dirname().exists(): 

1101 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

1102 uri.dirname().mkdir() 

1103 

1104 if self._transaction is None: 1104 ↛ 1105line 1104 didn't jump to line 1105, because the condition on line 1104 was never true

1105 raise RuntimeError("Attempting to write artifact without transaction enabled") 

1106 

1107 def _removeFileExists(uri: ResourcePath) -> None: 

1108 """Remove a file and do not complain if it is not there. 

1109 

1110 This is important since a formatter might fail before the file 

1111 is written and we should not confuse people by writing spurious 

1112 error messages to the log. 

1113 """ 

1114 try: 

1115 uri.remove() 

1116 except FileNotFoundError: 

1117 pass 

1118 

1119 # Register a callback to try to delete the uploaded data if 

1120 # something fails below 

1121 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1122 

1123 # For a local file, simply use the formatter directly 

1124 if uri.isLocal: 

1125 try: 

1126 formatter.write(inMemoryDataset) 

1127 except Exception as e: 

1128 raise RuntimeError( 

1129 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to location {uri}" 

1130 ) from e 

1131 log.debug("Successfully wrote python object to local file at %s", uri) 

1132 else: 

1133 # This is a remote URI. Some datasets can be serialized directly 

1134 # to bytes and sent to the remote datastore without writing a 

1135 # file. If the dataset is intended to be saved to the cache 

1136 # a file is always written and direct write to the remote 

1137 # datastore is bypassed. 

1138 data_written = False 

1139 if not self.cacheManager.should_be_cached(ref): 

1140 try: 

1141 serializedDataset = formatter.toBytes(inMemoryDataset) 

1142 except NotImplementedError: 

1143 # Fallback to the file writing option. 

1144 pass 

1145 except Exception as e: 

1146 raise RuntimeError( 

1147 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes." 

1148 ) from e 

1149 else: 

1150 log.debug("Writing bytes directly to %s", uri) 

1151 uri.write(serializedDataset, overwrite=True) 

1152 log.debug("Successfully wrote bytes directly to %s", uri) 

1153 data_written = True 

1154 

1155 if not data_written: 

1156 # Did not write the bytes directly to object store so instead 

1157 # write to temporary file. 

1158 with ResourcePath.temporary_uri(suffix=uri.getExtension()) as temporary_uri: 

1159 # Need to configure the formatter to write to a different 

1160 # location and that needs us to overwrite internals 

1161 log.debug("Writing dataset to temporary location at %s", temporary_uri) 

1162 with formatter._updateLocation(Location(None, temporary_uri)): 

1163 try: 

1164 formatter.write(inMemoryDataset) 

1165 except Exception as e: 

1166 raise RuntimeError( 

1167 f"Failed to serialize dataset {ref} of type" 

1168 f" {type(inMemoryDataset)} to " 

1169 f"temporary location {temporary_uri}" 

1170 ) from e 

1171 uri.transfer_from(temporary_uri, transfer="copy", overwrite=True) 

1172 

1173 # Cache if required 

1174 self.cacheManager.move_to_cache(temporary_uri, ref) 

1175 

1176 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1177 

1178 # URI is needed to resolve what ingest case are we dealing with 

1179 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1180 

1181 def _read_artifact_into_memory( 

1182 self, 

1183 getInfo: DatastoreFileGetInformation, 

1184 ref: DatasetRef, 

1185 isComponent: bool = False, 

1186 cache_ref: Optional[DatasetRef] = None, 

1187 ) -> Any: 

1188 """Read the artifact from datastore into in memory object. 

1189 

1190 Parameters 

1191 ---------- 

1192 getInfo : `DatastoreFileGetInformation` 

1193 Information about the artifact within the datastore. 

1194 ref : `DatasetRef` 

1195 The registry information associated with this artifact. 

1196 isComponent : `bool` 

1197 Flag to indicate if a component is being read from this artifact. 

1198 cache_ref : `DatasetRef`, optional 

1199 The DatasetRef to use when looking up the file in the cache. 

1200 This ref must have the same ID as the supplied ref but can 

1201 be a parent ref or component ref to indicate to the cache whether 

1202 a composite file is being requested from the cache or a component 

1203 file. Without this the cache will default to the supplied ref but 

1204 it can get confused with read-only derived components for 

1205 disassembled composites. 

1206 

1207 Returns 

1208 ------- 

1209 inMemoryDataset : `object` 

1210 The artifact as a python object. 

1211 """ 

1212 location = getInfo.location 

1213 uri = location.uri 

1214 log.debug("Accessing data from %s", uri) 

1215 

1216 if cache_ref is None: 

1217 cache_ref = ref 

1218 if cache_ref.id != ref.id: 1218 ↛ 1219line 1218 didn't jump to line 1219, because the condition on line 1218 was never true

1219 raise ValueError( 

1220 "The supplied cache dataset ref refers to a different dataset than expected:" 

1221 f" {ref.id} != {cache_ref.id}" 

1222 ) 

1223 

1224 # Cannot recalculate checksum but can compare size as a quick check 

1225 # Do not do this if the size is negative since that indicates 

1226 # we do not know. 

1227 recorded_size = getInfo.info.file_size 

1228 resource_size = uri.size() 

1229 if recorded_size >= 0 and resource_size != recorded_size: 1229 ↛ 1230line 1229 didn't jump to line 1230, because the condition on line 1229 was never true

1230 raise RuntimeError( 

1231 "Integrity failure in Datastore. " 

1232 f"Size of file {uri} ({resource_size}) " 

1233 f"does not match size recorded in registry of {recorded_size}" 

1234 ) 

1235 

1236 # For the general case we have choices for how to proceed. 

1237 # 1. Always use a local file (downloading the remote resource to a 

1238 # temporary file if needed). 

1239 # 2. Use a threshold size and read into memory and use bytes. 

1240 # Use both for now with an arbitrary hand off size. 

1241 # This allows small datasets to be downloaded from remote object 

1242 # stores without requiring a temporary file. 

1243 

1244 formatter = getInfo.formatter 

1245 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1246 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1247 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1248 if cached_file is not None: 

1249 desired_uri = cached_file 

1250 msg = f" (cached version of {uri})" 

1251 else: 

1252 desired_uri = uri 

1253 msg = "" 

1254 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)): 

1255 serializedDataset = desired_uri.read() 

1256 log.debug( 

1257 "Deserializing %s from %d bytes from location %s with formatter %s", 

1258 f"component {getInfo.component}" if isComponent else "", 

1259 len(serializedDataset), 

1260 uri, 

1261 formatter.name(), 

1262 ) 

1263 try: 

1264 result = formatter.fromBytes( 

1265 serializedDataset, component=getInfo.component if isComponent else None 

1266 ) 

1267 except Exception as e: 

1268 raise ValueError( 

1269 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1270 f" ({ref.datasetType.name} from {uri}): {e}" 

1271 ) from e 

1272 else: 

1273 # Read from file. 

1274 

1275 # Have to update the Location associated with the formatter 

1276 # because formatter.read does not allow an override. 

1277 # This could be improved. 

1278 location_updated = False 

1279 msg = "" 

1280 

1281 # First check in cache for local version. 

1282 # The cache will only be relevant for remote resources but 

1283 # no harm in always asking. Context manager ensures that cache 

1284 # file is not deleted during cache expiration. 

1285 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1286 if cached_file is not None: 

1287 msg = f"(via cache read of remote file {uri})" 

1288 uri = cached_file 

1289 location_updated = True 

1290 

1291 with uri.as_local() as local_uri: 

1292 

1293 can_be_cached = False 

1294 if uri != local_uri: 1294 ↛ 1296line 1294 didn't jump to line 1296, because the condition on line 1294 was never true

1295 # URI was remote and file was downloaded 

1296 cache_msg = "" 

1297 location_updated = True 

1298 

1299 if self.cacheManager.should_be_cached(cache_ref): 

1300 # In this scenario we want to ask if the downloaded 

1301 # file should be cached but we should not cache 

1302 # it until after we've used it (to ensure it can't 

1303 # be expired whilst we are using it). 

1304 can_be_cached = True 

1305 

1306 # Say that it is "likely" to be cached because 

1307 # if the formatter read fails we will not be 

1308 # caching this file. 

1309 cache_msg = " and likely cached" 

1310 

1311 msg = f"(via download to local file{cache_msg})" 

1312 

1313 # Calculate the (possibly) new location for the formatter 

1314 # to use. 

1315 newLocation = Location(*local_uri.split()) if location_updated else None 

1316 

1317 log.debug( 

1318 "Reading%s from location %s %s with formatter %s", 

1319 f" component {getInfo.component}" if isComponent else "", 

1320 uri, 

1321 msg, 

1322 formatter.name(), 

1323 ) 

1324 try: 

1325 with formatter._updateLocation(newLocation): 

1326 with time_this( 

1327 log, 

1328 msg="Reading%s from location %s %s with formatter %s", 

1329 args=( 

1330 f" component {getInfo.component}" if isComponent else "", 

1331 uri, 

1332 msg, 

1333 formatter.name(), 

1334 ), 

1335 ): 

1336 result = formatter.read(component=getInfo.component if isComponent else None) 

1337 except Exception as e: 

1338 raise ValueError( 

1339 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1340 f" ({ref.datasetType.name} from {uri}): {e}" 

1341 ) from e 

1342 

1343 # File was read successfully so can move to cache 

1344 if can_be_cached: 1344 ↛ 1345line 1344 didn't jump to line 1345, because the condition on line 1344 was never true

1345 self.cacheManager.move_to_cache(local_uri, cache_ref) 

1346 

1347 return self._post_process_get( 

1348 result, getInfo.readStorageClass, getInfo.assemblerParams, isComponent=isComponent 

1349 ) 

1350 

1351 def knows(self, ref: DatasetRef) -> bool: 

1352 """Check if the dataset is known to the datastore. 

1353 

1354 Does not check for existence of any artifact. 

1355 

1356 Parameters 

1357 ---------- 

1358 ref : `DatasetRef` 

1359 Reference to the required dataset. 

1360 

1361 Returns 

1362 ------- 

1363 exists : `bool` 

1364 `True` if the dataset is known to the datastore. 

1365 """ 

1366 fileLocations = self._get_dataset_locations_info(ref) 

1367 if fileLocations: 

1368 return True 

1369 return False 

1370 

1371 def _process_mexists_records( 

1372 self, 

1373 id_to_ref: Dict[DatasetId, DatasetRef], 

1374 records: Dict[DatasetId, List[StoredFileInfo]], 

1375 all_required: bool, 

1376 artifact_existence: Optional[Dict[ResourcePath, bool]] = None, 

1377 ) -> Dict[DatasetRef, bool]: 

1378 """Helper function for mexists that checks the given records. 

1379 

1380 Parameters 

1381 ---------- 

1382 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`] 

1383 Mapping of the dataset ID to the dataset ref itself. 

1384 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

1385 Records as generally returned by 

1386 ``_get_stored_records_associated_with_refs``. 

1387 all_required : `bool` 

1388 Flag to indicate whether existence requires all artifacts 

1389 associated with a dataset ID to exist or not for existence. 

1390 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1391 Optional mapping of datastore artifact to existence. Updated by 

1392 this method with details of all artifacts tested. Can be `None` 

1393 if the caller is not interested. 

1394 

1395 Returns 

1396 ------- 

1397 existence : `dict` of [`DatasetRef`, `bool`] 

1398 Mapping from dataset to boolean indicating existence. 

1399 """ 

1400 # The URIs to be checked and a mapping of those URIs to 

1401 # the dataset ID. 

1402 uris_to_check: List[ResourcePath] = [] 

1403 location_map: Dict[ResourcePath, DatasetId] = {} 

1404 

1405 location_factory = self.locationFactory 

1406 

1407 uri_existence: Dict[ResourcePath, bool] = {} 

1408 for ref_id, infos in records.items(): 

1409 # Key is the dataset Id, value is list of StoredItemInfo 

1410 uris = [info.file_location(location_factory).uri for info in infos] 

1411 location_map.update({uri: ref_id for uri in uris}) 

1412 

1413 # Check the local cache directly for a dataset corresponding 

1414 # to the remote URI. 

1415 if self.cacheManager.file_count > 0: 

1416 ref = id_to_ref[ref_id] 

1417 for uri, storedFileInfo in zip(uris, infos): 

1418 check_ref = ref 

1419 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 1419 ↛ 1420line 1419 didn't jump to line 1420, because the condition on line 1419 was never true

1420 check_ref = ref.makeComponentRef(component) 

1421 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()): 

1422 # Proxy for URI existence. 

1423 uri_existence[uri] = True 

1424 else: 

1425 uris_to_check.append(uri) 

1426 else: 

1427 # Check all of them. 

1428 uris_to_check.extend(uris) 

1429 

1430 if artifact_existence is not None: 

1431 # If a URI has already been checked remove it from the list 

1432 # and immediately add the status to the output dict. 

1433 filtered_uris_to_check = [] 

1434 for uri in uris_to_check: 

1435 if uri in artifact_existence: 

1436 uri_existence[uri] = artifact_existence[uri] 

1437 else: 

1438 filtered_uris_to_check.append(uri) 

1439 uris_to_check = filtered_uris_to_check 

1440 

1441 # Results. 

1442 dataset_existence: Dict[DatasetRef, bool] = {} 

1443 

1444 uri_existence.update(ResourcePath.mexists(uris_to_check)) 

1445 for uri, exists in uri_existence.items(): 

1446 dataset_id = location_map[uri] 

1447 ref = id_to_ref[dataset_id] 

1448 

1449 # Disassembled composite needs to check all locations. 

1450 # all_required indicates whether all need to exist or not. 

1451 if ref in dataset_existence: 

1452 if all_required: 

1453 exists = dataset_existence[ref] and exists 

1454 else: 

1455 exists = dataset_existence[ref] or exists 

1456 dataset_existence[ref] = exists 

1457 

1458 if artifact_existence is not None: 

1459 artifact_existence.update(uri_existence) 

1460 

1461 return dataset_existence 

1462 

1463 def mexists( 

1464 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None 

1465 ) -> Dict[DatasetRef, bool]: 

1466 """Check the existence of multiple datasets at once. 

1467 

1468 Parameters 

1469 ---------- 

1470 refs : iterable of `DatasetRef` 

1471 The datasets to be checked. 

1472 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1473 Optional mapping of datastore artifact to existence. Updated by 

1474 this method with details of all artifacts tested. Can be `None` 

1475 if the caller is not interested. 

1476 

1477 Returns 

1478 ------- 

1479 existence : `dict` of [`DatasetRef`, `bool`] 

1480 Mapping from dataset to boolean indicating existence. 

1481 

1482 Notes 

1483 ----- 

1484 To minimize potentially costly remote existence checks, the local 

1485 cache is checked as a proxy for existence. If a file for this 

1486 `DatasetRef` does exist no check is done for the actual URI. This 

1487 could result in possibly unexpected behavior if the dataset itself 

1488 has been removed from the datastore by another process whilst it is 

1489 still in the cache. 

1490 """ 

1491 chunk_size = 10_000 

1492 dataset_existence: Dict[DatasetRef, bool] = {} 

1493 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size) 

1494 n_found_total = 0 

1495 n_checked = 0 

1496 n_chunks = 0 

1497 for chunk in chunk_iterable(refs, chunk_size=chunk_size): 

1498 chunk_result = self._mexists(chunk, artifact_existence) 

1499 if log.isEnabledFor(VERBOSE): 

1500 n_results = len(chunk_result) 

1501 n_checked += n_results 

1502 # Can treat the booleans as 0, 1 integers and sum them. 

1503 n_found = sum(chunk_result.values()) 

1504 n_found_total += n_found 

1505 log.verbose( 

1506 "Number of datasets found in datastore for chunk %d = %d/%d (running total: %d/%d)", 

1507 n_chunks, 

1508 n_found, 

1509 n_results, 

1510 n_found_total, 

1511 n_checked, 

1512 ) 

1513 dataset_existence.update(chunk_result) 

1514 n_chunks += 1 

1515 

1516 return dataset_existence 

1517 

1518 def _mexists( 

1519 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None 

1520 ) -> Dict[DatasetRef, bool]: 

1521 """Check the existence of multiple datasets at once. 

1522 

1523 Parameters 

1524 ---------- 

1525 refs : iterable of `DatasetRef` 

1526 The datasets to be checked. 

1527 

1528 Returns 

1529 ------- 

1530 existence : `dict` of [`DatasetRef`, `bool`] 

1531 Mapping from dataset to boolean indicating existence. 

1532 """ 

1533 # Need a mapping of dataset_id to dataset ref since the API 

1534 # works with dataset_id 

1535 id_to_ref = {ref.getCheckedId(): ref for ref in refs} 

1536 

1537 # Set of all IDs we are checking for. 

1538 requested_ids = set(id_to_ref.keys()) 

1539 

1540 # The records themselves. Could be missing some entries. 

1541 records = self._get_stored_records_associated_with_refs(refs) 

1542 

1543 dataset_existence = self._process_mexists_records( 

1544 id_to_ref, records, True, artifact_existence=artifact_existence 

1545 ) 

1546 

1547 # Set of IDs that have been handled. 

1548 handled_ids = {ref.id for ref in dataset_existence.keys()} 

1549 

1550 missing_ids = requested_ids - handled_ids 

1551 if missing_ids: 

1552 if not self.trustGetRequest: 

1553 # Must assume these do not exist 

1554 for missing in missing_ids: 

1555 dataset_existence[id_to_ref[missing]] = False 

1556 else: 

1557 log.debug( 

1558 "%d out of %d datasets were not known to datastore during initial existence check.", 

1559 len(missing_ids), 

1560 len(requested_ids), 

1561 ) 

1562 

1563 # Construct data structure identical to that returned 

1564 # by _get_stored_records_associated_with_refs() but using 

1565 # guessed names. 

1566 records = {} 

1567 for missing in missing_ids: 

1568 expected = self._get_expected_dataset_locations_info(id_to_ref[missing]) 

1569 records[missing] = [info for _, info in expected] 

1570 

1571 dataset_existence.update( 

1572 self._process_mexists_records( 

1573 id_to_ref, records, False, artifact_existence=artifact_existence 

1574 ) 

1575 ) 

1576 

1577 return dataset_existence 

1578 

1579 def exists(self, ref: DatasetRef) -> bool: 

1580 """Check if the dataset exists in the datastore. 

1581 

1582 Parameters 

1583 ---------- 

1584 ref : `DatasetRef` 

1585 Reference to the required dataset. 

1586 

1587 Returns 

1588 ------- 

1589 exists : `bool` 

1590 `True` if the entity exists in the `Datastore`. 

1591 

1592 Notes 

1593 ----- 

1594 The local cache is checked as a proxy for existence in the remote 

1595 object store. It is possible that another process on a different 

1596 compute node could remove the file from the object store even 

1597 though it is present in the local cache. 

1598 """ 

1599 fileLocations = self._get_dataset_locations_info(ref) 

1600 

1601 # if we are being asked to trust that registry might not be correct 

1602 # we ask for the expected locations and check them explicitly 

1603 if not fileLocations: 

1604 if not self.trustGetRequest: 

1605 return False 

1606 

1607 # First check the cache. If it is not found we must check 

1608 # the datastore itself. Assume that any component in the cache 

1609 # means that the dataset does exist somewhere. 

1610 if self.cacheManager.known_to_cache(ref): 1610 ↛ 1611line 1610 didn't jump to line 1611, because the condition on line 1610 was never true

1611 return True 

1612 

1613 # When we are guessing a dataset location we can not check 

1614 # for the existence of every component since we can not 

1615 # know if every component was written. Instead we check 

1616 # for the existence of any of the expected locations. 

1617 for location, _ in self._get_expected_dataset_locations_info(ref): 

1618 if self._artifact_exists(location): 

1619 return True 

1620 return False 

1621 

1622 # All listed artifacts must exist. 

1623 for location, storedFileInfo in fileLocations: 

1624 # Checking in cache needs the component ref. 

1625 check_ref = ref 

1626 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1627 check_ref = ref.makeComponentRef(component) 

1628 if self.cacheManager.known_to_cache(check_ref, location.getExtension()): 

1629 continue 

1630 

1631 if not self._artifact_exists(location): 

1632 return False 

1633 

1634 return True 

1635 

1636 def getURIs( 

1637 self, ref: DatasetRef, predict: bool = False 

1638 ) -> Tuple[Optional[ResourcePath], Dict[str, ResourcePath]]: 

1639 """Return URIs associated with dataset. 

1640 

1641 Parameters 

1642 ---------- 

1643 ref : `DatasetRef` 

1644 Reference to the required dataset. 

1645 predict : `bool`, optional 

1646 If the datastore does not know about the dataset, should it 

1647 return a predicted URI or not? 

1648 

1649 Returns 

1650 ------- 

1651 primary : `lsst.resources.ResourcePath` 

1652 The URI to the primary artifact associated with this dataset. 

1653 If the dataset was disassembled within the datastore this 

1654 may be `None`. 

1655 components : `dict` 

1656 URIs to any components associated with the dataset artifact. 

1657 Can be empty if there are no components. 

1658 """ 

1659 

1660 primary: Optional[ResourcePath] = None 

1661 components: Dict[str, ResourcePath] = {} 

1662 

1663 # if this has never been written then we have to guess 

1664 if not self.exists(ref): 

1665 if not predict: 

1666 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

1667 

1668 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1669 

1670 if doDisassembly: 

1671 

1672 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

1673 compRef = ref.makeComponentRef(component) 

1674 compLocation, _ = self._determine_put_formatter_location(compRef) 

1675 

1676 # Add a URI fragment to indicate this is a guess 

1677 components[component] = ResourcePath(compLocation.uri.geturl() + "#predicted") 

1678 

1679 else: 

1680 

1681 location, _ = self._determine_put_formatter_location(ref) 

1682 

1683 # Add a URI fragment to indicate this is a guess 

1684 primary = ResourcePath(location.uri.geturl() + "#predicted") 

1685 

1686 return primary, components 

1687 

1688 # If this is a ref that we have written we can get the path. 

1689 # Get file metadata and internal metadata 

1690 fileLocations = self._get_dataset_locations_info(ref) 

1691 

1692 guessing = False 

1693 if not fileLocations: 

1694 if not self.trustGetRequest: 1694 ↛ 1695line 1694 didn't jump to line 1695, because the condition on line 1694 was never true

1695 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1696 fileLocations = self._get_expected_dataset_locations_info(ref) 

1697 guessing = True 

1698 

1699 if len(fileLocations) == 1: 

1700 # No disassembly so this is the primary URI 

1701 uri = fileLocations[0][0].uri 

1702 if guessing and not uri.exists(): 1702 ↛ 1703line 1702 didn't jump to line 1703, because the condition on line 1702 was never true

1703 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1704 primary = uri 

1705 

1706 else: 

1707 for location, storedFileInfo in fileLocations: 

1708 if storedFileInfo.component is None: 1708 ↛ 1709line 1708 didn't jump to line 1709, because the condition on line 1708 was never true

1709 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1710 uri = location.uri 

1711 if guessing and not uri.exists(): 1711 ↛ 1715line 1711 didn't jump to line 1715, because the condition on line 1711 was never true

1712 # If we are trusting then it is entirely possible for 

1713 # some components to be missing. In that case we skip 

1714 # to the next component. 

1715 if self.trustGetRequest: 

1716 continue 

1717 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1718 components[storedFileInfo.component] = uri 

1719 

1720 return primary, components 

1721 

1722 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

1723 """URI to the Dataset. 

1724 

1725 Parameters 

1726 ---------- 

1727 ref : `DatasetRef` 

1728 Reference to the required Dataset. 

1729 predict : `bool` 

1730 If `True`, allow URIs to be returned of datasets that have not 

1731 been written. 

1732 

1733 Returns 

1734 ------- 

1735 uri : `str` 

1736 URI pointing to the dataset within the datastore. If the 

1737 dataset does not exist in the datastore, and if ``predict`` is 

1738 `True`, the URI will be a prediction and will include a URI 

1739 fragment "#predicted". 

1740 If the datastore does not have entities that relate well 

1741 to the concept of a URI the returned URI will be 

1742 descriptive. The returned URI is not guaranteed to be obtainable. 

1743 

1744 Raises 

1745 ------ 

1746 FileNotFoundError 

1747 Raised if a URI has been requested for a dataset that does not 

1748 exist and guessing is not allowed. 

1749 RuntimeError 

1750 Raised if a request is made for a single URI but multiple URIs 

1751 are associated with this dataset. 

1752 

1753 Notes 

1754 ----- 

1755 When a predicted URI is requested an attempt will be made to form 

1756 a reasonable URI based on file templates and the expected formatter. 

1757 """ 

1758 primary, components = self.getURIs(ref, predict) 

1759 if primary is None or components: 1759 ↛ 1760line 1759 didn't jump to line 1760, because the condition on line 1759 was never true

1760 raise RuntimeError( 

1761 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

1762 ) 

1763 return primary 

1764 

1765 def retrieveArtifacts( 

1766 self, 

1767 refs: Iterable[DatasetRef], 

1768 destination: ResourcePath, 

1769 transfer: str = "auto", 

1770 preserve_path: bool = True, 

1771 overwrite: bool = False, 

1772 ) -> List[ResourcePath]: 

1773 """Retrieve the file artifacts associated with the supplied refs. 

1774 

1775 Parameters 

1776 ---------- 

1777 refs : iterable of `DatasetRef` 

1778 The datasets for which file artifacts are to be retrieved. 

1779 A single ref can result in multiple files. The refs must 

1780 be resolved. 

1781 destination : `lsst.resources.ResourcePath` 

1782 Location to write the file artifacts. 

1783 transfer : `str`, optional 

1784 Method to use to transfer the artifacts. Must be one of the options 

1785 supported by `lsst.resources.ResourcePath.transfer_from()`. 

1786 "move" is not allowed. 

1787 preserve_path : `bool`, optional 

1788 If `True` the full path of the file artifact within the datastore 

1789 is preserved. If `False` the final file component of the path 

1790 is used. 

1791 overwrite : `bool`, optional 

1792 If `True` allow transfers to overwrite existing files at the 

1793 destination. 

1794 

1795 Returns 

1796 ------- 

1797 targets : `list` of `lsst.resources.ResourcePath` 

1798 URIs of file artifacts in destination location. Order is not 

1799 preserved. 

1800 """ 

1801 if not destination.isdir(): 1801 ↛ 1802line 1801 didn't jump to line 1802, because the condition on line 1801 was never true

1802 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

1803 

1804 if transfer == "move": 

1805 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

1806 

1807 # Source -> Destination 

1808 # This also helps filter out duplicate DatasetRef in the request 

1809 # that will map to the same underlying file transfer. 

1810 to_transfer: Dict[ResourcePath, ResourcePath] = {} 

1811 

1812 for ref in refs: 

1813 locations = self._get_dataset_locations_info(ref) 

1814 for location, _ in locations: 

1815 source_uri = location.uri 

1816 target_path: ResourcePathExpression 

1817 if preserve_path: 

1818 target_path = location.pathInStore 

1819 if target_path.isabs(): 1819 ↛ 1822line 1819 didn't jump to line 1822, because the condition on line 1819 was never true

1820 # This is an absolute path to an external file. 

1821 # Use the full path. 

1822 target_path = target_path.relativeToPathRoot 

1823 else: 

1824 target_path = source_uri.basename() 

1825 target_uri = destination.join(target_path) 

1826 to_transfer[source_uri] = target_uri 

1827 

1828 # In theory can now parallelize the transfer 

1829 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer)) 

1830 for source_uri, target_uri in to_transfer.items(): 

1831 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

1832 

1833 return list(to_transfer.values()) 

1834 

1835 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

1836 """Load an InMemoryDataset from the store. 

1837 

1838 Parameters 

1839 ---------- 

1840 ref : `DatasetRef` 

1841 Reference to the required Dataset. 

1842 parameters : `dict` 

1843 `StorageClass`-specific parameters that specify, for example, 

1844 a slice of the dataset to be loaded. 

1845 

1846 Returns 

1847 ------- 

1848 inMemoryDataset : `object` 

1849 Requested dataset or slice thereof as an InMemoryDataset. 

1850 

1851 Raises 

1852 ------ 

1853 FileNotFoundError 

1854 Requested dataset can not be retrieved. 

1855 TypeError 

1856 Return value from formatter has unexpected type. 

1857 ValueError 

1858 Formatter failed to process the dataset. 

1859 """ 

1860 allGetInfo = self._prepare_for_get(ref, parameters) 

1861 refComponent = ref.datasetType.component() 

1862 

1863 # Supplied storage class for the component being read 

1864 refStorageClass = ref.datasetType.storageClass 

1865 

1866 # Create mapping from component name to related info 

1867 allComponents = {i.component: i for i in allGetInfo} 

1868 

1869 # By definition the dataset is disassembled if we have more 

1870 # than one record for it. 

1871 isDisassembled = len(allGetInfo) > 1 

1872 

1873 # Look for the special case where we are disassembled but the 

1874 # component is a derived component that was not written during 

1875 # disassembly. For this scenario we need to check that the 

1876 # component requested is listed as a derived component for the 

1877 # composite storage class 

1878 isDisassembledReadOnlyComponent = False 

1879 if isDisassembled and refComponent: 

1880 # The composite storage class should be accessible through 

1881 # the component dataset type 

1882 compositeStorageClass = ref.datasetType.parentStorageClass 

1883 

1884 # In the unlikely scenario where the composite storage 

1885 # class is not known, we can only assume that this is a 

1886 # normal component. If that assumption is wrong then the 

1887 # branch below that reads a persisted component will fail 

1888 # so there is no need to complain here. 

1889 if compositeStorageClass is not None: 1889 ↛ 1892line 1889 didn't jump to line 1892, because the condition on line 1889 was never false

1890 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

1891 

1892 if isDisassembled and not refComponent: 

1893 # This was a disassembled dataset spread over multiple files 

1894 # and we need to put them all back together again. 

1895 # Read into memory and then assemble 

1896 

1897 # Check that the supplied parameters are suitable for the type read 

1898 refStorageClass.validateParameters(parameters) 

1899 

1900 # We want to keep track of all the parameters that were not used 

1901 # by formatters. We assume that if any of the component formatters 

1902 # use a parameter that we do not need to apply it again in the 

1903 # assembler. 

1904 usedParams = set() 

1905 

1906 components: Dict[str, Any] = {} 

1907 for getInfo in allGetInfo: 

1908 # assemblerParams are parameters not understood by the 

1909 # associated formatter. 

1910 usedParams.update(set(getInfo.formatterParams)) 

1911 

1912 component = getInfo.component 

1913 

1914 if component is None: 1914 ↛ 1915line 1914 didn't jump to line 1915, because the condition on line 1914 was never true

1915 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

1916 

1917 # We do not want the formatter to think it's reading 

1918 # a component though because it is really reading a 

1919 # standalone dataset -- always tell reader it is not a 

1920 # component. 

1921 components[component] = self._read_artifact_into_memory( 

1922 getInfo, ref.makeComponentRef(component), isComponent=False 

1923 ) 

1924 

1925 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

1926 

1927 # Any unused parameters will have to be passed to the assembler 

1928 if parameters: 

1929 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

1930 else: 

1931 unusedParams = {} 

1932 

1933 # Process parameters 

1934 return ref.datasetType.storageClass.delegate().handleParameters( 

1935 inMemoryDataset, parameters=unusedParams 

1936 ) 

1937 

1938 elif isDisassembledReadOnlyComponent: 

1939 

1940 compositeStorageClass = ref.datasetType.parentStorageClass 

1941 if compositeStorageClass is None: 1941 ↛ 1942line 1941 didn't jump to line 1942, because the condition on line 1941 was never true

1942 raise RuntimeError( 

1943 f"Unable to retrieve derived component '{refComponent}' since" 

1944 "no composite storage class is available." 

1945 ) 

1946 

1947 if refComponent is None: 1947 ↛ 1949line 1947 didn't jump to line 1949, because the condition on line 1947 was never true

1948 # Mainly for mypy 

1949 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

1950 

1951 # Assume that every derived component can be calculated by 

1952 # forwarding the request to a single read/write component. 

1953 # Rather than guessing which rw component is the right one by 

1954 # scanning each for a derived component of the same name, 

1955 # we ask the storage class delegate directly which one is best to 

1956 # use. 

1957 compositeDelegate = compositeStorageClass.delegate() 

1958 forwardedComponent = compositeDelegate.selectResponsibleComponent( 

1959 refComponent, set(allComponents) 

1960 ) 

1961 

1962 # Select the relevant component 

1963 rwInfo = allComponents[forwardedComponent] 

1964 

1965 # For now assume that read parameters are validated against 

1966 # the real component and not the requested component 

1967 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

1968 forwardedStorageClass.validateParameters(parameters) 

1969 

1970 # The reference to use for the caching must refer to the forwarded 

1971 # component and not the derived component. 

1972 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent) 

1973 

1974 # Unfortunately the FileDescriptor inside the formatter will have 

1975 # the wrong write storage class so we need to create a new one 

1976 # given the immutability constraint. 

1977 writeStorageClass = rwInfo.info.storageClass 

1978 

1979 # We may need to put some thought into parameters for read 

1980 # components but for now forward them on as is 

1981 readFormatter = type(rwInfo.formatter)( 

1982 FileDescriptor( 

1983 rwInfo.location, 

1984 readStorageClass=refStorageClass, 

1985 storageClass=writeStorageClass, 

1986 parameters=parameters, 

1987 ), 

1988 ref.dataId, 

1989 ) 

1990 

1991 # The assembler can not receive any parameter requests for a 

1992 # derived component at this time since the assembler will 

1993 # see the storage class of the derived component and those 

1994 # parameters will have to be handled by the formatter on the 

1995 # forwarded storage class. 

1996 assemblerParams: Dict[str, Any] = {} 

1997 

1998 # Need to created a new info that specifies the derived 

1999 # component and associated storage class 

2000 readInfo = DatastoreFileGetInformation( 

2001 rwInfo.location, 

2002 readFormatter, 

2003 rwInfo.info, 

2004 assemblerParams, 

2005 {}, 

2006 refComponent, 

2007 refStorageClass, 

2008 ) 

2009 

2010 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref) 

2011 

2012 else: 

2013 # Single file request or component from that composite file 

2014 for lookup in (refComponent, None): 2014 ↛ 2019line 2014 didn't jump to line 2019, because the loop on line 2014 didn't complete

2015 if lookup in allComponents: 2015 ↛ 2014line 2015 didn't jump to line 2014, because the condition on line 2015 was never false

2016 getInfo = allComponents[lookup] 

2017 break 

2018 else: 

2019 raise FileNotFoundError( 

2020 f"Component {refComponent} not found for ref {ref} in datastore {self.name}" 

2021 ) 

2022 

2023 # Do not need the component itself if already disassembled 

2024 if isDisassembled: 

2025 isComponent = False 

2026 else: 

2027 isComponent = getInfo.component is not None 

2028 

2029 # For a component read of a composite we want the cache to 

2030 # be looking at the composite ref itself. 

2031 cache_ref = ref.makeCompositeRef() if isComponent else ref 

2032 

2033 # For a disassembled component we can validate parametersagainst 

2034 # the component storage class directly 

2035 if isDisassembled: 

2036 refStorageClass.validateParameters(parameters) 

2037 else: 

2038 # For an assembled composite this could be a derived 

2039 # component derived from a real component. The validity 

2040 # of the parameters is not clear. For now validate against 

2041 # the composite storage class 

2042 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

2043 

2044 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref) 

2045 

2046 @transactional 

2047 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

2048 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

2049 

2050 Parameters 

2051 ---------- 

2052 inMemoryDataset : `object` 

2053 The dataset to store. 

2054 ref : `DatasetRef` 

2055 Reference to the associated Dataset. 

2056 

2057 Raises 

2058 ------ 

2059 TypeError 

2060 Supplied object and storage class are inconsistent. 

2061 DatasetTypeNotSupportedError 

2062 The associated `DatasetType` is not handled by this datastore. 

2063 

2064 Notes 

2065 ----- 

2066 If the datastore is configured to reject certain dataset types it 

2067 is possible that the put will fail and raise a 

2068 `DatasetTypeNotSupportedError`. The main use case for this is to 

2069 allow `ChainedDatastore` to put to multiple datastores without 

2070 requiring that every datastore accepts the dataset. 

2071 """ 

2072 

2073 doDisassembly = self.composites.shouldBeDisassembled(ref) 

2074 # doDisassembly = True 

2075 

2076 artifacts = [] 

2077 if doDisassembly: 

2078 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

2079 if components is None: 2079 ↛ 2080line 2079 didn't jump to line 2080, because the condition on line 2079 was never true

2080 raise RuntimeError( 

2081 f"Inconsistent configuration: dataset type {ref.datasetType.name} " 

2082 f"with storage class {ref.datasetType.storageClass.name} " 

2083 "is configured to be disassembled, but cannot be." 

2084 ) 

2085 for component, componentInfo in components.items(): 

2086 # Don't recurse because we want to take advantage of 

2087 # bulk insert -- need a new DatasetRef that refers to the 

2088 # same dataset_id but has the component DatasetType 

2089 # DatasetType does not refer to the types of components 

2090 # So we construct one ourselves. 

2091 compRef = ref.makeComponentRef(component) 

2092 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

2093 artifacts.append((compRef, storedInfo)) 

2094 else: 

2095 # Write the entire thing out 

2096 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

2097 artifacts.append((ref, storedInfo)) 

2098 

2099 self._register_datasets(artifacts) 

2100 

2101 @transactional 

2102 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None: 

2103 # At this point can safely remove these datasets from the cache 

2104 # to avoid confusion later on. If they are not trashed later 

2105 # the cache will simply be refilled. 

2106 self.cacheManager.remove_from_cache(ref) 

2107 

2108 # If we are in trust mode there will be nothing to move to 

2109 # the trash table and we will have to try to delete the file 

2110 # immediately. 

2111 if self.trustGetRequest: 

2112 # Try to keep the logic below for a single file trash. 

2113 if isinstance(ref, DatasetRef): 

2114 refs = {ref} 

2115 else: 

2116 # Will recreate ref at the end of this branch. 

2117 refs = set(ref) 

2118 

2119 # Determine which datasets are known to datastore directly. 

2120 id_to_ref = {ref.getCheckedId(): ref for ref in refs} 

2121 existing_ids = self._get_stored_records_associated_with_refs(refs) 

2122 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids} 

2123 

2124 missing = refs - existing_refs 

2125 if missing: 

2126 # Do an explicit existence check on these refs. 

2127 # We only care about the artifacts at this point and not 

2128 # the dataset existence. 

2129 artifact_existence: Dict[ResourcePath, bool] = {} 

2130 _ = self.mexists(missing, artifact_existence) 

2131 uris = [uri for uri, exists in artifact_existence.items() if exists] 

2132 

2133 # FUTURE UPGRADE: Implement a parallelized bulk remove. 

2134 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris)) 

2135 for uri in uris: 

2136 try: 

2137 uri.remove() 

2138 except Exception as e: 

2139 if ignore_errors: 

2140 log.debug("Artifact %s could not be removed: %s", uri, e) 

2141 continue 

2142 raise 

2143 

2144 # There is no point asking the code below to remove refs we 

2145 # know are missing so update it with the list of existing 

2146 # records. Try to retain one vs many logic. 

2147 if not existing_refs: 

2148 # Nothing more to do since none of the datasets were 

2149 # known to the datastore record table. 

2150 return 

2151 ref = list(existing_refs) 

2152 if len(ref) == 1: 

2153 ref = ref[0] 

2154 

2155 # Get file metadata and internal metadata 

2156 if not isinstance(ref, DatasetRef): 

2157 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

2158 # Assumed to be an iterable of refs so bulk mode enabled. 

2159 try: 

2160 self.bridge.moveToTrash(ref) 

2161 except Exception as e: 

2162 if ignore_errors: 

2163 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

2164 else: 

2165 raise 

2166 return 

2167 

2168 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

2169 

2170 fileLocations = self._get_dataset_locations_info(ref) 

2171 

2172 if not fileLocations: 

2173 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

2174 if ignore_errors: 

2175 log.warning(err_msg) 

2176 return 

2177 else: 

2178 raise FileNotFoundError(err_msg) 

2179 

2180 for location, storedFileInfo in fileLocations: 

2181 if not self._artifact_exists(location): 2181 ↛ 2182line 2181 didn't jump to line 2182

2182 err_msg = ( 

2183 f"Dataset is known to datastore {self.name} but " 

2184 f"associated artifact ({location.uri}) is missing" 

2185 ) 

2186 if ignore_errors: 

2187 log.warning(err_msg) 

2188 return 

2189 else: 

2190 raise FileNotFoundError(err_msg) 

2191 

2192 # Mark dataset as trashed 

2193 try: 

2194 self.bridge.moveToTrash([ref]) 

2195 except Exception as e: 

2196 if ignore_errors: 

2197 log.warning( 

2198 "Attempted to mark dataset (%s) to be trashed in datastore %s " 

2199 "but encountered an error: %s", 

2200 ref, 

2201 self.name, 

2202 e, 

2203 ) 

2204 pass 

2205 else: 

2206 raise 

2207 

2208 @transactional 

2209 def emptyTrash(self, ignore_errors: bool = True) -> None: 

2210 """Remove all datasets from the trash. 

2211 

2212 Parameters 

2213 ---------- 

2214 ignore_errors : `bool` 

2215 If `True` return without error even if something went wrong. 

2216 Problems could occur if another process is simultaneously trying 

2217 to delete. 

2218 """ 

2219 log.debug("Emptying trash in datastore %s", self.name) 

2220 

2221 # Context manager will empty trash iff we finish it without raising. 

2222 # It will also automatically delete the relevant rows from the 

2223 # trash table and the records table. 

2224 with self.bridge.emptyTrash( 

2225 self._table, record_class=StoredFileInfo, record_column="path" 

2226 ) as trash_data: 

2227 # Removing the artifacts themselves requires that the files are 

2228 # not also associated with refs that are not to be trashed. 

2229 # Therefore need to do a query with the file paths themselves 

2230 # and return all the refs associated with them. Can only delete 

2231 # a file if the refs to be trashed are the only refs associated 

2232 # with the file. 

2233 # This requires multiple copies of the trashed items 

2234 trashed, artifacts_to_keep = trash_data 

2235 

2236 if artifacts_to_keep is None: 

2237 # The bridge is not helping us so have to work it out 

2238 # ourselves. This is not going to be as efficient. 

2239 trashed = list(trashed) 

2240 

2241 # The instance check is for mypy since up to this point it 

2242 # does not know the type of info. 

2243 path_map = self._refs_associated_with_artifacts( 

2244 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)] 

2245 ) 

2246 

2247 for ref, info in trashed: 

2248 

2249 # Mypy needs to know this is not the base class 

2250 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2251 

2252 # Check for mypy 

2253 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

2254 

2255 path_map[info.path].remove(ref.id) 

2256 if not path_map[info.path]: 2256 ↛ 2247line 2256 didn't jump to line 2247, because the condition on line 2256 was never false

2257 del path_map[info.path] 

2258 

2259 artifacts_to_keep = set(path_map) 

2260 

2261 for ref, info in trashed: 

2262 

2263 # Should not happen for this implementation but need 

2264 # to keep mypy happy. 

2265 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

2266 

2267 # Mypy needs to know this is not the base class 

2268 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2269 

2270 # Check for mypy 

2271 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

2272 

2273 if info.path in artifacts_to_keep: 

2274 # This is a multi-dataset artifact and we are not 

2275 # removing all associated refs. 

2276 continue 

2277 

2278 # Only trashed refs still known to datastore will be returned. 

2279 location = info.file_location(self.locationFactory) 

2280 

2281 # Point of no return for this artifact 

2282 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

2283 try: 

2284 self._delete_artifact(location) 

2285 except FileNotFoundError: 

2286 # If the file itself has been deleted there is nothing 

2287 # we can do about it. It is possible that trash has 

2288 # been run in parallel in another process or someone 

2289 # decided to delete the file. It is unlikely to come 

2290 # back and so we should still continue with the removal 

2291 # of the entry from the trash table. It is also possible 

2292 # we removed it in a previous iteration if it was 

2293 # a multi-dataset artifact. The delete artifact method 

2294 # will log a debug message in this scenario. 

2295 # Distinguishing file missing before trash started and 

2296 # file already removed previously as part of this trash 

2297 # is not worth the distinction with regards to potential 

2298 # memory cost. 

2299 pass 

2300 except Exception as e: 

2301 if ignore_errors: 

2302 # Use a debug message here even though it's not 

2303 # a good situation. In some cases this can be 

2304 # caused by a race between user A and user B 

2305 # and neither of them has permissions for the 

2306 # other's files. Butler does not know about users 

2307 # and trash has no idea what collections these 

2308 # files were in (without guessing from a path). 

2309 log.debug( 

2310 "Encountered error removing artifact %s from datastore %s: %s", 

2311 location.uri, 

2312 self.name, 

2313 e, 

2314 ) 

2315 else: 

2316 raise 

2317 

2318 @transactional 

2319 def transfer_from( 

2320 self, 

2321 source_datastore: Datastore, 

2322 refs: Iterable[DatasetRef], 

2323 local_refs: Optional[Iterable[DatasetRef]] = None, 

2324 transfer: str = "auto", 

2325 artifact_existence: Optional[Dict[ResourcePath, bool]] = None, 

2326 ) -> None: 

2327 # Docstring inherited 

2328 if type(self) is not type(source_datastore): 

2329 raise TypeError( 

2330 f"Datastore mismatch between this datastore ({type(self)}) and the " 

2331 f"source datastore ({type(source_datastore)})." 

2332 ) 

2333 

2334 # Be explicit for mypy 

2335 if not isinstance(source_datastore, FileDatastore): 2335 ↛ 2336line 2335 didn't jump to line 2336, because the condition on line 2335 was never true

2336 raise TypeError( 

2337 "Can only transfer to a FileDatastore from another FileDatastore, not" 

2338 f" {type(source_datastore)}" 

2339 ) 

2340 

2341 # Stop early if "direct" transfer mode is requested. That would 

2342 # require that the URI inside the source datastore should be stored 

2343 # directly in the target datastore, which seems unlikely to be useful 

2344 # since at any moment the source datastore could delete the file. 

2345 if transfer in ("direct", "split"): 

2346 raise ValueError( 

2347 f"Can not transfer from a source datastore using {transfer} mode since" 

2348 " those files are controlled by the other datastore." 

2349 ) 

2350 

2351 # Empty existence lookup if none given. 

2352 if artifact_existence is None: 

2353 artifact_existence = {} 

2354 

2355 # We will go through the list multiple times so must convert 

2356 # generators to lists. 

2357 refs = list(refs) 

2358 

2359 if local_refs is None: 

2360 local_refs = refs 

2361 else: 

2362 local_refs = list(local_refs) 

2363 

2364 # In order to handle disassembled composites the code works 

2365 # at the records level since it can assume that internal APIs 

2366 # can be used. 

2367 # - If the record already exists in the destination this is assumed 

2368 # to be okay. 

2369 # - If there is no record but the source and destination URIs are 

2370 # identical no transfer is done but the record is added. 

2371 # - If the source record refers to an absolute URI currently assume 

2372 # that that URI should remain absolute and will be visible to the 

2373 # destination butler. May need to have a flag to indicate whether 

2374 # the dataset should be transferred. This will only happen if 

2375 # the detached Butler has had a local ingest. 

2376 

2377 # What we really want is all the records in the source datastore 

2378 # associated with these refs. Or derived ones if they don't exist 

2379 # in the source. 

2380 source_records = source_datastore._get_stored_records_associated_with_refs(refs) 

2381 

2382 # The source dataset_ids are the keys in these records 

2383 source_ids = set(source_records) 

2384 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

2385 

2386 # The not None check is to appease mypy 

2387 requested_ids = set(ref.id for ref in refs if ref.id is not None) 

2388 missing_ids = requested_ids - source_ids 

2389 

2390 # Missing IDs can be okay if that datastore has allowed 

2391 # gets based on file existence. Should we transfer what we can 

2392 # or complain about it and warn? 

2393 if missing_ids and not source_datastore.trustGetRequest: 2393 ↛ 2394line 2393 didn't jump to line 2394, because the condition on line 2393 was never true

2394 raise ValueError( 

2395 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}" 

2396 ) 

2397 

2398 # Need to map these missing IDs to a DatasetRef so we can guess 

2399 # the details. 

2400 if missing_ids: 

2401 log.info( 

2402 "Number of expected datasets missing from source datastore records: %d out of %d", 

2403 len(missing_ids), 

2404 len(requested_ids), 

2405 ) 

2406 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

2407 

2408 # This should be chunked in case we end up having to check 

2409 # the file store since we need some log output to show 

2410 # progress. 

2411 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000): 

2412 records = {} 

2413 for missing in missing_ids_chunk: 

2414 # Ask the source datastore where the missing artifacts 

2415 # should be. An execution butler might not know about the 

2416 # artifacts even if they are there. 

2417 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) 

2418 records[missing] = [info for _, info in expected] 

2419 

2420 # Call the mexist helper method in case we have not already 

2421 # checked these artifacts such that artifact_existence is 

2422 # empty. This allows us to benefit from parallelism. 

2423 # datastore.mexists() itself does not give us access to the 

2424 # derived datastore record. 

2425 log.verbose("Checking existence of %d datasets unknown to datastore", len(records)) 

2426 ref_exists = source_datastore._process_mexists_records( 

2427 id_to_ref, records, False, artifact_existence=artifact_existence 

2428 ) 

2429 

2430 # Now go through the records and propagate the ones that exist. 

2431 location_factory = source_datastore.locationFactory 

2432 for missing, record_list in records.items(): 

2433 # Skip completely if the ref does not exist. 

2434 ref = id_to_ref[missing] 

2435 if not ref_exists[ref]: 

2436 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref) 

2437 continue 

2438 # Check for file artifact to decide which parts of a 

2439 # disassembled composite do exist. If there is only a 

2440 # single record we don't even need to look because it can't 

2441 # be a composite and must exist. 

2442 if len(record_list) == 1: 

2443 dataset_records = record_list 

2444 else: 

2445 dataset_records = [ 

2446 record 

2447 for record in record_list 

2448 if artifact_existence[record.file_location(location_factory).uri] 

2449 ] 

2450 assert len(dataset_records) > 0, "Disassembled composite should have had some files." 

2451 

2452 # Rely on source_records being a defaultdict. 

2453 source_records[missing].extend(dataset_records) 

2454 

2455 # See if we already have these records 

2456 target_records = self._get_stored_records_associated_with_refs(local_refs) 

2457 

2458 # The artifacts to register 

2459 artifacts = [] 

2460 

2461 # Refs that already exist 

2462 already_present = [] 

2463 

2464 # Now can transfer the artifacts 

2465 for source_ref, target_ref in zip(refs, local_refs): 

2466 if target_ref.id in target_records: 

2467 # Already have an artifact for this. 

2468 already_present.append(target_ref) 

2469 continue 

2470 

2471 # mypy needs to know these are always resolved refs 

2472 for info in source_records[source_ref.getCheckedId()]: 

2473 source_location = info.file_location(source_datastore.locationFactory) 

2474 target_location = info.file_location(self.locationFactory) 

2475 if source_location == target_location: 2475 ↛ 2479line 2475 didn't jump to line 2479, because the condition on line 2475 was never true

2476 # Either the dataset is already in the target datastore 

2477 # (which is how execution butler currently runs) or 

2478 # it is an absolute URI. 

2479 if source_location.pathInStore.isabs(): 

2480 # Just because we can see the artifact when running 

2481 # the transfer doesn't mean it will be generally 

2482 # accessible to a user of this butler. For now warn 

2483 # but assume it will be accessible. 

2484 log.warning( 

2485 "Transfer request for an outside-datastore artifact has been found at %s", 

2486 source_location, 

2487 ) 

2488 else: 

2489 # Need to transfer it to the new location. 

2490 # Assume we should always overwrite. If the artifact 

2491 # is there this might indicate that a previous transfer 

2492 # was interrupted but was not able to be rolled back 

2493 # completely (eg pre-emption) so follow Datastore default 

2494 # and overwrite. 

2495 target_location.uri.transfer_from( 

2496 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction 

2497 ) 

2498 

2499 artifacts.append((target_ref, info)) 

2500 

2501 self._register_datasets(artifacts) 

2502 

2503 if already_present: 

2504 n_skipped = len(already_present) 

2505 log.info( 

2506 "Skipped transfer of %d dataset%s already present in datastore", 

2507 n_skipped, 

2508 "" if n_skipped == 1 else "s", 

2509 ) 

2510 

2511 @transactional 

2512 def forget(self, refs: Iterable[DatasetRef]) -> None: 

2513 # Docstring inherited. 

2514 refs = list(refs) 

2515 self.bridge.forget(refs) 

2516 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs]) 

2517 

2518 def validateConfiguration( 

2519 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False 

2520 ) -> None: 

2521 """Validate some of the configuration for this datastore. 

2522 

2523 Parameters 

2524 ---------- 

2525 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

2526 Entities to test against this configuration. Can be differing 

2527 types. 

2528 logFailures : `bool`, optional 

2529 If `True`, output a log message for every validation error 

2530 detected. 

2531 

2532 Raises 

2533 ------ 

2534 DatastoreValidationError 

2535 Raised if there is a validation problem with a configuration. 

2536 All the problems are reported in a single exception. 

2537 

2538 Notes 

2539 ----- 

2540 This method checks that all the supplied entities have valid file 

2541 templates and also have formatters defined. 

2542 """ 

2543 

2544 templateFailed = None 

2545 try: 

2546 self.templates.validateTemplates(entities, logFailures=logFailures) 

2547 except FileTemplateValidationError as e: 

2548 templateFailed = str(e) 

2549 

2550 formatterFailed = [] 

2551 for entity in entities: 

2552 try: 

2553 self.formatterFactory.getFormatterClass(entity) 

2554 except KeyError as e: 

2555 formatterFailed.append(str(e)) 

2556 if logFailures: 2556 ↛ 2551line 2556 didn't jump to line 2551, because the condition on line 2556 was never false

2557 log.critical("Formatter failure: %s", e) 

2558 

2559 if templateFailed or formatterFailed: 

2560 messages = [] 

2561 if templateFailed: 2561 ↛ 2562line 2561 didn't jump to line 2562, because the condition on line 2561 was never true

2562 messages.append(templateFailed) 

2563 if formatterFailed: 2563 ↛ 2565line 2563 didn't jump to line 2565, because the condition on line 2563 was never false

2564 messages.append(",".join(formatterFailed)) 

2565 msg = ";\n".join(messages) 

2566 raise DatastoreValidationError(msg) 

2567 

2568 def getLookupKeys(self) -> Set[LookupKey]: 

2569 # Docstring is inherited from base class 

2570 return ( 

2571 self.templates.getLookupKeys() 

2572 | self.formatterFactory.getLookupKeys() 

2573 | self.constraints.getLookupKeys() 

2574 ) 

2575 

2576 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

2577 # Docstring is inherited from base class 

2578 # The key can be valid in either formatters or templates so we can 

2579 # only check the template if it exists 

2580 if lookupKey in self.templates: 

2581 try: 

2582 self.templates[lookupKey].validateTemplate(entity) 

2583 except FileTemplateValidationError as e: 

2584 raise DatastoreValidationError(e) from e 

2585 

2586 def export( 

2587 self, 

2588 refs: Iterable[DatasetRef], 

2589 *, 

2590 directory: Optional[ResourcePathExpression] = None, 

2591 transfer: Optional[str] = "auto", 

2592 ) -> Iterable[FileDataset]: 

2593 # Docstring inherited from Datastore.export. 

2594 if transfer is not None and directory is None: 2594 ↛ 2595line 2594 didn't jump to line 2595, because the condition on line 2594 was never true

2595 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

2596 

2597 # Force the directory to be a URI object 

2598 directoryUri: Optional[ResourcePath] = None 

2599 if directory is not None: 2599 ↛ 2602line 2599 didn't jump to line 2602, because the condition on line 2599 was never false

2600 directoryUri = ResourcePath(directory, forceDirectory=True) 

2601 

2602 if transfer is not None and directoryUri is not None: 2602 ↛ 2607line 2602 didn't jump to line 2607, because the condition on line 2602 was never false

2603 # mypy needs the second test 

2604 if not directoryUri.exists(): 2604 ↛ 2605line 2604 didn't jump to line 2605, because the condition on line 2604 was never true

2605 raise FileNotFoundError(f"Export location {directory} does not exist") 

2606 

2607 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

2608 for ref in progress.wrap(refs, "Exporting dataset files"): 

2609 fileLocations = self._get_dataset_locations_info(ref) 

2610 if not fileLocations: 2610 ↛ 2611line 2610 didn't jump to line 2611, because the condition on line 2610 was never true

2611 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

2612 # For now we can not export disassembled datasets 

2613 if len(fileLocations) > 1: 

2614 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

2615 location, storedFileInfo = fileLocations[0] 

2616 

2617 pathInStore = location.pathInStore.path 

2618 if transfer is None: 2618 ↛ 2622line 2618 didn't jump to line 2622, because the condition on line 2618 was never true

2619 # TODO: do we also need to return the readStorageClass somehow? 

2620 # We will use the path in store directly. If this is an 

2621 # absolute URI, preserve it. 

2622 if location.pathInStore.isabs(): 

2623 pathInStore = str(location.uri) 

2624 elif transfer == "direct": 2624 ↛ 2626line 2624 didn't jump to line 2626, because the condition on line 2624 was never true

2625 # Use full URIs to the remote store in the export 

2626 pathInStore = str(location.uri) 

2627 else: 

2628 # mypy needs help 

2629 assert directoryUri is not None, "directoryUri must be defined to get here" 

2630 storeUri = ResourcePath(location.uri) 

2631 

2632 # if the datastore has an absolute URI to a resource, we 

2633 # have two options: 

2634 # 1. Keep the absolute URI in the exported YAML 

2635 # 2. Allocate a new name in the local datastore and transfer 

2636 # it. 

2637 # For now go with option 2 

2638 if location.pathInStore.isabs(): 2638 ↛ 2639line 2638 didn't jump to line 2639, because the condition on line 2638 was never true

2639 template = self.templates.getTemplate(ref) 

2640 newURI = ResourcePath(template.format(ref), forceAbsolute=False) 

2641 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

2642 

2643 exportUri = directoryUri.join(pathInStore) 

2644 exportUri.transfer_from(storeUri, transfer=transfer) 

2645 

2646 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

2647 

2648 @staticmethod 

2649 def computeChecksum( 

2650 uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192 

2651 ) -> Optional[str]: 

2652 """Compute the checksum of the supplied file. 

2653 

2654 Parameters 

2655 ---------- 

2656 uri : `lsst.resources.ResourcePath` 

2657 Name of resource to calculate checksum from. 

2658 algorithm : `str`, optional 

2659 Name of algorithm to use. Must be one of the algorithms supported 

2660 by :py:class`hashlib`. 

2661 block_size : `int` 

2662 Number of bytes to read from file at one time. 

2663 

2664 Returns 

2665 ------- 

2666 hexdigest : `str` 

2667 Hex digest of the file. 

2668 

2669 Notes 

2670 ----- 

2671 Currently returns None if the URI is for a remote resource. 

2672 """ 

2673 if algorithm not in hashlib.algorithms_guaranteed: 2673 ↛ 2674line 2673 didn't jump to line 2674, because the condition on line 2673 was never true

2674 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

2675 

2676 if not uri.isLocal: 2676 ↛ 2677line 2676 didn't jump to line 2677, because the condition on line 2676 was never true

2677 return None 

2678 

2679 hasher = hashlib.new(algorithm) 

2680 

2681 with uri.as_local() as local_uri: 

2682 with open(local_uri.ospath, "rb") as f: 

2683 for chunk in iter(lambda: f.read(block_size), b""): 

2684 hasher.update(chunk) 

2685 

2686 return hasher.hexdigest() 

2687 

2688 def needs_expanded_data_ids( 

2689 self, 

2690 transfer: Optional[str], 

2691 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

2692 ) -> bool: 

2693 # Docstring inherited. 

2694 # This _could_ also use entity to inspect whether the filename template 

2695 # involves placeholders other than the required dimensions for its 

2696 # dataset type, but that's not necessary for correctness; it just 

2697 # enables more optimizations (perhaps only in theory). 

2698 return transfer not in ("direct", None) 

2699 

2700 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

2701 # Docstring inherited from the base class. 

2702 record_data = data.get(self.name) 

2703 if not record_data: 2703 ↛ 2704line 2703 didn't jump to line 2704, because the condition on line 2703 was never true

2704 return 

2705 

2706 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records.keys()) 

2707 

2708 # TODO: Verify that there are no unexpected table names in the dict? 

2709 unpacked_records = [] 

2710 for dataset_data in record_data.records.values(): 

2711 records = dataset_data.get(self._table.name) 

2712 if records: 2712 ↛ 2710line 2712 didn't jump to line 2710, because the condition on line 2712 was never false

2713 for info in records: 

2714 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records" 

2715 unpacked_records.append(info.to_record()) 

2716 if unpacked_records: 

2717 self._table.insert(*unpacked_records) 

2718 

2719 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

2720 # Docstring inherited from the base class. 

2721 exported_refs = list(self._bridge.check(refs)) 

2722 ids = {ref.getCheckedId() for ref in exported_refs} 

2723 records: defaultdict[DatasetId, defaultdict[str, List[StoredDatastoreItemInfo]]] = defaultdict( 

2724 lambda: defaultdict(list), {id: defaultdict(list) for id in ids} 

2725 ) 

2726 for row in self._table.fetch(dataset_id=ids): 

2727 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row) 

2728 records[info.dataset_id][self._table.name].append(info) 

2729 

2730 record_data = DatastoreRecordData(records=records) 

2731 return {self.name: record_data}