Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 85%

942 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-02-07 10:25 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileDatastore",) 

26 

27import hashlib 

28import logging 

29from collections import defaultdict 

30from dataclasses import dataclass 

31from typing import ( 

32 TYPE_CHECKING, 

33 Any, 

34 ClassVar, 

35 Dict, 

36 Iterable, 

37 List, 

38 Mapping, 

39 Optional, 

40 Sequence, 

41 Set, 

42 Tuple, 

43 Type, 

44 Union, 

45) 

46 

47from lsst.daf.butler import ( 

48 CompositesMap, 

49 Config, 

50 DatasetId, 

51 DatasetRef, 

52 DatasetRefURIs, 

53 DatasetType, 

54 DatasetTypeNotSupportedError, 

55 Datastore, 

56 DatastoreCacheManager, 

57 DatastoreConfig, 

58 DatastoreDisabledCacheManager, 

59 DatastoreRecordData, 

60 DatastoreValidationError, 

61 FileDataset, 

62 FileDescriptor, 

63 FileTemplates, 

64 FileTemplateValidationError, 

65 Formatter, 

66 FormatterFactory, 

67 Location, 

68 LocationFactory, 

69 Progress, 

70 StorageClass, 

71 StoredDatastoreItemInfo, 

72 StoredFileInfo, 

73 ddl, 

74) 

75from lsst.daf.butler.core.repoRelocation import replaceRoot 

76from lsst.daf.butler.core.utils import transactional 

77from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError 

78from lsst.resources import ResourcePath, ResourcePathExpression 

79from lsst.utils.introspection import get_class_of, get_instance_of 

80from lsst.utils.iteration import chunk_iterable 

81 

82# For VERBOSE logging usage. 

83from lsst.utils.logging import VERBOSE, getLogger 

84from lsst.utils.timer import time_this 

85from sqlalchemy import BigInteger, String 

86 

87from ..registry.interfaces import FakeDatasetRef 

88from .genericDatastore import GenericBaseDatastore 

89 

90if TYPE_CHECKING: 90 ↛ 91line 90 didn't jump to line 91, because the condition on line 90 was never true

91 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey 

92 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

93 

94log = getLogger(__name__) 

95 

96 

97class _IngestPrepData(Datastore.IngestPrepData): 

98 """Helper class for FileDatastore ingest implementation. 

99 

100 Parameters 

101 ---------- 

102 datasets : `list` of `FileDataset` 

103 Files to be ingested by this datastore. 

104 """ 

105 

106 def __init__(self, datasets: List[FileDataset]): 

107 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

108 self.datasets = datasets 

109 

110 

111@dataclass(frozen=True) 

112class DatastoreFileGetInformation: 

113 """Collection of useful parameters needed to retrieve a file from 

114 a Datastore. 

115 """ 

116 

117 location: Location 

118 """The location from which to read the dataset.""" 

119 

120 formatter: Formatter 

121 """The `Formatter` to use to deserialize the dataset.""" 

122 

123 info: StoredFileInfo 

124 """Stored information about this file and its formatter.""" 

125 

126 assemblerParams: Mapping[str, Any] 

127 """Parameters to use for post-processing the retrieved dataset.""" 

128 

129 formatterParams: Mapping[str, Any] 

130 """Parameters that were understood by the associated formatter.""" 

131 

132 component: Optional[str] 

133 """The component to be retrieved (can be `None`).""" 

134 

135 readStorageClass: StorageClass 

136 """The `StorageClass` of the dataset being read.""" 

137 

138 

139class FileDatastore(GenericBaseDatastore): 

140 """Generic Datastore for file-based implementations. 

141 

142 Should always be sub-classed since key abstract methods are missing. 

143 

144 Parameters 

145 ---------- 

146 config : `DatastoreConfig` or `str` 

147 Configuration as either a `Config` object or URI to file. 

148 bridgeManager : `DatastoreRegistryBridgeManager` 

149 Object that manages the interface between `Registry` and datastores. 

150 butlerRoot : `str`, optional 

151 New datastore root to use to override the configuration value. 

152 

153 Raises 

154 ------ 

155 ValueError 

156 If root location does not exist and ``create`` is `False` in the 

157 configuration. 

158 """ 

159 

160 defaultConfigFile: ClassVar[Optional[str]] = None 

161 """Path to configuration defaults. Accessed within the ``config`` resource 

162 or relative to a search path. Can be None if no defaults specified. 

163 """ 

164 

165 root: ResourcePath 

166 """Root directory URI of this `Datastore`.""" 

167 

168 locationFactory: LocationFactory 

169 """Factory for creating locations relative to the datastore root.""" 

170 

171 formatterFactory: FormatterFactory 

172 """Factory for creating instances of formatters.""" 

173 

174 templates: FileTemplates 

175 """File templates that can be used by this `Datastore`.""" 

176 

177 composites: CompositesMap 

178 """Determines whether a dataset should be disassembled on put.""" 

179 

180 defaultConfigFile = "datastores/fileDatastore.yaml" 

181 """Path to configuration defaults. Accessed within the ``config`` resource 

182 or relative to a search path. Can be None if no defaults specified. 

183 """ 

184 

185 @classmethod 

186 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

187 """Set any filesystem-dependent config options for this Datastore to 

188 be appropriate for a new empty repository with the given root. 

189 

190 Parameters 

191 ---------- 

192 root : `str` 

193 URI to the root of the data repository. 

194 config : `Config` 

195 A `Config` to update. Only the subset understood by 

196 this component will be updated. Will not expand 

197 defaults. 

198 full : `Config` 

199 A complete config with all defaults expanded that can be 

200 converted to a `DatastoreConfig`. Read-only and will not be 

201 modified by this method. 

202 Repository-specific options that should not be obtained 

203 from defaults when Butler instances are constructed 

204 should be copied from ``full`` to ``config``. 

205 overwrite : `bool`, optional 

206 If `False`, do not modify a value in ``config`` if the value 

207 already exists. Default is always to overwrite with the provided 

208 ``root``. 

209 

210 Notes 

211 ----- 

212 If a keyword is explicitly defined in the supplied ``config`` it 

213 will not be overridden by this method if ``overwrite`` is `False`. 

214 This allows explicit values set in external configs to be retained. 

215 """ 

216 Config.updateParameters( 

217 DatastoreConfig, 

218 config, 

219 full, 

220 toUpdate={"root": root}, 

221 toCopy=("cls", ("records", "table")), 

222 overwrite=overwrite, 

223 ) 

224 

225 @classmethod 

226 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

227 return ddl.TableSpec( 

228 fields=[ 

229 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

230 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

231 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

232 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

233 # Use empty string to indicate no component 

234 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

235 # TODO: should checksum be Base64Bytes instead? 

236 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

237 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

238 ], 

239 unique=frozenset(), 

240 indexes=[ddl.IndexSpec("path")], 

241 ) 

242 

243 def __init__( 

244 self, 

245 config: Union[DatastoreConfig, str], 

246 bridgeManager: DatastoreRegistryBridgeManager, 

247 butlerRoot: str | None = None, 

248 ): 

249 super().__init__(config, bridgeManager) 

250 if "root" not in self.config: 250 ↛ 251line 250 didn't jump to line 251, because the condition on line 250 was never true

251 raise ValueError("No root directory specified in configuration") 

252 

253 self._bridgeManager = bridgeManager 

254 

255 # Name ourselves either using an explicit name or a name 

256 # derived from the (unexpanded) root 

257 if "name" in self.config: 

258 self.name = self.config["name"] 

259 else: 

260 # We use the unexpanded root in the name to indicate that this 

261 # datastore can be moved without having to update registry. 

262 self.name = "{}@{}".format(type(self).__name__, self.config["root"]) 

263 

264 # Support repository relocation in config 

265 # Existence of self.root is checked in subclass 

266 self.root = ResourcePath( 

267 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True 

268 ) 

269 

270 self.locationFactory = LocationFactory(self.root) 

271 self.formatterFactory = FormatterFactory() 

272 

273 # Now associate formatters with storage classes 

274 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe) 

275 

276 # Read the file naming templates 

277 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe) 

278 

279 # See if composites should be disassembled 

280 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe) 

281 

282 tableName = self.config["records", "table"] 

283 try: 

284 # Storage of paths and formatters, keyed by dataset_id 

285 self._table = bridgeManager.opaque.register( 

286 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType) 

287 ) 

288 # Interface to Registry. 

289 self._bridge = bridgeManager.register(self.name) 

290 except ReadOnlyDatabaseError: 

291 # If the database is read only and we just tried and failed to 

292 # create a table, it means someone is trying to create a read-only 

293 # butler client for an empty repo. That should be okay, as long 

294 # as they then try to get any datasets before some other client 

295 # creates the table. Chances are they'rejust validating 

296 # configuration. 

297 pass 

298 

299 # Determine whether checksums should be used - default to False 

300 self.useChecksum = self.config.get("checksum", False) 

301 

302 # Determine whether we can fall back to configuration if a 

303 # requested dataset is not known to registry 

304 self.trustGetRequest = self.config.get("trust_get_request", False) 

305 

306 # Create a cache manager 

307 self.cacheManager: AbstractDatastoreCacheManager 

308 if "cached" in self.config: 308 ↛ 311line 308 didn't jump to line 311, because the condition on line 308 was never false

309 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe) 

310 else: 

311 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe) 

312 

313 # Check existence and create directory structure if necessary 

314 if not self.root.exists(): 

315 if "create" not in self.config or not self.config["create"]: 315 ↛ 316line 315 didn't jump to line 316, because the condition on line 315 was never true

316 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

317 try: 

318 self.root.mkdir() 

319 except Exception as e: 

320 raise ValueError( 

321 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}" 

322 ) from e 

323 

324 def __str__(self) -> str: 

325 return str(self.root) 

326 

327 @property 

328 def bridge(self) -> DatastoreRegistryBridge: 

329 return self._bridge 

330 

331 def _artifact_exists(self, location: Location) -> bool: 

332 """Check that an artifact exists in this datastore at the specified 

333 location. 

334 

335 Parameters 

336 ---------- 

337 location : `Location` 

338 Expected location of the artifact associated with this datastore. 

339 

340 Returns 

341 ------- 

342 exists : `bool` 

343 True if the location can be found, false otherwise. 

344 """ 

345 log.debug("Checking if resource exists: %s", location.uri) 

346 return location.uri.exists() 

347 

348 def _delete_artifact(self, location: Location) -> None: 

349 """Delete the artifact from the datastore. 

350 

351 Parameters 

352 ---------- 

353 location : `Location` 

354 Location of the artifact associated with this datastore. 

355 """ 

356 if location.pathInStore.isabs(): 356 ↛ 357line 356 didn't jump to line 357, because the condition on line 356 was never true

357 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

358 

359 try: 

360 location.uri.remove() 

361 except FileNotFoundError: 

362 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

363 raise 

364 except Exception as e: 

365 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

366 raise 

367 log.debug("Successfully deleted file: %s", location.uri) 

368 

369 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

370 # Docstring inherited from GenericBaseDatastore 

371 records = [info.rebase(ref).to_record() for ref, info in zip(refs, infos)] 

372 self._table.insert(*records, transaction=self._transaction) 

373 

374 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

375 # Docstring inherited from GenericBaseDatastore 

376 

377 # Look for the dataset_id -- there might be multiple matches 

378 # if we have disassembled the dataset. 

379 records = self._table.fetch(dataset_id=ref.id) 

380 return [StoredFileInfo.from_record(record) for record in records] 

381 

382 def _get_stored_records_associated_with_refs( 

383 self, refs: Iterable[DatasetIdRef] 

384 ) -> Dict[DatasetId, List[StoredFileInfo]]: 

385 """Retrieve all records associated with the provided refs. 

386 

387 Parameters 

388 ---------- 

389 refs : iterable of `DatasetIdRef` 

390 The refs for which records are to be retrieved. 

391 

392 Returns 

393 ------- 

394 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

395 The matching records indexed by the ref ID. The number of entries 

396 in the dict can be smaller than the number of requested refs. 

397 """ 

398 records = self._table.fetch(dataset_id=[ref.id for ref in refs]) 

399 

400 # Uniqueness is dataset_id + component so can have multiple records 

401 # per ref. 

402 records_by_ref = defaultdict(list) 

403 for record in records: 

404 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

405 return records_by_ref 

406 

407 def _refs_associated_with_artifacts( 

408 self, paths: List[Union[str, ResourcePath]] 

409 ) -> Dict[str, Set[DatasetId]]: 

410 """Return paths and associated dataset refs. 

411 

412 Parameters 

413 ---------- 

414 paths : `list` of `str` or `lsst.resources.ResourcePath` 

415 All the paths to include in search. 

416 

417 Returns 

418 ------- 

419 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

420 Mapping of each path to a set of associated database IDs. 

421 """ 

422 records = self._table.fetch(path=[str(path) for path in paths]) 

423 result = defaultdict(set) 

424 for row in records: 

425 result[row["path"]].add(row["dataset_id"]) 

426 return result 

427 

428 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> Set[DatasetId]: 

429 """Return all dataset refs associated with the supplied path. 

430 

431 Parameters 

432 ---------- 

433 pathInStore : `lsst.resources.ResourcePath` 

434 Path of interest in the data store. 

435 

436 Returns 

437 ------- 

438 ids : `set` of `int` 

439 All `DatasetRef` IDs associated with this path. 

440 """ 

441 records = list(self._table.fetch(path=str(pathInStore))) 

442 ids = {r["dataset_id"] for r in records} 

443 return ids 

444 

445 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

446 # Docstring inherited from GenericBaseDatastore 

447 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

448 

449 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

450 r"""Find all the `Location`\ s of the requested dataset in the 

451 `Datastore` and the associated stored file information. 

452 

453 Parameters 

454 ---------- 

455 ref : `DatasetRef` 

456 Reference to the required `Dataset`. 

457 

458 Returns 

459 ------- 

460 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

461 Location of the dataset within the datastore and 

462 stored information about each file and its formatter. 

463 """ 

464 # Get the file information (this will fail if no file) 

465 records = self.getStoredItemsInfo(ref) 

466 

467 # Use the path to determine the location -- we need to take 

468 # into account absolute URIs in the datastore record 

469 return [(r.file_location(self.locationFactory), r) for r in records] 

470 

471 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

472 """Check that there is only one dataset associated with the 

473 specified artifact. 

474 

475 Parameters 

476 ---------- 

477 ref : `DatasetRef` or `FakeDatasetRef` 

478 Dataset to be removed. 

479 location : `Location` 

480 The location of the artifact to be removed. 

481 

482 Returns 

483 ------- 

484 can_remove : `Bool` 

485 True if the artifact can be safely removed. 

486 """ 

487 # Can't ever delete absolute URIs. 

488 if location.pathInStore.isabs(): 

489 return False 

490 

491 # Get all entries associated with this path 

492 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

493 if not allRefs: 

494 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

495 

496 # Remove these refs from all the refs and if there is nothing left 

497 # then we can delete 

498 remainingRefs = allRefs - {ref.id} 

499 

500 if remainingRefs: 

501 return False 

502 return True 

503 

504 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, StoredFileInfo]]: 

505 """Predict the location and related file information of the requested 

506 dataset in this datastore. 

507 

508 Parameters 

509 ---------- 

510 ref : `DatasetRef` 

511 Reference to the required `Dataset`. 

512 

513 Returns 

514 ------- 

515 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

516 Expected Location of the dataset within the datastore and 

517 placeholder information about each file and its formatter. 

518 

519 Notes 

520 ----- 

521 Uses the current configuration to determine how we would expect the 

522 datastore files to have been written if we couldn't ask registry. 

523 This is safe so long as there has been no change to datastore 

524 configuration between writing the dataset and wanting to read it. 

525 Will not work for files that have been ingested without using the 

526 standard file template or default formatter. 

527 """ 

528 

529 # If we have a component ref we always need to ask the questions 

530 # of the composite. If the composite is disassembled this routine 

531 # should return all components. If the composite was not 

532 # disassembled the composite is what is stored regardless of 

533 # component request. Note that if the caller has disassembled 

534 # a composite there is no way for this guess to know that 

535 # without trying both the composite and component ref and seeing 

536 # if there is something at the component Location even without 

537 # disassembly being enabled. 

538 if ref.datasetType.isComponent(): 

539 ref = ref.makeCompositeRef() 

540 

541 # See if the ref is a composite that should be disassembled 

542 doDisassembly = self.composites.shouldBeDisassembled(ref) 

543 

544 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = [] 

545 

546 if doDisassembly: 

547 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

548 compRef = ref.makeComponentRef(component) 

549 location, formatter = self._determine_put_formatter_location(compRef) 

550 all_info.append((location, formatter, componentStorage, component)) 

551 

552 else: 

553 # Always use the composite ref if no disassembly 

554 location, formatter = self._determine_put_formatter_location(ref) 

555 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

556 

557 # Convert the list of tuples to have StoredFileInfo as second element 

558 return [ 

559 ( 

560 location, 

561 StoredFileInfo( 

562 formatter=formatter, 

563 path=location.pathInStore.path, 

564 storageClass=storageClass, 

565 component=component, 

566 checksum=None, 

567 file_size=-1, 

568 dataset_id=ref.getCheckedId(), 

569 ), 

570 ) 

571 for location, formatter, storageClass, component in all_info 

572 ] 

573 

574 def _prepare_for_get( 

575 self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None 

576 ) -> List[DatastoreFileGetInformation]: 

577 """Check parameters for ``get`` and obtain formatter and 

578 location. 

579 

580 Parameters 

581 ---------- 

582 ref : `DatasetRef` 

583 Reference to the required Dataset. 

584 parameters : `dict` 

585 `StorageClass`-specific parameters that specify, for example, 

586 a slice of the dataset to be loaded. 

587 

588 Returns 

589 ------- 

590 getInfo : `list` [`DatastoreFileGetInformation`] 

591 Parameters needed to retrieve each file. 

592 """ 

593 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

594 

595 # Get file metadata and internal metadata 

596 fileLocations = self._get_dataset_locations_info(ref) 

597 if not fileLocations: 

598 if not self.trustGetRequest: 

599 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

600 # Assume the dataset is where we think it should be 

601 fileLocations = self._get_expected_dataset_locations_info(ref) 

602 

603 # The storage class we want to use eventually 

604 refStorageClass = ref.datasetType.storageClass 

605 

606 if len(fileLocations) > 1: 

607 disassembled = True 

608 

609 # If trust is involved it is possible that there will be 

610 # components listed here that do not exist in the datastore. 

611 # Explicitly check for file artifact existence and filter out any 

612 # that are missing. 

613 if self.trustGetRequest: 

614 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()] 

615 

616 # For now complain only if we have no components at all. One 

617 # component is probably a problem but we can punt that to the 

618 # assembler. 

619 if not fileLocations: 619 ↛ 620line 619 didn't jump to line 620, because the condition on line 619 was never true

620 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") 

621 

622 else: 

623 disassembled = False 

624 

625 # Is this a component request? 

626 refComponent = ref.datasetType.component() 

627 

628 fileGetInfo = [] 

629 for location, storedFileInfo in fileLocations: 

630 # The storage class used to write the file 

631 writeStorageClass = storedFileInfo.storageClass 

632 

633 # If this has been disassembled we need read to match the write 

634 if disassembled: 

635 readStorageClass = writeStorageClass 

636 else: 

637 readStorageClass = refStorageClass 

638 

639 formatter = get_instance_of( 

640 storedFileInfo.formatter, 

641 FileDescriptor( 

642 location, 

643 readStorageClass=readStorageClass, 

644 storageClass=writeStorageClass, 

645 parameters=parameters, 

646 ), 

647 ref.dataId, 

648 ) 

649 

650 formatterParams, notFormatterParams = formatter.segregateParameters() 

651 

652 # Of the remaining parameters, extract the ones supported by 

653 # this StorageClass (for components not all will be handled) 

654 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

655 

656 # The ref itself could be a component if the dataset was 

657 # disassembled by butler, or we disassembled in datastore and 

658 # components came from the datastore records 

659 component = storedFileInfo.component if storedFileInfo.component else refComponent 

660 

661 fileGetInfo.append( 

662 DatastoreFileGetInformation( 

663 location, 

664 formatter, 

665 storedFileInfo, 

666 assemblerParams, 

667 formatterParams, 

668 component, 

669 readStorageClass, 

670 ) 

671 ) 

672 

673 return fileGetInfo 

674 

675 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

676 """Check the arguments for ``put`` and obtain formatter and 

677 location. 

678 

679 Parameters 

680 ---------- 

681 inMemoryDataset : `object` 

682 The dataset to store. 

683 ref : `DatasetRef` 

684 Reference to the associated Dataset. 

685 

686 Returns 

687 ------- 

688 location : `Location` 

689 The location to write the dataset. 

690 formatter : `Formatter` 

691 The `Formatter` to use to write the dataset. 

692 

693 Raises 

694 ------ 

695 TypeError 

696 Supplied object and storage class are inconsistent. 

697 DatasetTypeNotSupportedError 

698 The associated `DatasetType` is not handled by this datastore. 

699 """ 

700 self._validate_put_parameters(inMemoryDataset, ref) 

701 return self._determine_put_formatter_location(ref) 

702 

703 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]: 

704 """Calculate the formatter and output location to use for put. 

705 

706 Parameters 

707 ---------- 

708 ref : `DatasetRef` 

709 Reference to the associated Dataset. 

710 

711 Returns 

712 ------- 

713 location : `Location` 

714 The location to write the dataset. 

715 formatter : `Formatter` 

716 The `Formatter` to use to write the dataset. 

717 """ 

718 # Work out output file name 

719 try: 

720 template = self.templates.getTemplate(ref) 

721 except KeyError as e: 

722 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

723 

724 # Validate the template to protect against filenames from different 

725 # dataIds returning the same and causing overwrite confusion. 

726 template.validateTemplate(ref) 

727 

728 location = self.locationFactory.fromPath(template.format(ref)) 

729 

730 # Get the formatter based on the storage class 

731 storageClass = ref.datasetType.storageClass 

732 try: 

733 formatter = self.formatterFactory.getFormatter( 

734 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId 

735 ) 

736 except KeyError as e: 

737 raise DatasetTypeNotSupportedError( 

738 f"Unable to find formatter for {ref} in datastore {self.name}" 

739 ) from e 

740 

741 # Now that we know the formatter, update the location 

742 location = formatter.makeUpdatedLocation(location) 

743 

744 return location, formatter 

745 

746 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

747 # Docstring inherited from base class 

748 if transfer != "auto": 

749 return transfer 

750 

751 # See if the paths are within the datastore or not 

752 inside = [self._pathInStore(d.path) is not None for d in datasets] 

753 

754 if all(inside): 

755 transfer = None 

756 elif not any(inside): 756 ↛ 765line 756 didn't jump to line 765, because the condition on line 756 was never false

757 # Allow ResourcePath to use its own knowledge 

758 transfer = "auto" 

759 else: 

760 # This can happen when importing from a datastore that 

761 # has had some datasets ingested using "direct" mode. 

762 # Also allow ResourcePath to sort it out but warn about it. 

763 # This can happen if you are importing from a datastore 

764 # that had some direct transfer datasets. 

765 log.warning( 

766 "Some datasets are inside the datastore and some are outside. Using 'split' " 

767 "transfer mode. This assumes that the files outside the datastore are " 

768 "still accessible to the new butler since they will not be copied into " 

769 "the target datastore." 

770 ) 

771 transfer = "split" 

772 

773 return transfer 

774 

775 def _pathInStore(self, path: ResourcePathExpression) -> Optional[str]: 

776 """Return path relative to datastore root 

777 

778 Parameters 

779 ---------- 

780 path : `lsst.resources.ResourcePathExpression` 

781 Path to dataset. Can be absolute URI. If relative assumed to 

782 be relative to the datastore. Returns path in datastore 

783 or raises an exception if the path it outside. 

784 

785 Returns 

786 ------- 

787 inStore : `str` 

788 Path relative to datastore root. Returns `None` if the file is 

789 outside the root. 

790 """ 

791 # Relative path will always be relative to datastore 

792 pathUri = ResourcePath(path, forceAbsolute=False) 

793 return pathUri.relative_to(self.root) 

794 

795 def _standardizeIngestPath( 

796 self, path: Union[str, ResourcePath], *, transfer: Optional[str] = None 

797 ) -> Union[str, ResourcePath]: 

798 """Standardize the path of a to-be-ingested file. 

799 

800 Parameters 

801 ---------- 

802 path : `str` or `lsst.resources.ResourcePath` 

803 Path of a file to be ingested. This parameter is not expected 

804 to be all the types that can be used to construct a 

805 `~lsst.resources.ResourcePath`. 

806 transfer : `str`, optional 

807 How (and whether) the dataset should be added to the datastore. 

808 See `ingest` for details of transfer modes. 

809 This implementation is provided only so 

810 `NotImplementedError` can be raised if the mode is not supported; 

811 actual transfers are deferred to `_extractIngestInfo`. 

812 

813 Returns 

814 ------- 

815 path : `str` or `lsst.resources.ResourcePath` 

816 New path in what the datastore considers standard form. If an 

817 absolute URI was given that will be returned unchanged. 

818 

819 Notes 

820 ----- 

821 Subclasses of `FileDatastore` can implement this method instead 

822 of `_prepIngest`. It should not modify the data repository or given 

823 file in any way. 

824 

825 Raises 

826 ------ 

827 NotImplementedError 

828 Raised if the datastore does not support the given transfer mode 

829 (including the case where ingest is not supported at all). 

830 FileNotFoundError 

831 Raised if one of the given files does not exist. 

832 """ 

833 if transfer not in (None, "direct", "split") + self.root.transferModes: 833 ↛ 834line 833 didn't jump to line 834, because the condition on line 833 was never true

834 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

835 

836 # A relative URI indicates relative to datastore root 

837 srcUri = ResourcePath(path, forceAbsolute=False) 

838 if not srcUri.isabs(): 

839 srcUri = self.root.join(path) 

840 

841 if not srcUri.exists(): 

842 raise FileNotFoundError( 

843 f"Resource at {srcUri} does not exist; note that paths to ingest " 

844 f"are assumed to be relative to {self.root} unless they are absolute." 

845 ) 

846 

847 if transfer is None: 

848 relpath = srcUri.relative_to(self.root) 

849 if not relpath: 

850 raise RuntimeError( 

851 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})" 

852 ) 

853 

854 # Return the relative path within the datastore for internal 

855 # transfer 

856 path = relpath 

857 

858 return path 

859 

860 def _extractIngestInfo( 

861 self, 

862 path: ResourcePathExpression, 

863 ref: DatasetRef, 

864 *, 

865 formatter: Union[Formatter, Type[Formatter]], 

866 transfer: Optional[str] = None, 

867 record_validation_info: bool = True, 

868 ) -> StoredFileInfo: 

869 """Relocate (if necessary) and extract `StoredFileInfo` from a 

870 to-be-ingested file. 

871 

872 Parameters 

873 ---------- 

874 path : `lsst.resources.ResourcePathExpression` 

875 URI or path of a file to be ingested. 

876 ref : `DatasetRef` 

877 Reference for the dataset being ingested. Guaranteed to have 

878 ``dataset_id not None`. 

879 formatter : `type` or `Formatter` 

880 `Formatter` subclass to use for this dataset or an instance. 

881 transfer : `str`, optional 

882 How (and whether) the dataset should be added to the datastore. 

883 See `ingest` for details of transfer modes. 

884 record_validation_info : `bool`, optional 

885 If `True`, the default, the datastore can record validation 

886 information associated with the file. If `False` the datastore 

887 will not attempt to track any information such as checksums 

888 or file sizes. This can be useful if such information is tracked 

889 in an external system or if the file is to be compressed in place. 

890 It is up to the datastore whether this parameter is relevant. 

891 

892 Returns 

893 ------- 

894 info : `StoredFileInfo` 

895 Internal datastore record for this file. This will be inserted by 

896 the caller; the `_extractIngestInfo` is only responsible for 

897 creating and populating the struct. 

898 

899 Raises 

900 ------ 

901 FileNotFoundError 

902 Raised if one of the given files does not exist. 

903 FileExistsError 

904 Raised if transfer is not `None` but the (internal) location the 

905 file would be moved to is already occupied. 

906 """ 

907 if self._transaction is None: 907 ↛ 908line 907 didn't jump to line 908, because the condition on line 907 was never true

908 raise RuntimeError("Ingest called without transaction enabled") 

909 

910 # Create URI of the source path, do not need to force a relative 

911 # path to absolute. 

912 srcUri = ResourcePath(path, forceAbsolute=False) 

913 

914 # Track whether we have read the size of the source yet 

915 have_sized = False 

916 

917 tgtLocation: Optional[Location] 

918 if transfer is None or transfer == "split": 

919 # A relative path is assumed to be relative to the datastore 

920 # in this context 

921 if not srcUri.isabs(): 

922 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

923 else: 

924 # Work out the path in the datastore from an absolute URI 

925 # This is required to be within the datastore. 

926 pathInStore = srcUri.relative_to(self.root) 

927 if pathInStore is None and transfer is None: 927 ↛ 928line 927 didn't jump to line 928, because the condition on line 927 was never true

928 raise RuntimeError( 

929 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}" 

930 ) 

931 if pathInStore: 931 ↛ 933line 931 didn't jump to line 933, because the condition on line 931 was never false

932 tgtLocation = self.locationFactory.fromPath(pathInStore) 

933 elif transfer == "split": 

934 # Outside the datastore but treat that as a direct ingest 

935 # instead. 

936 tgtLocation = None 

937 else: 

938 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}") 

939 elif transfer == "direct": 939 ↛ 944line 939 didn't jump to line 944, because the condition on line 939 was never true

940 # Want to store the full URI to the resource directly in 

941 # datastore. This is useful for referring to permanent archive 

942 # storage for raw data. 

943 # Trust that people know what they are doing. 

944 tgtLocation = None 

945 else: 

946 # Work out the name we want this ingested file to have 

947 # inside the datastore 

948 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

949 if not tgtLocation.uri.dirname().exists(): 

950 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

951 tgtLocation.uri.dirname().mkdir() 

952 

953 # if we are transferring from a local file to a remote location 

954 # it may be more efficient to get the size and checksum of the 

955 # local file rather than the transferred one 

956 if record_validation_info and srcUri.isLocal: 

957 size = srcUri.size() 

958 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

959 have_sized = True 

960 

961 # Transfer the resource to the destination. 

962 # Allow overwrite of an existing file. This matches the behavior 

963 # of datastore.put() in that it trusts that registry would not 

964 # be asking to overwrite unless registry thought that the 

965 # overwrite was allowed. 

966 tgtLocation.uri.transfer_from( 

967 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True 

968 ) 

969 

970 if tgtLocation is None: 970 ↛ 972line 970 didn't jump to line 972, because the condition on line 970 was never true

971 # This means we are using direct mode 

972 targetUri = srcUri 

973 targetPath = str(srcUri) 

974 else: 

975 targetUri = tgtLocation.uri 

976 targetPath = tgtLocation.pathInStore.path 

977 

978 # the file should exist in the datastore now 

979 if record_validation_info: 

980 if not have_sized: 

981 size = targetUri.size() 

982 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

983 else: 

984 # Not recording any file information. 

985 size = -1 

986 checksum = None 

987 

988 return StoredFileInfo( 

989 formatter=formatter, 

990 path=targetPath, 

991 storageClass=ref.datasetType.storageClass, 

992 component=ref.datasetType.component(), 

993 file_size=size, 

994 checksum=checksum, 

995 dataset_id=ref.getCheckedId(), 

996 ) 

997 

998 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

999 # Docstring inherited from Datastore._prepIngest. 

1000 filtered = [] 

1001 for dataset in datasets: 

1002 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

1003 if not acceptable: 

1004 continue 

1005 else: 

1006 dataset.refs = acceptable 

1007 if dataset.formatter is None: 

1008 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

1009 else: 

1010 assert isinstance(dataset.formatter, (type, str)) 

1011 formatter_class = get_class_of(dataset.formatter) 

1012 if not issubclass(formatter_class, Formatter): 1012 ↛ 1013line 1012 didn't jump to line 1013, because the condition on line 1012 was never true

1013 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.") 

1014 dataset.formatter = formatter_class 

1015 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

1016 filtered.append(dataset) 

1017 return _IngestPrepData(filtered) 

1018 

1019 @transactional 

1020 def _finishIngest( 

1021 self, 

1022 prepData: Datastore.IngestPrepData, 

1023 *, 

1024 transfer: Optional[str] = None, 

1025 record_validation_info: bool = True, 

1026 ) -> None: 

1027 # Docstring inherited from Datastore._finishIngest. 

1028 refsAndInfos = [] 

1029 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

1030 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

1031 # Do ingest as if the first dataset ref is associated with the file 

1032 info = self._extractIngestInfo( 

1033 dataset.path, 

1034 dataset.refs[0], 

1035 formatter=dataset.formatter, 

1036 transfer=transfer, 

1037 record_validation_info=record_validation_info, 

1038 ) 

1039 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

1040 self._register_datasets(refsAndInfos) 

1041 

1042 def _calculate_ingested_datastore_name( 

1043 self, srcUri: ResourcePath, ref: DatasetRef, formatter: Union[Formatter, Type[Formatter]] 

1044 ) -> Location: 

1045 """Given a source URI and a DatasetRef, determine the name the 

1046 dataset will have inside datastore. 

1047 

1048 Parameters 

1049 ---------- 

1050 srcUri : `lsst.resources.ResourcePath` 

1051 URI to the source dataset file. 

1052 ref : `DatasetRef` 

1053 Ref associated with the newly-ingested dataset artifact. This 

1054 is used to determine the name within the datastore. 

1055 formatter : `Formatter` or Formatter class. 

1056 Formatter to use for validation. Can be a class or an instance. 

1057 

1058 Returns 

1059 ------- 

1060 location : `Location` 

1061 Target location for the newly-ingested dataset. 

1062 """ 

1063 # Ingesting a file from outside the datastore. 

1064 # This involves a new name. 

1065 template = self.templates.getTemplate(ref) 

1066 location = self.locationFactory.fromPath(template.format(ref)) 

1067 

1068 # Get the extension 

1069 ext = srcUri.getExtension() 

1070 

1071 # Update the destination to include that extension 

1072 location.updateExtension(ext) 

1073 

1074 # Ask the formatter to validate this extension 

1075 formatter.validateExtension(location) 

1076 

1077 return location 

1078 

1079 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

1080 """Write out in memory dataset to datastore. 

1081 

1082 Parameters 

1083 ---------- 

1084 inMemoryDataset : `object` 

1085 Dataset to write to datastore. 

1086 ref : `DatasetRef` 

1087 Registry information associated with this dataset. 

1088 

1089 Returns 

1090 ------- 

1091 info : `StoredFileInfo` 

1092 Information describing the artifact written to the datastore. 

1093 """ 

1094 # May need to coerce the in memory dataset to the correct 

1095 # python type. 

1096 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset) 

1097 

1098 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

1099 uri = location.uri 

1100 

1101 if not uri.dirname().exists(): 

1102 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

1103 uri.dirname().mkdir() 

1104 

1105 if self._transaction is None: 1105 ↛ 1106line 1105 didn't jump to line 1106, because the condition on line 1105 was never true

1106 raise RuntimeError("Attempting to write artifact without transaction enabled") 

1107 

1108 def _removeFileExists(uri: ResourcePath) -> None: 

1109 """Remove a file and do not complain if it is not there. 

1110 

1111 This is important since a formatter might fail before the file 

1112 is written and we should not confuse people by writing spurious 

1113 error messages to the log. 

1114 """ 

1115 try: 

1116 uri.remove() 

1117 except FileNotFoundError: 

1118 pass 

1119 

1120 # Register a callback to try to delete the uploaded data if 

1121 # something fails below 

1122 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1123 

1124 data_written = False 

1125 if not uri.isLocal: 

1126 # This is a remote URI. Some datasets can be serialized directly 

1127 # to bytes and sent to the remote datastore without writing a 

1128 # file. If the dataset is intended to be saved to the cache 

1129 # a file is always written and direct write to the remote 

1130 # datastore is bypassed. 

1131 if not self.cacheManager.should_be_cached(ref): 

1132 try: 

1133 serializedDataset = formatter.toBytes(inMemoryDataset) 

1134 except NotImplementedError: 

1135 # Fallback to the file writing option. 

1136 pass 

1137 except Exception as e: 

1138 raise RuntimeError( 

1139 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes." 

1140 ) from e 

1141 else: 

1142 log.debug("Writing bytes directly to %s", uri) 

1143 uri.write(serializedDataset, overwrite=True) 

1144 log.debug("Successfully wrote bytes directly to %s", uri) 

1145 data_written = True 

1146 

1147 if not data_written: 

1148 # Did not write the bytes directly to object store so instead 

1149 # write to temporary file. Always write to a temporary even if 

1150 # using a local file system -- that gives us atomic writes. 

1151 # If a process is killed as the file is being written we do not 

1152 # want it to remain in the correct place but in corrupt state. 

1153 # For local files write to the output directory not temporary dir. 

1154 prefix = uri.dirname() if uri.isLocal else None 

1155 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri: 

1156 # Need to configure the formatter to write to a different 

1157 # location and that needs us to overwrite internals 

1158 log.debug("Writing dataset to temporary location at %s", temporary_uri) 

1159 with formatter._updateLocation(Location(None, temporary_uri)): 

1160 try: 

1161 formatter.write(inMemoryDataset) 

1162 except Exception as e: 

1163 raise RuntimeError( 

1164 f"Failed to serialize dataset {ref} of type" 

1165 f" {type(inMemoryDataset)} to " 

1166 f"temporary location {temporary_uri}" 

1167 ) from e 

1168 

1169 # Use move for a local file since that becomes an efficient 

1170 # os.rename. For remote resources we use copy to allow the 

1171 # file to be cached afterwards. 

1172 transfer = "move" if uri.isLocal else "copy" 

1173 

1174 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True) 

1175 

1176 if transfer == "copy": 

1177 # Cache if required 

1178 self.cacheManager.move_to_cache(temporary_uri, ref) 

1179 

1180 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1181 

1182 # URI is needed to resolve what ingest case are we dealing with 

1183 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1184 

1185 def _read_artifact_into_memory( 

1186 self, 

1187 getInfo: DatastoreFileGetInformation, 

1188 ref: DatasetRef, 

1189 isComponent: bool = False, 

1190 cache_ref: Optional[DatasetRef] = None, 

1191 ) -> Any: 

1192 """Read the artifact from datastore into in memory object. 

1193 

1194 Parameters 

1195 ---------- 

1196 getInfo : `DatastoreFileGetInformation` 

1197 Information about the artifact within the datastore. 

1198 ref : `DatasetRef` 

1199 The registry information associated with this artifact. 

1200 isComponent : `bool` 

1201 Flag to indicate if a component is being read from this artifact. 

1202 cache_ref : `DatasetRef`, optional 

1203 The DatasetRef to use when looking up the file in the cache. 

1204 This ref must have the same ID as the supplied ref but can 

1205 be a parent ref or component ref to indicate to the cache whether 

1206 a composite file is being requested from the cache or a component 

1207 file. Without this the cache will default to the supplied ref but 

1208 it can get confused with read-only derived components for 

1209 disassembled composites. 

1210 

1211 Returns 

1212 ------- 

1213 inMemoryDataset : `object` 

1214 The artifact as a python object. 

1215 """ 

1216 location = getInfo.location 

1217 uri = location.uri 

1218 log.debug("Accessing data from %s", uri) 

1219 

1220 if cache_ref is None: 

1221 cache_ref = ref 

1222 if cache_ref.id != ref.id: 1222 ↛ 1223line 1222 didn't jump to line 1223, because the condition on line 1222 was never true

1223 raise ValueError( 

1224 "The supplied cache dataset ref refers to a different dataset than expected:" 

1225 f" {ref.id} != {cache_ref.id}" 

1226 ) 

1227 

1228 # Cannot recalculate checksum but can compare size as a quick check 

1229 # Do not do this if the size is negative since that indicates 

1230 # we do not know. 

1231 recorded_size = getInfo.info.file_size 

1232 resource_size = uri.size() 

1233 if recorded_size >= 0 and resource_size != recorded_size: 1233 ↛ 1234line 1233 didn't jump to line 1234, because the condition on line 1233 was never true

1234 raise RuntimeError( 

1235 "Integrity failure in Datastore. " 

1236 f"Size of file {uri} ({resource_size}) " 

1237 f"does not match size recorded in registry of {recorded_size}" 

1238 ) 

1239 

1240 # For the general case we have choices for how to proceed. 

1241 # 1. Always use a local file (downloading the remote resource to a 

1242 # temporary file if needed). 

1243 # 2. Use a threshold size and read into memory and use bytes. 

1244 # Use both for now with an arbitrary hand off size. 

1245 # This allows small datasets to be downloaded from remote object 

1246 # stores without requiring a temporary file. 

1247 

1248 formatter = getInfo.formatter 

1249 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1250 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1251 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1252 if cached_file is not None: 

1253 desired_uri = cached_file 

1254 msg = f" (cached version of {uri})" 

1255 else: 

1256 desired_uri = uri 

1257 msg = "" 

1258 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)): 

1259 serializedDataset = desired_uri.read() 

1260 log.debug( 

1261 "Deserializing %s from %d bytes from location %s with formatter %s", 

1262 f"component {getInfo.component}" if isComponent else "", 

1263 len(serializedDataset), 

1264 uri, 

1265 formatter.name(), 

1266 ) 

1267 try: 

1268 result = formatter.fromBytes( 

1269 serializedDataset, component=getInfo.component if isComponent else None 

1270 ) 

1271 except Exception as e: 

1272 raise ValueError( 

1273 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1274 f" ({ref.datasetType.name} from {uri}): {e}" 

1275 ) from e 

1276 else: 

1277 # Read from file. 

1278 

1279 # Have to update the Location associated with the formatter 

1280 # because formatter.read does not allow an override. 

1281 # This could be improved. 

1282 location_updated = False 

1283 msg = "" 

1284 

1285 # First check in cache for local version. 

1286 # The cache will only be relevant for remote resources but 

1287 # no harm in always asking. Context manager ensures that cache 

1288 # file is not deleted during cache expiration. 

1289 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1290 if cached_file is not None: 

1291 msg = f"(via cache read of remote file {uri})" 

1292 uri = cached_file 

1293 location_updated = True 

1294 

1295 with uri.as_local() as local_uri: 

1296 can_be_cached = False 

1297 if uri != local_uri: 1297 ↛ 1299line 1297 didn't jump to line 1299, because the condition on line 1297 was never true

1298 # URI was remote and file was downloaded 

1299 cache_msg = "" 

1300 location_updated = True 

1301 

1302 if self.cacheManager.should_be_cached(cache_ref): 

1303 # In this scenario we want to ask if the downloaded 

1304 # file should be cached but we should not cache 

1305 # it until after we've used it (to ensure it can't 

1306 # be expired whilst we are using it). 

1307 can_be_cached = True 

1308 

1309 # Say that it is "likely" to be cached because 

1310 # if the formatter read fails we will not be 

1311 # caching this file. 

1312 cache_msg = " and likely cached" 

1313 

1314 msg = f"(via download to local file{cache_msg})" 

1315 

1316 # Calculate the (possibly) new location for the formatter 

1317 # to use. 

1318 newLocation = Location(*local_uri.split()) if location_updated else None 

1319 

1320 log.debug( 

1321 "Reading%s from location %s %s with formatter %s", 

1322 f" component {getInfo.component}" if isComponent else "", 

1323 uri, 

1324 msg, 

1325 formatter.name(), 

1326 ) 

1327 try: 

1328 with formatter._updateLocation(newLocation): 

1329 with time_this( 

1330 log, 

1331 msg="Reading%s from location %s %s with formatter %s", 

1332 args=( 

1333 f" component {getInfo.component}" if isComponent else "", 

1334 uri, 

1335 msg, 

1336 formatter.name(), 

1337 ), 

1338 ): 

1339 result = formatter.read(component=getInfo.component if isComponent else None) 

1340 except Exception as e: 

1341 raise ValueError( 

1342 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1343 f" ({ref.datasetType.name} from {uri}): {e}" 

1344 ) from e 

1345 

1346 # File was read successfully so can move to cache 

1347 if can_be_cached: 1347 ↛ 1348line 1347 didn't jump to line 1348, because the condition on line 1347 was never true

1348 self.cacheManager.move_to_cache(local_uri, cache_ref) 

1349 

1350 return self._post_process_get( 

1351 result, getInfo.readStorageClass, getInfo.assemblerParams, isComponent=isComponent 

1352 ) 

1353 

1354 def knows(self, ref: DatasetRef) -> bool: 

1355 """Check if the dataset is known to the datastore. 

1356 

1357 Does not check for existence of any artifact. 

1358 

1359 Parameters 

1360 ---------- 

1361 ref : `DatasetRef` 

1362 Reference to the required dataset. 

1363 

1364 Returns 

1365 ------- 

1366 exists : `bool` 

1367 `True` if the dataset is known to the datastore. 

1368 """ 

1369 fileLocations = self._get_dataset_locations_info(ref) 

1370 if fileLocations: 

1371 return True 

1372 return False 

1373 

1374 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

1375 # Docstring inherited from the base class. 

1376 

1377 # The records themselves. Could be missing some entries. 

1378 records = self._get_stored_records_associated_with_refs(refs) 

1379 

1380 return {ref: ref.id in records for ref in refs} 

1381 

1382 def _process_mexists_records( 

1383 self, 

1384 id_to_ref: Dict[DatasetId, DatasetRef], 

1385 records: Dict[DatasetId, List[StoredFileInfo]], 

1386 all_required: bool, 

1387 artifact_existence: Optional[Dict[ResourcePath, bool]] = None, 

1388 ) -> Dict[DatasetRef, bool]: 

1389 """Helper function for mexists that checks the given records. 

1390 

1391 Parameters 

1392 ---------- 

1393 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`] 

1394 Mapping of the dataset ID to the dataset ref itself. 

1395 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

1396 Records as generally returned by 

1397 ``_get_stored_records_associated_with_refs``. 

1398 all_required : `bool` 

1399 Flag to indicate whether existence requires all artifacts 

1400 associated with a dataset ID to exist or not for existence. 

1401 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1402 Optional mapping of datastore artifact to existence. Updated by 

1403 this method with details of all artifacts tested. Can be `None` 

1404 if the caller is not interested. 

1405 

1406 Returns 

1407 ------- 

1408 existence : `dict` of [`DatasetRef`, `bool`] 

1409 Mapping from dataset to boolean indicating existence. 

1410 """ 

1411 # The URIs to be checked and a mapping of those URIs to 

1412 # the dataset ID. 

1413 uris_to_check: List[ResourcePath] = [] 

1414 location_map: Dict[ResourcePath, DatasetId] = {} 

1415 

1416 location_factory = self.locationFactory 

1417 

1418 uri_existence: Dict[ResourcePath, bool] = {} 

1419 for ref_id, infos in records.items(): 

1420 # Key is the dataset Id, value is list of StoredItemInfo 

1421 uris = [info.file_location(location_factory).uri for info in infos] 

1422 location_map.update({uri: ref_id for uri in uris}) 

1423 

1424 # Check the local cache directly for a dataset corresponding 

1425 # to the remote URI. 

1426 if self.cacheManager.file_count > 0: 1426 ↛ 1427line 1426 didn't jump to line 1427, because the condition on line 1426 was never true

1427 ref = id_to_ref[ref_id] 

1428 for uri, storedFileInfo in zip(uris, infos): 

1429 check_ref = ref 

1430 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1431 check_ref = ref.makeComponentRef(component) 

1432 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()): 

1433 # Proxy for URI existence. 

1434 uri_existence[uri] = True 

1435 else: 

1436 uris_to_check.append(uri) 

1437 else: 

1438 # Check all of them. 

1439 uris_to_check.extend(uris) 

1440 

1441 if artifact_existence is not None: 

1442 # If a URI has already been checked remove it from the list 

1443 # and immediately add the status to the output dict. 

1444 filtered_uris_to_check = [] 

1445 for uri in uris_to_check: 

1446 if uri in artifact_existence: 

1447 uri_existence[uri] = artifact_existence[uri] 

1448 else: 

1449 filtered_uris_to_check.append(uri) 

1450 uris_to_check = filtered_uris_to_check 

1451 

1452 # Results. 

1453 dataset_existence: Dict[DatasetRef, bool] = {} 

1454 

1455 uri_existence.update(ResourcePath.mexists(uris_to_check)) 

1456 for uri, exists in uri_existence.items(): 

1457 dataset_id = location_map[uri] 

1458 ref = id_to_ref[dataset_id] 

1459 

1460 # Disassembled composite needs to check all locations. 

1461 # all_required indicates whether all need to exist or not. 

1462 if ref in dataset_existence: 

1463 if all_required: 

1464 exists = dataset_existence[ref] and exists 

1465 else: 

1466 exists = dataset_existence[ref] or exists 

1467 dataset_existence[ref] = exists 

1468 

1469 if artifact_existence is not None: 

1470 artifact_existence.update(uri_existence) 

1471 

1472 return dataset_existence 

1473 

1474 def mexists( 

1475 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None 

1476 ) -> Dict[DatasetRef, bool]: 

1477 """Check the existence of multiple datasets at once. 

1478 

1479 Parameters 

1480 ---------- 

1481 refs : iterable of `DatasetRef` 

1482 The datasets to be checked. 

1483 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1484 Optional mapping of datastore artifact to existence. Updated by 

1485 this method with details of all artifacts tested. Can be `None` 

1486 if the caller is not interested. 

1487 

1488 Returns 

1489 ------- 

1490 existence : `dict` of [`DatasetRef`, `bool`] 

1491 Mapping from dataset to boolean indicating existence. 

1492 

1493 Notes 

1494 ----- 

1495 To minimize potentially costly remote existence checks, the local 

1496 cache is checked as a proxy for existence. If a file for this 

1497 `DatasetRef` does exist no check is done for the actual URI. This 

1498 could result in possibly unexpected behavior if the dataset itself 

1499 has been removed from the datastore by another process whilst it is 

1500 still in the cache. 

1501 """ 

1502 chunk_size = 10_000 

1503 dataset_existence: Dict[DatasetRef, bool] = {} 

1504 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size) 

1505 n_found_total = 0 

1506 n_checked = 0 

1507 n_chunks = 0 

1508 for chunk in chunk_iterable(refs, chunk_size=chunk_size): 

1509 chunk_result = self._mexists(chunk, artifact_existence) 

1510 if log.isEnabledFor(VERBOSE): 

1511 n_results = len(chunk_result) 

1512 n_checked += n_results 

1513 # Can treat the booleans as 0, 1 integers and sum them. 

1514 n_found = sum(chunk_result.values()) 

1515 n_found_total += n_found 

1516 log.verbose( 

1517 "Number of datasets found in datastore for chunk %d = %d/%d (running total: %d/%d)", 

1518 n_chunks, 

1519 n_found, 

1520 n_results, 

1521 n_found_total, 

1522 n_checked, 

1523 ) 

1524 dataset_existence.update(chunk_result) 

1525 n_chunks += 1 

1526 

1527 return dataset_existence 

1528 

1529 def _mexists( 

1530 self, refs: Sequence[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None 

1531 ) -> Dict[DatasetRef, bool]: 

1532 """Check the existence of multiple datasets at once. 

1533 

1534 Parameters 

1535 ---------- 

1536 refs : iterable of `DatasetRef` 

1537 The datasets to be checked. 

1538 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1539 Optional mapping of datastore artifact to existence. Updated by 

1540 this method with details of all artifacts tested. Can be `None` 

1541 if the caller is not interested. 

1542 

1543 Returns 

1544 ------- 

1545 existence : `dict` of [`DatasetRef`, `bool`] 

1546 Mapping from dataset to boolean indicating existence. 

1547 """ 

1548 # Need a mapping of dataset_id to dataset ref since the API 

1549 # works with dataset_id 

1550 id_to_ref = {ref.getCheckedId(): ref for ref in refs} 

1551 

1552 # Set of all IDs we are checking for. 

1553 requested_ids = set(id_to_ref.keys()) 

1554 

1555 # The records themselves. Could be missing some entries. 

1556 records = self._get_stored_records_associated_with_refs(refs) 

1557 

1558 dataset_existence = self._process_mexists_records( 

1559 id_to_ref, records, True, artifact_existence=artifact_existence 

1560 ) 

1561 

1562 # Set of IDs that have been handled. 

1563 handled_ids = {ref.id for ref in dataset_existence.keys()} 

1564 

1565 missing_ids = requested_ids - handled_ids 

1566 if missing_ids: 

1567 dataset_existence.update( 

1568 self._mexists_check_expected( 

1569 [id_to_ref[missing] for missing in missing_ids], artifact_existence 

1570 ) 

1571 ) 

1572 

1573 return dataset_existence 

1574 

1575 def _mexists_check_expected( 

1576 self, refs: Sequence[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None 

1577 ) -> Dict[DatasetRef, bool]: 

1578 """Check existence of refs that are not known to datastore. 

1579 

1580 Parameters 

1581 ---------- 

1582 refs : iterable of `DatasetRef` 

1583 The datasets to be checked. These are assumed not to be known 

1584 to datastore. 

1585 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1586 Optional mapping of datastore artifact to existence. Updated by 

1587 this method with details of all artifacts tested. Can be `None` 

1588 if the caller is not interested. 

1589 

1590 Returns 

1591 ------- 

1592 existence : `dict` of [`DatasetRef`, `bool`] 

1593 Mapping from dataset to boolean indicating existence. 

1594 """ 

1595 dataset_existence: Dict[DatasetRef, bool] = {} 

1596 if not self.trustGetRequest: 

1597 # Must assume these do not exist 

1598 for ref in refs: 

1599 dataset_existence[ref] = False 

1600 else: 

1601 log.debug( 

1602 "%d datasets were not known to datastore during initial existence check.", 

1603 len(refs), 

1604 ) 

1605 

1606 # Construct data structure identical to that returned 

1607 # by _get_stored_records_associated_with_refs() but using 

1608 # guessed names. 

1609 records = {} 

1610 id_to_ref = {} 

1611 for missing_ref in refs: 

1612 expected = self._get_expected_dataset_locations_info(missing_ref) 

1613 dataset_id = missing_ref.getCheckedId() 

1614 records[dataset_id] = [info for _, info in expected] 

1615 id_to_ref[dataset_id] = missing_ref 

1616 

1617 dataset_existence.update( 

1618 self._process_mexists_records( 

1619 id_to_ref, 

1620 records, 

1621 False, 

1622 artifact_existence=artifact_existence, 

1623 ) 

1624 ) 

1625 

1626 return dataset_existence 

1627 

1628 def exists(self, ref: DatasetRef) -> bool: 

1629 """Check if the dataset exists in the datastore. 

1630 

1631 Parameters 

1632 ---------- 

1633 ref : `DatasetRef` 

1634 Reference to the required dataset. 

1635 

1636 Returns 

1637 ------- 

1638 exists : `bool` 

1639 `True` if the entity exists in the `Datastore`. 

1640 

1641 Notes 

1642 ----- 

1643 The local cache is checked as a proxy for existence in the remote 

1644 object store. It is possible that another process on a different 

1645 compute node could remove the file from the object store even 

1646 though it is present in the local cache. 

1647 """ 

1648 fileLocations = self._get_dataset_locations_info(ref) 

1649 

1650 # if we are being asked to trust that registry might not be correct 

1651 # we ask for the expected locations and check them explicitly 

1652 if not fileLocations: 

1653 if not self.trustGetRequest: 

1654 return False 

1655 

1656 # First check the cache. If it is not found we must check 

1657 # the datastore itself. Assume that any component in the cache 

1658 # means that the dataset does exist somewhere. 

1659 if self.cacheManager.known_to_cache(ref): 1659 ↛ 1660line 1659 didn't jump to line 1660, because the condition on line 1659 was never true

1660 return True 

1661 

1662 # When we are guessing a dataset location we can not check 

1663 # for the existence of every component since we can not 

1664 # know if every component was written. Instead we check 

1665 # for the existence of any of the expected locations. 

1666 for location, _ in self._get_expected_dataset_locations_info(ref): 

1667 if self._artifact_exists(location): 

1668 return True 

1669 return False 

1670 

1671 # All listed artifacts must exist. 

1672 for location, storedFileInfo in fileLocations: 

1673 # Checking in cache needs the component ref. 

1674 check_ref = ref 

1675 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1676 check_ref = ref.makeComponentRef(component) 

1677 if self.cacheManager.known_to_cache(check_ref, location.getExtension()): 

1678 continue 

1679 

1680 if not self._artifact_exists(location): 1680 ↛ 1681line 1680 didn't jump to line 1681, because the condition on line 1680 was never true

1681 return False 

1682 

1683 return True 

1684 

1685 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

1686 """Return URIs associated with dataset. 

1687 

1688 Parameters 

1689 ---------- 

1690 ref : `DatasetRef` 

1691 Reference to the required dataset. 

1692 predict : `bool`, optional 

1693 If the datastore does not know about the dataset, should it 

1694 return a predicted URI or not? 

1695 

1696 Returns 

1697 ------- 

1698 uris : `DatasetRefURIs` 

1699 The URI to the primary artifact associated with this dataset (if 

1700 the dataset was disassembled within the datastore this may be 

1701 `None`), and the URIs to any components associated with the dataset 

1702 artifact. (can be empty if there are no components). 

1703 """ 

1704 many = self.getManyURIs([ref], predict=predict, allow_missing=False) 

1705 return many[ref] 

1706 

1707 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

1708 """URI to the Dataset. 

1709 

1710 Parameters 

1711 ---------- 

1712 ref : `DatasetRef` 

1713 Reference to the required Dataset. 

1714 predict : `bool` 

1715 If `True`, allow URIs to be returned of datasets that have not 

1716 been written. 

1717 

1718 Returns 

1719 ------- 

1720 uri : `str` 

1721 URI pointing to the dataset within the datastore. If the 

1722 dataset does not exist in the datastore, and if ``predict`` is 

1723 `True`, the URI will be a prediction and will include a URI 

1724 fragment "#predicted". 

1725 If the datastore does not have entities that relate well 

1726 to the concept of a URI the returned URI will be 

1727 descriptive. The returned URI is not guaranteed to be obtainable. 

1728 

1729 Raises 

1730 ------ 

1731 FileNotFoundError 

1732 Raised if a URI has been requested for a dataset that does not 

1733 exist and guessing is not allowed. 

1734 RuntimeError 

1735 Raised if a request is made for a single URI but multiple URIs 

1736 are associated with this dataset. 

1737 

1738 Notes 

1739 ----- 

1740 When a predicted URI is requested an attempt will be made to form 

1741 a reasonable URI based on file templates and the expected formatter. 

1742 """ 

1743 primary, components = self.getURIs(ref, predict) 

1744 if primary is None or components: 1744 ↛ 1745line 1744 didn't jump to line 1745, because the condition on line 1744 was never true

1745 raise RuntimeError( 

1746 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

1747 ) 

1748 return primary 

1749 

1750 def _predict_URIs( 

1751 self, 

1752 ref: DatasetRef, 

1753 ) -> DatasetRefURIs: 

1754 """Predict the URIs of a dataset ref. 

1755 

1756 Parameters 

1757 ---------- 

1758 ref : `DatasetRef` 

1759 Reference to the required Dataset. 

1760 

1761 Returns 

1762 ------- 

1763 URI : DatasetRefUris 

1764 Primary and component URIs. URIs will contain a URI fragment 

1765 "#predicted". 

1766 """ 

1767 uris = DatasetRefURIs() 

1768 

1769 if self.composites.shouldBeDisassembled(ref): 

1770 for component, _ in ref.datasetType.storageClass.components.items(): 

1771 comp_ref = ref.makeComponentRef(component) 

1772 comp_location, _ = self._determine_put_formatter_location(comp_ref) 

1773 

1774 # Add the "#predicted" URI fragment to indicate this is a 

1775 # guess 

1776 uris.componentURIs[component] = ResourcePath(comp_location.uri.geturl() + "#predicted") 

1777 

1778 else: 

1779 location, _ = self._determine_put_formatter_location(ref) 

1780 

1781 # Add the "#predicted" URI fragment to indicate this is a guess 

1782 uris.primaryURI = ResourcePath(location.uri.geturl() + "#predicted") 

1783 

1784 return uris 

1785 

1786 def getManyURIs( 

1787 self, 

1788 refs: Iterable[DatasetRef], 

1789 predict: bool = False, 

1790 allow_missing: bool = False, 

1791 ) -> Dict[DatasetRef, DatasetRefURIs]: 

1792 # Docstring inherited 

1793 

1794 uris: Dict[DatasetRef, DatasetRefURIs] = {} 

1795 

1796 records = self._get_stored_records_associated_with_refs(refs) 

1797 records_keys = records.keys() 

1798 

1799 existing_refs = tuple(ref for ref in refs if ref.id in records_keys) 

1800 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys) 

1801 

1802 # Have to handle trustGetRequest mode by checking for the existence 

1803 # of the missing refs on disk. 

1804 if missing_refs: 

1805 dataset_existence = self._mexists_check_expected(missing_refs, None) 

1806 really_missing = set() 

1807 not_missing = set() 

1808 for ref, exists in dataset_existence.items(): 

1809 if exists: 

1810 not_missing.add(ref) 

1811 else: 

1812 really_missing.add(ref) 

1813 

1814 if not_missing: 

1815 # Need to recalculate the missing/existing split. 

1816 existing_refs = existing_refs + tuple(not_missing) 

1817 missing_refs = tuple(really_missing) 

1818 

1819 for ref in missing_refs: 

1820 # if this has never been written then we have to guess 

1821 if not predict: 

1822 if not allow_missing: 

1823 raise FileNotFoundError("Dataset {} not in this datastore.".format(ref)) 

1824 else: 

1825 uris[ref] = self._predict_URIs(ref) 

1826 

1827 for ref in existing_refs: 

1828 file_infos = records[ref.getCheckedId()] 

1829 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos] 

1830 uris[ref] = self._locations_to_URI(ref, file_locations) 

1831 

1832 return uris 

1833 

1834 def _locations_to_URI( 

1835 self, 

1836 ref: DatasetRef, 

1837 file_locations: Sequence[Tuple[Location, StoredFileInfo]], 

1838 ) -> DatasetRefURIs: 

1839 """Convert one or more file locations associated with a DatasetRef 

1840 to a DatasetRefURIs. 

1841 

1842 Parameters 

1843 ---------- 

1844 ref : `DatasetRef` 

1845 Reference to the dataset. 

1846 file_locations : Sequence[Tuple[Location, StoredFileInfo]] 

1847 Each item in the sequence is the location of the dataset within the 

1848 datastore and stored information about the file and its formatter. 

1849 If there is only one item in the sequence then it is treated as the 

1850 primary URI. If there is more than one item then they are treated 

1851 as component URIs. If there are no items then an error is raised 

1852 unless ``self.trustGetRequest`` is `True`. 

1853 

1854 Returns 

1855 ------- 

1856 uris: DatasetRefURIs 

1857 Represents the primary URI or component URIs described by the 

1858 inputs. 

1859 

1860 Raises 

1861 ------ 

1862 RuntimeError 

1863 If no file locations are passed in and ``self.trustGetRequest`` is 

1864 `False`. 

1865 FileNotFoundError 

1866 If the a passed-in URI does not exist, and ``self.trustGetRequest`` 

1867 is `False`. 

1868 RuntimeError 

1869 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is 

1870 unexpected). 

1871 """ 

1872 

1873 guessing = False 

1874 uris = DatasetRefURIs() 

1875 

1876 if not file_locations: 

1877 if not self.trustGetRequest: 1877 ↛ 1878line 1877 didn't jump to line 1878, because the condition on line 1877 was never true

1878 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1879 file_locations = self._get_expected_dataset_locations_info(ref) 

1880 guessing = True 

1881 

1882 if len(file_locations) == 1: 

1883 # No disassembly so this is the primary URI 

1884 uris.primaryURI = file_locations[0][0].uri 

1885 if guessing and not uris.primaryURI.exists(): 1885 ↛ 1886line 1885 didn't jump to line 1886, because the condition on line 1885 was never true

1886 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist") 

1887 else: 

1888 for location, file_info in file_locations: 

1889 if file_info.component is None: 1889 ↛ 1890line 1889 didn't jump to line 1890, because the condition on line 1889 was never true

1890 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1891 if guessing and not location.uri.exists(): 1891 ↛ 1895line 1891 didn't jump to line 1895, because the condition on line 1891 was never true

1892 # If we are trusting then it is entirely possible for 

1893 # some components to be missing. In that case we skip 

1894 # to the next component. 

1895 if self.trustGetRequest: 

1896 continue 

1897 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist") 

1898 uris.componentURIs[file_info.component] = location.uri 

1899 

1900 return uris 

1901 

1902 def retrieveArtifacts( 

1903 self, 

1904 refs: Iterable[DatasetRef], 

1905 destination: ResourcePath, 

1906 transfer: str = "auto", 

1907 preserve_path: bool = True, 

1908 overwrite: bool = False, 

1909 ) -> List[ResourcePath]: 

1910 """Retrieve the file artifacts associated with the supplied refs. 

1911 

1912 Parameters 

1913 ---------- 

1914 refs : iterable of `DatasetRef` 

1915 The datasets for which file artifacts are to be retrieved. 

1916 A single ref can result in multiple files. The refs must 

1917 be resolved. 

1918 destination : `lsst.resources.ResourcePath` 

1919 Location to write the file artifacts. 

1920 transfer : `str`, optional 

1921 Method to use to transfer the artifacts. Must be one of the options 

1922 supported by `lsst.resources.ResourcePath.transfer_from()`. 

1923 "move" is not allowed. 

1924 preserve_path : `bool`, optional 

1925 If `True` the full path of the file artifact within the datastore 

1926 is preserved. If `False` the final file component of the path 

1927 is used. 

1928 overwrite : `bool`, optional 

1929 If `True` allow transfers to overwrite existing files at the 

1930 destination. 

1931 

1932 Returns 

1933 ------- 

1934 targets : `list` of `lsst.resources.ResourcePath` 

1935 URIs of file artifacts in destination location. Order is not 

1936 preserved. 

1937 """ 

1938 if not destination.isdir(): 1938 ↛ 1939line 1938 didn't jump to line 1939, because the condition on line 1938 was never true

1939 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

1940 

1941 if transfer == "move": 

1942 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

1943 

1944 # Source -> Destination 

1945 # This also helps filter out duplicate DatasetRef in the request 

1946 # that will map to the same underlying file transfer. 

1947 to_transfer: Dict[ResourcePath, ResourcePath] = {} 

1948 

1949 for ref in refs: 

1950 locations = self._get_dataset_locations_info(ref) 

1951 for location, _ in locations: 

1952 source_uri = location.uri 

1953 target_path: ResourcePathExpression 

1954 if preserve_path: 

1955 target_path = location.pathInStore 

1956 if target_path.isabs(): 1956 ↛ 1959line 1956 didn't jump to line 1959, because the condition on line 1956 was never true

1957 # This is an absolute path to an external file. 

1958 # Use the full path. 

1959 target_path = target_path.relativeToPathRoot 

1960 else: 

1961 target_path = source_uri.basename() 

1962 target_uri = destination.join(target_path) 

1963 to_transfer[source_uri] = target_uri 

1964 

1965 # In theory can now parallelize the transfer 

1966 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer)) 

1967 for source_uri, target_uri in to_transfer.items(): 

1968 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

1969 

1970 return list(to_transfer.values()) 

1971 

1972 def get( 

1973 self, 

1974 ref: DatasetRef, 

1975 parameters: Optional[Mapping[str, Any]] = None, 

1976 storageClass: Optional[Union[StorageClass, str]] = None, 

1977 ) -> Any: 

1978 """Load an InMemoryDataset from the store. 

1979 

1980 Parameters 

1981 ---------- 

1982 ref : `DatasetRef` 

1983 Reference to the required Dataset. 

1984 parameters : `dict` 

1985 `StorageClass`-specific parameters that specify, for example, 

1986 a slice of the dataset to be loaded. 

1987 storageClass : `StorageClass` or `str`, optional 

1988 The storage class to be used to override the Python type 

1989 returned by this method. By default the returned type matches 

1990 the dataset type definition for this dataset. Specifying a 

1991 read `StorageClass` can force a different type to be returned. 

1992 This type must be compatible with the original type. 

1993 

1994 Returns 

1995 ------- 

1996 inMemoryDataset : `object` 

1997 Requested dataset or slice thereof as an InMemoryDataset. 

1998 

1999 Raises 

2000 ------ 

2001 FileNotFoundError 

2002 Requested dataset can not be retrieved. 

2003 TypeError 

2004 Return value from formatter has unexpected type. 

2005 ValueError 

2006 Formatter failed to process the dataset. 

2007 """ 

2008 # Supplied storage class for the component being read is either 

2009 # from the ref itself or some an override if we want to force 

2010 # type conversion. 

2011 if storageClass is not None: 

2012 ref = ref.overrideStorageClass(storageClass) 

2013 refStorageClass = ref.datasetType.storageClass 

2014 

2015 allGetInfo = self._prepare_for_get(ref, parameters) 

2016 refComponent = ref.datasetType.component() 

2017 

2018 # Create mapping from component name to related info 

2019 allComponents = {i.component: i for i in allGetInfo} 

2020 

2021 # By definition the dataset is disassembled if we have more 

2022 # than one record for it. 

2023 isDisassembled = len(allGetInfo) > 1 

2024 

2025 # Look for the special case where we are disassembled but the 

2026 # component is a derived component that was not written during 

2027 # disassembly. For this scenario we need to check that the 

2028 # component requested is listed as a derived component for the 

2029 # composite storage class 

2030 isDisassembledReadOnlyComponent = False 

2031 if isDisassembled and refComponent: 

2032 # The composite storage class should be accessible through 

2033 # the component dataset type 

2034 compositeStorageClass = ref.datasetType.parentStorageClass 

2035 

2036 # In the unlikely scenario where the composite storage 

2037 # class is not known, we can only assume that this is a 

2038 # normal component. If that assumption is wrong then the 

2039 # branch below that reads a persisted component will fail 

2040 # so there is no need to complain here. 

2041 if compositeStorageClass is not None: 2041 ↛ 2044line 2041 didn't jump to line 2044, because the condition on line 2041 was never false

2042 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

2043 

2044 if isDisassembled and not refComponent: 

2045 # This was a disassembled dataset spread over multiple files 

2046 # and we need to put them all back together again. 

2047 # Read into memory and then assemble 

2048 

2049 # Check that the supplied parameters are suitable for the type read 

2050 refStorageClass.validateParameters(parameters) 

2051 

2052 # We want to keep track of all the parameters that were not used 

2053 # by formatters. We assume that if any of the component formatters 

2054 # use a parameter that we do not need to apply it again in the 

2055 # assembler. 

2056 usedParams = set() 

2057 

2058 components: Dict[str, Any] = {} 

2059 for getInfo in allGetInfo: 

2060 # assemblerParams are parameters not understood by the 

2061 # associated formatter. 

2062 usedParams.update(set(getInfo.formatterParams)) 

2063 

2064 component = getInfo.component 

2065 

2066 if component is None: 2066 ↛ 2067line 2066 didn't jump to line 2067, because the condition on line 2066 was never true

2067 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

2068 

2069 # We do not want the formatter to think it's reading 

2070 # a component though because it is really reading a 

2071 # standalone dataset -- always tell reader it is not a 

2072 # component. 

2073 components[component] = self._read_artifact_into_memory( 

2074 getInfo, ref.makeComponentRef(component), isComponent=False 

2075 ) 

2076 

2077 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

2078 

2079 # Any unused parameters will have to be passed to the assembler 

2080 if parameters: 

2081 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

2082 else: 

2083 unusedParams = {} 

2084 

2085 # Process parameters 

2086 return ref.datasetType.storageClass.delegate().handleParameters( 

2087 inMemoryDataset, parameters=unusedParams 

2088 ) 

2089 

2090 elif isDisassembledReadOnlyComponent: 

2091 compositeStorageClass = ref.datasetType.parentStorageClass 

2092 if compositeStorageClass is None: 2092 ↛ 2093line 2092 didn't jump to line 2093, because the condition on line 2092 was never true

2093 raise RuntimeError( 

2094 f"Unable to retrieve derived component '{refComponent}' since" 

2095 "no composite storage class is available." 

2096 ) 

2097 

2098 if refComponent is None: 2098 ↛ 2100line 2098 didn't jump to line 2100, because the condition on line 2098 was never true

2099 # Mainly for mypy 

2100 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

2101 

2102 # Assume that every derived component can be calculated by 

2103 # forwarding the request to a single read/write component. 

2104 # Rather than guessing which rw component is the right one by 

2105 # scanning each for a derived component of the same name, 

2106 # we ask the storage class delegate directly which one is best to 

2107 # use. 

2108 compositeDelegate = compositeStorageClass.delegate() 

2109 forwardedComponent = compositeDelegate.selectResponsibleComponent( 

2110 refComponent, set(allComponents) 

2111 ) 

2112 

2113 # Select the relevant component 

2114 rwInfo = allComponents[forwardedComponent] 

2115 

2116 # For now assume that read parameters are validated against 

2117 # the real component and not the requested component 

2118 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

2119 forwardedStorageClass.validateParameters(parameters) 

2120 

2121 # The reference to use for the caching must refer to the forwarded 

2122 # component and not the derived component. 

2123 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent) 

2124 

2125 # Unfortunately the FileDescriptor inside the formatter will have 

2126 # the wrong write storage class so we need to create a new one 

2127 # given the immutability constraint. 

2128 writeStorageClass = rwInfo.info.storageClass 

2129 

2130 # We may need to put some thought into parameters for read 

2131 # components but for now forward them on as is 

2132 readFormatter = type(rwInfo.formatter)( 

2133 FileDescriptor( 

2134 rwInfo.location, 

2135 readStorageClass=refStorageClass, 

2136 storageClass=writeStorageClass, 

2137 parameters=parameters, 

2138 ), 

2139 ref.dataId, 

2140 ) 

2141 

2142 # The assembler can not receive any parameter requests for a 

2143 # derived component at this time since the assembler will 

2144 # see the storage class of the derived component and those 

2145 # parameters will have to be handled by the formatter on the 

2146 # forwarded storage class. 

2147 assemblerParams: Dict[str, Any] = {} 

2148 

2149 # Need to created a new info that specifies the derived 

2150 # component and associated storage class 

2151 readInfo = DatastoreFileGetInformation( 

2152 rwInfo.location, 

2153 readFormatter, 

2154 rwInfo.info, 

2155 assemblerParams, 

2156 {}, 

2157 refComponent, 

2158 refStorageClass, 

2159 ) 

2160 

2161 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref) 

2162 

2163 else: 

2164 # Single file request or component from that composite file 

2165 for lookup in (refComponent, None): 2165 ↛ 2170line 2165 didn't jump to line 2170, because the loop on line 2165 didn't complete

2166 if lookup in allComponents: 2166 ↛ 2165line 2166 didn't jump to line 2165, because the condition on line 2166 was never false

2167 getInfo = allComponents[lookup] 

2168 break 

2169 else: 

2170 raise FileNotFoundError( 

2171 f"Component {refComponent} not found for ref {ref} in datastore {self.name}" 

2172 ) 

2173 

2174 # Do not need the component itself if already disassembled 

2175 if isDisassembled: 

2176 isComponent = False 

2177 else: 

2178 isComponent = getInfo.component is not None 

2179 

2180 # For a component read of a composite we want the cache to 

2181 # be looking at the composite ref itself. 

2182 cache_ref = ref.makeCompositeRef() if isComponent else ref 

2183 

2184 # For a disassembled component we can validate parametersagainst 

2185 # the component storage class directly 

2186 if isDisassembled: 

2187 refStorageClass.validateParameters(parameters) 

2188 else: 

2189 # For an assembled composite this could be a derived 

2190 # component derived from a real component. The validity 

2191 # of the parameters is not clear. For now validate against 

2192 # the composite storage class 

2193 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

2194 

2195 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref) 

2196 

2197 @transactional 

2198 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

2199 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

2200 

2201 Parameters 

2202 ---------- 

2203 inMemoryDataset : `object` 

2204 The dataset to store. 

2205 ref : `DatasetRef` 

2206 Reference to the associated Dataset. 

2207 

2208 Raises 

2209 ------ 

2210 TypeError 

2211 Supplied object and storage class are inconsistent. 

2212 DatasetTypeNotSupportedError 

2213 The associated `DatasetType` is not handled by this datastore. 

2214 

2215 Notes 

2216 ----- 

2217 If the datastore is configured to reject certain dataset types it 

2218 is possible that the put will fail and raise a 

2219 `DatasetTypeNotSupportedError`. The main use case for this is to 

2220 allow `ChainedDatastore` to put to multiple datastores without 

2221 requiring that every datastore accepts the dataset. 

2222 """ 

2223 

2224 doDisassembly = self.composites.shouldBeDisassembled(ref) 

2225 # doDisassembly = True 

2226 

2227 artifacts = [] 

2228 if doDisassembly: 

2229 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

2230 if components is None: 2230 ↛ 2231line 2230 didn't jump to line 2231, because the condition on line 2230 was never true

2231 raise RuntimeError( 

2232 f"Inconsistent configuration: dataset type {ref.datasetType.name} " 

2233 f"with storage class {ref.datasetType.storageClass.name} " 

2234 "is configured to be disassembled, but cannot be." 

2235 ) 

2236 for component, componentInfo in components.items(): 

2237 # Don't recurse because we want to take advantage of 

2238 # bulk insert -- need a new DatasetRef that refers to the 

2239 # same dataset_id but has the component DatasetType 

2240 # DatasetType does not refer to the types of components 

2241 # So we construct one ourselves. 

2242 compRef = ref.makeComponentRef(component) 

2243 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

2244 artifacts.append((compRef, storedInfo)) 

2245 else: 

2246 # Write the entire thing out 

2247 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

2248 artifacts.append((ref, storedInfo)) 

2249 

2250 self._register_datasets(artifacts) 

2251 

2252 @transactional 

2253 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None: 

2254 # At this point can safely remove these datasets from the cache 

2255 # to avoid confusion later on. If they are not trashed later 

2256 # the cache will simply be refilled. 

2257 self.cacheManager.remove_from_cache(ref) 

2258 

2259 # If we are in trust mode there will be nothing to move to 

2260 # the trash table and we will have to try to delete the file 

2261 # immediately. 

2262 if self.trustGetRequest: 

2263 # Try to keep the logic below for a single file trash. 

2264 if isinstance(ref, DatasetRef): 

2265 refs = {ref} 

2266 else: 

2267 # Will recreate ref at the end of this branch. 

2268 refs = set(ref) 

2269 

2270 # Determine which datasets are known to datastore directly. 

2271 id_to_ref = {ref.getCheckedId(): ref for ref in refs} 

2272 existing_ids = self._get_stored_records_associated_with_refs(refs) 

2273 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids} 

2274 

2275 missing = refs - existing_refs 

2276 if missing: 

2277 # Do an explicit existence check on these refs. 

2278 # We only care about the artifacts at this point and not 

2279 # the dataset existence. 

2280 artifact_existence: Dict[ResourcePath, bool] = {} 

2281 _ = self.mexists(missing, artifact_existence) 

2282 uris = [uri for uri, exists in artifact_existence.items() if exists] 

2283 

2284 # FUTURE UPGRADE: Implement a parallelized bulk remove. 

2285 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris)) 

2286 for uri in uris: 

2287 try: 

2288 uri.remove() 

2289 except Exception as e: 

2290 if ignore_errors: 

2291 log.debug("Artifact %s could not be removed: %s", uri, e) 

2292 continue 

2293 raise 

2294 

2295 # There is no point asking the code below to remove refs we 

2296 # know are missing so update it with the list of existing 

2297 # records. Try to retain one vs many logic. 

2298 if not existing_refs: 

2299 # Nothing more to do since none of the datasets were 

2300 # known to the datastore record table. 

2301 return 

2302 ref = list(existing_refs) 

2303 if len(ref) == 1: 

2304 ref = ref[0] 

2305 

2306 # Get file metadata and internal metadata 

2307 if not isinstance(ref, DatasetRef): 

2308 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

2309 # Assumed to be an iterable of refs so bulk mode enabled. 

2310 try: 

2311 self.bridge.moveToTrash(ref, transaction=self._transaction) 

2312 except Exception as e: 

2313 if ignore_errors: 

2314 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

2315 else: 

2316 raise 

2317 return 

2318 

2319 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

2320 

2321 fileLocations = self._get_dataset_locations_info(ref) 

2322 

2323 if not fileLocations: 

2324 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

2325 if ignore_errors: 

2326 log.warning(err_msg) 

2327 return 

2328 else: 

2329 raise FileNotFoundError(err_msg) 

2330 

2331 for location, storedFileInfo in fileLocations: 

2332 if not self._artifact_exists(location): 2332 ↛ 2333line 2332 didn't jump to line 2333

2333 err_msg = ( 

2334 f"Dataset is known to datastore {self.name} but " 

2335 f"associated artifact ({location.uri}) is missing" 

2336 ) 

2337 if ignore_errors: 

2338 log.warning(err_msg) 

2339 return 

2340 else: 

2341 raise FileNotFoundError(err_msg) 

2342 

2343 # Mark dataset as trashed 

2344 try: 

2345 self.bridge.moveToTrash([ref], transaction=self._transaction) 

2346 except Exception as e: 

2347 if ignore_errors: 

2348 log.warning( 

2349 "Attempted to mark dataset (%s) to be trashed in datastore %s " 

2350 "but encountered an error: %s", 

2351 ref, 

2352 self.name, 

2353 e, 

2354 ) 

2355 pass 

2356 else: 

2357 raise 

2358 

2359 @transactional 

2360 def emptyTrash(self, ignore_errors: bool = True) -> None: 

2361 """Remove all datasets from the trash. 

2362 

2363 Parameters 

2364 ---------- 

2365 ignore_errors : `bool` 

2366 If `True` return without error even if something went wrong. 

2367 Problems could occur if another process is simultaneously trying 

2368 to delete. 

2369 """ 

2370 log.debug("Emptying trash in datastore %s", self.name) 

2371 

2372 # Context manager will empty trash iff we finish it without raising. 

2373 # It will also automatically delete the relevant rows from the 

2374 # trash table and the records table. 

2375 with self.bridge.emptyTrash( 

2376 self._table, record_class=StoredFileInfo, record_column="path" 

2377 ) as trash_data: 

2378 # Removing the artifacts themselves requires that the files are 

2379 # not also associated with refs that are not to be trashed. 

2380 # Therefore need to do a query with the file paths themselves 

2381 # and return all the refs associated with them. Can only delete 

2382 # a file if the refs to be trashed are the only refs associated 

2383 # with the file. 

2384 # This requires multiple copies of the trashed items 

2385 trashed, artifacts_to_keep = trash_data 

2386 

2387 if artifacts_to_keep is None: 

2388 # The bridge is not helping us so have to work it out 

2389 # ourselves. This is not going to be as efficient. 

2390 trashed = list(trashed) 

2391 

2392 # The instance check is for mypy since up to this point it 

2393 # does not know the type of info. 

2394 path_map = self._refs_associated_with_artifacts( 

2395 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)] 

2396 ) 

2397 

2398 for ref, info in trashed: 

2399 # Mypy needs to know this is not the base class 

2400 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2401 

2402 # Check for mypy 

2403 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

2404 

2405 path_map[info.path].remove(ref.id) 

2406 if not path_map[info.path]: 2406 ↛ 2398line 2406 didn't jump to line 2398, because the condition on line 2406 was never false

2407 del path_map[info.path] 

2408 

2409 artifacts_to_keep = set(path_map) 

2410 

2411 for ref, info in trashed: 

2412 # Should not happen for this implementation but need 

2413 # to keep mypy happy. 

2414 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

2415 

2416 # Mypy needs to know this is not the base class 

2417 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2418 

2419 # Check for mypy 

2420 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

2421 

2422 if info.path in artifacts_to_keep: 

2423 # This is a multi-dataset artifact and we are not 

2424 # removing all associated refs. 

2425 continue 

2426 

2427 # Only trashed refs still known to datastore will be returned. 

2428 location = info.file_location(self.locationFactory) 

2429 

2430 # Point of no return for this artifact 

2431 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

2432 try: 

2433 self._delete_artifact(location) 

2434 except FileNotFoundError: 

2435 # If the file itself has been deleted there is nothing 

2436 # we can do about it. It is possible that trash has 

2437 # been run in parallel in another process or someone 

2438 # decided to delete the file. It is unlikely to come 

2439 # back and so we should still continue with the removal 

2440 # of the entry from the trash table. It is also possible 

2441 # we removed it in a previous iteration if it was 

2442 # a multi-dataset artifact. The delete artifact method 

2443 # will log a debug message in this scenario. 

2444 # Distinguishing file missing before trash started and 

2445 # file already removed previously as part of this trash 

2446 # is not worth the distinction with regards to potential 

2447 # memory cost. 

2448 pass 

2449 except Exception as e: 

2450 if ignore_errors: 

2451 # Use a debug message here even though it's not 

2452 # a good situation. In some cases this can be 

2453 # caused by a race between user A and user B 

2454 # and neither of them has permissions for the 

2455 # other's files. Butler does not know about users 

2456 # and trash has no idea what collections these 

2457 # files were in (without guessing from a path). 

2458 log.debug( 

2459 "Encountered error removing artifact %s from datastore %s: %s", 

2460 location.uri, 

2461 self.name, 

2462 e, 

2463 ) 

2464 else: 

2465 raise 

2466 

2467 @transactional 

2468 def transfer_from( 

2469 self, 

2470 source_datastore: Datastore, 

2471 refs: Iterable[DatasetRef], 

2472 local_refs: Optional[Iterable[DatasetRef]] = None, 

2473 transfer: str = "auto", 

2474 artifact_existence: Optional[Dict[ResourcePath, bool]] = None, 

2475 ) -> None: 

2476 # Docstring inherited 

2477 if type(self) is not type(source_datastore): 

2478 raise TypeError( 

2479 f"Datastore mismatch between this datastore ({type(self)}) and the " 

2480 f"source datastore ({type(source_datastore)})." 

2481 ) 

2482 

2483 # Be explicit for mypy 

2484 if not isinstance(source_datastore, FileDatastore): 2484 ↛ 2485line 2484 didn't jump to line 2485, because the condition on line 2484 was never true

2485 raise TypeError( 

2486 "Can only transfer to a FileDatastore from another FileDatastore, not" 

2487 f" {type(source_datastore)}" 

2488 ) 

2489 

2490 # Stop early if "direct" transfer mode is requested. That would 

2491 # require that the URI inside the source datastore should be stored 

2492 # directly in the target datastore, which seems unlikely to be useful 

2493 # since at any moment the source datastore could delete the file. 

2494 if transfer in ("direct", "split"): 

2495 raise ValueError( 

2496 f"Can not transfer from a source datastore using {transfer} mode since" 

2497 " those files are controlled by the other datastore." 

2498 ) 

2499 

2500 # Empty existence lookup if none given. 

2501 if artifact_existence is None: 

2502 artifact_existence = {} 

2503 

2504 # We will go through the list multiple times so must convert 

2505 # generators to lists. 

2506 refs = list(refs) 

2507 

2508 if local_refs is None: 

2509 local_refs = refs 

2510 else: 

2511 local_refs = list(local_refs) 

2512 

2513 # In order to handle disassembled composites the code works 

2514 # at the records level since it can assume that internal APIs 

2515 # can be used. 

2516 # - If the record already exists in the destination this is assumed 

2517 # to be okay. 

2518 # - If there is no record but the source and destination URIs are 

2519 # identical no transfer is done but the record is added. 

2520 # - If the source record refers to an absolute URI currently assume 

2521 # that that URI should remain absolute and will be visible to the 

2522 # destination butler. May need to have a flag to indicate whether 

2523 # the dataset should be transferred. This will only happen if 

2524 # the detached Butler has had a local ingest. 

2525 

2526 # What we really want is all the records in the source datastore 

2527 # associated with these refs. Or derived ones if they don't exist 

2528 # in the source. 

2529 source_records = source_datastore._get_stored_records_associated_with_refs(refs) 

2530 

2531 # The source dataset_ids are the keys in these records 

2532 source_ids = set(source_records) 

2533 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

2534 

2535 # The not None check is to appease mypy 

2536 requested_ids = set(ref.id for ref in refs if ref.id is not None) 

2537 missing_ids = requested_ids - source_ids 

2538 

2539 # Missing IDs can be okay if that datastore has allowed 

2540 # gets based on file existence. Should we transfer what we can 

2541 # or complain about it and warn? 

2542 if missing_ids and not source_datastore.trustGetRequest: 2542 ↛ 2543line 2542 didn't jump to line 2543, because the condition on line 2542 was never true

2543 raise ValueError( 

2544 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}" 

2545 ) 

2546 

2547 # Need to map these missing IDs to a DatasetRef so we can guess 

2548 # the details. 

2549 if missing_ids: 

2550 log.info( 

2551 "Number of expected datasets missing from source datastore records: %d out of %d", 

2552 len(missing_ids), 

2553 len(requested_ids), 

2554 ) 

2555 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

2556 

2557 # This should be chunked in case we end up having to check 

2558 # the file store since we need some log output to show 

2559 # progress. 

2560 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000): 

2561 records = {} 

2562 for missing in missing_ids_chunk: 

2563 # Ask the source datastore where the missing artifacts 

2564 # should be. An execution butler might not know about the 

2565 # artifacts even if they are there. 

2566 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) 

2567 records[missing] = [info for _, info in expected] 

2568 

2569 # Call the mexist helper method in case we have not already 

2570 # checked these artifacts such that artifact_existence is 

2571 # empty. This allows us to benefit from parallelism. 

2572 # datastore.mexists() itself does not give us access to the 

2573 # derived datastore record. 

2574 log.verbose("Checking existence of %d datasets unknown to datastore", len(records)) 

2575 ref_exists = source_datastore._process_mexists_records( 

2576 id_to_ref, records, False, artifact_existence=artifact_existence 

2577 ) 

2578 

2579 # Now go through the records and propagate the ones that exist. 

2580 location_factory = source_datastore.locationFactory 

2581 for missing, record_list in records.items(): 

2582 # Skip completely if the ref does not exist. 

2583 ref = id_to_ref[missing] 

2584 if not ref_exists[ref]: 

2585 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref) 

2586 continue 

2587 # Check for file artifact to decide which parts of a 

2588 # disassembled composite do exist. If there is only a 

2589 # single record we don't even need to look because it can't 

2590 # be a composite and must exist. 

2591 if len(record_list) == 1: 

2592 dataset_records = record_list 

2593 else: 

2594 dataset_records = [ 

2595 record 

2596 for record in record_list 

2597 if artifact_existence[record.file_location(location_factory).uri] 

2598 ] 

2599 assert len(dataset_records) > 0, "Disassembled composite should have had some files." 

2600 

2601 # Rely on source_records being a defaultdict. 

2602 source_records[missing].extend(dataset_records) 

2603 

2604 # See if we already have these records 

2605 target_records = self._get_stored_records_associated_with_refs(local_refs) 

2606 

2607 # The artifacts to register 

2608 artifacts = [] 

2609 

2610 # Refs that already exist 

2611 already_present = [] 

2612 

2613 # Now can transfer the artifacts 

2614 for source_ref, target_ref in zip(refs, local_refs): 

2615 if target_ref.id in target_records: 

2616 # Already have an artifact for this. 

2617 already_present.append(target_ref) 

2618 continue 

2619 

2620 # mypy needs to know these are always resolved refs 

2621 for info in source_records[source_ref.getCheckedId()]: 

2622 source_location = info.file_location(source_datastore.locationFactory) 

2623 target_location = info.file_location(self.locationFactory) 

2624 if source_location == target_location: 2624 ↛ 2628line 2624 didn't jump to line 2628, because the condition on line 2624 was never true

2625 # Either the dataset is already in the target datastore 

2626 # (which is how execution butler currently runs) or 

2627 # it is an absolute URI. 

2628 if source_location.pathInStore.isabs(): 

2629 # Just because we can see the artifact when running 

2630 # the transfer doesn't mean it will be generally 

2631 # accessible to a user of this butler. For now warn 

2632 # but assume it will be accessible. 

2633 log.warning( 

2634 "Transfer request for an outside-datastore artifact has been found at %s", 

2635 source_location, 

2636 ) 

2637 else: 

2638 # Need to transfer it to the new location. 

2639 # Assume we should always overwrite. If the artifact 

2640 # is there this might indicate that a previous transfer 

2641 # was interrupted but was not able to be rolled back 

2642 # completely (eg pre-emption) so follow Datastore default 

2643 # and overwrite. 

2644 target_location.uri.transfer_from( 

2645 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction 

2646 ) 

2647 

2648 artifacts.append((target_ref, info)) 

2649 

2650 self._register_datasets(artifacts) 

2651 

2652 if already_present: 

2653 n_skipped = len(already_present) 

2654 log.info( 

2655 "Skipped transfer of %d dataset%s already present in datastore", 

2656 n_skipped, 

2657 "" if n_skipped == 1 else "s", 

2658 ) 

2659 

2660 @transactional 

2661 def forget(self, refs: Iterable[DatasetRef]) -> None: 

2662 # Docstring inherited. 

2663 refs = list(refs) 

2664 self.bridge.forget(refs) 

2665 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs]) 

2666 

2667 def validateConfiguration( 

2668 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False 

2669 ) -> None: 

2670 """Validate some of the configuration for this datastore. 

2671 

2672 Parameters 

2673 ---------- 

2674 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

2675 Entities to test against this configuration. Can be differing 

2676 types. 

2677 logFailures : `bool`, optional 

2678 If `True`, output a log message for every validation error 

2679 detected. 

2680 

2681 Raises 

2682 ------ 

2683 DatastoreValidationError 

2684 Raised if there is a validation problem with a configuration. 

2685 All the problems are reported in a single exception. 

2686 

2687 Notes 

2688 ----- 

2689 This method checks that all the supplied entities have valid file 

2690 templates and also have formatters defined. 

2691 """ 

2692 

2693 templateFailed = None 

2694 try: 

2695 self.templates.validateTemplates(entities, logFailures=logFailures) 

2696 except FileTemplateValidationError as e: 

2697 templateFailed = str(e) 

2698 

2699 formatterFailed = [] 

2700 for entity in entities: 

2701 try: 

2702 self.formatterFactory.getFormatterClass(entity) 

2703 except KeyError as e: 

2704 formatterFailed.append(str(e)) 

2705 if logFailures: 2705 ↛ 2700line 2705 didn't jump to line 2700, because the condition on line 2705 was never false

2706 log.critical("Formatter failure: %s", e) 

2707 

2708 if templateFailed or formatterFailed: 

2709 messages = [] 

2710 if templateFailed: 2710 ↛ 2711line 2710 didn't jump to line 2711, because the condition on line 2710 was never true

2711 messages.append(templateFailed) 

2712 if formatterFailed: 2712 ↛ 2714line 2712 didn't jump to line 2714, because the condition on line 2712 was never false

2713 messages.append(",".join(formatterFailed)) 

2714 msg = ";\n".join(messages) 

2715 raise DatastoreValidationError(msg) 

2716 

2717 def getLookupKeys(self) -> Set[LookupKey]: 

2718 # Docstring is inherited from base class 

2719 return ( 

2720 self.templates.getLookupKeys() 

2721 | self.formatterFactory.getLookupKeys() 

2722 | self.constraints.getLookupKeys() 

2723 ) 

2724 

2725 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

2726 # Docstring is inherited from base class 

2727 # The key can be valid in either formatters or templates so we can 

2728 # only check the template if it exists 

2729 if lookupKey in self.templates: 

2730 try: 

2731 self.templates[lookupKey].validateTemplate(entity) 

2732 except FileTemplateValidationError as e: 

2733 raise DatastoreValidationError(e) from e 

2734 

2735 def export( 

2736 self, 

2737 refs: Iterable[DatasetRef], 

2738 *, 

2739 directory: Optional[ResourcePathExpression] = None, 

2740 transfer: Optional[str] = "auto", 

2741 ) -> Iterable[FileDataset]: 

2742 # Docstring inherited from Datastore.export. 

2743 if transfer == "auto" and directory is None: 

2744 transfer = None 

2745 

2746 if transfer is not None and directory is None: 

2747 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

2748 

2749 if transfer == "move": 

2750 raise TypeError("Can not export by moving files out of datastore.") 

2751 elif transfer == "direct": 2751 ↛ 2755line 2751 didn't jump to line 2755, because the condition on line 2751 was never true

2752 # For an export, treat this as equivalent to None. We do not 

2753 # want an import to risk using absolute URIs to datasets owned 

2754 # by another datastore. 

2755 log.info("Treating 'direct' transfer mode as in-place export.") 

2756 transfer = None 

2757 

2758 # Force the directory to be a URI object 

2759 directoryUri: Optional[ResourcePath] = None 

2760 if directory is not None: 

2761 directoryUri = ResourcePath(directory, forceDirectory=True) 

2762 

2763 if transfer is not None and directoryUri is not None: 

2764 # mypy needs the second test 

2765 if not directoryUri.exists(): 2765 ↛ 2766line 2765 didn't jump to line 2766, because the condition on line 2765 was never true

2766 raise FileNotFoundError(f"Export location {directory} does not exist") 

2767 

2768 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

2769 for ref in progress.wrap(refs, "Exporting dataset files"): 

2770 fileLocations = self._get_dataset_locations_info(ref) 

2771 if not fileLocations: 

2772 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

2773 # For now we can not export disassembled datasets 

2774 if len(fileLocations) > 1: 

2775 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

2776 location, storedFileInfo = fileLocations[0] 

2777 

2778 pathInStore = location.pathInStore.path 

2779 if transfer is None: 

2780 # TODO: do we also need to return the readStorageClass somehow? 

2781 # We will use the path in store directly. If this is an 

2782 # absolute URI, preserve it. 

2783 if location.pathInStore.isabs(): 2783 ↛ 2784line 2783 didn't jump to line 2784, because the condition on line 2783 was never true

2784 pathInStore = str(location.uri) 

2785 elif transfer == "direct": 2785 ↛ 2787line 2785 didn't jump to line 2787, because the condition on line 2785 was never true

2786 # Use full URIs to the remote store in the export 

2787 pathInStore = str(location.uri) 

2788 else: 

2789 # mypy needs help 

2790 assert directoryUri is not None, "directoryUri must be defined to get here" 

2791 storeUri = ResourcePath(location.uri) 

2792 

2793 # if the datastore has an absolute URI to a resource, we 

2794 # have two options: 

2795 # 1. Keep the absolute URI in the exported YAML 

2796 # 2. Allocate a new name in the local datastore and transfer 

2797 # it. 

2798 # For now go with option 2 

2799 if location.pathInStore.isabs(): 2799 ↛ 2800line 2799 didn't jump to line 2800, because the condition on line 2799 was never true

2800 template = self.templates.getTemplate(ref) 

2801 newURI = ResourcePath(template.format(ref), forceAbsolute=False) 

2802 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

2803 

2804 exportUri = directoryUri.join(pathInStore) 

2805 exportUri.transfer_from(storeUri, transfer=transfer) 

2806 

2807 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

2808 

2809 @staticmethod 

2810 def computeChecksum( 

2811 uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192 

2812 ) -> Optional[str]: 

2813 """Compute the checksum of the supplied file. 

2814 

2815 Parameters 

2816 ---------- 

2817 uri : `lsst.resources.ResourcePath` 

2818 Name of resource to calculate checksum from. 

2819 algorithm : `str`, optional 

2820 Name of algorithm to use. Must be one of the algorithms supported 

2821 by :py:class`hashlib`. 

2822 block_size : `int` 

2823 Number of bytes to read from file at one time. 

2824 

2825 Returns 

2826 ------- 

2827 hexdigest : `str` 

2828 Hex digest of the file. 

2829 

2830 Notes 

2831 ----- 

2832 Currently returns None if the URI is for a remote resource. 

2833 """ 

2834 if algorithm not in hashlib.algorithms_guaranteed: 2834 ↛ 2835line 2834 didn't jump to line 2835, because the condition on line 2834 was never true

2835 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

2836 

2837 if not uri.isLocal: 2837 ↛ 2838line 2837 didn't jump to line 2838, because the condition on line 2837 was never true

2838 return None 

2839 

2840 hasher = hashlib.new(algorithm) 

2841 

2842 with uri.as_local() as local_uri: 

2843 with open(local_uri.ospath, "rb") as f: 

2844 for chunk in iter(lambda: f.read(block_size), b""): 

2845 hasher.update(chunk) 

2846 

2847 return hasher.hexdigest() 

2848 

2849 def needs_expanded_data_ids( 

2850 self, 

2851 transfer: Optional[str], 

2852 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

2853 ) -> bool: 

2854 # Docstring inherited. 

2855 # This _could_ also use entity to inspect whether the filename template 

2856 # involves placeholders other than the required dimensions for its 

2857 # dataset type, but that's not necessary for correctness; it just 

2858 # enables more optimizations (perhaps only in theory). 

2859 return transfer not in ("direct", None) 

2860 

2861 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

2862 # Docstring inherited from the base class. 

2863 record_data = data.get(self.name) 

2864 if not record_data: 2864 ↛ 2865line 2864 didn't jump to line 2865, because the condition on line 2864 was never true

2865 return 

2866 

2867 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records.keys()) 

2868 

2869 # TODO: Verify that there are no unexpected table names in the dict? 

2870 unpacked_records = [] 

2871 for dataset_data in record_data.records.values(): 

2872 records = dataset_data.get(self._table.name) 

2873 if records: 2873 ↛ 2871line 2873 didn't jump to line 2871, because the condition on line 2873 was never false

2874 for info in records: 

2875 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records" 

2876 unpacked_records.append(info.to_record()) 

2877 if unpacked_records: 

2878 self._table.insert(*unpacked_records, transaction=self._transaction) 

2879 

2880 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

2881 # Docstring inherited from the base class. 

2882 exported_refs = list(self._bridge.check(refs)) 

2883 ids = {ref.getCheckedId() for ref in exported_refs} 

2884 records: defaultdict[DatasetId, defaultdict[str, List[StoredDatastoreItemInfo]]] = defaultdict( 

2885 lambda: defaultdict(list), {id: defaultdict(list) for id in ids} 

2886 ) 

2887 for row in self._table.fetch(dataset_id=ids): 

2888 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row) 

2889 records[info.dataset_id][self._table.name].append(info) 

2890 

2891 record_data = DatastoreRecordData(records=records) 

2892 return {self.name: record_data}