Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 85%

942 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-01-07 10:07 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileDatastore",) 

26 

27import hashlib 

28import logging 

29from collections import defaultdict 

30from dataclasses import dataclass 

31from typing import ( 

32 TYPE_CHECKING, 

33 Any, 

34 ClassVar, 

35 Dict, 

36 Iterable, 

37 List, 

38 Mapping, 

39 Optional, 

40 Sequence, 

41 Set, 

42 Tuple, 

43 Type, 

44 Union, 

45) 

46 

47from lsst.daf.butler import ( 

48 CompositesMap, 

49 Config, 

50 DatasetId, 

51 DatasetRef, 

52 DatasetRefURIs, 

53 DatasetType, 

54 DatasetTypeNotSupportedError, 

55 Datastore, 

56 DatastoreCacheManager, 

57 DatastoreConfig, 

58 DatastoreDisabledCacheManager, 

59 DatastoreRecordData, 

60 DatastoreValidationError, 

61 FileDataset, 

62 FileDescriptor, 

63 FileTemplates, 

64 FileTemplateValidationError, 

65 Formatter, 

66 FormatterFactory, 

67 Location, 

68 LocationFactory, 

69 Progress, 

70 StorageClass, 

71 StoredDatastoreItemInfo, 

72 StoredFileInfo, 

73 ddl, 

74) 

75from lsst.daf.butler.core.repoRelocation import replaceRoot 

76from lsst.daf.butler.core.utils import transactional 

77from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError 

78from lsst.resources import ResourcePath, ResourcePathExpression 

79from lsst.utils.introspection import get_class_of, get_instance_of 

80from lsst.utils.iteration import chunk_iterable 

81 

82# For VERBOSE logging usage. 

83from lsst.utils.logging import VERBOSE, getLogger 

84from lsst.utils.timer import time_this 

85from sqlalchemy import BigInteger, String 

86 

87from ..registry.interfaces import FakeDatasetRef 

88from .genericDatastore import GenericBaseDatastore 

89 

90if TYPE_CHECKING: 90 ↛ 91line 90 didn't jump to line 91, because the condition on line 90 was never true

91 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey 

92 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

93 

94log = getLogger(__name__) 

95 

96 

97class _IngestPrepData(Datastore.IngestPrepData): 

98 """Helper class for FileDatastore ingest implementation. 

99 

100 Parameters 

101 ---------- 

102 datasets : `list` of `FileDataset` 

103 Files to be ingested by this datastore. 

104 """ 

105 

106 def __init__(self, datasets: List[FileDataset]): 

107 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

108 self.datasets = datasets 

109 

110 

111@dataclass(frozen=True) 

112class DatastoreFileGetInformation: 

113 """Collection of useful parameters needed to retrieve a file from 

114 a Datastore. 

115 """ 

116 

117 location: Location 

118 """The location from which to read the dataset.""" 

119 

120 formatter: Formatter 

121 """The `Formatter` to use to deserialize the dataset.""" 

122 

123 info: StoredFileInfo 

124 """Stored information about this file and its formatter.""" 

125 

126 assemblerParams: Mapping[str, Any] 

127 """Parameters to use for post-processing the retrieved dataset.""" 

128 

129 formatterParams: Mapping[str, Any] 

130 """Parameters that were understood by the associated formatter.""" 

131 

132 component: Optional[str] 

133 """The component to be retrieved (can be `None`).""" 

134 

135 readStorageClass: StorageClass 

136 """The `StorageClass` of the dataset being read.""" 

137 

138 

139class FileDatastore(GenericBaseDatastore): 

140 """Generic Datastore for file-based implementations. 

141 

142 Should always be sub-classed since key abstract methods are missing. 

143 

144 Parameters 

145 ---------- 

146 config : `DatastoreConfig` or `str` 

147 Configuration as either a `Config` object or URI to file. 

148 bridgeManager : `DatastoreRegistryBridgeManager` 

149 Object that manages the interface between `Registry` and datastores. 

150 butlerRoot : `str`, optional 

151 New datastore root to use to override the configuration value. 

152 

153 Raises 

154 ------ 

155 ValueError 

156 If root location does not exist and ``create`` is `False` in the 

157 configuration. 

158 """ 

159 

160 defaultConfigFile: ClassVar[Optional[str]] = None 

161 """Path to configuration defaults. Accessed within the ``config`` resource 

162 or relative to a search path. Can be None if no defaults specified. 

163 """ 

164 

165 root: ResourcePath 

166 """Root directory URI of this `Datastore`.""" 

167 

168 locationFactory: LocationFactory 

169 """Factory for creating locations relative to the datastore root.""" 

170 

171 formatterFactory: FormatterFactory 

172 """Factory for creating instances of formatters.""" 

173 

174 templates: FileTemplates 

175 """File templates that can be used by this `Datastore`.""" 

176 

177 composites: CompositesMap 

178 """Determines whether a dataset should be disassembled on put.""" 

179 

180 defaultConfigFile = "datastores/fileDatastore.yaml" 

181 """Path to configuration defaults. Accessed within the ``config`` resource 

182 or relative to a search path. Can be None if no defaults specified. 

183 """ 

184 

185 @classmethod 

186 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

187 """Set any filesystem-dependent config options for this Datastore to 

188 be appropriate for a new empty repository with the given root. 

189 

190 Parameters 

191 ---------- 

192 root : `str` 

193 URI to the root of the data repository. 

194 config : `Config` 

195 A `Config` to update. Only the subset understood by 

196 this component will be updated. Will not expand 

197 defaults. 

198 full : `Config` 

199 A complete config with all defaults expanded that can be 

200 converted to a `DatastoreConfig`. Read-only and will not be 

201 modified by this method. 

202 Repository-specific options that should not be obtained 

203 from defaults when Butler instances are constructed 

204 should be copied from ``full`` to ``config``. 

205 overwrite : `bool`, optional 

206 If `False`, do not modify a value in ``config`` if the value 

207 already exists. Default is always to overwrite with the provided 

208 ``root``. 

209 

210 Notes 

211 ----- 

212 If a keyword is explicitly defined in the supplied ``config`` it 

213 will not be overridden by this method if ``overwrite`` is `False`. 

214 This allows explicit values set in external configs to be retained. 

215 """ 

216 Config.updateParameters( 

217 DatastoreConfig, 

218 config, 

219 full, 

220 toUpdate={"root": root}, 

221 toCopy=("cls", ("records", "table")), 

222 overwrite=overwrite, 

223 ) 

224 

225 @classmethod 

226 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

227 return ddl.TableSpec( 

228 fields=[ 

229 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

230 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

231 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

232 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

233 # Use empty string to indicate no component 

234 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

235 # TODO: should checksum be Base64Bytes instead? 

236 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

237 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

238 ], 

239 unique=frozenset(), 

240 indexes=[ddl.IndexSpec("path")], 

241 ) 

242 

243 def __init__( 

244 self, 

245 config: Union[DatastoreConfig, str], 

246 bridgeManager: DatastoreRegistryBridgeManager, 

247 butlerRoot: str | None = None, 

248 ): 

249 super().__init__(config, bridgeManager) 

250 if "root" not in self.config: 250 ↛ 251line 250 didn't jump to line 251, because the condition on line 250 was never true

251 raise ValueError("No root directory specified in configuration") 

252 

253 self._bridgeManager = bridgeManager 

254 

255 # Name ourselves either using an explicit name or a name 

256 # derived from the (unexpanded) root 

257 if "name" in self.config: 

258 self.name = self.config["name"] 

259 else: 

260 # We use the unexpanded root in the name to indicate that this 

261 # datastore can be moved without having to update registry. 

262 self.name = "{}@{}".format(type(self).__name__, self.config["root"]) 

263 

264 # Support repository relocation in config 

265 # Existence of self.root is checked in subclass 

266 self.root = ResourcePath( 

267 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True 

268 ) 

269 

270 self.locationFactory = LocationFactory(self.root) 

271 self.formatterFactory = FormatterFactory() 

272 

273 # Now associate formatters with storage classes 

274 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe) 

275 

276 # Read the file naming templates 

277 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe) 

278 

279 # See if composites should be disassembled 

280 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe) 

281 

282 tableName = self.config["records", "table"] 

283 try: 

284 # Storage of paths and formatters, keyed by dataset_id 

285 self._table = bridgeManager.opaque.register( 

286 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType) 

287 ) 

288 # Interface to Registry. 

289 self._bridge = bridgeManager.register(self.name) 

290 except ReadOnlyDatabaseError: 

291 # If the database is read only and we just tried and failed to 

292 # create a table, it means someone is trying to create a read-only 

293 # butler client for an empty repo. That should be okay, as long 

294 # as they then try to get any datasets before some other client 

295 # creates the table. Chances are they'rejust validating 

296 # configuration. 

297 pass 

298 

299 # Determine whether checksums should be used - default to False 

300 self.useChecksum = self.config.get("checksum", False) 

301 

302 # Determine whether we can fall back to configuration if a 

303 # requested dataset is not known to registry 

304 self.trustGetRequest = self.config.get("trust_get_request", False) 

305 

306 # Create a cache manager 

307 self.cacheManager: AbstractDatastoreCacheManager 

308 if "cached" in self.config: 308 ↛ 311line 308 didn't jump to line 311, because the condition on line 308 was never false

309 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe) 

310 else: 

311 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe) 

312 

313 # Check existence and create directory structure if necessary 

314 if not self.root.exists(): 

315 if "create" not in self.config or not self.config["create"]: 315 ↛ 316line 315 didn't jump to line 316, because the condition on line 315 was never true

316 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

317 try: 

318 self.root.mkdir() 

319 except Exception as e: 

320 raise ValueError( 

321 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}" 

322 ) from e 

323 

324 def __str__(self) -> str: 

325 return str(self.root) 

326 

327 @property 

328 def bridge(self) -> DatastoreRegistryBridge: 

329 return self._bridge 

330 

331 def _artifact_exists(self, location: Location) -> bool: 

332 """Check that an artifact exists in this datastore at the specified 

333 location. 

334 

335 Parameters 

336 ---------- 

337 location : `Location` 

338 Expected location of the artifact associated with this datastore. 

339 

340 Returns 

341 ------- 

342 exists : `bool` 

343 True if the location can be found, false otherwise. 

344 """ 

345 log.debug("Checking if resource exists: %s", location.uri) 

346 return location.uri.exists() 

347 

348 def _delete_artifact(self, location: Location) -> None: 

349 """Delete the artifact from the datastore. 

350 

351 Parameters 

352 ---------- 

353 location : `Location` 

354 Location of the artifact associated with this datastore. 

355 """ 

356 if location.pathInStore.isabs(): 356 ↛ 357line 356 didn't jump to line 357, because the condition on line 356 was never true

357 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

358 

359 try: 

360 location.uri.remove() 

361 except FileNotFoundError: 

362 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

363 raise 

364 except Exception as e: 

365 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

366 raise 

367 log.debug("Successfully deleted file: %s", location.uri) 

368 

369 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

370 # Docstring inherited from GenericBaseDatastore 

371 records = [info.rebase(ref).to_record() for ref, info in zip(refs, infos)] 

372 self._table.insert(*records, transaction=self._transaction) 

373 

374 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

375 # Docstring inherited from GenericBaseDatastore 

376 

377 # Look for the dataset_id -- there might be multiple matches 

378 # if we have disassembled the dataset. 

379 records = self._table.fetch(dataset_id=ref.id) 

380 return [StoredFileInfo.from_record(record) for record in records] 

381 

382 def _get_stored_records_associated_with_refs( 

383 self, refs: Iterable[DatasetIdRef] 

384 ) -> Dict[DatasetId, List[StoredFileInfo]]: 

385 """Retrieve all records associated with the provided refs. 

386 

387 Parameters 

388 ---------- 

389 refs : iterable of `DatasetIdRef` 

390 The refs for which records are to be retrieved. 

391 

392 Returns 

393 ------- 

394 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

395 The matching records indexed by the ref ID. The number of entries 

396 in the dict can be smaller than the number of requested refs. 

397 """ 

398 records = self._table.fetch(dataset_id=[ref.id for ref in refs]) 

399 

400 # Uniqueness is dataset_id + component so can have multiple records 

401 # per ref. 

402 records_by_ref = defaultdict(list) 

403 for record in records: 

404 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

405 return records_by_ref 

406 

407 def _refs_associated_with_artifacts( 

408 self, paths: List[Union[str, ResourcePath]] 

409 ) -> Dict[str, Set[DatasetId]]: 

410 """Return paths and associated dataset refs. 

411 

412 Parameters 

413 ---------- 

414 paths : `list` of `str` or `lsst.resources.ResourcePath` 

415 All the paths to include in search. 

416 

417 Returns 

418 ------- 

419 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

420 Mapping of each path to a set of associated database IDs. 

421 """ 

422 records = self._table.fetch(path=[str(path) for path in paths]) 

423 result = defaultdict(set) 

424 for row in records: 

425 result[row["path"]].add(row["dataset_id"]) 

426 return result 

427 

428 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> Set[DatasetId]: 

429 """Return all dataset refs associated with the supplied path. 

430 

431 Parameters 

432 ---------- 

433 pathInStore : `lsst.resources.ResourcePath` 

434 Path of interest in the data store. 

435 

436 Returns 

437 ------- 

438 ids : `set` of `int` 

439 All `DatasetRef` IDs associated with this path. 

440 """ 

441 records = list(self._table.fetch(path=str(pathInStore))) 

442 ids = {r["dataset_id"] for r in records} 

443 return ids 

444 

445 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

446 # Docstring inherited from GenericBaseDatastore 

447 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

448 

449 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

450 r"""Find all the `Location`\ s of the requested dataset in the 

451 `Datastore` and the associated stored file information. 

452 

453 Parameters 

454 ---------- 

455 ref : `DatasetRef` 

456 Reference to the required `Dataset`. 

457 

458 Returns 

459 ------- 

460 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

461 Location of the dataset within the datastore and 

462 stored information about each file and its formatter. 

463 """ 

464 # Get the file information (this will fail if no file) 

465 records = self.getStoredItemsInfo(ref) 

466 

467 # Use the path to determine the location -- we need to take 

468 # into account absolute URIs in the datastore record 

469 return [(r.file_location(self.locationFactory), r) for r in records] 

470 

471 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

472 """Check that there is only one dataset associated with the 

473 specified artifact. 

474 

475 Parameters 

476 ---------- 

477 ref : `DatasetRef` or `FakeDatasetRef` 

478 Dataset to be removed. 

479 location : `Location` 

480 The location of the artifact to be removed. 

481 

482 Returns 

483 ------- 

484 can_remove : `Bool` 

485 True if the artifact can be safely removed. 

486 """ 

487 # Can't ever delete absolute URIs. 

488 if location.pathInStore.isabs(): 

489 return False 

490 

491 # Get all entries associated with this path 

492 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

493 if not allRefs: 

494 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

495 

496 # Remove these refs from all the refs and if there is nothing left 

497 # then we can delete 

498 remainingRefs = allRefs - {ref.id} 

499 

500 if remainingRefs: 

501 return False 

502 return True 

503 

504 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, StoredFileInfo]]: 

505 """Predict the location and related file information of the requested 

506 dataset in this datastore. 

507 

508 Parameters 

509 ---------- 

510 ref : `DatasetRef` 

511 Reference to the required `Dataset`. 

512 

513 Returns 

514 ------- 

515 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

516 Expected Location of the dataset within the datastore and 

517 placeholder information about each file and its formatter. 

518 

519 Notes 

520 ----- 

521 Uses the current configuration to determine how we would expect the 

522 datastore files to have been written if we couldn't ask registry. 

523 This is safe so long as there has been no change to datastore 

524 configuration between writing the dataset and wanting to read it. 

525 Will not work for files that have been ingested without using the 

526 standard file template or default formatter. 

527 """ 

528 

529 # If we have a component ref we always need to ask the questions 

530 # of the composite. If the composite is disassembled this routine 

531 # should return all components. If the composite was not 

532 # disassembled the composite is what is stored regardless of 

533 # component request. Note that if the caller has disassembled 

534 # a composite there is no way for this guess to know that 

535 # without trying both the composite and component ref and seeing 

536 # if there is something at the component Location even without 

537 # disassembly being enabled. 

538 if ref.datasetType.isComponent(): 

539 ref = ref.makeCompositeRef() 

540 

541 # See if the ref is a composite that should be disassembled 

542 doDisassembly = self.composites.shouldBeDisassembled(ref) 

543 

544 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = [] 

545 

546 if doDisassembly: 

547 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

548 compRef = ref.makeComponentRef(component) 

549 location, formatter = self._determine_put_formatter_location(compRef) 

550 all_info.append((location, formatter, componentStorage, component)) 

551 

552 else: 

553 # Always use the composite ref if no disassembly 

554 location, formatter = self._determine_put_formatter_location(ref) 

555 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

556 

557 # Convert the list of tuples to have StoredFileInfo as second element 

558 return [ 

559 ( 

560 location, 

561 StoredFileInfo( 

562 formatter=formatter, 

563 path=location.pathInStore.path, 

564 storageClass=storageClass, 

565 component=component, 

566 checksum=None, 

567 file_size=-1, 

568 dataset_id=ref.getCheckedId(), 

569 ), 

570 ) 

571 for location, formatter, storageClass, component in all_info 

572 ] 

573 

574 def _prepare_for_get( 

575 self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None 

576 ) -> List[DatastoreFileGetInformation]: 

577 """Check parameters for ``get`` and obtain formatter and 

578 location. 

579 

580 Parameters 

581 ---------- 

582 ref : `DatasetRef` 

583 Reference to the required Dataset. 

584 parameters : `dict` 

585 `StorageClass`-specific parameters that specify, for example, 

586 a slice of the dataset to be loaded. 

587 

588 Returns 

589 ------- 

590 getInfo : `list` [`DatastoreFileGetInformation`] 

591 Parameters needed to retrieve each file. 

592 """ 

593 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

594 

595 # Get file metadata and internal metadata 

596 fileLocations = self._get_dataset_locations_info(ref) 

597 if not fileLocations: 

598 if not self.trustGetRequest: 

599 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

600 # Assume the dataset is where we think it should be 

601 fileLocations = self._get_expected_dataset_locations_info(ref) 

602 

603 # The storage class we want to use eventually 

604 refStorageClass = ref.datasetType.storageClass 

605 

606 if len(fileLocations) > 1: 

607 disassembled = True 

608 

609 # If trust is involved it is possible that there will be 

610 # components listed here that do not exist in the datastore. 

611 # Explicitly check for file artifact existence and filter out any 

612 # that are missing. 

613 if self.trustGetRequest: 

614 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()] 

615 

616 # For now complain only if we have no components at all. One 

617 # component is probably a problem but we can punt that to the 

618 # assembler. 

619 if not fileLocations: 619 ↛ 620line 619 didn't jump to line 620, because the condition on line 619 was never true

620 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") 

621 

622 else: 

623 disassembled = False 

624 

625 # Is this a component request? 

626 refComponent = ref.datasetType.component() 

627 

628 fileGetInfo = [] 

629 for location, storedFileInfo in fileLocations: 

630 

631 # The storage class used to write the file 

632 writeStorageClass = storedFileInfo.storageClass 

633 

634 # If this has been disassembled we need read to match the write 

635 if disassembled: 

636 readStorageClass = writeStorageClass 

637 else: 

638 readStorageClass = refStorageClass 

639 

640 formatter = get_instance_of( 

641 storedFileInfo.formatter, 

642 FileDescriptor( 

643 location, 

644 readStorageClass=readStorageClass, 

645 storageClass=writeStorageClass, 

646 parameters=parameters, 

647 ), 

648 ref.dataId, 

649 ) 

650 

651 formatterParams, notFormatterParams = formatter.segregateParameters() 

652 

653 # Of the remaining parameters, extract the ones supported by 

654 # this StorageClass (for components not all will be handled) 

655 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

656 

657 # The ref itself could be a component if the dataset was 

658 # disassembled by butler, or we disassembled in datastore and 

659 # components came from the datastore records 

660 component = storedFileInfo.component if storedFileInfo.component else refComponent 

661 

662 fileGetInfo.append( 

663 DatastoreFileGetInformation( 

664 location, 

665 formatter, 

666 storedFileInfo, 

667 assemblerParams, 

668 formatterParams, 

669 component, 

670 readStorageClass, 

671 ) 

672 ) 

673 

674 return fileGetInfo 

675 

676 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

677 """Check the arguments for ``put`` and obtain formatter and 

678 location. 

679 

680 Parameters 

681 ---------- 

682 inMemoryDataset : `object` 

683 The dataset to store. 

684 ref : `DatasetRef` 

685 Reference to the associated Dataset. 

686 

687 Returns 

688 ------- 

689 location : `Location` 

690 The location to write the dataset. 

691 formatter : `Formatter` 

692 The `Formatter` to use to write the dataset. 

693 

694 Raises 

695 ------ 

696 TypeError 

697 Supplied object and storage class are inconsistent. 

698 DatasetTypeNotSupportedError 

699 The associated `DatasetType` is not handled by this datastore. 

700 """ 

701 self._validate_put_parameters(inMemoryDataset, ref) 

702 return self._determine_put_formatter_location(ref) 

703 

704 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]: 

705 """Calculate the formatter and output location to use for put. 

706 

707 Parameters 

708 ---------- 

709 ref : `DatasetRef` 

710 Reference to the associated Dataset. 

711 

712 Returns 

713 ------- 

714 location : `Location` 

715 The location to write the dataset. 

716 formatter : `Formatter` 

717 The `Formatter` to use to write the dataset. 

718 """ 

719 # Work out output file name 

720 try: 

721 template = self.templates.getTemplate(ref) 

722 except KeyError as e: 

723 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

724 

725 # Validate the template to protect against filenames from different 

726 # dataIds returning the same and causing overwrite confusion. 

727 template.validateTemplate(ref) 

728 

729 location = self.locationFactory.fromPath(template.format(ref)) 

730 

731 # Get the formatter based on the storage class 

732 storageClass = ref.datasetType.storageClass 

733 try: 

734 formatter = self.formatterFactory.getFormatter( 

735 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId 

736 ) 

737 except KeyError as e: 

738 raise DatasetTypeNotSupportedError( 

739 f"Unable to find formatter for {ref} in datastore {self.name}" 

740 ) from e 

741 

742 # Now that we know the formatter, update the location 

743 location = formatter.makeUpdatedLocation(location) 

744 

745 return location, formatter 

746 

747 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

748 # Docstring inherited from base class 

749 if transfer != "auto": 

750 return transfer 

751 

752 # See if the paths are within the datastore or not 

753 inside = [self._pathInStore(d.path) is not None for d in datasets] 

754 

755 if all(inside): 

756 transfer = None 

757 elif not any(inside): 757 ↛ 766line 757 didn't jump to line 766, because the condition on line 757 was never false

758 # Allow ResourcePath to use its own knowledge 

759 transfer = "auto" 

760 else: 

761 # This can happen when importing from a datastore that 

762 # has had some datasets ingested using "direct" mode. 

763 # Also allow ResourcePath to sort it out but warn about it. 

764 # This can happen if you are importing from a datastore 

765 # that had some direct transfer datasets. 

766 log.warning( 

767 "Some datasets are inside the datastore and some are outside. Using 'split' " 

768 "transfer mode. This assumes that the files outside the datastore are " 

769 "still accessible to the new butler since they will not be copied into " 

770 "the target datastore." 

771 ) 

772 transfer = "split" 

773 

774 return transfer 

775 

776 def _pathInStore(self, path: ResourcePathExpression) -> Optional[str]: 

777 """Return path relative to datastore root 

778 

779 Parameters 

780 ---------- 

781 path : `lsst.resources.ResourcePathExpression` 

782 Path to dataset. Can be absolute URI. If relative assumed to 

783 be relative to the datastore. Returns path in datastore 

784 or raises an exception if the path it outside. 

785 

786 Returns 

787 ------- 

788 inStore : `str` 

789 Path relative to datastore root. Returns `None` if the file is 

790 outside the root. 

791 """ 

792 # Relative path will always be relative to datastore 

793 pathUri = ResourcePath(path, forceAbsolute=False) 

794 return pathUri.relative_to(self.root) 

795 

796 def _standardizeIngestPath( 

797 self, path: Union[str, ResourcePath], *, transfer: Optional[str] = None 

798 ) -> Union[str, ResourcePath]: 

799 """Standardize the path of a to-be-ingested file. 

800 

801 Parameters 

802 ---------- 

803 path : `str` or `lsst.resources.ResourcePath` 

804 Path of a file to be ingested. This parameter is not expected 

805 to be all the types that can be used to construct a 

806 `~lsst.resources.ResourcePath`. 

807 transfer : `str`, optional 

808 How (and whether) the dataset should be added to the datastore. 

809 See `ingest` for details of transfer modes. 

810 This implementation is provided only so 

811 `NotImplementedError` can be raised if the mode is not supported; 

812 actual transfers are deferred to `_extractIngestInfo`. 

813 

814 Returns 

815 ------- 

816 path : `str` or `lsst.resources.ResourcePath` 

817 New path in what the datastore considers standard form. If an 

818 absolute URI was given that will be returned unchanged. 

819 

820 Notes 

821 ----- 

822 Subclasses of `FileDatastore` can implement this method instead 

823 of `_prepIngest`. It should not modify the data repository or given 

824 file in any way. 

825 

826 Raises 

827 ------ 

828 NotImplementedError 

829 Raised if the datastore does not support the given transfer mode 

830 (including the case where ingest is not supported at all). 

831 FileNotFoundError 

832 Raised if one of the given files does not exist. 

833 """ 

834 if transfer not in (None, "direct", "split") + self.root.transferModes: 834 ↛ 835line 834 didn't jump to line 835, because the condition on line 834 was never true

835 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

836 

837 # A relative URI indicates relative to datastore root 

838 srcUri = ResourcePath(path, forceAbsolute=False) 

839 if not srcUri.isabs(): 

840 srcUri = self.root.join(path) 

841 

842 if not srcUri.exists(): 

843 raise FileNotFoundError( 

844 f"Resource at {srcUri} does not exist; note that paths to ingest " 

845 f"are assumed to be relative to {self.root} unless they are absolute." 

846 ) 

847 

848 if transfer is None: 

849 relpath = srcUri.relative_to(self.root) 

850 if not relpath: 

851 raise RuntimeError( 

852 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})" 

853 ) 

854 

855 # Return the relative path within the datastore for internal 

856 # transfer 

857 path = relpath 

858 

859 return path 

860 

861 def _extractIngestInfo( 

862 self, 

863 path: ResourcePathExpression, 

864 ref: DatasetRef, 

865 *, 

866 formatter: Union[Formatter, Type[Formatter]], 

867 transfer: Optional[str] = None, 

868 record_validation_info: bool = True, 

869 ) -> StoredFileInfo: 

870 """Relocate (if necessary) and extract `StoredFileInfo` from a 

871 to-be-ingested file. 

872 

873 Parameters 

874 ---------- 

875 path : `lsst.resources.ResourcePathExpression` 

876 URI or path of a file to be ingested. 

877 ref : `DatasetRef` 

878 Reference for the dataset being ingested. Guaranteed to have 

879 ``dataset_id not None`. 

880 formatter : `type` or `Formatter` 

881 `Formatter` subclass to use for this dataset or an instance. 

882 transfer : `str`, optional 

883 How (and whether) the dataset should be added to the datastore. 

884 See `ingest` for details of transfer modes. 

885 record_validation_info : `bool`, optional 

886 If `True`, the default, the datastore can record validation 

887 information associated with the file. If `False` the datastore 

888 will not attempt to track any information such as checksums 

889 or file sizes. This can be useful if such information is tracked 

890 in an external system or if the file is to be compressed in place. 

891 It is up to the datastore whether this parameter is relevant. 

892 

893 Returns 

894 ------- 

895 info : `StoredFileInfo` 

896 Internal datastore record for this file. This will be inserted by 

897 the caller; the `_extractIngestInfo` is only responsible for 

898 creating and populating the struct. 

899 

900 Raises 

901 ------ 

902 FileNotFoundError 

903 Raised if one of the given files does not exist. 

904 FileExistsError 

905 Raised if transfer is not `None` but the (internal) location the 

906 file would be moved to is already occupied. 

907 """ 

908 if self._transaction is None: 908 ↛ 909line 908 didn't jump to line 909, because the condition on line 908 was never true

909 raise RuntimeError("Ingest called without transaction enabled") 

910 

911 # Create URI of the source path, do not need to force a relative 

912 # path to absolute. 

913 srcUri = ResourcePath(path, forceAbsolute=False) 

914 

915 # Track whether we have read the size of the source yet 

916 have_sized = False 

917 

918 tgtLocation: Optional[Location] 

919 if transfer is None or transfer == "split": 

920 # A relative path is assumed to be relative to the datastore 

921 # in this context 

922 if not srcUri.isabs(): 

923 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

924 else: 

925 # Work out the path in the datastore from an absolute URI 

926 # This is required to be within the datastore. 

927 pathInStore = srcUri.relative_to(self.root) 

928 if pathInStore is None and transfer is None: 928 ↛ 929line 928 didn't jump to line 929, because the condition on line 928 was never true

929 raise RuntimeError( 

930 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}" 

931 ) 

932 if pathInStore: 932 ↛ 934line 932 didn't jump to line 934, because the condition on line 932 was never false

933 tgtLocation = self.locationFactory.fromPath(pathInStore) 

934 elif transfer == "split": 

935 # Outside the datastore but treat that as a direct ingest 

936 # instead. 

937 tgtLocation = None 

938 else: 

939 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}") 

940 elif transfer == "direct": 940 ↛ 945line 940 didn't jump to line 945, because the condition on line 940 was never true

941 # Want to store the full URI to the resource directly in 

942 # datastore. This is useful for referring to permanent archive 

943 # storage for raw data. 

944 # Trust that people know what they are doing. 

945 tgtLocation = None 

946 else: 

947 # Work out the name we want this ingested file to have 

948 # inside the datastore 

949 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

950 if not tgtLocation.uri.dirname().exists(): 

951 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

952 tgtLocation.uri.dirname().mkdir() 

953 

954 # if we are transferring from a local file to a remote location 

955 # it may be more efficient to get the size and checksum of the 

956 # local file rather than the transferred one 

957 if record_validation_info and srcUri.isLocal: 

958 size = srcUri.size() 

959 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

960 have_sized = True 

961 

962 # Transfer the resource to the destination. 

963 # Allow overwrite of an existing file. This matches the behavior 

964 # of datastore.put() in that it trusts that registry would not 

965 # be asking to overwrite unless registry thought that the 

966 # overwrite was allowed. 

967 tgtLocation.uri.transfer_from( 

968 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True 

969 ) 

970 

971 if tgtLocation is None: 971 ↛ 973line 971 didn't jump to line 973, because the condition on line 971 was never true

972 # This means we are using direct mode 

973 targetUri = srcUri 

974 targetPath = str(srcUri) 

975 else: 

976 targetUri = tgtLocation.uri 

977 targetPath = tgtLocation.pathInStore.path 

978 

979 # the file should exist in the datastore now 

980 if record_validation_info: 

981 if not have_sized: 

982 size = targetUri.size() 

983 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

984 else: 

985 # Not recording any file information. 

986 size = -1 

987 checksum = None 

988 

989 return StoredFileInfo( 

990 formatter=formatter, 

991 path=targetPath, 

992 storageClass=ref.datasetType.storageClass, 

993 component=ref.datasetType.component(), 

994 file_size=size, 

995 checksum=checksum, 

996 dataset_id=ref.getCheckedId(), 

997 ) 

998 

999 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

1000 # Docstring inherited from Datastore._prepIngest. 

1001 filtered = [] 

1002 for dataset in datasets: 

1003 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

1004 if not acceptable: 

1005 continue 

1006 else: 

1007 dataset.refs = acceptable 

1008 if dataset.formatter is None: 

1009 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

1010 else: 

1011 assert isinstance(dataset.formatter, (type, str)) 

1012 formatter_class = get_class_of(dataset.formatter) 

1013 if not issubclass(formatter_class, Formatter): 1013 ↛ 1014line 1013 didn't jump to line 1014, because the condition on line 1013 was never true

1014 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.") 

1015 dataset.formatter = formatter_class 

1016 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

1017 filtered.append(dataset) 

1018 return _IngestPrepData(filtered) 

1019 

1020 @transactional 

1021 def _finishIngest( 

1022 self, 

1023 prepData: Datastore.IngestPrepData, 

1024 *, 

1025 transfer: Optional[str] = None, 

1026 record_validation_info: bool = True, 

1027 ) -> None: 

1028 # Docstring inherited from Datastore._finishIngest. 

1029 refsAndInfos = [] 

1030 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

1031 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

1032 # Do ingest as if the first dataset ref is associated with the file 

1033 info = self._extractIngestInfo( 

1034 dataset.path, 

1035 dataset.refs[0], 

1036 formatter=dataset.formatter, 

1037 transfer=transfer, 

1038 record_validation_info=record_validation_info, 

1039 ) 

1040 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

1041 self._register_datasets(refsAndInfos) 

1042 

1043 def _calculate_ingested_datastore_name( 

1044 self, srcUri: ResourcePath, ref: DatasetRef, formatter: Union[Formatter, Type[Formatter]] 

1045 ) -> Location: 

1046 """Given a source URI and a DatasetRef, determine the name the 

1047 dataset will have inside datastore. 

1048 

1049 Parameters 

1050 ---------- 

1051 srcUri : `lsst.resources.ResourcePath` 

1052 URI to the source dataset file. 

1053 ref : `DatasetRef` 

1054 Ref associated with the newly-ingested dataset artifact. This 

1055 is used to determine the name within the datastore. 

1056 formatter : `Formatter` or Formatter class. 

1057 Formatter to use for validation. Can be a class or an instance. 

1058 

1059 Returns 

1060 ------- 

1061 location : `Location` 

1062 Target location for the newly-ingested dataset. 

1063 """ 

1064 # Ingesting a file from outside the datastore. 

1065 # This involves a new name. 

1066 template = self.templates.getTemplate(ref) 

1067 location = self.locationFactory.fromPath(template.format(ref)) 

1068 

1069 # Get the extension 

1070 ext = srcUri.getExtension() 

1071 

1072 # Update the destination to include that extension 

1073 location.updateExtension(ext) 

1074 

1075 # Ask the formatter to validate this extension 

1076 formatter.validateExtension(location) 

1077 

1078 return location 

1079 

1080 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

1081 """Write out in memory dataset to datastore. 

1082 

1083 Parameters 

1084 ---------- 

1085 inMemoryDataset : `object` 

1086 Dataset to write to datastore. 

1087 ref : `DatasetRef` 

1088 Registry information associated with this dataset. 

1089 

1090 Returns 

1091 ------- 

1092 info : `StoredFileInfo` 

1093 Information describing the artifact written to the datastore. 

1094 """ 

1095 # May need to coerce the in memory dataset to the correct 

1096 # python type. 

1097 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset) 

1098 

1099 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

1100 uri = location.uri 

1101 

1102 if not uri.dirname().exists(): 

1103 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

1104 uri.dirname().mkdir() 

1105 

1106 if self._transaction is None: 1106 ↛ 1107line 1106 didn't jump to line 1107, because the condition on line 1106 was never true

1107 raise RuntimeError("Attempting to write artifact without transaction enabled") 

1108 

1109 def _removeFileExists(uri: ResourcePath) -> None: 

1110 """Remove a file and do not complain if it is not there. 

1111 

1112 This is important since a formatter might fail before the file 

1113 is written and we should not confuse people by writing spurious 

1114 error messages to the log. 

1115 """ 

1116 try: 

1117 uri.remove() 

1118 except FileNotFoundError: 

1119 pass 

1120 

1121 # Register a callback to try to delete the uploaded data if 

1122 # something fails below 

1123 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1124 

1125 data_written = False 

1126 if not uri.isLocal: 

1127 # This is a remote URI. Some datasets can be serialized directly 

1128 # to bytes and sent to the remote datastore without writing a 

1129 # file. If the dataset is intended to be saved to the cache 

1130 # a file is always written and direct write to the remote 

1131 # datastore is bypassed. 

1132 if not self.cacheManager.should_be_cached(ref): 

1133 try: 

1134 serializedDataset = formatter.toBytes(inMemoryDataset) 

1135 except NotImplementedError: 

1136 # Fallback to the file writing option. 

1137 pass 

1138 except Exception as e: 

1139 raise RuntimeError( 

1140 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes." 

1141 ) from e 

1142 else: 

1143 log.debug("Writing bytes directly to %s", uri) 

1144 uri.write(serializedDataset, overwrite=True) 

1145 log.debug("Successfully wrote bytes directly to %s", uri) 

1146 data_written = True 

1147 

1148 if not data_written: 

1149 # Did not write the bytes directly to object store so instead 

1150 # write to temporary file. Always write to a temporary even if 

1151 # using a local file system -- that gives us atomic writes. 

1152 # If a process is killed as the file is being written we do not 

1153 # want it to remain in the correct place but in corrupt state. 

1154 # For local files write to the output directory not temporary dir. 

1155 prefix = uri.dirname() if uri.isLocal else None 

1156 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri: 

1157 # Need to configure the formatter to write to a different 

1158 # location and that needs us to overwrite internals 

1159 log.debug("Writing dataset to temporary location at %s", temporary_uri) 

1160 with formatter._updateLocation(Location(None, temporary_uri)): 

1161 try: 

1162 formatter.write(inMemoryDataset) 

1163 except Exception as e: 

1164 raise RuntimeError( 

1165 f"Failed to serialize dataset {ref} of type" 

1166 f" {type(inMemoryDataset)} to " 

1167 f"temporary location {temporary_uri}" 

1168 ) from e 

1169 

1170 # Use move for a local file since that becomes an efficient 

1171 # os.rename. For remote resources we use copy to allow the 

1172 # file to be cached afterwards. 

1173 transfer = "move" if uri.isLocal else "copy" 

1174 

1175 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True) 

1176 

1177 if transfer == "copy": 

1178 # Cache if required 

1179 self.cacheManager.move_to_cache(temporary_uri, ref) 

1180 

1181 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1182 

1183 # URI is needed to resolve what ingest case are we dealing with 

1184 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1185 

1186 def _read_artifact_into_memory( 

1187 self, 

1188 getInfo: DatastoreFileGetInformation, 

1189 ref: DatasetRef, 

1190 isComponent: bool = False, 

1191 cache_ref: Optional[DatasetRef] = None, 

1192 ) -> Any: 

1193 """Read the artifact from datastore into in memory object. 

1194 

1195 Parameters 

1196 ---------- 

1197 getInfo : `DatastoreFileGetInformation` 

1198 Information about the artifact within the datastore. 

1199 ref : `DatasetRef` 

1200 The registry information associated with this artifact. 

1201 isComponent : `bool` 

1202 Flag to indicate if a component is being read from this artifact. 

1203 cache_ref : `DatasetRef`, optional 

1204 The DatasetRef to use when looking up the file in the cache. 

1205 This ref must have the same ID as the supplied ref but can 

1206 be a parent ref or component ref to indicate to the cache whether 

1207 a composite file is being requested from the cache or a component 

1208 file. Without this the cache will default to the supplied ref but 

1209 it can get confused with read-only derived components for 

1210 disassembled composites. 

1211 

1212 Returns 

1213 ------- 

1214 inMemoryDataset : `object` 

1215 The artifact as a python object. 

1216 """ 

1217 location = getInfo.location 

1218 uri = location.uri 

1219 log.debug("Accessing data from %s", uri) 

1220 

1221 if cache_ref is None: 

1222 cache_ref = ref 

1223 if cache_ref.id != ref.id: 1223 ↛ 1224line 1223 didn't jump to line 1224, because the condition on line 1223 was never true

1224 raise ValueError( 

1225 "The supplied cache dataset ref refers to a different dataset than expected:" 

1226 f" {ref.id} != {cache_ref.id}" 

1227 ) 

1228 

1229 # Cannot recalculate checksum but can compare size as a quick check 

1230 # Do not do this if the size is negative since that indicates 

1231 # we do not know. 

1232 recorded_size = getInfo.info.file_size 

1233 resource_size = uri.size() 

1234 if recorded_size >= 0 and resource_size != recorded_size: 1234 ↛ 1235line 1234 didn't jump to line 1235, because the condition on line 1234 was never true

1235 raise RuntimeError( 

1236 "Integrity failure in Datastore. " 

1237 f"Size of file {uri} ({resource_size}) " 

1238 f"does not match size recorded in registry of {recorded_size}" 

1239 ) 

1240 

1241 # For the general case we have choices for how to proceed. 

1242 # 1. Always use a local file (downloading the remote resource to a 

1243 # temporary file if needed). 

1244 # 2. Use a threshold size and read into memory and use bytes. 

1245 # Use both for now with an arbitrary hand off size. 

1246 # This allows small datasets to be downloaded from remote object 

1247 # stores without requiring a temporary file. 

1248 

1249 formatter = getInfo.formatter 

1250 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1251 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1252 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1253 if cached_file is not None: 

1254 desired_uri = cached_file 

1255 msg = f" (cached version of {uri})" 

1256 else: 

1257 desired_uri = uri 

1258 msg = "" 

1259 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)): 

1260 serializedDataset = desired_uri.read() 

1261 log.debug( 

1262 "Deserializing %s from %d bytes from location %s with formatter %s", 

1263 f"component {getInfo.component}" if isComponent else "", 

1264 len(serializedDataset), 

1265 uri, 

1266 formatter.name(), 

1267 ) 

1268 try: 

1269 result = formatter.fromBytes( 

1270 serializedDataset, component=getInfo.component if isComponent else None 

1271 ) 

1272 except Exception as e: 

1273 raise ValueError( 

1274 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1275 f" ({ref.datasetType.name} from {uri}): {e}" 

1276 ) from e 

1277 else: 

1278 # Read from file. 

1279 

1280 # Have to update the Location associated with the formatter 

1281 # because formatter.read does not allow an override. 

1282 # This could be improved. 

1283 location_updated = False 

1284 msg = "" 

1285 

1286 # First check in cache for local version. 

1287 # The cache will only be relevant for remote resources but 

1288 # no harm in always asking. Context manager ensures that cache 

1289 # file is not deleted during cache expiration. 

1290 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1291 if cached_file is not None: 

1292 msg = f"(via cache read of remote file {uri})" 

1293 uri = cached_file 

1294 location_updated = True 

1295 

1296 with uri.as_local() as local_uri: 

1297 

1298 can_be_cached = False 

1299 if uri != local_uri: 1299 ↛ 1301line 1299 didn't jump to line 1301, because the condition on line 1299 was never true

1300 # URI was remote and file was downloaded 

1301 cache_msg = "" 

1302 location_updated = True 

1303 

1304 if self.cacheManager.should_be_cached(cache_ref): 

1305 # In this scenario we want to ask if the downloaded 

1306 # file should be cached but we should not cache 

1307 # it until after we've used it (to ensure it can't 

1308 # be expired whilst we are using it). 

1309 can_be_cached = True 

1310 

1311 # Say that it is "likely" to be cached because 

1312 # if the formatter read fails we will not be 

1313 # caching this file. 

1314 cache_msg = " and likely cached" 

1315 

1316 msg = f"(via download to local file{cache_msg})" 

1317 

1318 # Calculate the (possibly) new location for the formatter 

1319 # to use. 

1320 newLocation = Location(*local_uri.split()) if location_updated else None 

1321 

1322 log.debug( 

1323 "Reading%s from location %s %s with formatter %s", 

1324 f" component {getInfo.component}" if isComponent else "", 

1325 uri, 

1326 msg, 

1327 formatter.name(), 

1328 ) 

1329 try: 

1330 with formatter._updateLocation(newLocation): 

1331 with time_this( 

1332 log, 

1333 msg="Reading%s from location %s %s with formatter %s", 

1334 args=( 

1335 f" component {getInfo.component}" if isComponent else "", 

1336 uri, 

1337 msg, 

1338 formatter.name(), 

1339 ), 

1340 ): 

1341 result = formatter.read(component=getInfo.component if isComponent else None) 

1342 except Exception as e: 

1343 raise ValueError( 

1344 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1345 f" ({ref.datasetType.name} from {uri}): {e}" 

1346 ) from e 

1347 

1348 # File was read successfully so can move to cache 

1349 if can_be_cached: 1349 ↛ 1350line 1349 didn't jump to line 1350, because the condition on line 1349 was never true

1350 self.cacheManager.move_to_cache(local_uri, cache_ref) 

1351 

1352 return self._post_process_get( 

1353 result, getInfo.readStorageClass, getInfo.assemblerParams, isComponent=isComponent 

1354 ) 

1355 

1356 def knows(self, ref: DatasetRef) -> bool: 

1357 """Check if the dataset is known to the datastore. 

1358 

1359 Does not check for existence of any artifact. 

1360 

1361 Parameters 

1362 ---------- 

1363 ref : `DatasetRef` 

1364 Reference to the required dataset. 

1365 

1366 Returns 

1367 ------- 

1368 exists : `bool` 

1369 `True` if the dataset is known to the datastore. 

1370 """ 

1371 fileLocations = self._get_dataset_locations_info(ref) 

1372 if fileLocations: 

1373 return True 

1374 return False 

1375 

1376 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

1377 # Docstring inherited from the base class. 

1378 

1379 # The records themselves. Could be missing some entries. 

1380 records = self._get_stored_records_associated_with_refs(refs) 

1381 

1382 return {ref: ref.id in records for ref in refs} 

1383 

1384 def _process_mexists_records( 

1385 self, 

1386 id_to_ref: Dict[DatasetId, DatasetRef], 

1387 records: Dict[DatasetId, List[StoredFileInfo]], 

1388 all_required: bool, 

1389 artifact_existence: Optional[Dict[ResourcePath, bool]] = None, 

1390 ) -> Dict[DatasetRef, bool]: 

1391 """Helper function for mexists that checks the given records. 

1392 

1393 Parameters 

1394 ---------- 

1395 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`] 

1396 Mapping of the dataset ID to the dataset ref itself. 

1397 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

1398 Records as generally returned by 

1399 ``_get_stored_records_associated_with_refs``. 

1400 all_required : `bool` 

1401 Flag to indicate whether existence requires all artifacts 

1402 associated with a dataset ID to exist or not for existence. 

1403 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1404 Optional mapping of datastore artifact to existence. Updated by 

1405 this method with details of all artifacts tested. Can be `None` 

1406 if the caller is not interested. 

1407 

1408 Returns 

1409 ------- 

1410 existence : `dict` of [`DatasetRef`, `bool`] 

1411 Mapping from dataset to boolean indicating existence. 

1412 """ 

1413 # The URIs to be checked and a mapping of those URIs to 

1414 # the dataset ID. 

1415 uris_to_check: List[ResourcePath] = [] 

1416 location_map: Dict[ResourcePath, DatasetId] = {} 

1417 

1418 location_factory = self.locationFactory 

1419 

1420 uri_existence: Dict[ResourcePath, bool] = {} 

1421 for ref_id, infos in records.items(): 

1422 # Key is the dataset Id, value is list of StoredItemInfo 

1423 uris = [info.file_location(location_factory).uri for info in infos] 

1424 location_map.update({uri: ref_id for uri in uris}) 

1425 

1426 # Check the local cache directly for a dataset corresponding 

1427 # to the remote URI. 

1428 if self.cacheManager.file_count > 0: 1428 ↛ 1429line 1428 didn't jump to line 1429, because the condition on line 1428 was never true

1429 ref = id_to_ref[ref_id] 

1430 for uri, storedFileInfo in zip(uris, infos): 

1431 check_ref = ref 

1432 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1433 check_ref = ref.makeComponentRef(component) 

1434 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()): 

1435 # Proxy for URI existence. 

1436 uri_existence[uri] = True 

1437 else: 

1438 uris_to_check.append(uri) 

1439 else: 

1440 # Check all of them. 

1441 uris_to_check.extend(uris) 

1442 

1443 if artifact_existence is not None: 

1444 # If a URI has already been checked remove it from the list 

1445 # and immediately add the status to the output dict. 

1446 filtered_uris_to_check = [] 

1447 for uri in uris_to_check: 

1448 if uri in artifact_existence: 

1449 uri_existence[uri] = artifact_existence[uri] 

1450 else: 

1451 filtered_uris_to_check.append(uri) 

1452 uris_to_check = filtered_uris_to_check 

1453 

1454 # Results. 

1455 dataset_existence: Dict[DatasetRef, bool] = {} 

1456 

1457 uri_existence.update(ResourcePath.mexists(uris_to_check)) 

1458 for uri, exists in uri_existence.items(): 

1459 dataset_id = location_map[uri] 

1460 ref = id_to_ref[dataset_id] 

1461 

1462 # Disassembled composite needs to check all locations. 

1463 # all_required indicates whether all need to exist or not. 

1464 if ref in dataset_existence: 

1465 if all_required: 

1466 exists = dataset_existence[ref] and exists 

1467 else: 

1468 exists = dataset_existence[ref] or exists 

1469 dataset_existence[ref] = exists 

1470 

1471 if artifact_existence is not None: 

1472 artifact_existence.update(uri_existence) 

1473 

1474 return dataset_existence 

1475 

1476 def mexists( 

1477 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None 

1478 ) -> Dict[DatasetRef, bool]: 

1479 """Check the existence of multiple datasets at once. 

1480 

1481 Parameters 

1482 ---------- 

1483 refs : iterable of `DatasetRef` 

1484 The datasets to be checked. 

1485 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1486 Optional mapping of datastore artifact to existence. Updated by 

1487 this method with details of all artifacts tested. Can be `None` 

1488 if the caller is not interested. 

1489 

1490 Returns 

1491 ------- 

1492 existence : `dict` of [`DatasetRef`, `bool`] 

1493 Mapping from dataset to boolean indicating existence. 

1494 

1495 Notes 

1496 ----- 

1497 To minimize potentially costly remote existence checks, the local 

1498 cache is checked as a proxy for existence. If a file for this 

1499 `DatasetRef` does exist no check is done for the actual URI. This 

1500 could result in possibly unexpected behavior if the dataset itself 

1501 has been removed from the datastore by another process whilst it is 

1502 still in the cache. 

1503 """ 

1504 chunk_size = 10_000 

1505 dataset_existence: Dict[DatasetRef, bool] = {} 

1506 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size) 

1507 n_found_total = 0 

1508 n_checked = 0 

1509 n_chunks = 0 

1510 for chunk in chunk_iterable(refs, chunk_size=chunk_size): 

1511 chunk_result = self._mexists(chunk, artifact_existence) 

1512 if log.isEnabledFor(VERBOSE): 

1513 n_results = len(chunk_result) 

1514 n_checked += n_results 

1515 # Can treat the booleans as 0, 1 integers and sum them. 

1516 n_found = sum(chunk_result.values()) 

1517 n_found_total += n_found 

1518 log.verbose( 

1519 "Number of datasets found in datastore for chunk %d = %d/%d (running total: %d/%d)", 

1520 n_chunks, 

1521 n_found, 

1522 n_results, 

1523 n_found_total, 

1524 n_checked, 

1525 ) 

1526 dataset_existence.update(chunk_result) 

1527 n_chunks += 1 

1528 

1529 return dataset_existence 

1530 

1531 def _mexists( 

1532 self, refs: Sequence[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None 

1533 ) -> Dict[DatasetRef, bool]: 

1534 """Check the existence of multiple datasets at once. 

1535 

1536 Parameters 

1537 ---------- 

1538 refs : iterable of `DatasetRef` 

1539 The datasets to be checked. 

1540 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1541 Optional mapping of datastore artifact to existence. Updated by 

1542 this method with details of all artifacts tested. Can be `None` 

1543 if the caller is not interested. 

1544 

1545 Returns 

1546 ------- 

1547 existence : `dict` of [`DatasetRef`, `bool`] 

1548 Mapping from dataset to boolean indicating existence. 

1549 """ 

1550 # Need a mapping of dataset_id to dataset ref since the API 

1551 # works with dataset_id 

1552 id_to_ref = {ref.getCheckedId(): ref for ref in refs} 

1553 

1554 # Set of all IDs we are checking for. 

1555 requested_ids = set(id_to_ref.keys()) 

1556 

1557 # The records themselves. Could be missing some entries. 

1558 records = self._get_stored_records_associated_with_refs(refs) 

1559 

1560 dataset_existence = self._process_mexists_records( 

1561 id_to_ref, records, True, artifact_existence=artifact_existence 

1562 ) 

1563 

1564 # Set of IDs that have been handled. 

1565 handled_ids = {ref.id for ref in dataset_existence.keys()} 

1566 

1567 missing_ids = requested_ids - handled_ids 

1568 if missing_ids: 

1569 dataset_existence.update( 

1570 self._mexists_check_expected( 

1571 [id_to_ref[missing] for missing in missing_ids], artifact_existence 

1572 ) 

1573 ) 

1574 

1575 return dataset_existence 

1576 

1577 def _mexists_check_expected( 

1578 self, refs: Sequence[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None 

1579 ) -> Dict[DatasetRef, bool]: 

1580 """Check existence of refs that are not known to datastore. 

1581 

1582 Parameters 

1583 ---------- 

1584 refs : iterable of `DatasetRef` 

1585 The datasets to be checked. These are assumed not to be known 

1586 to datastore. 

1587 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1588 Optional mapping of datastore artifact to existence. Updated by 

1589 this method with details of all artifacts tested. Can be `None` 

1590 if the caller is not interested. 

1591 

1592 Returns 

1593 ------- 

1594 existence : `dict` of [`DatasetRef`, `bool`] 

1595 Mapping from dataset to boolean indicating existence. 

1596 """ 

1597 dataset_existence: Dict[DatasetRef, bool] = {} 

1598 if not self.trustGetRequest: 

1599 # Must assume these do not exist 

1600 for ref in refs: 

1601 dataset_existence[ref] = False 

1602 else: 

1603 log.debug( 

1604 "%d datasets were not known to datastore during initial existence check.", 

1605 len(refs), 

1606 ) 

1607 

1608 # Construct data structure identical to that returned 

1609 # by _get_stored_records_associated_with_refs() but using 

1610 # guessed names. 

1611 records = {} 

1612 id_to_ref = {} 

1613 for missing_ref in refs: 

1614 expected = self._get_expected_dataset_locations_info(missing_ref) 

1615 dataset_id = missing_ref.getCheckedId() 

1616 records[dataset_id] = [info for _, info in expected] 

1617 id_to_ref[dataset_id] = missing_ref 

1618 

1619 dataset_existence.update( 

1620 self._process_mexists_records( 

1621 id_to_ref, 

1622 records, 

1623 False, 

1624 artifact_existence=artifact_existence, 

1625 ) 

1626 ) 

1627 

1628 return dataset_existence 

1629 

1630 def exists(self, ref: DatasetRef) -> bool: 

1631 """Check if the dataset exists in the datastore. 

1632 

1633 Parameters 

1634 ---------- 

1635 ref : `DatasetRef` 

1636 Reference to the required dataset. 

1637 

1638 Returns 

1639 ------- 

1640 exists : `bool` 

1641 `True` if the entity exists in the `Datastore`. 

1642 

1643 Notes 

1644 ----- 

1645 The local cache is checked as a proxy for existence in the remote 

1646 object store. It is possible that another process on a different 

1647 compute node could remove the file from the object store even 

1648 though it is present in the local cache. 

1649 """ 

1650 fileLocations = self._get_dataset_locations_info(ref) 

1651 

1652 # if we are being asked to trust that registry might not be correct 

1653 # we ask for the expected locations and check them explicitly 

1654 if not fileLocations: 

1655 if not self.trustGetRequest: 

1656 return False 

1657 

1658 # First check the cache. If it is not found we must check 

1659 # the datastore itself. Assume that any component in the cache 

1660 # means that the dataset does exist somewhere. 

1661 if self.cacheManager.known_to_cache(ref): 1661 ↛ 1662line 1661 didn't jump to line 1662, because the condition on line 1661 was never true

1662 return True 

1663 

1664 # When we are guessing a dataset location we can not check 

1665 # for the existence of every component since we can not 

1666 # know if every component was written. Instead we check 

1667 # for the existence of any of the expected locations. 

1668 for location, _ in self._get_expected_dataset_locations_info(ref): 

1669 if self._artifact_exists(location): 

1670 return True 

1671 return False 

1672 

1673 # All listed artifacts must exist. 

1674 for location, storedFileInfo in fileLocations: 

1675 # Checking in cache needs the component ref. 

1676 check_ref = ref 

1677 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1678 check_ref = ref.makeComponentRef(component) 

1679 if self.cacheManager.known_to_cache(check_ref, location.getExtension()): 

1680 continue 

1681 

1682 if not self._artifact_exists(location): 1682 ↛ 1683line 1682 didn't jump to line 1683, because the condition on line 1682 was never true

1683 return False 

1684 

1685 return True 

1686 

1687 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

1688 """Return URIs associated with dataset. 

1689 

1690 Parameters 

1691 ---------- 

1692 ref : `DatasetRef` 

1693 Reference to the required dataset. 

1694 predict : `bool`, optional 

1695 If the datastore does not know about the dataset, should it 

1696 return a predicted URI or not? 

1697 

1698 Returns 

1699 ------- 

1700 uris : `DatasetRefURIs` 

1701 The URI to the primary artifact associated with this dataset (if 

1702 the dataset was disassembled within the datastore this may be 

1703 `None`), and the URIs to any components associated with the dataset 

1704 artifact. (can be empty if there are no components). 

1705 """ 

1706 many = self.getManyURIs([ref], predict=predict, allow_missing=False) 

1707 return many[ref] 

1708 

1709 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

1710 """URI to the Dataset. 

1711 

1712 Parameters 

1713 ---------- 

1714 ref : `DatasetRef` 

1715 Reference to the required Dataset. 

1716 predict : `bool` 

1717 If `True`, allow URIs to be returned of datasets that have not 

1718 been written. 

1719 

1720 Returns 

1721 ------- 

1722 uri : `str` 

1723 URI pointing to the dataset within the datastore. If the 

1724 dataset does not exist in the datastore, and if ``predict`` is 

1725 `True`, the URI will be a prediction and will include a URI 

1726 fragment "#predicted". 

1727 If the datastore does not have entities that relate well 

1728 to the concept of a URI the returned URI will be 

1729 descriptive. The returned URI is not guaranteed to be obtainable. 

1730 

1731 Raises 

1732 ------ 

1733 FileNotFoundError 

1734 Raised if a URI has been requested for a dataset that does not 

1735 exist and guessing is not allowed. 

1736 RuntimeError 

1737 Raised if a request is made for a single URI but multiple URIs 

1738 are associated with this dataset. 

1739 

1740 Notes 

1741 ----- 

1742 When a predicted URI is requested an attempt will be made to form 

1743 a reasonable URI based on file templates and the expected formatter. 

1744 """ 

1745 primary, components = self.getURIs(ref, predict) 

1746 if primary is None or components: 1746 ↛ 1747line 1746 didn't jump to line 1747, because the condition on line 1746 was never true

1747 raise RuntimeError( 

1748 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

1749 ) 

1750 return primary 

1751 

1752 def _predict_URIs( 

1753 self, 

1754 ref: DatasetRef, 

1755 ) -> DatasetRefURIs: 

1756 """Predict the URIs of a dataset ref. 

1757 

1758 Parameters 

1759 ---------- 

1760 ref : `DatasetRef` 

1761 Reference to the required Dataset. 

1762 

1763 Returns 

1764 ------- 

1765 URI : DatasetRefUris 

1766 Primary and component URIs. URIs will contain a URI fragment 

1767 "#predicted". 

1768 """ 

1769 uris = DatasetRefURIs() 

1770 

1771 if self.composites.shouldBeDisassembled(ref): 

1772 

1773 for component, _ in ref.datasetType.storageClass.components.items(): 

1774 comp_ref = ref.makeComponentRef(component) 

1775 comp_location, _ = self._determine_put_formatter_location(comp_ref) 

1776 

1777 # Add the "#predicted" URI fragment to indicate this is a 

1778 # guess 

1779 uris.componentURIs[component] = ResourcePath(comp_location.uri.geturl() + "#predicted") 

1780 

1781 else: 

1782 

1783 location, _ = self._determine_put_formatter_location(ref) 

1784 

1785 # Add the "#predicted" URI fragment to indicate this is a guess 

1786 uris.primaryURI = ResourcePath(location.uri.geturl() + "#predicted") 

1787 

1788 return uris 

1789 

1790 def getManyURIs( 

1791 self, 

1792 refs: Iterable[DatasetRef], 

1793 predict: bool = False, 

1794 allow_missing: bool = False, 

1795 ) -> Dict[DatasetRef, DatasetRefURIs]: 

1796 # Docstring inherited 

1797 

1798 uris: Dict[DatasetRef, DatasetRefURIs] = {} 

1799 

1800 records = self._get_stored_records_associated_with_refs(refs) 

1801 records_keys = records.keys() 

1802 

1803 existing_refs = tuple(ref for ref in refs if ref.id in records_keys) 

1804 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys) 

1805 

1806 # Have to handle trustGetRequest mode by checking for the existence 

1807 # of the missing refs on disk. 

1808 if missing_refs: 

1809 dataset_existence = self._mexists_check_expected(missing_refs, None) 

1810 really_missing = set() 

1811 not_missing = set() 

1812 for ref, exists in dataset_existence.items(): 

1813 if exists: 

1814 not_missing.add(ref) 

1815 else: 

1816 really_missing.add(ref) 

1817 

1818 if not_missing: 

1819 # Need to recalculate the missing/existing split. 

1820 existing_refs = existing_refs + tuple(not_missing) 

1821 missing_refs = tuple(really_missing) 

1822 

1823 for ref in missing_refs: 

1824 # if this has never been written then we have to guess 

1825 if not predict: 

1826 if not allow_missing: 

1827 raise FileNotFoundError("Dataset {} not in this datastore.".format(ref)) 

1828 else: 

1829 uris[ref] = self._predict_URIs(ref) 

1830 

1831 for ref in existing_refs: 

1832 file_infos = records[ref.getCheckedId()] 

1833 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos] 

1834 uris[ref] = self._locations_to_URI(ref, file_locations) 

1835 

1836 return uris 

1837 

1838 def _locations_to_URI( 

1839 self, 

1840 ref: DatasetRef, 

1841 file_locations: Sequence[Tuple[Location, StoredFileInfo]], 

1842 ) -> DatasetRefURIs: 

1843 """Convert one or more file locations associated with a DatasetRef 

1844 to a DatasetRefURIs. 

1845 

1846 Parameters 

1847 ---------- 

1848 ref : `DatasetRef` 

1849 Reference to the dataset. 

1850 file_locations : Sequence[Tuple[Location, StoredFileInfo]] 

1851 Each item in the sequence is the location of the dataset within the 

1852 datastore and stored information about the file and its formatter. 

1853 If there is only one item in the sequence then it is treated as the 

1854 primary URI. If there is more than one item then they are treated 

1855 as component URIs. If there are no items then an error is raised 

1856 unless ``self.trustGetRequest`` is `True`. 

1857 

1858 Returns 

1859 ------- 

1860 uris: DatasetRefURIs 

1861 Represents the primary URI or component URIs described by the 

1862 inputs. 

1863 

1864 Raises 

1865 ------ 

1866 RuntimeError 

1867 If no file locations are passed in and ``self.trustGetRequest`` is 

1868 `False`. 

1869 FileNotFoundError 

1870 If the a passed-in URI does not exist, and ``self.trustGetRequest`` 

1871 is `False`. 

1872 RuntimeError 

1873 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is 

1874 unexpected). 

1875 """ 

1876 

1877 guessing = False 

1878 uris = DatasetRefURIs() 

1879 

1880 if not file_locations: 

1881 if not self.trustGetRequest: 1881 ↛ 1882line 1881 didn't jump to line 1882, because the condition on line 1881 was never true

1882 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1883 file_locations = self._get_expected_dataset_locations_info(ref) 

1884 guessing = True 

1885 

1886 if len(file_locations) == 1: 

1887 # No disassembly so this is the primary URI 

1888 uris.primaryURI = file_locations[0][0].uri 

1889 if guessing and not uris.primaryURI.exists(): 1889 ↛ 1890line 1889 didn't jump to line 1890, because the condition on line 1889 was never true

1890 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist") 

1891 else: 

1892 for location, file_info in file_locations: 

1893 if file_info.component is None: 1893 ↛ 1894line 1893 didn't jump to line 1894, because the condition on line 1893 was never true

1894 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1895 if guessing and not location.uri.exists(): 1895 ↛ 1899line 1895 didn't jump to line 1899, because the condition on line 1895 was never true

1896 # If we are trusting then it is entirely possible for 

1897 # some components to be missing. In that case we skip 

1898 # to the next component. 

1899 if self.trustGetRequest: 

1900 continue 

1901 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist") 

1902 uris.componentURIs[file_info.component] = location.uri 

1903 

1904 return uris 

1905 

1906 def retrieveArtifacts( 

1907 self, 

1908 refs: Iterable[DatasetRef], 

1909 destination: ResourcePath, 

1910 transfer: str = "auto", 

1911 preserve_path: bool = True, 

1912 overwrite: bool = False, 

1913 ) -> List[ResourcePath]: 

1914 """Retrieve the file artifacts associated with the supplied refs. 

1915 

1916 Parameters 

1917 ---------- 

1918 refs : iterable of `DatasetRef` 

1919 The datasets for which file artifacts are to be retrieved. 

1920 A single ref can result in multiple files. The refs must 

1921 be resolved. 

1922 destination : `lsst.resources.ResourcePath` 

1923 Location to write the file artifacts. 

1924 transfer : `str`, optional 

1925 Method to use to transfer the artifacts. Must be one of the options 

1926 supported by `lsst.resources.ResourcePath.transfer_from()`. 

1927 "move" is not allowed. 

1928 preserve_path : `bool`, optional 

1929 If `True` the full path of the file artifact within the datastore 

1930 is preserved. If `False` the final file component of the path 

1931 is used. 

1932 overwrite : `bool`, optional 

1933 If `True` allow transfers to overwrite existing files at the 

1934 destination. 

1935 

1936 Returns 

1937 ------- 

1938 targets : `list` of `lsst.resources.ResourcePath` 

1939 URIs of file artifacts in destination location. Order is not 

1940 preserved. 

1941 """ 

1942 if not destination.isdir(): 1942 ↛ 1943line 1942 didn't jump to line 1943, because the condition on line 1942 was never true

1943 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

1944 

1945 if transfer == "move": 

1946 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

1947 

1948 # Source -> Destination 

1949 # This also helps filter out duplicate DatasetRef in the request 

1950 # that will map to the same underlying file transfer. 

1951 to_transfer: Dict[ResourcePath, ResourcePath] = {} 

1952 

1953 for ref in refs: 

1954 locations = self._get_dataset_locations_info(ref) 

1955 for location, _ in locations: 

1956 source_uri = location.uri 

1957 target_path: ResourcePathExpression 

1958 if preserve_path: 

1959 target_path = location.pathInStore 

1960 if target_path.isabs(): 1960 ↛ 1963line 1960 didn't jump to line 1963, because the condition on line 1960 was never true

1961 # This is an absolute path to an external file. 

1962 # Use the full path. 

1963 target_path = target_path.relativeToPathRoot 

1964 else: 

1965 target_path = source_uri.basename() 

1966 target_uri = destination.join(target_path) 

1967 to_transfer[source_uri] = target_uri 

1968 

1969 # In theory can now parallelize the transfer 

1970 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer)) 

1971 for source_uri, target_uri in to_transfer.items(): 

1972 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

1973 

1974 return list(to_transfer.values()) 

1975 

1976 def get( 

1977 self, 

1978 ref: DatasetRef, 

1979 parameters: Optional[Mapping[str, Any]] = None, 

1980 storageClass: Optional[Union[StorageClass, str]] = None, 

1981 ) -> Any: 

1982 """Load an InMemoryDataset from the store. 

1983 

1984 Parameters 

1985 ---------- 

1986 ref : `DatasetRef` 

1987 Reference to the required Dataset. 

1988 parameters : `dict` 

1989 `StorageClass`-specific parameters that specify, for example, 

1990 a slice of the dataset to be loaded. 

1991 storageClass : `StorageClass` or `str`, optional 

1992 The storage class to be used to override the Python type 

1993 returned by this method. By default the returned type matches 

1994 the dataset type definition for this dataset. Specifying a 

1995 read `StorageClass` can force a different type to be returned. 

1996 This type must be compatible with the original type. 

1997 

1998 Returns 

1999 ------- 

2000 inMemoryDataset : `object` 

2001 Requested dataset or slice thereof as an InMemoryDataset. 

2002 

2003 Raises 

2004 ------ 

2005 FileNotFoundError 

2006 Requested dataset can not be retrieved. 

2007 TypeError 

2008 Return value from formatter has unexpected type. 

2009 ValueError 

2010 Formatter failed to process the dataset. 

2011 """ 

2012 # Supplied storage class for the component being read is either 

2013 # from the ref itself or some an override if we want to force 

2014 # type conversion. 

2015 if storageClass is not None: 

2016 ref = ref.overrideStorageClass(storageClass) 

2017 refStorageClass = ref.datasetType.storageClass 

2018 

2019 allGetInfo = self._prepare_for_get(ref, parameters) 

2020 refComponent = ref.datasetType.component() 

2021 

2022 # Create mapping from component name to related info 

2023 allComponents = {i.component: i for i in allGetInfo} 

2024 

2025 # By definition the dataset is disassembled if we have more 

2026 # than one record for it. 

2027 isDisassembled = len(allGetInfo) > 1 

2028 

2029 # Look for the special case where we are disassembled but the 

2030 # component is a derived component that was not written during 

2031 # disassembly. For this scenario we need to check that the 

2032 # component requested is listed as a derived component for the 

2033 # composite storage class 

2034 isDisassembledReadOnlyComponent = False 

2035 if isDisassembled and refComponent: 

2036 # The composite storage class should be accessible through 

2037 # the component dataset type 

2038 compositeStorageClass = ref.datasetType.parentStorageClass 

2039 

2040 # In the unlikely scenario where the composite storage 

2041 # class is not known, we can only assume that this is a 

2042 # normal component. If that assumption is wrong then the 

2043 # branch below that reads a persisted component will fail 

2044 # so there is no need to complain here. 

2045 if compositeStorageClass is not None: 2045 ↛ 2048line 2045 didn't jump to line 2048, because the condition on line 2045 was never false

2046 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

2047 

2048 if isDisassembled and not refComponent: 

2049 # This was a disassembled dataset spread over multiple files 

2050 # and we need to put them all back together again. 

2051 # Read into memory and then assemble 

2052 

2053 # Check that the supplied parameters are suitable for the type read 

2054 refStorageClass.validateParameters(parameters) 

2055 

2056 # We want to keep track of all the parameters that were not used 

2057 # by formatters. We assume that if any of the component formatters 

2058 # use a parameter that we do not need to apply it again in the 

2059 # assembler. 

2060 usedParams = set() 

2061 

2062 components: Dict[str, Any] = {} 

2063 for getInfo in allGetInfo: 

2064 # assemblerParams are parameters not understood by the 

2065 # associated formatter. 

2066 usedParams.update(set(getInfo.formatterParams)) 

2067 

2068 component = getInfo.component 

2069 

2070 if component is None: 2070 ↛ 2071line 2070 didn't jump to line 2071, because the condition on line 2070 was never true

2071 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

2072 

2073 # We do not want the formatter to think it's reading 

2074 # a component though because it is really reading a 

2075 # standalone dataset -- always tell reader it is not a 

2076 # component. 

2077 components[component] = self._read_artifact_into_memory( 

2078 getInfo, ref.makeComponentRef(component), isComponent=False 

2079 ) 

2080 

2081 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

2082 

2083 # Any unused parameters will have to be passed to the assembler 

2084 if parameters: 

2085 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

2086 else: 

2087 unusedParams = {} 

2088 

2089 # Process parameters 

2090 return ref.datasetType.storageClass.delegate().handleParameters( 

2091 inMemoryDataset, parameters=unusedParams 

2092 ) 

2093 

2094 elif isDisassembledReadOnlyComponent: 

2095 

2096 compositeStorageClass = ref.datasetType.parentStorageClass 

2097 if compositeStorageClass is None: 2097 ↛ 2098line 2097 didn't jump to line 2098, because the condition on line 2097 was never true

2098 raise RuntimeError( 

2099 f"Unable to retrieve derived component '{refComponent}' since" 

2100 "no composite storage class is available." 

2101 ) 

2102 

2103 if refComponent is None: 2103 ↛ 2105line 2103 didn't jump to line 2105, because the condition on line 2103 was never true

2104 # Mainly for mypy 

2105 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

2106 

2107 # Assume that every derived component can be calculated by 

2108 # forwarding the request to a single read/write component. 

2109 # Rather than guessing which rw component is the right one by 

2110 # scanning each for a derived component of the same name, 

2111 # we ask the storage class delegate directly which one is best to 

2112 # use. 

2113 compositeDelegate = compositeStorageClass.delegate() 

2114 forwardedComponent = compositeDelegate.selectResponsibleComponent( 

2115 refComponent, set(allComponents) 

2116 ) 

2117 

2118 # Select the relevant component 

2119 rwInfo = allComponents[forwardedComponent] 

2120 

2121 # For now assume that read parameters are validated against 

2122 # the real component and not the requested component 

2123 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

2124 forwardedStorageClass.validateParameters(parameters) 

2125 

2126 # The reference to use for the caching must refer to the forwarded 

2127 # component and not the derived component. 

2128 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent) 

2129 

2130 # Unfortunately the FileDescriptor inside the formatter will have 

2131 # the wrong write storage class so we need to create a new one 

2132 # given the immutability constraint. 

2133 writeStorageClass = rwInfo.info.storageClass 

2134 

2135 # We may need to put some thought into parameters for read 

2136 # components but for now forward them on as is 

2137 readFormatter = type(rwInfo.formatter)( 

2138 FileDescriptor( 

2139 rwInfo.location, 

2140 readStorageClass=refStorageClass, 

2141 storageClass=writeStorageClass, 

2142 parameters=parameters, 

2143 ), 

2144 ref.dataId, 

2145 ) 

2146 

2147 # The assembler can not receive any parameter requests for a 

2148 # derived component at this time since the assembler will 

2149 # see the storage class of the derived component and those 

2150 # parameters will have to be handled by the formatter on the 

2151 # forwarded storage class. 

2152 assemblerParams: Dict[str, Any] = {} 

2153 

2154 # Need to created a new info that specifies the derived 

2155 # component and associated storage class 

2156 readInfo = DatastoreFileGetInformation( 

2157 rwInfo.location, 

2158 readFormatter, 

2159 rwInfo.info, 

2160 assemblerParams, 

2161 {}, 

2162 refComponent, 

2163 refStorageClass, 

2164 ) 

2165 

2166 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref) 

2167 

2168 else: 

2169 # Single file request or component from that composite file 

2170 for lookup in (refComponent, None): 2170 ↛ 2175line 2170 didn't jump to line 2175, because the loop on line 2170 didn't complete

2171 if lookup in allComponents: 2171 ↛ 2170line 2171 didn't jump to line 2170, because the condition on line 2171 was never false

2172 getInfo = allComponents[lookup] 

2173 break 

2174 else: 

2175 raise FileNotFoundError( 

2176 f"Component {refComponent} not found for ref {ref} in datastore {self.name}" 

2177 ) 

2178 

2179 # Do not need the component itself if already disassembled 

2180 if isDisassembled: 

2181 isComponent = False 

2182 else: 

2183 isComponent = getInfo.component is not None 

2184 

2185 # For a component read of a composite we want the cache to 

2186 # be looking at the composite ref itself. 

2187 cache_ref = ref.makeCompositeRef() if isComponent else ref 

2188 

2189 # For a disassembled component we can validate parametersagainst 

2190 # the component storage class directly 

2191 if isDisassembled: 

2192 refStorageClass.validateParameters(parameters) 

2193 else: 

2194 # For an assembled composite this could be a derived 

2195 # component derived from a real component. The validity 

2196 # of the parameters is not clear. For now validate against 

2197 # the composite storage class 

2198 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

2199 

2200 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref) 

2201 

2202 @transactional 

2203 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

2204 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

2205 

2206 Parameters 

2207 ---------- 

2208 inMemoryDataset : `object` 

2209 The dataset to store. 

2210 ref : `DatasetRef` 

2211 Reference to the associated Dataset. 

2212 

2213 Raises 

2214 ------ 

2215 TypeError 

2216 Supplied object and storage class are inconsistent. 

2217 DatasetTypeNotSupportedError 

2218 The associated `DatasetType` is not handled by this datastore. 

2219 

2220 Notes 

2221 ----- 

2222 If the datastore is configured to reject certain dataset types it 

2223 is possible that the put will fail and raise a 

2224 `DatasetTypeNotSupportedError`. The main use case for this is to 

2225 allow `ChainedDatastore` to put to multiple datastores without 

2226 requiring that every datastore accepts the dataset. 

2227 """ 

2228 

2229 doDisassembly = self.composites.shouldBeDisassembled(ref) 

2230 # doDisassembly = True 

2231 

2232 artifacts = [] 

2233 if doDisassembly: 

2234 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

2235 if components is None: 2235 ↛ 2236line 2235 didn't jump to line 2236, because the condition on line 2235 was never true

2236 raise RuntimeError( 

2237 f"Inconsistent configuration: dataset type {ref.datasetType.name} " 

2238 f"with storage class {ref.datasetType.storageClass.name} " 

2239 "is configured to be disassembled, but cannot be." 

2240 ) 

2241 for component, componentInfo in components.items(): 

2242 # Don't recurse because we want to take advantage of 

2243 # bulk insert -- need a new DatasetRef that refers to the 

2244 # same dataset_id but has the component DatasetType 

2245 # DatasetType does not refer to the types of components 

2246 # So we construct one ourselves. 

2247 compRef = ref.makeComponentRef(component) 

2248 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

2249 artifacts.append((compRef, storedInfo)) 

2250 else: 

2251 # Write the entire thing out 

2252 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

2253 artifacts.append((ref, storedInfo)) 

2254 

2255 self._register_datasets(artifacts) 

2256 

2257 @transactional 

2258 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None: 

2259 # At this point can safely remove these datasets from the cache 

2260 # to avoid confusion later on. If they are not trashed later 

2261 # the cache will simply be refilled. 

2262 self.cacheManager.remove_from_cache(ref) 

2263 

2264 # If we are in trust mode there will be nothing to move to 

2265 # the trash table and we will have to try to delete the file 

2266 # immediately. 

2267 if self.trustGetRequest: 

2268 # Try to keep the logic below for a single file trash. 

2269 if isinstance(ref, DatasetRef): 

2270 refs = {ref} 

2271 else: 

2272 # Will recreate ref at the end of this branch. 

2273 refs = set(ref) 

2274 

2275 # Determine which datasets are known to datastore directly. 

2276 id_to_ref = {ref.getCheckedId(): ref for ref in refs} 

2277 existing_ids = self._get_stored_records_associated_with_refs(refs) 

2278 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids} 

2279 

2280 missing = refs - existing_refs 

2281 if missing: 

2282 # Do an explicit existence check on these refs. 

2283 # We only care about the artifacts at this point and not 

2284 # the dataset existence. 

2285 artifact_existence: Dict[ResourcePath, bool] = {} 

2286 _ = self.mexists(missing, artifact_existence) 

2287 uris = [uri for uri, exists in artifact_existence.items() if exists] 

2288 

2289 # FUTURE UPGRADE: Implement a parallelized bulk remove. 

2290 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris)) 

2291 for uri in uris: 

2292 try: 

2293 uri.remove() 

2294 except Exception as e: 

2295 if ignore_errors: 

2296 log.debug("Artifact %s could not be removed: %s", uri, e) 

2297 continue 

2298 raise 

2299 

2300 # There is no point asking the code below to remove refs we 

2301 # know are missing so update it with the list of existing 

2302 # records. Try to retain one vs many logic. 

2303 if not existing_refs: 

2304 # Nothing more to do since none of the datasets were 

2305 # known to the datastore record table. 

2306 return 

2307 ref = list(existing_refs) 

2308 if len(ref) == 1: 

2309 ref = ref[0] 

2310 

2311 # Get file metadata and internal metadata 

2312 if not isinstance(ref, DatasetRef): 

2313 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

2314 # Assumed to be an iterable of refs so bulk mode enabled. 

2315 try: 

2316 self.bridge.moveToTrash(ref, transaction=self._transaction) 

2317 except Exception as e: 

2318 if ignore_errors: 

2319 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

2320 else: 

2321 raise 

2322 return 

2323 

2324 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

2325 

2326 fileLocations = self._get_dataset_locations_info(ref) 

2327 

2328 if not fileLocations: 

2329 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

2330 if ignore_errors: 

2331 log.warning(err_msg) 

2332 return 

2333 else: 

2334 raise FileNotFoundError(err_msg) 

2335 

2336 for location, storedFileInfo in fileLocations: 

2337 if not self._artifact_exists(location): 2337 ↛ 2338line 2337 didn't jump to line 2338

2338 err_msg = ( 

2339 f"Dataset is known to datastore {self.name} but " 

2340 f"associated artifact ({location.uri}) is missing" 

2341 ) 

2342 if ignore_errors: 

2343 log.warning(err_msg) 

2344 return 

2345 else: 

2346 raise FileNotFoundError(err_msg) 

2347 

2348 # Mark dataset as trashed 

2349 try: 

2350 self.bridge.moveToTrash([ref], transaction=self._transaction) 

2351 except Exception as e: 

2352 if ignore_errors: 

2353 log.warning( 

2354 "Attempted to mark dataset (%s) to be trashed in datastore %s " 

2355 "but encountered an error: %s", 

2356 ref, 

2357 self.name, 

2358 e, 

2359 ) 

2360 pass 

2361 else: 

2362 raise 

2363 

2364 @transactional 

2365 def emptyTrash(self, ignore_errors: bool = True) -> None: 

2366 """Remove all datasets from the trash. 

2367 

2368 Parameters 

2369 ---------- 

2370 ignore_errors : `bool` 

2371 If `True` return without error even if something went wrong. 

2372 Problems could occur if another process is simultaneously trying 

2373 to delete. 

2374 """ 

2375 log.debug("Emptying trash in datastore %s", self.name) 

2376 

2377 # Context manager will empty trash iff we finish it without raising. 

2378 # It will also automatically delete the relevant rows from the 

2379 # trash table and the records table. 

2380 with self.bridge.emptyTrash( 

2381 self._table, record_class=StoredFileInfo, record_column="path" 

2382 ) as trash_data: 

2383 # Removing the artifacts themselves requires that the files are 

2384 # not also associated with refs that are not to be trashed. 

2385 # Therefore need to do a query with the file paths themselves 

2386 # and return all the refs associated with them. Can only delete 

2387 # a file if the refs to be trashed are the only refs associated 

2388 # with the file. 

2389 # This requires multiple copies of the trashed items 

2390 trashed, artifacts_to_keep = trash_data 

2391 

2392 if artifacts_to_keep is None: 

2393 # The bridge is not helping us so have to work it out 

2394 # ourselves. This is not going to be as efficient. 

2395 trashed = list(trashed) 

2396 

2397 # The instance check is for mypy since up to this point it 

2398 # does not know the type of info. 

2399 path_map = self._refs_associated_with_artifacts( 

2400 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)] 

2401 ) 

2402 

2403 for ref, info in trashed: 

2404 

2405 # Mypy needs to know this is not the base class 

2406 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2407 

2408 # Check for mypy 

2409 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

2410 

2411 path_map[info.path].remove(ref.id) 

2412 if not path_map[info.path]: 2412 ↛ 2403line 2412 didn't jump to line 2403, because the condition on line 2412 was never false

2413 del path_map[info.path] 

2414 

2415 artifacts_to_keep = set(path_map) 

2416 

2417 for ref, info in trashed: 

2418 

2419 # Should not happen for this implementation but need 

2420 # to keep mypy happy. 

2421 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

2422 

2423 # Mypy needs to know this is not the base class 

2424 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2425 

2426 # Check for mypy 

2427 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

2428 

2429 if info.path in artifacts_to_keep: 

2430 # This is a multi-dataset artifact and we are not 

2431 # removing all associated refs. 

2432 continue 

2433 

2434 # Only trashed refs still known to datastore will be returned. 

2435 location = info.file_location(self.locationFactory) 

2436 

2437 # Point of no return for this artifact 

2438 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

2439 try: 

2440 self._delete_artifact(location) 

2441 except FileNotFoundError: 

2442 # If the file itself has been deleted there is nothing 

2443 # we can do about it. It is possible that trash has 

2444 # been run in parallel in another process or someone 

2445 # decided to delete the file. It is unlikely to come 

2446 # back and so we should still continue with the removal 

2447 # of the entry from the trash table. It is also possible 

2448 # we removed it in a previous iteration if it was 

2449 # a multi-dataset artifact. The delete artifact method 

2450 # will log a debug message in this scenario. 

2451 # Distinguishing file missing before trash started and 

2452 # file already removed previously as part of this trash 

2453 # is not worth the distinction with regards to potential 

2454 # memory cost. 

2455 pass 

2456 except Exception as e: 

2457 if ignore_errors: 

2458 # Use a debug message here even though it's not 

2459 # a good situation. In some cases this can be 

2460 # caused by a race between user A and user B 

2461 # and neither of them has permissions for the 

2462 # other's files. Butler does not know about users 

2463 # and trash has no idea what collections these 

2464 # files were in (without guessing from a path). 

2465 log.debug( 

2466 "Encountered error removing artifact %s from datastore %s: %s", 

2467 location.uri, 

2468 self.name, 

2469 e, 

2470 ) 

2471 else: 

2472 raise 

2473 

2474 @transactional 

2475 def transfer_from( 

2476 self, 

2477 source_datastore: Datastore, 

2478 refs: Iterable[DatasetRef], 

2479 local_refs: Optional[Iterable[DatasetRef]] = None, 

2480 transfer: str = "auto", 

2481 artifact_existence: Optional[Dict[ResourcePath, bool]] = None, 

2482 ) -> None: 

2483 # Docstring inherited 

2484 if type(self) is not type(source_datastore): 

2485 raise TypeError( 

2486 f"Datastore mismatch between this datastore ({type(self)}) and the " 

2487 f"source datastore ({type(source_datastore)})." 

2488 ) 

2489 

2490 # Be explicit for mypy 

2491 if not isinstance(source_datastore, FileDatastore): 2491 ↛ 2492line 2491 didn't jump to line 2492, because the condition on line 2491 was never true

2492 raise TypeError( 

2493 "Can only transfer to a FileDatastore from another FileDatastore, not" 

2494 f" {type(source_datastore)}" 

2495 ) 

2496 

2497 # Stop early if "direct" transfer mode is requested. That would 

2498 # require that the URI inside the source datastore should be stored 

2499 # directly in the target datastore, which seems unlikely to be useful 

2500 # since at any moment the source datastore could delete the file. 

2501 if transfer in ("direct", "split"): 

2502 raise ValueError( 

2503 f"Can not transfer from a source datastore using {transfer} mode since" 

2504 " those files are controlled by the other datastore." 

2505 ) 

2506 

2507 # Empty existence lookup if none given. 

2508 if artifact_existence is None: 

2509 artifact_existence = {} 

2510 

2511 # We will go through the list multiple times so must convert 

2512 # generators to lists. 

2513 refs = list(refs) 

2514 

2515 if local_refs is None: 

2516 local_refs = refs 

2517 else: 

2518 local_refs = list(local_refs) 

2519 

2520 # In order to handle disassembled composites the code works 

2521 # at the records level since it can assume that internal APIs 

2522 # can be used. 

2523 # - If the record already exists in the destination this is assumed 

2524 # to be okay. 

2525 # - If there is no record but the source and destination URIs are 

2526 # identical no transfer is done but the record is added. 

2527 # - If the source record refers to an absolute URI currently assume 

2528 # that that URI should remain absolute and will be visible to the 

2529 # destination butler. May need to have a flag to indicate whether 

2530 # the dataset should be transferred. This will only happen if 

2531 # the detached Butler has had a local ingest. 

2532 

2533 # What we really want is all the records in the source datastore 

2534 # associated with these refs. Or derived ones if they don't exist 

2535 # in the source. 

2536 source_records = source_datastore._get_stored_records_associated_with_refs(refs) 

2537 

2538 # The source dataset_ids are the keys in these records 

2539 source_ids = set(source_records) 

2540 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

2541 

2542 # The not None check is to appease mypy 

2543 requested_ids = set(ref.id for ref in refs if ref.id is not None) 

2544 missing_ids = requested_ids - source_ids 

2545 

2546 # Missing IDs can be okay if that datastore has allowed 

2547 # gets based on file existence. Should we transfer what we can 

2548 # or complain about it and warn? 

2549 if missing_ids and not source_datastore.trustGetRequest: 2549 ↛ 2550line 2549 didn't jump to line 2550, because the condition on line 2549 was never true

2550 raise ValueError( 

2551 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}" 

2552 ) 

2553 

2554 # Need to map these missing IDs to a DatasetRef so we can guess 

2555 # the details. 

2556 if missing_ids: 

2557 log.info( 

2558 "Number of expected datasets missing from source datastore records: %d out of %d", 

2559 len(missing_ids), 

2560 len(requested_ids), 

2561 ) 

2562 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

2563 

2564 # This should be chunked in case we end up having to check 

2565 # the file store since we need some log output to show 

2566 # progress. 

2567 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000): 

2568 records = {} 

2569 for missing in missing_ids_chunk: 

2570 # Ask the source datastore where the missing artifacts 

2571 # should be. An execution butler might not know about the 

2572 # artifacts even if they are there. 

2573 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) 

2574 records[missing] = [info for _, info in expected] 

2575 

2576 # Call the mexist helper method in case we have not already 

2577 # checked these artifacts such that artifact_existence is 

2578 # empty. This allows us to benefit from parallelism. 

2579 # datastore.mexists() itself does not give us access to the 

2580 # derived datastore record. 

2581 log.verbose("Checking existence of %d datasets unknown to datastore", len(records)) 

2582 ref_exists = source_datastore._process_mexists_records( 

2583 id_to_ref, records, False, artifact_existence=artifact_existence 

2584 ) 

2585 

2586 # Now go through the records and propagate the ones that exist. 

2587 location_factory = source_datastore.locationFactory 

2588 for missing, record_list in records.items(): 

2589 # Skip completely if the ref does not exist. 

2590 ref = id_to_ref[missing] 

2591 if not ref_exists[ref]: 

2592 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref) 

2593 continue 

2594 # Check for file artifact to decide which parts of a 

2595 # disassembled composite do exist. If there is only a 

2596 # single record we don't even need to look because it can't 

2597 # be a composite and must exist. 

2598 if len(record_list) == 1: 

2599 dataset_records = record_list 

2600 else: 

2601 dataset_records = [ 

2602 record 

2603 for record in record_list 

2604 if artifact_existence[record.file_location(location_factory).uri] 

2605 ] 

2606 assert len(dataset_records) > 0, "Disassembled composite should have had some files." 

2607 

2608 # Rely on source_records being a defaultdict. 

2609 source_records[missing].extend(dataset_records) 

2610 

2611 # See if we already have these records 

2612 target_records = self._get_stored_records_associated_with_refs(local_refs) 

2613 

2614 # The artifacts to register 

2615 artifacts = [] 

2616 

2617 # Refs that already exist 

2618 already_present = [] 

2619 

2620 # Now can transfer the artifacts 

2621 for source_ref, target_ref in zip(refs, local_refs): 

2622 if target_ref.id in target_records: 

2623 # Already have an artifact for this. 

2624 already_present.append(target_ref) 

2625 continue 

2626 

2627 # mypy needs to know these are always resolved refs 

2628 for info in source_records[source_ref.getCheckedId()]: 

2629 source_location = info.file_location(source_datastore.locationFactory) 

2630 target_location = info.file_location(self.locationFactory) 

2631 if source_location == target_location: 2631 ↛ 2635line 2631 didn't jump to line 2635, because the condition on line 2631 was never true

2632 # Either the dataset is already in the target datastore 

2633 # (which is how execution butler currently runs) or 

2634 # it is an absolute URI. 

2635 if source_location.pathInStore.isabs(): 

2636 # Just because we can see the artifact when running 

2637 # the transfer doesn't mean it will be generally 

2638 # accessible to a user of this butler. For now warn 

2639 # but assume it will be accessible. 

2640 log.warning( 

2641 "Transfer request for an outside-datastore artifact has been found at %s", 

2642 source_location, 

2643 ) 

2644 else: 

2645 # Need to transfer it to the new location. 

2646 # Assume we should always overwrite. If the artifact 

2647 # is there this might indicate that a previous transfer 

2648 # was interrupted but was not able to be rolled back 

2649 # completely (eg pre-emption) so follow Datastore default 

2650 # and overwrite. 

2651 target_location.uri.transfer_from( 

2652 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction 

2653 ) 

2654 

2655 artifacts.append((target_ref, info)) 

2656 

2657 self._register_datasets(artifacts) 

2658 

2659 if already_present: 

2660 n_skipped = len(already_present) 

2661 log.info( 

2662 "Skipped transfer of %d dataset%s already present in datastore", 

2663 n_skipped, 

2664 "" if n_skipped == 1 else "s", 

2665 ) 

2666 

2667 @transactional 

2668 def forget(self, refs: Iterable[DatasetRef]) -> None: 

2669 # Docstring inherited. 

2670 refs = list(refs) 

2671 self.bridge.forget(refs) 

2672 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs]) 

2673 

2674 def validateConfiguration( 

2675 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False 

2676 ) -> None: 

2677 """Validate some of the configuration for this datastore. 

2678 

2679 Parameters 

2680 ---------- 

2681 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

2682 Entities to test against this configuration. Can be differing 

2683 types. 

2684 logFailures : `bool`, optional 

2685 If `True`, output a log message for every validation error 

2686 detected. 

2687 

2688 Raises 

2689 ------ 

2690 DatastoreValidationError 

2691 Raised if there is a validation problem with a configuration. 

2692 All the problems are reported in a single exception. 

2693 

2694 Notes 

2695 ----- 

2696 This method checks that all the supplied entities have valid file 

2697 templates and also have formatters defined. 

2698 """ 

2699 

2700 templateFailed = None 

2701 try: 

2702 self.templates.validateTemplates(entities, logFailures=logFailures) 

2703 except FileTemplateValidationError as e: 

2704 templateFailed = str(e) 

2705 

2706 formatterFailed = [] 

2707 for entity in entities: 

2708 try: 

2709 self.formatterFactory.getFormatterClass(entity) 

2710 except KeyError as e: 

2711 formatterFailed.append(str(e)) 

2712 if logFailures: 2712 ↛ 2707line 2712 didn't jump to line 2707, because the condition on line 2712 was never false

2713 log.critical("Formatter failure: %s", e) 

2714 

2715 if templateFailed or formatterFailed: 

2716 messages = [] 

2717 if templateFailed: 2717 ↛ 2718line 2717 didn't jump to line 2718, because the condition on line 2717 was never true

2718 messages.append(templateFailed) 

2719 if formatterFailed: 2719 ↛ 2721line 2719 didn't jump to line 2721, because the condition on line 2719 was never false

2720 messages.append(",".join(formatterFailed)) 

2721 msg = ";\n".join(messages) 

2722 raise DatastoreValidationError(msg) 

2723 

2724 def getLookupKeys(self) -> Set[LookupKey]: 

2725 # Docstring is inherited from base class 

2726 return ( 

2727 self.templates.getLookupKeys() 

2728 | self.formatterFactory.getLookupKeys() 

2729 | self.constraints.getLookupKeys() 

2730 ) 

2731 

2732 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

2733 # Docstring is inherited from base class 

2734 # The key can be valid in either formatters or templates so we can 

2735 # only check the template if it exists 

2736 if lookupKey in self.templates: 

2737 try: 

2738 self.templates[lookupKey].validateTemplate(entity) 

2739 except FileTemplateValidationError as e: 

2740 raise DatastoreValidationError(e) from e 

2741 

2742 def export( 

2743 self, 

2744 refs: Iterable[DatasetRef], 

2745 *, 

2746 directory: Optional[ResourcePathExpression] = None, 

2747 transfer: Optional[str] = "auto", 

2748 ) -> Iterable[FileDataset]: 

2749 # Docstring inherited from Datastore.export. 

2750 if transfer == "auto" and directory is None: 

2751 transfer = None 

2752 

2753 if transfer is not None and directory is None: 

2754 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

2755 

2756 if transfer == "move": 

2757 raise TypeError("Can not export by moving files out of datastore.") 

2758 elif transfer == "direct": 2758 ↛ 2762line 2758 didn't jump to line 2762, because the condition on line 2758 was never true

2759 # For an export, treat this as equivalent to None. We do not 

2760 # want an import to risk using absolute URIs to datasets owned 

2761 # by another datastore. 

2762 log.info("Treating 'direct' transfer mode as in-place export.") 

2763 transfer = None 

2764 

2765 # Force the directory to be a URI object 

2766 directoryUri: Optional[ResourcePath] = None 

2767 if directory is not None: 

2768 directoryUri = ResourcePath(directory, forceDirectory=True) 

2769 

2770 if transfer is not None and directoryUri is not None: 

2771 # mypy needs the second test 

2772 if not directoryUri.exists(): 2772 ↛ 2773line 2772 didn't jump to line 2773, because the condition on line 2772 was never true

2773 raise FileNotFoundError(f"Export location {directory} does not exist") 

2774 

2775 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

2776 for ref in progress.wrap(refs, "Exporting dataset files"): 

2777 fileLocations = self._get_dataset_locations_info(ref) 

2778 if not fileLocations: 

2779 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

2780 # For now we can not export disassembled datasets 

2781 if len(fileLocations) > 1: 

2782 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

2783 location, storedFileInfo = fileLocations[0] 

2784 

2785 pathInStore = location.pathInStore.path 

2786 if transfer is None: 

2787 # TODO: do we also need to return the readStorageClass somehow? 

2788 # We will use the path in store directly. If this is an 

2789 # absolute URI, preserve it. 

2790 if location.pathInStore.isabs(): 2790 ↛ 2791line 2790 didn't jump to line 2791, because the condition on line 2790 was never true

2791 pathInStore = str(location.uri) 

2792 elif transfer == "direct": 2792 ↛ 2794line 2792 didn't jump to line 2794, because the condition on line 2792 was never true

2793 # Use full URIs to the remote store in the export 

2794 pathInStore = str(location.uri) 

2795 else: 

2796 # mypy needs help 

2797 assert directoryUri is not None, "directoryUri must be defined to get here" 

2798 storeUri = ResourcePath(location.uri) 

2799 

2800 # if the datastore has an absolute URI to a resource, we 

2801 # have two options: 

2802 # 1. Keep the absolute URI in the exported YAML 

2803 # 2. Allocate a new name in the local datastore and transfer 

2804 # it. 

2805 # For now go with option 2 

2806 if location.pathInStore.isabs(): 2806 ↛ 2807line 2806 didn't jump to line 2807, because the condition on line 2806 was never true

2807 template = self.templates.getTemplate(ref) 

2808 newURI = ResourcePath(template.format(ref), forceAbsolute=False) 

2809 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

2810 

2811 exportUri = directoryUri.join(pathInStore) 

2812 exportUri.transfer_from(storeUri, transfer=transfer) 

2813 

2814 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

2815 

2816 @staticmethod 

2817 def computeChecksum( 

2818 uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192 

2819 ) -> Optional[str]: 

2820 """Compute the checksum of the supplied file. 

2821 

2822 Parameters 

2823 ---------- 

2824 uri : `lsst.resources.ResourcePath` 

2825 Name of resource to calculate checksum from. 

2826 algorithm : `str`, optional 

2827 Name of algorithm to use. Must be one of the algorithms supported 

2828 by :py:class`hashlib`. 

2829 block_size : `int` 

2830 Number of bytes to read from file at one time. 

2831 

2832 Returns 

2833 ------- 

2834 hexdigest : `str` 

2835 Hex digest of the file. 

2836 

2837 Notes 

2838 ----- 

2839 Currently returns None if the URI is for a remote resource. 

2840 """ 

2841 if algorithm not in hashlib.algorithms_guaranteed: 2841 ↛ 2842line 2841 didn't jump to line 2842, because the condition on line 2841 was never true

2842 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

2843 

2844 if not uri.isLocal: 2844 ↛ 2845line 2844 didn't jump to line 2845, because the condition on line 2844 was never true

2845 return None 

2846 

2847 hasher = hashlib.new(algorithm) 

2848 

2849 with uri.as_local() as local_uri: 

2850 with open(local_uri.ospath, "rb") as f: 

2851 for chunk in iter(lambda: f.read(block_size), b""): 

2852 hasher.update(chunk) 

2853 

2854 return hasher.hexdigest() 

2855 

2856 def needs_expanded_data_ids( 

2857 self, 

2858 transfer: Optional[str], 

2859 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

2860 ) -> bool: 

2861 # Docstring inherited. 

2862 # This _could_ also use entity to inspect whether the filename template 

2863 # involves placeholders other than the required dimensions for its 

2864 # dataset type, but that's not necessary for correctness; it just 

2865 # enables more optimizations (perhaps only in theory). 

2866 return transfer not in ("direct", None) 

2867 

2868 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

2869 # Docstring inherited from the base class. 

2870 record_data = data.get(self.name) 

2871 if not record_data: 2871 ↛ 2872line 2871 didn't jump to line 2872, because the condition on line 2871 was never true

2872 return 

2873 

2874 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records.keys()) 

2875 

2876 # TODO: Verify that there are no unexpected table names in the dict? 

2877 unpacked_records = [] 

2878 for dataset_data in record_data.records.values(): 

2879 records = dataset_data.get(self._table.name) 

2880 if records: 2880 ↛ 2878line 2880 didn't jump to line 2878, because the condition on line 2880 was never false

2881 for info in records: 

2882 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records" 

2883 unpacked_records.append(info.to_record()) 

2884 if unpacked_records: 

2885 self._table.insert(*unpacked_records, transaction=self._transaction) 

2886 

2887 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

2888 # Docstring inherited from the base class. 

2889 exported_refs = list(self._bridge.check(refs)) 

2890 ids = {ref.getCheckedId() for ref in exported_refs} 

2891 records: defaultdict[DatasetId, defaultdict[str, List[StoredDatastoreItemInfo]]] = defaultdict( 

2892 lambda: defaultdict(list), {id: defaultdict(list) for id in ids} 

2893 ) 

2894 for row in self._table.fetch(dataset_id=ids): 

2895 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row) 

2896 records[info.dataset_id][self._table.name].append(info) 

2897 

2898 record_data = DatastoreRecordData(records=records) 

2899 return {self.name: record_data}