Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 86%

967 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-02 09:49 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileDatastore",) 

26 

27import hashlib 

28import logging 

29from collections import defaultdict 

30from collections.abc import Callable 

31from dataclasses import dataclass 

32from typing import ( 

33 TYPE_CHECKING, 

34 Any, 

35 ClassVar, 

36 Dict, 

37 Iterable, 

38 List, 

39 Mapping, 

40 Optional, 

41 Sequence, 

42 Set, 

43 Tuple, 

44 Type, 

45 Union, 

46) 

47 

48from lsst.daf.butler import ( 

49 CompositesMap, 

50 Config, 

51 DatasetId, 

52 DatasetRef, 

53 DatasetRefURIs, 

54 DatasetType, 

55 DatasetTypeNotSupportedError, 

56 Datastore, 

57 DatastoreCacheManager, 

58 DatastoreConfig, 

59 DatastoreDisabledCacheManager, 

60 DatastoreRecordData, 

61 DatastoreValidationError, 

62 FileDataset, 

63 FileDescriptor, 

64 FileTemplates, 

65 FileTemplateValidationError, 

66 Formatter, 

67 FormatterFactory, 

68 Location, 

69 LocationFactory, 

70 Progress, 

71 StorageClass, 

72 StoredDatastoreItemInfo, 

73 StoredFileInfo, 

74 ddl, 

75) 

76from lsst.daf.butler.core.repoRelocation import replaceRoot 

77from lsst.daf.butler.core.utils import transactional 

78from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError 

79from lsst.resources import ResourcePath, ResourcePathExpression 

80from lsst.utils.introspection import get_class_of, get_instance_of 

81from lsst.utils.iteration import chunk_iterable 

82 

83# For VERBOSE logging usage. 

84from lsst.utils.logging import VERBOSE, getLogger 

85from lsst.utils.timer import time_this 

86from sqlalchemy import BigInteger, String 

87 

88from ..registry.interfaces import FakeDatasetRef 

89from .genericDatastore import GenericBaseDatastore 

90 

91if TYPE_CHECKING: 

92 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey 

93 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

94 

95log = getLogger(__name__) 

96 

97 

98class _IngestPrepData(Datastore.IngestPrepData): 

99 """Helper class for FileDatastore ingest implementation. 

100 

101 Parameters 

102 ---------- 

103 datasets : `list` of `FileDataset` 

104 Files to be ingested by this datastore. 

105 """ 

106 

107 def __init__(self, datasets: List[FileDataset]): 

108 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

109 self.datasets = datasets 

110 

111 

112@dataclass(frozen=True) 

113class DatastoreFileGetInformation: 

114 """Collection of useful parameters needed to retrieve a file from 

115 a Datastore. 

116 """ 

117 

118 location: Location 

119 """The location from which to read the dataset.""" 

120 

121 formatter: Formatter 

122 """The `Formatter` to use to deserialize the dataset.""" 

123 

124 info: StoredFileInfo 

125 """Stored information about this file and its formatter.""" 

126 

127 assemblerParams: Mapping[str, Any] 

128 """Parameters to use for post-processing the retrieved dataset.""" 

129 

130 formatterParams: Mapping[str, Any] 

131 """Parameters that were understood by the associated formatter.""" 

132 

133 component: Optional[str] 

134 """The component to be retrieved (can be `None`).""" 

135 

136 readStorageClass: StorageClass 

137 """The `StorageClass` of the dataset being read.""" 

138 

139 

140class FileDatastore(GenericBaseDatastore): 

141 """Generic Datastore for file-based implementations. 

142 

143 Should always be sub-classed since key abstract methods are missing. 

144 

145 Parameters 

146 ---------- 

147 config : `DatastoreConfig` or `str` 

148 Configuration as either a `Config` object or URI to file. 

149 bridgeManager : `DatastoreRegistryBridgeManager` 

150 Object that manages the interface between `Registry` and datastores. 

151 butlerRoot : `str`, optional 

152 New datastore root to use to override the configuration value. 

153 

154 Raises 

155 ------ 

156 ValueError 

157 If root location does not exist and ``create`` is `False` in the 

158 configuration. 

159 """ 

160 

161 defaultConfigFile: ClassVar[Optional[str]] = None 

162 """Path to configuration defaults. Accessed within the ``config`` resource 

163 or relative to a search path. Can be None if no defaults specified. 

164 """ 

165 

166 root: ResourcePath 

167 """Root directory URI of this `Datastore`.""" 

168 

169 locationFactory: LocationFactory 

170 """Factory for creating locations relative to the datastore root.""" 

171 

172 formatterFactory: FormatterFactory 

173 """Factory for creating instances of formatters.""" 

174 

175 templates: FileTemplates 

176 """File templates that can be used by this `Datastore`.""" 

177 

178 composites: CompositesMap 

179 """Determines whether a dataset should be disassembled on put.""" 

180 

181 defaultConfigFile = "datastores/fileDatastore.yaml" 

182 """Path to configuration defaults. Accessed within the ``config`` resource 

183 or relative to a search path. Can be None if no defaults specified. 

184 """ 

185 

186 _retrieve_dataset_method: Callable[[str], DatasetType | None] | None = None 

187 """Callable that is used in trusted mode to retrieve registry definition 

188 of a named dataset type. 

189 """ 

190 

191 @classmethod 

192 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

193 """Set any filesystem-dependent config options for this Datastore to 

194 be appropriate for a new empty repository with the given root. 

195 

196 Parameters 

197 ---------- 

198 root : `str` 

199 URI to the root of the data repository. 

200 config : `Config` 

201 A `Config` to update. Only the subset understood by 

202 this component will be updated. Will not expand 

203 defaults. 

204 full : `Config` 

205 A complete config with all defaults expanded that can be 

206 converted to a `DatastoreConfig`. Read-only and will not be 

207 modified by this method. 

208 Repository-specific options that should not be obtained 

209 from defaults when Butler instances are constructed 

210 should be copied from ``full`` to ``config``. 

211 overwrite : `bool`, optional 

212 If `False`, do not modify a value in ``config`` if the value 

213 already exists. Default is always to overwrite with the provided 

214 ``root``. 

215 

216 Notes 

217 ----- 

218 If a keyword is explicitly defined in the supplied ``config`` it 

219 will not be overridden by this method if ``overwrite`` is `False`. 

220 This allows explicit values set in external configs to be retained. 

221 """ 

222 Config.updateParameters( 

223 DatastoreConfig, 

224 config, 

225 full, 

226 toUpdate={"root": root}, 

227 toCopy=("cls", ("records", "table")), 

228 overwrite=overwrite, 

229 ) 

230 

231 @classmethod 

232 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

233 return ddl.TableSpec( 

234 fields=[ 

235 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

236 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

237 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

238 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

239 # Use empty string to indicate no component 

240 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

241 # TODO: should checksum be Base64Bytes instead? 

242 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

243 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

244 ], 

245 unique=frozenset(), 

246 indexes=[ddl.IndexSpec("path")], 

247 ) 

248 

249 def __init__( 

250 self, 

251 config: Union[DatastoreConfig, str], 

252 bridgeManager: DatastoreRegistryBridgeManager, 

253 butlerRoot: str | None = None, 

254 ): 

255 super().__init__(config, bridgeManager) 

256 if "root" not in self.config: 256 ↛ 257line 256 didn't jump to line 257, because the condition on line 256 was never true

257 raise ValueError("No root directory specified in configuration") 

258 

259 self._bridgeManager = bridgeManager 

260 

261 # Name ourselves either using an explicit name or a name 

262 # derived from the (unexpanded) root 

263 if "name" in self.config: 

264 self.name = self.config["name"] 

265 else: 

266 # We use the unexpanded root in the name to indicate that this 

267 # datastore can be moved without having to update registry. 

268 self.name = "{}@{}".format(type(self).__name__, self.config["root"]) 

269 

270 # Support repository relocation in config 

271 # Existence of self.root is checked in subclass 

272 self.root = ResourcePath( 

273 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True 

274 ) 

275 

276 self.locationFactory = LocationFactory(self.root) 

277 self.formatterFactory = FormatterFactory() 

278 

279 # Now associate formatters with storage classes 

280 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe) 

281 

282 # Read the file naming templates 

283 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe) 

284 

285 # See if composites should be disassembled 

286 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe) 

287 

288 tableName = self.config["records", "table"] 

289 try: 

290 # Storage of paths and formatters, keyed by dataset_id 

291 self._table = bridgeManager.opaque.register( 

292 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType) 

293 ) 

294 # Interface to Registry. 

295 self._bridge = bridgeManager.register(self.name) 

296 except ReadOnlyDatabaseError: 

297 # If the database is read only and we just tried and failed to 

298 # create a table, it means someone is trying to create a read-only 

299 # butler client for an empty repo. That should be okay, as long 

300 # as they then try to get any datasets before some other client 

301 # creates the table. Chances are they'rejust validating 

302 # configuration. 

303 pass 

304 

305 # Determine whether checksums should be used - default to False 

306 self.useChecksum = self.config.get("checksum", False) 

307 

308 # Determine whether we can fall back to configuration if a 

309 # requested dataset is not known to registry 

310 self.trustGetRequest = self.config.get("trust_get_request", False) 

311 

312 # Create a cache manager 

313 self.cacheManager: AbstractDatastoreCacheManager 

314 if "cached" in self.config: 314 ↛ 317line 314 didn't jump to line 317, because the condition on line 314 was never false

315 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe) 

316 else: 

317 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe) 

318 

319 # Check existence and create directory structure if necessary 

320 if not self.root.exists(): 

321 if "create" not in self.config or not self.config["create"]: 321 ↛ 322line 321 didn't jump to line 322, because the condition on line 321 was never true

322 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

323 try: 

324 self.root.mkdir() 

325 except Exception as e: 

326 raise ValueError( 

327 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}" 

328 ) from e 

329 

330 def __str__(self) -> str: 

331 return str(self.root) 

332 

333 @property 

334 def bridge(self) -> DatastoreRegistryBridge: 

335 return self._bridge 

336 

337 def _artifact_exists(self, location: Location) -> bool: 

338 """Check that an artifact exists in this datastore at the specified 

339 location. 

340 

341 Parameters 

342 ---------- 

343 location : `Location` 

344 Expected location of the artifact associated with this datastore. 

345 

346 Returns 

347 ------- 

348 exists : `bool` 

349 True if the location can be found, false otherwise. 

350 """ 

351 log.debug("Checking if resource exists: %s", location.uri) 

352 return location.uri.exists() 

353 

354 def _delete_artifact(self, location: Location) -> None: 

355 """Delete the artifact from the datastore. 

356 

357 Parameters 

358 ---------- 

359 location : `Location` 

360 Location of the artifact associated with this datastore. 

361 """ 

362 if location.pathInStore.isabs(): 362 ↛ 363line 362 didn't jump to line 363, because the condition on line 362 was never true

363 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

364 

365 try: 

366 location.uri.remove() 

367 except FileNotFoundError: 

368 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

369 raise 

370 except Exception as e: 

371 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

372 raise 

373 log.debug("Successfully deleted file: %s", location.uri) 

374 

375 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

376 # Docstring inherited from GenericBaseDatastore 

377 records = [info.rebase(ref).to_record() for ref, info in zip(refs, infos)] 

378 self._table.insert(*records, transaction=self._transaction) 

379 

380 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

381 # Docstring inherited from GenericBaseDatastore 

382 

383 # Look for the dataset_id -- there might be multiple matches 

384 # if we have disassembled the dataset. 

385 records = self._table.fetch(dataset_id=ref.id) 

386 return [StoredFileInfo.from_record(record) for record in records] 

387 

388 def _get_stored_records_associated_with_refs( 

389 self, refs: Iterable[DatasetIdRef] 

390 ) -> Dict[DatasetId, List[StoredFileInfo]]: 

391 """Retrieve all records associated with the provided refs. 

392 

393 Parameters 

394 ---------- 

395 refs : iterable of `DatasetIdRef` 

396 The refs for which records are to be retrieved. 

397 

398 Returns 

399 ------- 

400 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

401 The matching records indexed by the ref ID. The number of entries 

402 in the dict can be smaller than the number of requested refs. 

403 """ 

404 records = self._table.fetch(dataset_id=[ref.id for ref in refs]) 

405 

406 # Uniqueness is dataset_id + component so can have multiple records 

407 # per ref. 

408 records_by_ref = defaultdict(list) 

409 for record in records: 

410 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

411 return records_by_ref 

412 

413 def _refs_associated_with_artifacts( 

414 self, paths: List[Union[str, ResourcePath]] 

415 ) -> Dict[str, Set[DatasetId]]: 

416 """Return paths and associated dataset refs. 

417 

418 Parameters 

419 ---------- 

420 paths : `list` of `str` or `lsst.resources.ResourcePath` 

421 All the paths to include in search. 

422 

423 Returns 

424 ------- 

425 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

426 Mapping of each path to a set of associated database IDs. 

427 """ 

428 records = self._table.fetch(path=[str(path) for path in paths]) 

429 result = defaultdict(set) 

430 for row in records: 

431 result[row["path"]].add(row["dataset_id"]) 

432 return result 

433 

434 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> Set[DatasetId]: 

435 """Return all dataset refs associated with the supplied path. 

436 

437 Parameters 

438 ---------- 

439 pathInStore : `lsst.resources.ResourcePath` 

440 Path of interest in the data store. 

441 

442 Returns 

443 ------- 

444 ids : `set` of `int` 

445 All `DatasetRef` IDs associated with this path. 

446 """ 

447 records = list(self._table.fetch(path=str(pathInStore))) 

448 ids = {r["dataset_id"] for r in records} 

449 return ids 

450 

451 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

452 # Docstring inherited from GenericBaseDatastore 

453 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

454 

455 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

456 r"""Find all the `Location`\ s of the requested dataset in the 

457 `Datastore` and the associated stored file information. 

458 

459 Parameters 

460 ---------- 

461 ref : `DatasetRef` 

462 Reference to the required `Dataset`. 

463 

464 Returns 

465 ------- 

466 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

467 Location of the dataset within the datastore and 

468 stored information about each file and its formatter. 

469 """ 

470 # Get the file information (this will fail if no file) 

471 records = self.getStoredItemsInfo(ref) 

472 

473 # Use the path to determine the location -- we need to take 

474 # into account absolute URIs in the datastore record 

475 return [(r.file_location(self.locationFactory), r) for r in records] 

476 

477 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

478 """Check that there is only one dataset associated with the 

479 specified artifact. 

480 

481 Parameters 

482 ---------- 

483 ref : `DatasetRef` or `FakeDatasetRef` 

484 Dataset to be removed. 

485 location : `Location` 

486 The location of the artifact to be removed. 

487 

488 Returns 

489 ------- 

490 can_remove : `Bool` 

491 True if the artifact can be safely removed. 

492 """ 

493 # Can't ever delete absolute URIs. 

494 if location.pathInStore.isabs(): 

495 return False 

496 

497 # Get all entries associated with this path 

498 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

499 if not allRefs: 

500 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

501 

502 # Remove these refs from all the refs and if there is nothing left 

503 # then we can delete 

504 remainingRefs = allRefs - {ref.id} 

505 

506 if remainingRefs: 

507 return False 

508 return True 

509 

510 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, StoredFileInfo]]: 

511 """Predict the location and related file information of the requested 

512 dataset in this datastore. 

513 

514 Parameters 

515 ---------- 

516 ref : `DatasetRef` 

517 Reference to the required `Dataset`. 

518 

519 Returns 

520 ------- 

521 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

522 Expected Location of the dataset within the datastore and 

523 placeholder information about each file and its formatter. 

524 

525 Notes 

526 ----- 

527 Uses the current configuration to determine how we would expect the 

528 datastore files to have been written if we couldn't ask registry. 

529 This is safe so long as there has been no change to datastore 

530 configuration between writing the dataset and wanting to read it. 

531 Will not work for files that have been ingested without using the 

532 standard file template or default formatter. 

533 """ 

534 

535 # If we have a component ref we always need to ask the questions 

536 # of the composite. If the composite is disassembled this routine 

537 # should return all components. If the composite was not 

538 # disassembled the composite is what is stored regardless of 

539 # component request. Note that if the caller has disassembled 

540 # a composite there is no way for this guess to know that 

541 # without trying both the composite and component ref and seeing 

542 # if there is something at the component Location even without 

543 # disassembly being enabled. 

544 if ref.datasetType.isComponent(): 

545 ref = ref.makeCompositeRef() 

546 

547 # See if the ref is a composite that should be disassembled 

548 doDisassembly = self.composites.shouldBeDisassembled(ref) 

549 

550 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = [] 

551 

552 if doDisassembly: 

553 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

554 compRef = ref.makeComponentRef(component) 

555 location, formatter = self._determine_put_formatter_location(compRef) 

556 all_info.append((location, formatter, componentStorage, component)) 

557 

558 else: 

559 # Always use the composite ref if no disassembly 

560 location, formatter = self._determine_put_formatter_location(ref) 

561 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

562 

563 # Convert the list of tuples to have StoredFileInfo as second element 

564 return [ 

565 ( 

566 location, 

567 StoredFileInfo( 

568 formatter=formatter, 

569 path=location.pathInStore.path, 

570 storageClass=storageClass, 

571 component=component, 

572 checksum=None, 

573 file_size=-1, 

574 dataset_id=ref.getCheckedId(), 

575 ), 

576 ) 

577 for location, formatter, storageClass, component in all_info 

578 ] 

579 

580 def _prepare_for_get( 

581 self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None 

582 ) -> List[DatastoreFileGetInformation]: 

583 """Check parameters for ``get`` and obtain formatter and 

584 location. 

585 

586 Parameters 

587 ---------- 

588 ref : `DatasetRef` 

589 Reference to the required Dataset. 

590 parameters : `dict` 

591 `StorageClass`-specific parameters that specify, for example, 

592 a slice of the dataset to be loaded. 

593 

594 Returns 

595 ------- 

596 getInfo : `list` [`DatastoreFileGetInformation`] 

597 Parameters needed to retrieve each file. 

598 """ 

599 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

600 

601 # For trusted mode need to reset storage class. 

602 ref = self._cast_storage_class(ref) 

603 

604 # Get file metadata and internal metadata 

605 fileLocations = self._get_dataset_locations_info(ref) 

606 if not fileLocations: 

607 if not self.trustGetRequest: 

608 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

609 # Assume the dataset is where we think it should be 

610 fileLocations = self._get_expected_dataset_locations_info(ref) 

611 

612 # The storage class we want to use eventually 

613 refStorageClass = ref.datasetType.storageClass 

614 

615 if len(fileLocations) > 1: 

616 disassembled = True 

617 

618 # If trust is involved it is possible that there will be 

619 # components listed here that do not exist in the datastore. 

620 # Explicitly check for file artifact existence and filter out any 

621 # that are missing. 

622 if self.trustGetRequest: 

623 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()] 

624 

625 # For now complain only if we have no components at all. One 

626 # component is probably a problem but we can punt that to the 

627 # assembler. 

628 if not fileLocations: 

629 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") 

630 

631 else: 

632 disassembled = False 

633 

634 # Is this a component request? 

635 refComponent = ref.datasetType.component() 

636 

637 fileGetInfo = [] 

638 for location, storedFileInfo in fileLocations: 

639 # The storage class used to write the file 

640 writeStorageClass = storedFileInfo.storageClass 

641 

642 # If this has been disassembled we need read to match the write 

643 if disassembled: 

644 readStorageClass = writeStorageClass 

645 else: 

646 readStorageClass = refStorageClass 

647 

648 formatter = get_instance_of( 

649 storedFileInfo.formatter, 

650 FileDescriptor( 

651 location, 

652 readStorageClass=readStorageClass, 

653 storageClass=writeStorageClass, 

654 parameters=parameters, 

655 ), 

656 ref.dataId, 

657 ) 

658 

659 formatterParams, notFormatterParams = formatter.segregateParameters() 

660 

661 # Of the remaining parameters, extract the ones supported by 

662 # this StorageClass (for components not all will be handled) 

663 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

664 

665 # The ref itself could be a component if the dataset was 

666 # disassembled by butler, or we disassembled in datastore and 

667 # components came from the datastore records 

668 component = storedFileInfo.component if storedFileInfo.component else refComponent 

669 

670 fileGetInfo.append( 

671 DatastoreFileGetInformation( 

672 location, 

673 formatter, 

674 storedFileInfo, 

675 assemblerParams, 

676 formatterParams, 

677 component, 

678 readStorageClass, 

679 ) 

680 ) 

681 

682 return fileGetInfo 

683 

684 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

685 """Check the arguments for ``put`` and obtain formatter and 

686 location. 

687 

688 Parameters 

689 ---------- 

690 inMemoryDataset : `object` 

691 The dataset to store. 

692 ref : `DatasetRef` 

693 Reference to the associated Dataset. 

694 

695 Returns 

696 ------- 

697 location : `Location` 

698 The location to write the dataset. 

699 formatter : `Formatter` 

700 The `Formatter` to use to write the dataset. 

701 

702 Raises 

703 ------ 

704 TypeError 

705 Supplied object and storage class are inconsistent. 

706 DatasetTypeNotSupportedError 

707 The associated `DatasetType` is not handled by this datastore. 

708 """ 

709 self._validate_put_parameters(inMemoryDataset, ref) 

710 return self._determine_put_formatter_location(ref) 

711 

712 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]: 

713 """Calculate the formatter and output location to use for put. 

714 

715 Parameters 

716 ---------- 

717 ref : `DatasetRef` 

718 Reference to the associated Dataset. 

719 

720 Returns 

721 ------- 

722 location : `Location` 

723 The location to write the dataset. 

724 formatter : `Formatter` 

725 The `Formatter` to use to write the dataset. 

726 """ 

727 # Work out output file name 

728 try: 

729 template = self.templates.getTemplate(ref) 

730 except KeyError as e: 

731 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

732 

733 # Validate the template to protect against filenames from different 

734 # dataIds returning the same and causing overwrite confusion. 

735 template.validateTemplate(ref) 

736 

737 location = self.locationFactory.fromPath(template.format(ref)) 

738 

739 # Get the formatter based on the storage class 

740 storageClass = ref.datasetType.storageClass 

741 try: 

742 formatter = self.formatterFactory.getFormatter( 

743 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId 

744 ) 

745 except KeyError as e: 

746 raise DatasetTypeNotSupportedError( 

747 f"Unable to find formatter for {ref} in datastore {self.name}" 

748 ) from e 

749 

750 # Now that we know the formatter, update the location 

751 location = formatter.makeUpdatedLocation(location) 

752 

753 return location, formatter 

754 

755 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

756 # Docstring inherited from base class 

757 if transfer != "auto": 

758 return transfer 

759 

760 # See if the paths are within the datastore or not 

761 inside = [self._pathInStore(d.path) is not None for d in datasets] 

762 

763 if all(inside): 

764 transfer = None 

765 elif not any(inside): 765 ↛ 774line 765 didn't jump to line 774, because the condition on line 765 was never false

766 # Allow ResourcePath to use its own knowledge 

767 transfer = "auto" 

768 else: 

769 # This can happen when importing from a datastore that 

770 # has had some datasets ingested using "direct" mode. 

771 # Also allow ResourcePath to sort it out but warn about it. 

772 # This can happen if you are importing from a datastore 

773 # that had some direct transfer datasets. 

774 log.warning( 

775 "Some datasets are inside the datastore and some are outside. Using 'split' " 

776 "transfer mode. This assumes that the files outside the datastore are " 

777 "still accessible to the new butler since they will not be copied into " 

778 "the target datastore." 

779 ) 

780 transfer = "split" 

781 

782 return transfer 

783 

784 def _pathInStore(self, path: ResourcePathExpression) -> Optional[str]: 

785 """Return path relative to datastore root 

786 

787 Parameters 

788 ---------- 

789 path : `lsst.resources.ResourcePathExpression` 

790 Path to dataset. Can be absolute URI. If relative assumed to 

791 be relative to the datastore. Returns path in datastore 

792 or raises an exception if the path it outside. 

793 

794 Returns 

795 ------- 

796 inStore : `str` 

797 Path relative to datastore root. Returns `None` if the file is 

798 outside the root. 

799 """ 

800 # Relative path will always be relative to datastore 

801 pathUri = ResourcePath(path, forceAbsolute=False) 

802 return pathUri.relative_to(self.root) 

803 

804 def _standardizeIngestPath( 

805 self, path: Union[str, ResourcePath], *, transfer: Optional[str] = None 

806 ) -> Union[str, ResourcePath]: 

807 """Standardize the path of a to-be-ingested file. 

808 

809 Parameters 

810 ---------- 

811 path : `str` or `lsst.resources.ResourcePath` 

812 Path of a file to be ingested. This parameter is not expected 

813 to be all the types that can be used to construct a 

814 `~lsst.resources.ResourcePath`. 

815 transfer : `str`, optional 

816 How (and whether) the dataset should be added to the datastore. 

817 See `ingest` for details of transfer modes. 

818 This implementation is provided only so 

819 `NotImplementedError` can be raised if the mode is not supported; 

820 actual transfers are deferred to `_extractIngestInfo`. 

821 

822 Returns 

823 ------- 

824 path : `str` or `lsst.resources.ResourcePath` 

825 New path in what the datastore considers standard form. If an 

826 absolute URI was given that will be returned unchanged. 

827 

828 Notes 

829 ----- 

830 Subclasses of `FileDatastore` can implement this method instead 

831 of `_prepIngest`. It should not modify the data repository or given 

832 file in any way. 

833 

834 Raises 

835 ------ 

836 NotImplementedError 

837 Raised if the datastore does not support the given transfer mode 

838 (including the case where ingest is not supported at all). 

839 FileNotFoundError 

840 Raised if one of the given files does not exist. 

841 """ 

842 if transfer not in (None, "direct", "split") + self.root.transferModes: 

843 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

844 

845 # A relative URI indicates relative to datastore root 

846 srcUri = ResourcePath(path, forceAbsolute=False) 

847 if not srcUri.isabs(): 

848 srcUri = self.root.join(path) 

849 

850 if not srcUri.exists(): 

851 raise FileNotFoundError( 

852 f"Resource at {srcUri} does not exist; note that paths to ingest " 

853 f"are assumed to be relative to {self.root} unless they are absolute." 

854 ) 

855 

856 if transfer is None: 

857 relpath = srcUri.relative_to(self.root) 

858 if not relpath: 

859 raise RuntimeError( 

860 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})" 

861 ) 

862 

863 # Return the relative path within the datastore for internal 

864 # transfer 

865 path = relpath 

866 

867 return path 

868 

869 def _extractIngestInfo( 

870 self, 

871 path: ResourcePathExpression, 

872 ref: DatasetRef, 

873 *, 

874 formatter: Union[Formatter, Type[Formatter]], 

875 transfer: Optional[str] = None, 

876 record_validation_info: bool = True, 

877 ) -> StoredFileInfo: 

878 """Relocate (if necessary) and extract `StoredFileInfo` from a 

879 to-be-ingested file. 

880 

881 Parameters 

882 ---------- 

883 path : `lsst.resources.ResourcePathExpression` 

884 URI or path of a file to be ingested. 

885 ref : `DatasetRef` 

886 Reference for the dataset being ingested. Guaranteed to have 

887 ``dataset_id not None`. 

888 formatter : `type` or `Formatter` 

889 `Formatter` subclass to use for this dataset or an instance. 

890 transfer : `str`, optional 

891 How (and whether) the dataset should be added to the datastore. 

892 See `ingest` for details of transfer modes. 

893 record_validation_info : `bool`, optional 

894 If `True`, the default, the datastore can record validation 

895 information associated with the file. If `False` the datastore 

896 will not attempt to track any information such as checksums 

897 or file sizes. This can be useful if such information is tracked 

898 in an external system or if the file is to be compressed in place. 

899 It is up to the datastore whether this parameter is relevant. 

900 

901 Returns 

902 ------- 

903 info : `StoredFileInfo` 

904 Internal datastore record for this file. This will be inserted by 

905 the caller; the `_extractIngestInfo` is only responsible for 

906 creating and populating the struct. 

907 

908 Raises 

909 ------ 

910 FileNotFoundError 

911 Raised if one of the given files does not exist. 

912 FileExistsError 

913 Raised if transfer is not `None` but the (internal) location the 

914 file would be moved to is already occupied. 

915 """ 

916 if self._transaction is None: 916 ↛ 917line 916 didn't jump to line 917, because the condition on line 916 was never true

917 raise RuntimeError("Ingest called without transaction enabled") 

918 

919 # Create URI of the source path, do not need to force a relative 

920 # path to absolute. 

921 srcUri = ResourcePath(path, forceAbsolute=False) 

922 

923 # Track whether we have read the size of the source yet 

924 have_sized = False 

925 

926 tgtLocation: Optional[Location] 

927 if transfer is None or transfer == "split": 

928 # A relative path is assumed to be relative to the datastore 

929 # in this context 

930 if not srcUri.isabs(): 

931 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

932 else: 

933 # Work out the path in the datastore from an absolute URI 

934 # This is required to be within the datastore. 

935 pathInStore = srcUri.relative_to(self.root) 

936 if pathInStore is None and transfer is None: 936 ↛ 937line 936 didn't jump to line 937, because the condition on line 936 was never true

937 raise RuntimeError( 

938 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}" 

939 ) 

940 if pathInStore: 940 ↛ 942line 940 didn't jump to line 942, because the condition on line 940 was never false

941 tgtLocation = self.locationFactory.fromPath(pathInStore) 

942 elif transfer == "split": 

943 # Outside the datastore but treat that as a direct ingest 

944 # instead. 

945 tgtLocation = None 

946 else: 

947 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}") 

948 elif transfer == "direct": 

949 # Want to store the full URI to the resource directly in 

950 # datastore. This is useful for referring to permanent archive 

951 # storage for raw data. 

952 # Trust that people know what they are doing. 

953 tgtLocation = None 

954 else: 

955 # Work out the name we want this ingested file to have 

956 # inside the datastore 

957 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

958 if not tgtLocation.uri.dirname().exists(): 

959 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

960 tgtLocation.uri.dirname().mkdir() 

961 

962 # if we are transferring from a local file to a remote location 

963 # it may be more efficient to get the size and checksum of the 

964 # local file rather than the transferred one 

965 if record_validation_info and srcUri.isLocal: 

966 size = srcUri.size() 

967 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

968 have_sized = True 

969 

970 # Transfer the resource to the destination. 

971 # Allow overwrite of an existing file. This matches the behavior 

972 # of datastore.put() in that it trusts that registry would not 

973 # be asking to overwrite unless registry thought that the 

974 # overwrite was allowed. 

975 tgtLocation.uri.transfer_from( 

976 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True 

977 ) 

978 

979 if tgtLocation is None: 

980 # This means we are using direct mode 

981 targetUri = srcUri 

982 targetPath = str(srcUri) 

983 else: 

984 targetUri = tgtLocation.uri 

985 targetPath = tgtLocation.pathInStore.path 

986 

987 # the file should exist in the datastore now 

988 if record_validation_info: 

989 if not have_sized: 

990 size = targetUri.size() 

991 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

992 else: 

993 # Not recording any file information. 

994 size = -1 

995 checksum = None 

996 

997 return StoredFileInfo( 

998 formatter=formatter, 

999 path=targetPath, 

1000 storageClass=ref.datasetType.storageClass, 

1001 component=ref.datasetType.component(), 

1002 file_size=size, 

1003 checksum=checksum, 

1004 dataset_id=ref.getCheckedId(), 

1005 ) 

1006 

1007 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

1008 # Docstring inherited from Datastore._prepIngest. 

1009 filtered = [] 

1010 for dataset in datasets: 

1011 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

1012 if not acceptable: 

1013 continue 

1014 else: 

1015 dataset.refs = acceptable 

1016 if dataset.formatter is None: 

1017 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

1018 else: 

1019 assert isinstance(dataset.formatter, (type, str)) 

1020 formatter_class = get_class_of(dataset.formatter) 

1021 if not issubclass(formatter_class, Formatter): 1021 ↛ 1022line 1021 didn't jump to line 1022, because the condition on line 1021 was never true

1022 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.") 

1023 dataset.formatter = formatter_class 

1024 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

1025 filtered.append(dataset) 

1026 return _IngestPrepData(filtered) 

1027 

1028 @transactional 

1029 def _finishIngest( 

1030 self, 

1031 prepData: Datastore.IngestPrepData, 

1032 *, 

1033 transfer: Optional[str] = None, 

1034 record_validation_info: bool = True, 

1035 ) -> None: 

1036 # Docstring inherited from Datastore._finishIngest. 

1037 refsAndInfos = [] 

1038 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

1039 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

1040 # Do ingest as if the first dataset ref is associated with the file 

1041 info = self._extractIngestInfo( 

1042 dataset.path, 

1043 dataset.refs[0], 

1044 formatter=dataset.formatter, 

1045 transfer=transfer, 

1046 record_validation_info=record_validation_info, 

1047 ) 

1048 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

1049 self._register_datasets(refsAndInfos) 

1050 

1051 def _calculate_ingested_datastore_name( 

1052 self, 

1053 srcUri: ResourcePath, 

1054 ref: DatasetRef, 

1055 formatter: Formatter | Type[Formatter] | None = None, 

1056 ) -> Location: 

1057 """Given a source URI and a DatasetRef, determine the name the 

1058 dataset will have inside datastore. 

1059 

1060 Parameters 

1061 ---------- 

1062 srcUri : `lsst.resources.ResourcePath` 

1063 URI to the source dataset file. 

1064 ref : `DatasetRef` 

1065 Ref associated with the newly-ingested dataset artifact. This 

1066 is used to determine the name within the datastore. 

1067 formatter : `Formatter` or Formatter class. 

1068 Formatter to use for validation. Can be a class or an instance. 

1069 No validation of the file extension is performed if the 

1070 ``formatter`` is `None`. This can be used if the caller knows 

1071 that the source URI and target URI will use the same formatter. 

1072 

1073 Returns 

1074 ------- 

1075 location : `Location` 

1076 Target location for the newly-ingested dataset. 

1077 """ 

1078 # Ingesting a file from outside the datastore. 

1079 # This involves a new name. 

1080 template = self.templates.getTemplate(ref) 

1081 location = self.locationFactory.fromPath(template.format(ref)) 

1082 

1083 # Get the extension 

1084 ext = srcUri.getExtension() 

1085 

1086 # Update the destination to include that extension 

1087 location.updateExtension(ext) 

1088 

1089 # Ask the formatter to validate this extension 

1090 if formatter is not None: 

1091 formatter.validateExtension(location) 

1092 

1093 return location 

1094 

1095 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

1096 """Write out in memory dataset to datastore. 

1097 

1098 Parameters 

1099 ---------- 

1100 inMemoryDataset : `object` 

1101 Dataset to write to datastore. 

1102 ref : `DatasetRef` 

1103 Registry information associated with this dataset. 

1104 

1105 Returns 

1106 ------- 

1107 info : `StoredFileInfo` 

1108 Information describing the artifact written to the datastore. 

1109 """ 

1110 # May need to coerce the in memory dataset to the correct 

1111 # python type. 

1112 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset) 

1113 

1114 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

1115 uri = location.uri 

1116 

1117 if not uri.dirname().exists(): 

1118 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

1119 uri.dirname().mkdir() 

1120 

1121 if self._transaction is None: 1121 ↛ 1122line 1121 didn't jump to line 1122, because the condition on line 1121 was never true

1122 raise RuntimeError("Attempting to write artifact without transaction enabled") 

1123 

1124 def _removeFileExists(uri: ResourcePath) -> None: 

1125 """Remove a file and do not complain if it is not there. 

1126 

1127 This is important since a formatter might fail before the file 

1128 is written and we should not confuse people by writing spurious 

1129 error messages to the log. 

1130 """ 

1131 try: 

1132 uri.remove() 

1133 except FileNotFoundError: 

1134 pass 

1135 

1136 # Register a callback to try to delete the uploaded data if 

1137 # something fails below 

1138 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1139 

1140 data_written = False 

1141 if not uri.isLocal: 

1142 # This is a remote URI. Some datasets can be serialized directly 

1143 # to bytes and sent to the remote datastore without writing a 

1144 # file. If the dataset is intended to be saved to the cache 

1145 # a file is always written and direct write to the remote 

1146 # datastore is bypassed. 

1147 if not self.cacheManager.should_be_cached(ref): 

1148 try: 

1149 serializedDataset = formatter.toBytes(inMemoryDataset) 

1150 except NotImplementedError: 

1151 # Fallback to the file writing option. 

1152 pass 

1153 except Exception as e: 

1154 raise RuntimeError( 

1155 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes." 

1156 ) from e 

1157 else: 

1158 log.debug("Writing bytes directly to %s", uri) 

1159 uri.write(serializedDataset, overwrite=True) 

1160 log.debug("Successfully wrote bytes directly to %s", uri) 

1161 data_written = True 

1162 

1163 if not data_written: 

1164 # Did not write the bytes directly to object store so instead 

1165 # write to temporary file. Always write to a temporary even if 

1166 # using a local file system -- that gives us atomic writes. 

1167 # If a process is killed as the file is being written we do not 

1168 # want it to remain in the correct place but in corrupt state. 

1169 # For local files write to the output directory not temporary dir. 

1170 prefix = uri.dirname() if uri.isLocal else None 

1171 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri: 

1172 # Need to configure the formatter to write to a different 

1173 # location and that needs us to overwrite internals 

1174 log.debug("Writing dataset to temporary location at %s", temporary_uri) 

1175 with formatter._updateLocation(Location(None, temporary_uri)): 

1176 try: 

1177 formatter.write(inMemoryDataset) 

1178 except Exception as e: 

1179 raise RuntimeError( 

1180 f"Failed to serialize dataset {ref} of type" 

1181 f" {type(inMemoryDataset)} to " 

1182 f"temporary location {temporary_uri}" 

1183 ) from e 

1184 

1185 # Use move for a local file since that becomes an efficient 

1186 # os.rename. For remote resources we use copy to allow the 

1187 # file to be cached afterwards. 

1188 transfer = "move" if uri.isLocal else "copy" 

1189 

1190 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True) 

1191 

1192 if transfer == "copy": 

1193 # Cache if required 

1194 self.cacheManager.move_to_cache(temporary_uri, ref) 

1195 

1196 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1197 

1198 # URI is needed to resolve what ingest case are we dealing with 

1199 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1200 

1201 def _read_artifact_into_memory( 

1202 self, 

1203 getInfo: DatastoreFileGetInformation, 

1204 ref: DatasetRef, 

1205 isComponent: bool = False, 

1206 cache_ref: Optional[DatasetRef] = None, 

1207 ) -> Any: 

1208 """Read the artifact from datastore into in memory object. 

1209 

1210 Parameters 

1211 ---------- 

1212 getInfo : `DatastoreFileGetInformation` 

1213 Information about the artifact within the datastore. 

1214 ref : `DatasetRef` 

1215 The registry information associated with this artifact. 

1216 isComponent : `bool` 

1217 Flag to indicate if a component is being read from this artifact. 

1218 cache_ref : `DatasetRef`, optional 

1219 The DatasetRef to use when looking up the file in the cache. 

1220 This ref must have the same ID as the supplied ref but can 

1221 be a parent ref or component ref to indicate to the cache whether 

1222 a composite file is being requested from the cache or a component 

1223 file. Without this the cache will default to the supplied ref but 

1224 it can get confused with read-only derived components for 

1225 disassembled composites. 

1226 

1227 Returns 

1228 ------- 

1229 inMemoryDataset : `object` 

1230 The artifact as a python object. 

1231 """ 

1232 location = getInfo.location 

1233 uri = location.uri 

1234 log.debug("Accessing data from %s", uri) 

1235 

1236 if cache_ref is None: 

1237 cache_ref = ref 

1238 if cache_ref.id != ref.id: 1238 ↛ 1239line 1238 didn't jump to line 1239, because the condition on line 1238 was never true

1239 raise ValueError( 

1240 "The supplied cache dataset ref refers to a different dataset than expected:" 

1241 f" {ref.id} != {cache_ref.id}" 

1242 ) 

1243 

1244 # Cannot recalculate checksum but can compare size as a quick check 

1245 # Do not do this if the size is negative since that indicates 

1246 # we do not know. 

1247 recorded_size = getInfo.info.file_size 

1248 resource_size = uri.size() 

1249 if recorded_size >= 0 and resource_size != recorded_size: 1249 ↛ 1250line 1249 didn't jump to line 1250, because the condition on line 1249 was never true

1250 raise RuntimeError( 

1251 "Integrity failure in Datastore. " 

1252 f"Size of file {uri} ({resource_size}) " 

1253 f"does not match size recorded in registry of {recorded_size}" 

1254 ) 

1255 

1256 # For the general case we have choices for how to proceed. 

1257 # 1. Always use a local file (downloading the remote resource to a 

1258 # temporary file if needed). 

1259 # 2. Use a threshold size and read into memory and use bytes. 

1260 # Use both for now with an arbitrary hand off size. 

1261 # This allows small datasets to be downloaded from remote object 

1262 # stores without requiring a temporary file. 

1263 

1264 formatter = getInfo.formatter 

1265 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1266 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1267 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1268 if cached_file is not None: 

1269 desired_uri = cached_file 

1270 msg = f" (cached version of {uri})" 

1271 else: 

1272 desired_uri = uri 

1273 msg = "" 

1274 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)): 

1275 serializedDataset = desired_uri.read() 

1276 log.debug( 

1277 "Deserializing %s from %d bytes from location %s with formatter %s", 

1278 f"component {getInfo.component}" if isComponent else "", 

1279 len(serializedDataset), 

1280 uri, 

1281 formatter.name(), 

1282 ) 

1283 try: 

1284 result = formatter.fromBytes( 

1285 serializedDataset, component=getInfo.component if isComponent else None 

1286 ) 

1287 except Exception as e: 

1288 raise ValueError( 

1289 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1290 f" ({ref.datasetType.name} from {uri}): {e}" 

1291 ) from e 

1292 else: 

1293 # Read from file. 

1294 

1295 # Have to update the Location associated with the formatter 

1296 # because formatter.read does not allow an override. 

1297 # This could be improved. 

1298 location_updated = False 

1299 msg = "" 

1300 

1301 # First check in cache for local version. 

1302 # The cache will only be relevant for remote resources but 

1303 # no harm in always asking. Context manager ensures that cache 

1304 # file is not deleted during cache expiration. 

1305 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1306 if cached_file is not None: 

1307 msg = f"(via cache read of remote file {uri})" 

1308 uri = cached_file 

1309 location_updated = True 

1310 

1311 with uri.as_local() as local_uri: 

1312 can_be_cached = False 

1313 if uri != local_uri: 1313 ↛ 1315line 1313 didn't jump to line 1315, because the condition on line 1313 was never true

1314 # URI was remote and file was downloaded 

1315 cache_msg = "" 

1316 location_updated = True 

1317 

1318 if self.cacheManager.should_be_cached(cache_ref): 

1319 # In this scenario we want to ask if the downloaded 

1320 # file should be cached but we should not cache 

1321 # it until after we've used it (to ensure it can't 

1322 # be expired whilst we are using it). 

1323 can_be_cached = True 

1324 

1325 # Say that it is "likely" to be cached because 

1326 # if the formatter read fails we will not be 

1327 # caching this file. 

1328 cache_msg = " and likely cached" 

1329 

1330 msg = f"(via download to local file{cache_msg})" 

1331 

1332 # Calculate the (possibly) new location for the formatter 

1333 # to use. 

1334 newLocation = Location(*local_uri.split()) if location_updated else None 

1335 

1336 log.debug( 

1337 "Reading%s from location %s %s with formatter %s", 

1338 f" component {getInfo.component}" if isComponent else "", 

1339 uri, 

1340 msg, 

1341 formatter.name(), 

1342 ) 

1343 try: 

1344 with formatter._updateLocation(newLocation): 

1345 with time_this( 

1346 log, 

1347 msg="Reading%s from location %s %s with formatter %s", 

1348 args=( 

1349 f" component {getInfo.component}" if isComponent else "", 

1350 uri, 

1351 msg, 

1352 formatter.name(), 

1353 ), 

1354 ): 

1355 result = formatter.read(component=getInfo.component if isComponent else None) 

1356 except Exception as e: 

1357 raise ValueError( 

1358 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1359 f" ({ref.datasetType.name} from {uri}): {e}" 

1360 ) from e 

1361 

1362 # File was read successfully so can move to cache 

1363 if can_be_cached: 1363 ↛ 1364line 1363 didn't jump to line 1364, because the condition on line 1363 was never true

1364 self.cacheManager.move_to_cache(local_uri, cache_ref) 

1365 

1366 return self._post_process_get( 

1367 result, ref.datasetType.storageClass, getInfo.assemblerParams, isComponent=isComponent 

1368 ) 

1369 

1370 def knows(self, ref: DatasetRef) -> bool: 

1371 """Check if the dataset is known to the datastore. 

1372 

1373 Does not check for existence of any artifact. 

1374 

1375 Parameters 

1376 ---------- 

1377 ref : `DatasetRef` 

1378 Reference to the required dataset. 

1379 

1380 Returns 

1381 ------- 

1382 exists : `bool` 

1383 `True` if the dataset is known to the datastore. 

1384 """ 

1385 fileLocations = self._get_dataset_locations_info(ref) 

1386 if fileLocations: 

1387 return True 

1388 return False 

1389 

1390 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

1391 # Docstring inherited from the base class. 

1392 

1393 # The records themselves. Could be missing some entries. 

1394 records = self._get_stored_records_associated_with_refs(refs) 

1395 

1396 return {ref: ref.id in records for ref in refs} 

1397 

1398 def _process_mexists_records( 

1399 self, 

1400 id_to_ref: Dict[DatasetId, DatasetRef], 

1401 records: Dict[DatasetId, List[StoredFileInfo]], 

1402 all_required: bool, 

1403 artifact_existence: Optional[Dict[ResourcePath, bool]] = None, 

1404 ) -> Dict[DatasetRef, bool]: 

1405 """Helper function for mexists that checks the given records. 

1406 

1407 Parameters 

1408 ---------- 

1409 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`] 

1410 Mapping of the dataset ID to the dataset ref itself. 

1411 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

1412 Records as generally returned by 

1413 ``_get_stored_records_associated_with_refs``. 

1414 all_required : `bool` 

1415 Flag to indicate whether existence requires all artifacts 

1416 associated with a dataset ID to exist or not for existence. 

1417 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1418 Optional mapping of datastore artifact to existence. Updated by 

1419 this method with details of all artifacts tested. Can be `None` 

1420 if the caller is not interested. 

1421 

1422 Returns 

1423 ------- 

1424 existence : `dict` of [`DatasetRef`, `bool`] 

1425 Mapping from dataset to boolean indicating existence. 

1426 """ 

1427 # The URIs to be checked and a mapping of those URIs to 

1428 # the dataset ID. 

1429 uris_to_check: List[ResourcePath] = [] 

1430 location_map: Dict[ResourcePath, DatasetId] = {} 

1431 

1432 location_factory = self.locationFactory 

1433 

1434 uri_existence: Dict[ResourcePath, bool] = {} 

1435 for ref_id, infos in records.items(): 

1436 # Key is the dataset Id, value is list of StoredItemInfo 

1437 uris = [info.file_location(location_factory).uri for info in infos] 

1438 location_map.update({uri: ref_id for uri in uris}) 

1439 

1440 # Check the local cache directly for a dataset corresponding 

1441 # to the remote URI. 

1442 if self.cacheManager.file_count > 0: 1442 ↛ 1443line 1442 didn't jump to line 1443, because the condition on line 1442 was never true

1443 ref = id_to_ref[ref_id] 

1444 for uri, storedFileInfo in zip(uris, infos): 

1445 check_ref = ref 

1446 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1447 check_ref = ref.makeComponentRef(component) 

1448 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()): 

1449 # Proxy for URI existence. 

1450 uri_existence[uri] = True 

1451 else: 

1452 uris_to_check.append(uri) 

1453 else: 

1454 # Check all of them. 

1455 uris_to_check.extend(uris) 

1456 

1457 if artifact_existence is not None: 

1458 # If a URI has already been checked remove it from the list 

1459 # and immediately add the status to the output dict. 

1460 filtered_uris_to_check = [] 

1461 for uri in uris_to_check: 

1462 if uri in artifact_existence: 

1463 uri_existence[uri] = artifact_existence[uri] 

1464 else: 

1465 filtered_uris_to_check.append(uri) 

1466 uris_to_check = filtered_uris_to_check 

1467 

1468 # Results. 

1469 dataset_existence: Dict[DatasetRef, bool] = {} 

1470 

1471 uri_existence.update(ResourcePath.mexists(uris_to_check)) 

1472 for uri, exists in uri_existence.items(): 

1473 dataset_id = location_map[uri] 

1474 ref = id_to_ref[dataset_id] 

1475 

1476 # Disassembled composite needs to check all locations. 

1477 # all_required indicates whether all need to exist or not. 

1478 if ref in dataset_existence: 

1479 if all_required: 

1480 exists = dataset_existence[ref] and exists 

1481 else: 

1482 exists = dataset_existence[ref] or exists 

1483 dataset_existence[ref] = exists 

1484 

1485 if artifact_existence is not None: 

1486 artifact_existence.update(uri_existence) 

1487 

1488 return dataset_existence 

1489 

1490 def mexists( 

1491 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None 

1492 ) -> Dict[DatasetRef, bool]: 

1493 """Check the existence of multiple datasets at once. 

1494 

1495 Parameters 

1496 ---------- 

1497 refs : iterable of `DatasetRef` 

1498 The datasets to be checked. 

1499 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1500 Optional mapping of datastore artifact to existence. Updated by 

1501 this method with details of all artifacts tested. Can be `None` 

1502 if the caller is not interested. 

1503 

1504 Returns 

1505 ------- 

1506 existence : `dict` of [`DatasetRef`, `bool`] 

1507 Mapping from dataset to boolean indicating existence. 

1508 

1509 Notes 

1510 ----- 

1511 To minimize potentially costly remote existence checks, the local 

1512 cache is checked as a proxy for existence. If a file for this 

1513 `DatasetRef` does exist no check is done for the actual URI. This 

1514 could result in possibly unexpected behavior if the dataset itself 

1515 has been removed from the datastore by another process whilst it is 

1516 still in the cache. 

1517 """ 

1518 chunk_size = 10_000 

1519 dataset_existence: Dict[DatasetRef, bool] = {} 

1520 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size) 

1521 n_found_total = 0 

1522 n_checked = 0 

1523 n_chunks = 0 

1524 for chunk in chunk_iterable(refs, chunk_size=chunk_size): 

1525 chunk_result = self._mexists(chunk, artifact_existence) 

1526 if log.isEnabledFor(VERBOSE): 

1527 n_results = len(chunk_result) 

1528 n_checked += n_results 

1529 # Can treat the booleans as 0, 1 integers and sum them. 

1530 n_found = sum(chunk_result.values()) 

1531 n_found_total += n_found 

1532 log.verbose( 

1533 "Number of datasets found in datastore for chunk %d = %d/%d (running total: %d/%d)", 

1534 n_chunks, 

1535 n_found, 

1536 n_results, 

1537 n_found_total, 

1538 n_checked, 

1539 ) 

1540 dataset_existence.update(chunk_result) 

1541 n_chunks += 1 

1542 

1543 return dataset_existence 

1544 

1545 def _mexists( 

1546 self, refs: Sequence[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None 

1547 ) -> Dict[DatasetRef, bool]: 

1548 """Check the existence of multiple datasets at once. 

1549 

1550 Parameters 

1551 ---------- 

1552 refs : iterable of `DatasetRef` 

1553 The datasets to be checked. 

1554 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1555 Optional mapping of datastore artifact to existence. Updated by 

1556 this method with details of all artifacts tested. Can be `None` 

1557 if the caller is not interested. 

1558 

1559 Returns 

1560 ------- 

1561 existence : `dict` of [`DatasetRef`, `bool`] 

1562 Mapping from dataset to boolean indicating existence. 

1563 """ 

1564 # Need a mapping of dataset_id to dataset ref since the API 

1565 # works with dataset_id 

1566 id_to_ref = {ref.getCheckedId(): ref for ref in refs} 

1567 

1568 # Set of all IDs we are checking for. 

1569 requested_ids = set(id_to_ref.keys()) 

1570 

1571 # The records themselves. Could be missing some entries. 

1572 records = self._get_stored_records_associated_with_refs(refs) 

1573 

1574 dataset_existence = self._process_mexists_records( 

1575 id_to_ref, records, True, artifact_existence=artifact_existence 

1576 ) 

1577 

1578 # Set of IDs that have been handled. 

1579 handled_ids = {ref.id for ref in dataset_existence.keys()} 

1580 

1581 missing_ids = requested_ids - handled_ids 

1582 if missing_ids: 

1583 dataset_existence.update( 

1584 self._mexists_check_expected( 

1585 [id_to_ref[missing] for missing in missing_ids], artifact_existence 

1586 ) 

1587 ) 

1588 

1589 return dataset_existence 

1590 

1591 def _mexists_check_expected( 

1592 self, refs: Sequence[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None 

1593 ) -> Dict[DatasetRef, bool]: 

1594 """Check existence of refs that are not known to datastore. 

1595 

1596 Parameters 

1597 ---------- 

1598 refs : iterable of `DatasetRef` 

1599 The datasets to be checked. These are assumed not to be known 

1600 to datastore. 

1601 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1602 Optional mapping of datastore artifact to existence. Updated by 

1603 this method with details of all artifacts tested. Can be `None` 

1604 if the caller is not interested. 

1605 

1606 Returns 

1607 ------- 

1608 existence : `dict` of [`DatasetRef`, `bool`] 

1609 Mapping from dataset to boolean indicating existence. 

1610 """ 

1611 dataset_existence: Dict[DatasetRef, bool] = {} 

1612 if not self.trustGetRequest: 

1613 # Must assume these do not exist 

1614 for ref in refs: 

1615 dataset_existence[ref] = False 

1616 else: 

1617 log.debug( 

1618 "%d datasets were not known to datastore during initial existence check.", 

1619 len(refs), 

1620 ) 

1621 

1622 # Construct data structure identical to that returned 

1623 # by _get_stored_records_associated_with_refs() but using 

1624 # guessed names. 

1625 records = {} 

1626 id_to_ref = {} 

1627 for missing_ref in refs: 

1628 expected = self._get_expected_dataset_locations_info(missing_ref) 

1629 dataset_id = missing_ref.getCheckedId() 

1630 records[dataset_id] = [info for _, info in expected] 

1631 id_to_ref[dataset_id] = missing_ref 

1632 

1633 dataset_existence.update( 

1634 self._process_mexists_records( 

1635 id_to_ref, 

1636 records, 

1637 False, 

1638 artifact_existence=artifact_existence, 

1639 ) 

1640 ) 

1641 

1642 return dataset_existence 

1643 

1644 def exists(self, ref: DatasetRef) -> bool: 

1645 """Check if the dataset exists in the datastore. 

1646 

1647 Parameters 

1648 ---------- 

1649 ref : `DatasetRef` 

1650 Reference to the required dataset. 

1651 

1652 Returns 

1653 ------- 

1654 exists : `bool` 

1655 `True` if the entity exists in the `Datastore`. 

1656 

1657 Notes 

1658 ----- 

1659 The local cache is checked as a proxy for existence in the remote 

1660 object store. It is possible that another process on a different 

1661 compute node could remove the file from the object store even 

1662 though it is present in the local cache. 

1663 """ 

1664 ref = self._cast_storage_class(ref) 

1665 fileLocations = self._get_dataset_locations_info(ref) 

1666 

1667 # if we are being asked to trust that registry might not be correct 

1668 # we ask for the expected locations and check them explicitly 

1669 if not fileLocations: 

1670 if not self.trustGetRequest: 

1671 return False 

1672 

1673 # First check the cache. If it is not found we must check 

1674 # the datastore itself. Assume that any component in the cache 

1675 # means that the dataset does exist somewhere. 

1676 if self.cacheManager.known_to_cache(ref): 1676 ↛ 1677line 1676 didn't jump to line 1677, because the condition on line 1676 was never true

1677 return True 

1678 

1679 # When we are guessing a dataset location we can not check 

1680 # for the existence of every component since we can not 

1681 # know if every component was written. Instead we check 

1682 # for the existence of any of the expected locations. 

1683 for location, _ in self._get_expected_dataset_locations_info(ref): 

1684 if self._artifact_exists(location): 

1685 return True 

1686 return False 

1687 

1688 # All listed artifacts must exist. 

1689 for location, storedFileInfo in fileLocations: 

1690 # Checking in cache needs the component ref. 

1691 check_ref = ref 

1692 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1693 check_ref = ref.makeComponentRef(component) 

1694 if self.cacheManager.known_to_cache(check_ref, location.getExtension()): 

1695 continue 

1696 

1697 if not self._artifact_exists(location): 1697 ↛ 1698line 1697 didn't jump to line 1698, because the condition on line 1697 was never true

1698 return False 

1699 

1700 return True 

1701 

1702 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

1703 """Return URIs associated with dataset. 

1704 

1705 Parameters 

1706 ---------- 

1707 ref : `DatasetRef` 

1708 Reference to the required dataset. 

1709 predict : `bool`, optional 

1710 If the datastore does not know about the dataset, should it 

1711 return a predicted URI or not? 

1712 

1713 Returns 

1714 ------- 

1715 uris : `DatasetRefURIs` 

1716 The URI to the primary artifact associated with this dataset (if 

1717 the dataset was disassembled within the datastore this may be 

1718 `None`), and the URIs to any components associated with the dataset 

1719 artifact. (can be empty if there are no components). 

1720 """ 

1721 many = self.getManyURIs([ref], predict=predict, allow_missing=False) 

1722 return many[ref] 

1723 

1724 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

1725 """URI to the Dataset. 

1726 

1727 Parameters 

1728 ---------- 

1729 ref : `DatasetRef` 

1730 Reference to the required Dataset. 

1731 predict : `bool` 

1732 If `True`, allow URIs to be returned of datasets that have not 

1733 been written. 

1734 

1735 Returns 

1736 ------- 

1737 uri : `str` 

1738 URI pointing to the dataset within the datastore. If the 

1739 dataset does not exist in the datastore, and if ``predict`` is 

1740 `True`, the URI will be a prediction and will include a URI 

1741 fragment "#predicted". 

1742 If the datastore does not have entities that relate well 

1743 to the concept of a URI the returned URI will be 

1744 descriptive. The returned URI is not guaranteed to be obtainable. 

1745 

1746 Raises 

1747 ------ 

1748 FileNotFoundError 

1749 Raised if a URI has been requested for a dataset that does not 

1750 exist and guessing is not allowed. 

1751 RuntimeError 

1752 Raised if a request is made for a single URI but multiple URIs 

1753 are associated with this dataset. 

1754 

1755 Notes 

1756 ----- 

1757 When a predicted URI is requested an attempt will be made to form 

1758 a reasonable URI based on file templates and the expected formatter. 

1759 """ 

1760 primary, components = self.getURIs(ref, predict) 

1761 if primary is None or components: 1761 ↛ 1762line 1761 didn't jump to line 1762, because the condition on line 1761 was never true

1762 raise RuntimeError( 

1763 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

1764 ) 

1765 return primary 

1766 

1767 def _predict_URIs( 

1768 self, 

1769 ref: DatasetRef, 

1770 ) -> DatasetRefURIs: 

1771 """Predict the URIs of a dataset ref. 

1772 

1773 Parameters 

1774 ---------- 

1775 ref : `DatasetRef` 

1776 Reference to the required Dataset. 

1777 

1778 Returns 

1779 ------- 

1780 URI : DatasetRefUris 

1781 Primary and component URIs. URIs will contain a URI fragment 

1782 "#predicted". 

1783 """ 

1784 uris = DatasetRefURIs() 

1785 

1786 if self.composites.shouldBeDisassembled(ref): 

1787 for component, _ in ref.datasetType.storageClass.components.items(): 

1788 comp_ref = ref.makeComponentRef(component) 

1789 comp_location, _ = self._determine_put_formatter_location(comp_ref) 

1790 

1791 # Add the "#predicted" URI fragment to indicate this is a 

1792 # guess 

1793 uris.componentURIs[component] = ResourcePath(comp_location.uri.geturl() + "#predicted") 

1794 

1795 else: 

1796 location, _ = self._determine_put_formatter_location(ref) 

1797 

1798 # Add the "#predicted" URI fragment to indicate this is a guess 

1799 uris.primaryURI = ResourcePath(location.uri.geturl() + "#predicted") 

1800 

1801 return uris 

1802 

1803 def getManyURIs( 

1804 self, 

1805 refs: Iterable[DatasetRef], 

1806 predict: bool = False, 

1807 allow_missing: bool = False, 

1808 ) -> Dict[DatasetRef, DatasetRefURIs]: 

1809 # Docstring inherited 

1810 

1811 uris: Dict[DatasetRef, DatasetRefURIs] = {} 

1812 

1813 records = self._get_stored_records_associated_with_refs(refs) 

1814 records_keys = records.keys() 

1815 

1816 existing_refs = tuple(ref for ref in refs if ref.id in records_keys) 

1817 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys) 

1818 

1819 # Have to handle trustGetRequest mode by checking for the existence 

1820 # of the missing refs on disk. 

1821 if missing_refs: 

1822 dataset_existence = self._mexists_check_expected(missing_refs, None) 

1823 really_missing = set() 

1824 not_missing = set() 

1825 for ref, exists in dataset_existence.items(): 

1826 if exists: 

1827 not_missing.add(ref) 

1828 else: 

1829 really_missing.add(ref) 

1830 

1831 if not_missing: 

1832 # Need to recalculate the missing/existing split. 

1833 existing_refs = existing_refs + tuple(not_missing) 

1834 missing_refs = tuple(really_missing) 

1835 

1836 for ref in missing_refs: 

1837 # if this has never been written then we have to guess 

1838 if not predict: 

1839 if not allow_missing: 

1840 raise FileNotFoundError("Dataset {} not in this datastore.".format(ref)) 

1841 else: 

1842 uris[ref] = self._predict_URIs(ref) 

1843 

1844 for ref in existing_refs: 

1845 file_infos = records[ref.getCheckedId()] 

1846 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos] 

1847 uris[ref] = self._locations_to_URI(ref, file_locations) 

1848 

1849 return uris 

1850 

1851 def _locations_to_URI( 

1852 self, 

1853 ref: DatasetRef, 

1854 file_locations: Sequence[Tuple[Location, StoredFileInfo]], 

1855 ) -> DatasetRefURIs: 

1856 """Convert one or more file locations associated with a DatasetRef 

1857 to a DatasetRefURIs. 

1858 

1859 Parameters 

1860 ---------- 

1861 ref : `DatasetRef` 

1862 Reference to the dataset. 

1863 file_locations : Sequence[Tuple[Location, StoredFileInfo]] 

1864 Each item in the sequence is the location of the dataset within the 

1865 datastore and stored information about the file and its formatter. 

1866 If there is only one item in the sequence then it is treated as the 

1867 primary URI. If there is more than one item then they are treated 

1868 as component URIs. If there are no items then an error is raised 

1869 unless ``self.trustGetRequest`` is `True`. 

1870 

1871 Returns 

1872 ------- 

1873 uris: DatasetRefURIs 

1874 Represents the primary URI or component URIs described by the 

1875 inputs. 

1876 

1877 Raises 

1878 ------ 

1879 RuntimeError 

1880 If no file locations are passed in and ``self.trustGetRequest`` is 

1881 `False`. 

1882 FileNotFoundError 

1883 If the a passed-in URI does not exist, and ``self.trustGetRequest`` 

1884 is `False`. 

1885 RuntimeError 

1886 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is 

1887 unexpected). 

1888 """ 

1889 

1890 guessing = False 

1891 uris = DatasetRefURIs() 

1892 

1893 if not file_locations: 

1894 if not self.trustGetRequest: 1894 ↛ 1895line 1894 didn't jump to line 1895, because the condition on line 1894 was never true

1895 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1896 file_locations = self._get_expected_dataset_locations_info(ref) 

1897 guessing = True 

1898 

1899 if len(file_locations) == 1: 

1900 # No disassembly so this is the primary URI 

1901 uris.primaryURI = file_locations[0][0].uri 

1902 if guessing and not uris.primaryURI.exists(): 1902 ↛ 1903line 1902 didn't jump to line 1903, because the condition on line 1902 was never true

1903 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist") 

1904 else: 

1905 for location, file_info in file_locations: 

1906 if file_info.component is None: 1906 ↛ 1907line 1906 didn't jump to line 1907, because the condition on line 1906 was never true

1907 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1908 if guessing and not location.uri.exists(): 1908 ↛ 1912line 1908 didn't jump to line 1912, because the condition on line 1908 was never true

1909 # If we are trusting then it is entirely possible for 

1910 # some components to be missing. In that case we skip 

1911 # to the next component. 

1912 if self.trustGetRequest: 

1913 continue 

1914 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist") 

1915 uris.componentURIs[file_info.component] = location.uri 

1916 

1917 return uris 

1918 

1919 def retrieveArtifacts( 

1920 self, 

1921 refs: Iterable[DatasetRef], 

1922 destination: ResourcePath, 

1923 transfer: str = "auto", 

1924 preserve_path: bool = True, 

1925 overwrite: bool = False, 

1926 ) -> List[ResourcePath]: 

1927 """Retrieve the file artifacts associated with the supplied refs. 

1928 

1929 Parameters 

1930 ---------- 

1931 refs : iterable of `DatasetRef` 

1932 The datasets for which file artifacts are to be retrieved. 

1933 A single ref can result in multiple files. The refs must 

1934 be resolved. 

1935 destination : `lsst.resources.ResourcePath` 

1936 Location to write the file artifacts. 

1937 transfer : `str`, optional 

1938 Method to use to transfer the artifacts. Must be one of the options 

1939 supported by `lsst.resources.ResourcePath.transfer_from()`. 

1940 "move" is not allowed. 

1941 preserve_path : `bool`, optional 

1942 If `True` the full path of the file artifact within the datastore 

1943 is preserved. If `False` the final file component of the path 

1944 is used. 

1945 overwrite : `bool`, optional 

1946 If `True` allow transfers to overwrite existing files at the 

1947 destination. 

1948 

1949 Returns 

1950 ------- 

1951 targets : `list` of `lsst.resources.ResourcePath` 

1952 URIs of file artifacts in destination location. Order is not 

1953 preserved. 

1954 """ 

1955 if not destination.isdir(): 1955 ↛ 1956line 1955 didn't jump to line 1956, because the condition on line 1955 was never true

1956 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

1957 

1958 if transfer == "move": 

1959 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

1960 

1961 # Source -> Destination 

1962 # This also helps filter out duplicate DatasetRef in the request 

1963 # that will map to the same underlying file transfer. 

1964 to_transfer: Dict[ResourcePath, ResourcePath] = {} 

1965 

1966 for ref in refs: 

1967 locations = self._get_dataset_locations_info(ref) 

1968 for location, _ in locations: 

1969 source_uri = location.uri 

1970 target_path: ResourcePathExpression 

1971 if preserve_path: 

1972 target_path = location.pathInStore 

1973 if target_path.isabs(): 1973 ↛ 1976line 1973 didn't jump to line 1976, because the condition on line 1973 was never true

1974 # This is an absolute path to an external file. 

1975 # Use the full path. 

1976 target_path = target_path.relativeToPathRoot 

1977 else: 

1978 target_path = source_uri.basename() 

1979 target_uri = destination.join(target_path) 

1980 to_transfer[source_uri] = target_uri 

1981 

1982 # In theory can now parallelize the transfer 

1983 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer)) 

1984 for source_uri, target_uri in to_transfer.items(): 

1985 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

1986 

1987 return list(to_transfer.values()) 

1988 

1989 def get( 

1990 self, 

1991 ref: DatasetRef, 

1992 parameters: Optional[Mapping[str, Any]] = None, 

1993 storageClass: Optional[Union[StorageClass, str]] = None, 

1994 ) -> Any: 

1995 """Load an InMemoryDataset from the store. 

1996 

1997 Parameters 

1998 ---------- 

1999 ref : `DatasetRef` 

2000 Reference to the required Dataset. 

2001 parameters : `dict` 

2002 `StorageClass`-specific parameters that specify, for example, 

2003 a slice of the dataset to be loaded. 

2004 storageClass : `StorageClass` or `str`, optional 

2005 The storage class to be used to override the Python type 

2006 returned by this method. By default the returned type matches 

2007 the dataset type definition for this dataset. Specifying a 

2008 read `StorageClass` can force a different type to be returned. 

2009 This type must be compatible with the original type. 

2010 

2011 Returns 

2012 ------- 

2013 inMemoryDataset : `object` 

2014 Requested dataset or slice thereof as an InMemoryDataset. 

2015 

2016 Raises 

2017 ------ 

2018 FileNotFoundError 

2019 Requested dataset can not be retrieved. 

2020 TypeError 

2021 Return value from formatter has unexpected type. 

2022 ValueError 

2023 Formatter failed to process the dataset. 

2024 """ 

2025 # Supplied storage class for the component being read is either 

2026 # from the ref itself or some an override if we want to force 

2027 # type conversion. 

2028 if storageClass is not None: 

2029 ref = ref.overrideStorageClass(storageClass) 

2030 refStorageClass = ref.datasetType.storageClass 

2031 

2032 allGetInfo = self._prepare_for_get(ref, parameters) 

2033 refComponent = ref.datasetType.component() 

2034 

2035 # Create mapping from component name to related info 

2036 allComponents = {i.component: i for i in allGetInfo} 

2037 

2038 # By definition the dataset is disassembled if we have more 

2039 # than one record for it. 

2040 isDisassembled = len(allGetInfo) > 1 

2041 

2042 # Look for the special case where we are disassembled but the 

2043 # component is a derived component that was not written during 

2044 # disassembly. For this scenario we need to check that the 

2045 # component requested is listed as a derived component for the 

2046 # composite storage class 

2047 isDisassembledReadOnlyComponent = False 

2048 if isDisassembled and refComponent: 

2049 # The composite storage class should be accessible through 

2050 # the component dataset type 

2051 compositeStorageClass = ref.datasetType.parentStorageClass 

2052 

2053 # In the unlikely scenario where the composite storage 

2054 # class is not known, we can only assume that this is a 

2055 # normal component. If that assumption is wrong then the 

2056 # branch below that reads a persisted component will fail 

2057 # so there is no need to complain here. 

2058 if compositeStorageClass is not None: 2058 ↛ 2061line 2058 didn't jump to line 2061, because the condition on line 2058 was never false

2059 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

2060 

2061 if isDisassembled and not refComponent: 

2062 # This was a disassembled dataset spread over multiple files 

2063 # and we need to put them all back together again. 

2064 # Read into memory and then assemble 

2065 

2066 # Check that the supplied parameters are suitable for the type read 

2067 refStorageClass.validateParameters(parameters) 

2068 

2069 # We want to keep track of all the parameters that were not used 

2070 # by formatters. We assume that if any of the component formatters 

2071 # use a parameter that we do not need to apply it again in the 

2072 # assembler. 

2073 usedParams = set() 

2074 

2075 components: Dict[str, Any] = {} 

2076 for getInfo in allGetInfo: 

2077 # assemblerParams are parameters not understood by the 

2078 # associated formatter. 

2079 usedParams.update(set(getInfo.formatterParams)) 

2080 

2081 component = getInfo.component 

2082 

2083 if component is None: 2083 ↛ 2084line 2083 didn't jump to line 2084, because the condition on line 2083 was never true

2084 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

2085 

2086 # We do not want the formatter to think it's reading 

2087 # a component though because it is really reading a 

2088 # standalone dataset -- always tell reader it is not a 

2089 # component. 

2090 components[component] = self._read_artifact_into_memory( 

2091 getInfo, ref.makeComponentRef(component), isComponent=False 

2092 ) 

2093 

2094 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

2095 

2096 # Any unused parameters will have to be passed to the assembler 

2097 if parameters: 

2098 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

2099 else: 

2100 unusedParams = {} 

2101 

2102 # Process parameters 

2103 return ref.datasetType.storageClass.delegate().handleParameters( 

2104 inMemoryDataset, parameters=unusedParams 

2105 ) 

2106 

2107 elif isDisassembledReadOnlyComponent: 

2108 compositeStorageClass = ref.datasetType.parentStorageClass 

2109 if compositeStorageClass is None: 2109 ↛ 2110line 2109 didn't jump to line 2110, because the condition on line 2109 was never true

2110 raise RuntimeError( 

2111 f"Unable to retrieve derived component '{refComponent}' since" 

2112 "no composite storage class is available." 

2113 ) 

2114 

2115 if refComponent is None: 2115 ↛ 2117line 2115 didn't jump to line 2117, because the condition on line 2115 was never true

2116 # Mainly for mypy 

2117 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

2118 

2119 # Assume that every derived component can be calculated by 

2120 # forwarding the request to a single read/write component. 

2121 # Rather than guessing which rw component is the right one by 

2122 # scanning each for a derived component of the same name, 

2123 # we ask the storage class delegate directly which one is best to 

2124 # use. 

2125 compositeDelegate = compositeStorageClass.delegate() 

2126 forwardedComponent = compositeDelegate.selectResponsibleComponent( 

2127 refComponent, set(allComponents) 

2128 ) 

2129 

2130 # Select the relevant component 

2131 rwInfo = allComponents[forwardedComponent] 

2132 

2133 # For now assume that read parameters are validated against 

2134 # the real component and not the requested component 

2135 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

2136 forwardedStorageClass.validateParameters(parameters) 

2137 

2138 # The reference to use for the caching must refer to the forwarded 

2139 # component and not the derived component. 

2140 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent) 

2141 

2142 # Unfortunately the FileDescriptor inside the formatter will have 

2143 # the wrong write storage class so we need to create a new one 

2144 # given the immutability constraint. 

2145 writeStorageClass = rwInfo.info.storageClass 

2146 

2147 # We may need to put some thought into parameters for read 

2148 # components but for now forward them on as is 

2149 readFormatter = type(rwInfo.formatter)( 

2150 FileDescriptor( 

2151 rwInfo.location, 

2152 readStorageClass=refStorageClass, 

2153 storageClass=writeStorageClass, 

2154 parameters=parameters, 

2155 ), 

2156 ref.dataId, 

2157 ) 

2158 

2159 # The assembler can not receive any parameter requests for a 

2160 # derived component at this time since the assembler will 

2161 # see the storage class of the derived component and those 

2162 # parameters will have to be handled by the formatter on the 

2163 # forwarded storage class. 

2164 assemblerParams: Dict[str, Any] = {} 

2165 

2166 # Need to created a new info that specifies the derived 

2167 # component and associated storage class 

2168 readInfo = DatastoreFileGetInformation( 

2169 rwInfo.location, 

2170 readFormatter, 

2171 rwInfo.info, 

2172 assemblerParams, 

2173 {}, 

2174 refComponent, 

2175 refStorageClass, 

2176 ) 

2177 

2178 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref) 

2179 

2180 else: 

2181 # Single file request or component from that composite file 

2182 for lookup in (refComponent, None): 2182 ↛ 2187line 2182 didn't jump to line 2187, because the loop on line 2182 didn't complete

2183 if lookup in allComponents: 2183 ↛ 2182line 2183 didn't jump to line 2182, because the condition on line 2183 was never false

2184 getInfo = allComponents[lookup] 

2185 break 

2186 else: 

2187 raise FileNotFoundError( 

2188 f"Component {refComponent} not found for ref {ref} in datastore {self.name}" 

2189 ) 

2190 

2191 # Do not need the component itself if already disassembled 

2192 if isDisassembled: 

2193 isComponent = False 

2194 else: 

2195 isComponent = getInfo.component is not None 

2196 

2197 # For a component read of a composite we want the cache to 

2198 # be looking at the composite ref itself. 

2199 cache_ref = ref.makeCompositeRef() if isComponent else ref 

2200 

2201 # For a disassembled component we can validate parametersagainst 

2202 # the component storage class directly 

2203 if isDisassembled: 

2204 refStorageClass.validateParameters(parameters) 

2205 else: 

2206 # For an assembled composite this could be a derived 

2207 # component derived from a real component. The validity 

2208 # of the parameters is not clear. For now validate against 

2209 # the composite storage class 

2210 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

2211 

2212 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref) 

2213 

2214 @transactional 

2215 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

2216 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

2217 

2218 Parameters 

2219 ---------- 

2220 inMemoryDataset : `object` 

2221 The dataset to store. 

2222 ref : `DatasetRef` 

2223 Reference to the associated Dataset. 

2224 

2225 Raises 

2226 ------ 

2227 TypeError 

2228 Supplied object and storage class are inconsistent. 

2229 DatasetTypeNotSupportedError 

2230 The associated `DatasetType` is not handled by this datastore. 

2231 

2232 Notes 

2233 ----- 

2234 If the datastore is configured to reject certain dataset types it 

2235 is possible that the put will fail and raise a 

2236 `DatasetTypeNotSupportedError`. The main use case for this is to 

2237 allow `ChainedDatastore` to put to multiple datastores without 

2238 requiring that every datastore accepts the dataset. 

2239 """ 

2240 

2241 doDisassembly = self.composites.shouldBeDisassembled(ref) 

2242 # doDisassembly = True 

2243 

2244 artifacts = [] 

2245 if doDisassembly: 

2246 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

2247 if components is None: 2247 ↛ 2248line 2247 didn't jump to line 2248, because the condition on line 2247 was never true

2248 raise RuntimeError( 

2249 f"Inconsistent configuration: dataset type {ref.datasetType.name} " 

2250 f"with storage class {ref.datasetType.storageClass.name} " 

2251 "is configured to be disassembled, but cannot be." 

2252 ) 

2253 for component, componentInfo in components.items(): 

2254 # Don't recurse because we want to take advantage of 

2255 # bulk insert -- need a new DatasetRef that refers to the 

2256 # same dataset_id but has the component DatasetType 

2257 # DatasetType does not refer to the types of components 

2258 # So we construct one ourselves. 

2259 compRef = ref.makeComponentRef(component) 

2260 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

2261 artifacts.append((compRef, storedInfo)) 

2262 else: 

2263 # Write the entire thing out 

2264 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

2265 artifacts.append((ref, storedInfo)) 

2266 

2267 self._register_datasets(artifacts) 

2268 

2269 @transactional 

2270 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None: 

2271 # At this point can safely remove these datasets from the cache 

2272 # to avoid confusion later on. If they are not trashed later 

2273 # the cache will simply be refilled. 

2274 self.cacheManager.remove_from_cache(ref) 

2275 

2276 # If we are in trust mode there will be nothing to move to 

2277 # the trash table and we will have to try to delete the file 

2278 # immediately. 

2279 if self.trustGetRequest: 

2280 # Try to keep the logic below for a single file trash. 

2281 if isinstance(ref, DatasetRef): 

2282 refs = {ref} 

2283 else: 

2284 # Will recreate ref at the end of this branch. 

2285 refs = set(ref) 

2286 

2287 # Determine which datasets are known to datastore directly. 

2288 id_to_ref = {ref.getCheckedId(): ref for ref in refs} 

2289 existing_ids = self._get_stored_records_associated_with_refs(refs) 

2290 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids} 

2291 

2292 missing = refs - existing_refs 

2293 if missing: 

2294 # Do an explicit existence check on these refs. 

2295 # We only care about the artifacts at this point and not 

2296 # the dataset existence. 

2297 artifact_existence: Dict[ResourcePath, bool] = {} 

2298 _ = self.mexists(missing, artifact_existence) 

2299 uris = [uri for uri, exists in artifact_existence.items() if exists] 

2300 

2301 # FUTURE UPGRADE: Implement a parallelized bulk remove. 

2302 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris)) 

2303 for uri in uris: 

2304 try: 

2305 uri.remove() 

2306 except Exception as e: 

2307 if ignore_errors: 

2308 log.debug("Artifact %s could not be removed: %s", uri, e) 

2309 continue 

2310 raise 

2311 

2312 # There is no point asking the code below to remove refs we 

2313 # know are missing so update it with the list of existing 

2314 # records. Try to retain one vs many logic. 

2315 if not existing_refs: 

2316 # Nothing more to do since none of the datasets were 

2317 # known to the datastore record table. 

2318 return 

2319 ref = list(existing_refs) 

2320 if len(ref) == 1: 

2321 ref = ref[0] 

2322 

2323 # Get file metadata and internal metadata 

2324 if not isinstance(ref, DatasetRef): 

2325 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

2326 # Assumed to be an iterable of refs so bulk mode enabled. 

2327 try: 

2328 self.bridge.moveToTrash(ref, transaction=self._transaction) 

2329 except Exception as e: 

2330 if ignore_errors: 

2331 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

2332 else: 

2333 raise 

2334 return 

2335 

2336 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

2337 

2338 fileLocations = self._get_dataset_locations_info(ref) 

2339 

2340 if not fileLocations: 

2341 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

2342 if ignore_errors: 

2343 log.warning(err_msg) 

2344 return 

2345 else: 

2346 raise FileNotFoundError(err_msg) 

2347 

2348 for location, storedFileInfo in fileLocations: 

2349 if not self._artifact_exists(location): 2349 ↛ 2350line 2349 didn't jump to line 2350

2350 err_msg = ( 

2351 f"Dataset is known to datastore {self.name} but " 

2352 f"associated artifact ({location.uri}) is missing" 

2353 ) 

2354 if ignore_errors: 

2355 log.warning(err_msg) 

2356 return 

2357 else: 

2358 raise FileNotFoundError(err_msg) 

2359 

2360 # Mark dataset as trashed 

2361 try: 

2362 self.bridge.moveToTrash([ref], transaction=self._transaction) 

2363 except Exception as e: 

2364 if ignore_errors: 

2365 log.warning( 

2366 "Attempted to mark dataset (%s) to be trashed in datastore %s " 

2367 "but encountered an error: %s", 

2368 ref, 

2369 self.name, 

2370 e, 

2371 ) 

2372 pass 

2373 else: 

2374 raise 

2375 

2376 @transactional 

2377 def emptyTrash(self, ignore_errors: bool = True) -> None: 

2378 """Remove all datasets from the trash. 

2379 

2380 Parameters 

2381 ---------- 

2382 ignore_errors : `bool` 

2383 If `True` return without error even if something went wrong. 

2384 Problems could occur if another process is simultaneously trying 

2385 to delete. 

2386 """ 

2387 log.debug("Emptying trash in datastore %s", self.name) 

2388 

2389 # Context manager will empty trash iff we finish it without raising. 

2390 # It will also automatically delete the relevant rows from the 

2391 # trash table and the records table. 

2392 with self.bridge.emptyTrash( 

2393 self._table, record_class=StoredFileInfo, record_column="path" 

2394 ) as trash_data: 

2395 # Removing the artifacts themselves requires that the files are 

2396 # not also associated with refs that are not to be trashed. 

2397 # Therefore need to do a query with the file paths themselves 

2398 # and return all the refs associated with them. Can only delete 

2399 # a file if the refs to be trashed are the only refs associated 

2400 # with the file. 

2401 # This requires multiple copies of the trashed items 

2402 trashed, artifacts_to_keep = trash_data 

2403 

2404 if artifacts_to_keep is None: 

2405 # The bridge is not helping us so have to work it out 

2406 # ourselves. This is not going to be as efficient. 

2407 trashed = list(trashed) 

2408 

2409 # The instance check is for mypy since up to this point it 

2410 # does not know the type of info. 

2411 path_map = self._refs_associated_with_artifacts( 

2412 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)] 

2413 ) 

2414 

2415 for ref, info in trashed: 

2416 # Mypy needs to know this is not the base class 

2417 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2418 

2419 # Check for mypy 

2420 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

2421 

2422 path_map[info.path].remove(ref.id) 

2423 if not path_map[info.path]: 2423 ↛ 2415line 2423 didn't jump to line 2415, because the condition on line 2423 was never false

2424 del path_map[info.path] 

2425 

2426 artifacts_to_keep = set(path_map) 

2427 

2428 for ref, info in trashed: 

2429 # Should not happen for this implementation but need 

2430 # to keep mypy happy. 

2431 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

2432 

2433 # Mypy needs to know this is not the base class 

2434 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2435 

2436 # Check for mypy 

2437 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

2438 

2439 if info.path in artifacts_to_keep: 

2440 # This is a multi-dataset artifact and we are not 

2441 # removing all associated refs. 

2442 continue 

2443 

2444 # Only trashed refs still known to datastore will be returned. 

2445 location = info.file_location(self.locationFactory) 

2446 

2447 # Point of no return for this artifact 

2448 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

2449 try: 

2450 self._delete_artifact(location) 

2451 except FileNotFoundError: 

2452 # If the file itself has been deleted there is nothing 

2453 # we can do about it. It is possible that trash has 

2454 # been run in parallel in another process or someone 

2455 # decided to delete the file. It is unlikely to come 

2456 # back and so we should still continue with the removal 

2457 # of the entry from the trash table. It is also possible 

2458 # we removed it in a previous iteration if it was 

2459 # a multi-dataset artifact. The delete artifact method 

2460 # will log a debug message in this scenario. 

2461 # Distinguishing file missing before trash started and 

2462 # file already removed previously as part of this trash 

2463 # is not worth the distinction with regards to potential 

2464 # memory cost. 

2465 pass 

2466 except Exception as e: 

2467 if ignore_errors: 

2468 # Use a debug message here even though it's not 

2469 # a good situation. In some cases this can be 

2470 # caused by a race between user A and user B 

2471 # and neither of them has permissions for the 

2472 # other's files. Butler does not know about users 

2473 # and trash has no idea what collections these 

2474 # files were in (without guessing from a path). 

2475 log.debug( 

2476 "Encountered error removing artifact %s from datastore %s: %s", 

2477 location.uri, 

2478 self.name, 

2479 e, 

2480 ) 

2481 else: 

2482 raise 

2483 

2484 @transactional 

2485 def transfer_from( 

2486 self, 

2487 source_datastore: Datastore, 

2488 refs: Iterable[DatasetRef], 

2489 transfer: str = "auto", 

2490 artifact_existence: Optional[Dict[ResourcePath, bool]] = None, 

2491 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

2492 # Docstring inherited 

2493 if type(self) is not type(source_datastore): 

2494 raise TypeError( 

2495 f"Datastore mismatch between this datastore ({type(self)}) and the " 

2496 f"source datastore ({type(source_datastore)})." 

2497 ) 

2498 

2499 # Be explicit for mypy 

2500 if not isinstance(source_datastore, FileDatastore): 2500 ↛ 2501line 2500 didn't jump to line 2501, because the condition on line 2500 was never true

2501 raise TypeError( 

2502 "Can only transfer to a FileDatastore from another FileDatastore, not" 

2503 f" {type(source_datastore)}" 

2504 ) 

2505 

2506 # Stop early if "direct" transfer mode is requested. That would 

2507 # require that the URI inside the source datastore should be stored 

2508 # directly in the target datastore, which seems unlikely to be useful 

2509 # since at any moment the source datastore could delete the file. 

2510 if transfer in ("direct", "split"): 

2511 raise ValueError( 

2512 f"Can not transfer from a source datastore using {transfer} mode since" 

2513 " those files are controlled by the other datastore." 

2514 ) 

2515 

2516 # Empty existence lookup if none given. 

2517 if artifact_existence is None: 

2518 artifact_existence = {} 

2519 

2520 # We will go through the list multiple times so must convert 

2521 # generators to lists. 

2522 refs = list(refs) 

2523 

2524 # In order to handle disassembled composites the code works 

2525 # at the records level since it can assume that internal APIs 

2526 # can be used. 

2527 # - If the record already exists in the destination this is assumed 

2528 # to be okay. 

2529 # - If there is no record but the source and destination URIs are 

2530 # identical no transfer is done but the record is added. 

2531 # - If the source record refers to an absolute URI currently assume 

2532 # that that URI should remain absolute and will be visible to the 

2533 # destination butler. May need to have a flag to indicate whether 

2534 # the dataset should be transferred. This will only happen if 

2535 # the detached Butler has had a local ingest. 

2536 

2537 # What we really want is all the records in the source datastore 

2538 # associated with these refs. Or derived ones if they don't exist 

2539 # in the source. 

2540 source_records = source_datastore._get_stored_records_associated_with_refs(refs) 

2541 

2542 # The source dataset_ids are the keys in these records 

2543 source_ids = set(source_records) 

2544 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

2545 

2546 # The not None check is to appease mypy 

2547 requested_ids = set(ref.id for ref in refs if ref.id is not None) 

2548 missing_ids = requested_ids - source_ids 

2549 

2550 # Missing IDs can be okay if that datastore has allowed 

2551 # gets based on file existence. Should we transfer what we can 

2552 # or complain about it and warn? 

2553 if missing_ids and not source_datastore.trustGetRequest: 2553 ↛ 2554line 2553 didn't jump to line 2554, because the condition on line 2553 was never true

2554 raise ValueError( 

2555 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}" 

2556 ) 

2557 

2558 # Need to map these missing IDs to a DatasetRef so we can guess 

2559 # the details. 

2560 if missing_ids: 

2561 log.info( 

2562 "Number of expected datasets missing from source datastore records: %d out of %d", 

2563 len(missing_ids), 

2564 len(requested_ids), 

2565 ) 

2566 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

2567 

2568 # This should be chunked in case we end up having to check 

2569 # the file store since we need some log output to show 

2570 # progress. 

2571 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000): 

2572 records = {} 

2573 for missing in missing_ids_chunk: 

2574 # Ask the source datastore where the missing artifacts 

2575 # should be. An execution butler might not know about the 

2576 # artifacts even if they are there. 

2577 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) 

2578 records[missing] = [info for _, info in expected] 

2579 

2580 # Call the mexist helper method in case we have not already 

2581 # checked these artifacts such that artifact_existence is 

2582 # empty. This allows us to benefit from parallelism. 

2583 # datastore.mexists() itself does not give us access to the 

2584 # derived datastore record. 

2585 log.verbose("Checking existence of %d datasets unknown to datastore", len(records)) 

2586 ref_exists = source_datastore._process_mexists_records( 

2587 id_to_ref, records, False, artifact_existence=artifact_existence 

2588 ) 

2589 

2590 # Now go through the records and propagate the ones that exist. 

2591 location_factory = source_datastore.locationFactory 

2592 for missing, record_list in records.items(): 

2593 # Skip completely if the ref does not exist. 

2594 ref = id_to_ref[missing] 

2595 if not ref_exists[ref]: 

2596 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref) 

2597 continue 

2598 # Check for file artifact to decide which parts of a 

2599 # disassembled composite do exist. If there is only a 

2600 # single record we don't even need to look because it can't 

2601 # be a composite and must exist. 

2602 if len(record_list) == 1: 

2603 dataset_records = record_list 

2604 else: 

2605 dataset_records = [ 

2606 record 

2607 for record in record_list 

2608 if artifact_existence[record.file_location(location_factory).uri] 

2609 ] 

2610 assert len(dataset_records) > 0, "Disassembled composite should have had some files." 

2611 

2612 # Rely on source_records being a defaultdict. 

2613 source_records[missing].extend(dataset_records) 

2614 

2615 # See if we already have these records 

2616 target_records = self._get_stored_records_associated_with_refs(refs) 

2617 

2618 # The artifacts to register 

2619 artifacts = [] 

2620 

2621 # Refs that already exist 

2622 already_present = [] 

2623 

2624 # Refs that were rejected by this datastore. 

2625 rejected = set() 

2626 

2627 # Refs that were transferred successfully. 

2628 accepted = set() 

2629 

2630 # Record each time we have done a "direct" transfer. 

2631 direct_transfers = [] 

2632 

2633 # Now can transfer the artifacts 

2634 for ref in refs: 

2635 if not self.constraints.isAcceptable(ref): 2635 ↛ 2637line 2635 didn't jump to line 2637, because the condition on line 2635 was never true

2636 # This datastore should not be accepting this dataset. 

2637 rejected.add(ref) 

2638 continue 

2639 

2640 accepted.add(ref) 

2641 

2642 if ref.id in target_records: 

2643 # Already have an artifact for this. 

2644 already_present.append(ref) 

2645 continue 

2646 

2647 # mypy needs to know these are always resolved refs 

2648 for info in source_records[ref.getCheckedId()]: 

2649 source_location = info.file_location(source_datastore.locationFactory) 

2650 target_location = info.file_location(self.locationFactory) 

2651 if source_location == target_location and not source_location.pathInStore.isabs(): 2651 ↛ 2654line 2651 didn't jump to line 2654, because the condition on line 2651 was never true

2652 # Artifact is already in the target location. 

2653 # (which is how execution butler currently runs) 

2654 pass 

2655 else: 

2656 if target_location.pathInStore.isabs(): 

2657 # Just because we can see the artifact when running 

2658 # the transfer doesn't mean it will be generally 

2659 # accessible to a user of this butler. Need to decide 

2660 # what to do about an absolute path. 

2661 if transfer == "auto": 

2662 # For "auto" transfers we allow the absolute URI 

2663 # to be recorded in the target datastore. 

2664 direct_transfers.append(source_location) 

2665 else: 

2666 # The user is explicitly requesting a transfer 

2667 # even for an absolute URI. This requires us to 

2668 # calculate the target path. 

2669 template_ref = ref 

2670 if info.component: 2670 ↛ 2671line 2670 didn't jump to line 2671, because the condition on line 2670 was never true

2671 template_ref = ref.makeComponentRef(info.component) 

2672 target_location = self._calculate_ingested_datastore_name( 

2673 source_location.uri, 

2674 template_ref, 

2675 ) 

2676 

2677 info = info.update(path=target_location.pathInStore.path) 

2678 

2679 # Need to transfer it to the new location. 

2680 # Assume we should always overwrite. If the artifact 

2681 # is there this might indicate that a previous transfer 

2682 # was interrupted but was not able to be rolled back 

2683 # completely (eg pre-emption) so follow Datastore default 

2684 # and overwrite. 

2685 target_location.uri.transfer_from( 

2686 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction 

2687 ) 

2688 

2689 artifacts.append((ref, info)) 

2690 

2691 if direct_transfers: 

2692 log.info( 

2693 "Transfer request for an outside-datastore artifact with absolute URI done %d time%s", 

2694 len(direct_transfers), 

2695 "" if len(direct_transfers) == 1 else "s", 

2696 ) 

2697 

2698 self._register_datasets(artifacts) 

2699 

2700 if already_present: 

2701 n_skipped = len(already_present) 

2702 log.info( 

2703 "Skipped transfer of %d dataset%s already present in datastore", 

2704 n_skipped, 

2705 "" if n_skipped == 1 else "s", 

2706 ) 

2707 

2708 return accepted, rejected 

2709 

2710 @transactional 

2711 def forget(self, refs: Iterable[DatasetRef]) -> None: 

2712 # Docstring inherited. 

2713 refs = list(refs) 

2714 self.bridge.forget(refs) 

2715 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs]) 

2716 

2717 def validateConfiguration( 

2718 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False 

2719 ) -> None: 

2720 """Validate some of the configuration for this datastore. 

2721 

2722 Parameters 

2723 ---------- 

2724 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

2725 Entities to test against this configuration. Can be differing 

2726 types. 

2727 logFailures : `bool`, optional 

2728 If `True`, output a log message for every validation error 

2729 detected. 

2730 

2731 Raises 

2732 ------ 

2733 DatastoreValidationError 

2734 Raised if there is a validation problem with a configuration. 

2735 All the problems are reported in a single exception. 

2736 

2737 Notes 

2738 ----- 

2739 This method checks that all the supplied entities have valid file 

2740 templates and also have formatters defined. 

2741 """ 

2742 

2743 templateFailed = None 

2744 try: 

2745 self.templates.validateTemplates(entities, logFailures=logFailures) 

2746 except FileTemplateValidationError as e: 

2747 templateFailed = str(e) 

2748 

2749 formatterFailed = [] 

2750 for entity in entities: 

2751 try: 

2752 self.formatterFactory.getFormatterClass(entity) 

2753 except KeyError as e: 

2754 formatterFailed.append(str(e)) 

2755 if logFailures: 2755 ↛ 2750line 2755 didn't jump to line 2750, because the condition on line 2755 was never false

2756 log.critical("Formatter failure: %s", e) 

2757 

2758 if templateFailed or formatterFailed: 

2759 messages = [] 

2760 if templateFailed: 2760 ↛ 2761line 2760 didn't jump to line 2761, because the condition on line 2760 was never true

2761 messages.append(templateFailed) 

2762 if formatterFailed: 2762 ↛ 2764line 2762 didn't jump to line 2764, because the condition on line 2762 was never false

2763 messages.append(",".join(formatterFailed)) 

2764 msg = ";\n".join(messages) 

2765 raise DatastoreValidationError(msg) 

2766 

2767 def getLookupKeys(self) -> Set[LookupKey]: 

2768 # Docstring is inherited from base class 

2769 return ( 

2770 self.templates.getLookupKeys() 

2771 | self.formatterFactory.getLookupKeys() 

2772 | self.constraints.getLookupKeys() 

2773 ) 

2774 

2775 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

2776 # Docstring is inherited from base class 

2777 # The key can be valid in either formatters or templates so we can 

2778 # only check the template if it exists 

2779 if lookupKey in self.templates: 

2780 try: 

2781 self.templates[lookupKey].validateTemplate(entity) 

2782 except FileTemplateValidationError as e: 

2783 raise DatastoreValidationError(e) from e 

2784 

2785 def export( 

2786 self, 

2787 refs: Iterable[DatasetRef], 

2788 *, 

2789 directory: Optional[ResourcePathExpression] = None, 

2790 transfer: Optional[str] = "auto", 

2791 ) -> Iterable[FileDataset]: 

2792 # Docstring inherited from Datastore.export. 

2793 if transfer == "auto" and directory is None: 

2794 transfer = None 

2795 

2796 if transfer is not None and directory is None: 

2797 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

2798 

2799 if transfer == "move": 

2800 raise TypeError("Can not export by moving files out of datastore.") 

2801 elif transfer == "direct": 2801 ↛ 2805line 2801 didn't jump to line 2805, because the condition on line 2801 was never true

2802 # For an export, treat this as equivalent to None. We do not 

2803 # want an import to risk using absolute URIs to datasets owned 

2804 # by another datastore. 

2805 log.info("Treating 'direct' transfer mode as in-place export.") 

2806 transfer = None 

2807 

2808 # Force the directory to be a URI object 

2809 directoryUri: Optional[ResourcePath] = None 

2810 if directory is not None: 

2811 directoryUri = ResourcePath(directory, forceDirectory=True) 

2812 

2813 if transfer is not None and directoryUri is not None: 

2814 # mypy needs the second test 

2815 if not directoryUri.exists(): 2815 ↛ 2816line 2815 didn't jump to line 2816, because the condition on line 2815 was never true

2816 raise FileNotFoundError(f"Export location {directory} does not exist") 

2817 

2818 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

2819 for ref in progress.wrap(refs, "Exporting dataset files"): 

2820 fileLocations = self._get_dataset_locations_info(ref) 

2821 if not fileLocations: 

2822 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

2823 # For now we can not export disassembled datasets 

2824 if len(fileLocations) > 1: 

2825 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

2826 location, storedFileInfo = fileLocations[0] 

2827 

2828 pathInStore = location.pathInStore.path 

2829 if transfer is None: 

2830 # TODO: do we also need to return the readStorageClass somehow? 

2831 # We will use the path in store directly. If this is an 

2832 # absolute URI, preserve it. 

2833 if location.pathInStore.isabs(): 2833 ↛ 2834line 2833 didn't jump to line 2834, because the condition on line 2833 was never true

2834 pathInStore = str(location.uri) 

2835 elif transfer == "direct": 2835 ↛ 2837line 2835 didn't jump to line 2837, because the condition on line 2835 was never true

2836 # Use full URIs to the remote store in the export 

2837 pathInStore = str(location.uri) 

2838 else: 

2839 # mypy needs help 

2840 assert directoryUri is not None, "directoryUri must be defined to get here" 

2841 storeUri = ResourcePath(location.uri) 

2842 

2843 # if the datastore has an absolute URI to a resource, we 

2844 # have two options: 

2845 # 1. Keep the absolute URI in the exported YAML 

2846 # 2. Allocate a new name in the local datastore and transfer 

2847 # it. 

2848 # For now go with option 2 

2849 if location.pathInStore.isabs(): 2849 ↛ 2850line 2849 didn't jump to line 2850, because the condition on line 2849 was never true

2850 template = self.templates.getTemplate(ref) 

2851 newURI = ResourcePath(template.format(ref), forceAbsolute=False) 

2852 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

2853 

2854 exportUri = directoryUri.join(pathInStore) 

2855 exportUri.transfer_from(storeUri, transfer=transfer) 

2856 

2857 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

2858 

2859 @staticmethod 

2860 def computeChecksum( 

2861 uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192 

2862 ) -> Optional[str]: 

2863 """Compute the checksum of the supplied file. 

2864 

2865 Parameters 

2866 ---------- 

2867 uri : `lsst.resources.ResourcePath` 

2868 Name of resource to calculate checksum from. 

2869 algorithm : `str`, optional 

2870 Name of algorithm to use. Must be one of the algorithms supported 

2871 by :py:class`hashlib`. 

2872 block_size : `int` 

2873 Number of bytes to read from file at one time. 

2874 

2875 Returns 

2876 ------- 

2877 hexdigest : `str` 

2878 Hex digest of the file. 

2879 

2880 Notes 

2881 ----- 

2882 Currently returns None if the URI is for a remote resource. 

2883 """ 

2884 if algorithm not in hashlib.algorithms_guaranteed: 2884 ↛ 2885line 2884 didn't jump to line 2885, because the condition on line 2884 was never true

2885 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

2886 

2887 if not uri.isLocal: 2887 ↛ 2888line 2887 didn't jump to line 2888, because the condition on line 2887 was never true

2888 return None 

2889 

2890 hasher = hashlib.new(algorithm) 

2891 

2892 with uri.as_local() as local_uri: 

2893 with open(local_uri.ospath, "rb") as f: 

2894 for chunk in iter(lambda: f.read(block_size), b""): 

2895 hasher.update(chunk) 

2896 

2897 return hasher.hexdigest() 

2898 

2899 def needs_expanded_data_ids( 

2900 self, 

2901 transfer: Optional[str], 

2902 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

2903 ) -> bool: 

2904 # Docstring inherited. 

2905 # This _could_ also use entity to inspect whether the filename template 

2906 # involves placeholders other than the required dimensions for its 

2907 # dataset type, but that's not necessary for correctness; it just 

2908 # enables more optimizations (perhaps only in theory). 

2909 return transfer not in ("direct", None) 

2910 

2911 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

2912 # Docstring inherited from the base class. 

2913 record_data = data.get(self.name) 

2914 if not record_data: 2914 ↛ 2915line 2914 didn't jump to line 2915, because the condition on line 2914 was never true

2915 return 

2916 

2917 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records.keys()) 

2918 

2919 # TODO: Verify that there are no unexpected table names in the dict? 

2920 unpacked_records = [] 

2921 for dataset_data in record_data.records.values(): 

2922 records = dataset_data.get(self._table.name) 

2923 if records: 2923 ↛ 2921line 2923 didn't jump to line 2921, because the condition on line 2923 was never false

2924 for info in records: 

2925 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records" 

2926 unpacked_records.append(info.to_record()) 

2927 if unpacked_records: 

2928 self._table.insert(*unpacked_records, transaction=self._transaction) 

2929 

2930 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

2931 # Docstring inherited from the base class. 

2932 exported_refs = list(self._bridge.check(refs)) 

2933 ids = {ref.getCheckedId() for ref in exported_refs} 

2934 records: defaultdict[DatasetId, defaultdict[str, List[StoredDatastoreItemInfo]]] = defaultdict( 

2935 lambda: defaultdict(list), {id: defaultdict(list) for id in ids} 

2936 ) 

2937 for row in self._table.fetch(dataset_id=ids): 

2938 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row) 

2939 records[info.dataset_id][self._table.name].append(info) 

2940 

2941 record_data = DatastoreRecordData(records=records) 

2942 return {self.name: record_data} 

2943 

2944 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None: 

2945 # Docstring inherited from the base class. 

2946 self._retrieve_dataset_method = method 

2947 

2948 def _cast_storage_class(self, ref: DatasetRef) -> DatasetRef: 

2949 """Update dataset reference to use the storage class from registry. 

2950 

2951 This does nothing for regular datastores, and is only enabled for 

2952 trusted mode where we need to use registry definition of storage class 

2953 for some datastore methods. `set_retrieve_dataset_type_method` has to 

2954 be called beforehand. 

2955 """ 

2956 if self.trustGetRequest: 

2957 if self._retrieve_dataset_method is None: 

2958 # We could raise an exception here but unit tests do not define 

2959 # this method. 

2960 return ref 

2961 dataset_type = self._retrieve_dataset_method(ref.datasetType.name) 

2962 if dataset_type is not None: 2962 ↛ 2964line 2962 didn't jump to line 2964, because the condition on line 2962 was never false

2963 ref = ref.overrideStorageClass(dataset_type.storageClass) 

2964 return ref