Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 10%

991 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-10-02 08:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Generic file-based datastore code.""" 

29 

30from __future__ import annotations 

31 

32__all__ = ("FileDatastore",) 

33 

34import contextlib 

35import hashlib 

36import logging 

37from collections import defaultdict 

38from collections.abc import Callable, Iterable, Mapping, Sequence 

39from dataclasses import dataclass 

40from typing import TYPE_CHECKING, Any, ClassVar 

41 

42from lsst.daf.butler import ( 

43 CompositesMap, 

44 Config, 

45 DatasetId, 

46 DatasetRef, 

47 DatasetRefURIs, 

48 DatasetType, 

49 DatasetTypeNotSupportedError, 

50 Datastore, 

51 DatastoreCacheManager, 

52 DatastoreConfig, 

53 DatastoreDisabledCacheManager, 

54 DatastoreRecordData, 

55 DatastoreValidationError, 

56 FileDataset, 

57 FileDescriptor, 

58 FileTemplates, 

59 FileTemplateValidationError, 

60 Formatter, 

61 FormatterFactory, 

62 Location, 

63 LocationFactory, 

64 Progress, 

65 StorageClass, 

66 StoredDatastoreItemInfo, 

67 StoredFileInfo, 

68 ddl, 

69) 

70from lsst.daf.butler.core.repoRelocation import replaceRoot 

71from lsst.daf.butler.core.utils import transactional 

72from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError 

73from lsst.resources import ResourcePath, ResourcePathExpression 

74from lsst.utils.introspection import get_class_of, get_instance_of 

75from lsst.utils.iteration import chunk_iterable 

76 

77# For VERBOSE logging usage. 

78from lsst.utils.logging import VERBOSE, getLogger 

79from lsst.utils.timer import time_this 

80from sqlalchemy import BigInteger, String 

81 

82from ..registry.interfaces import DatabaseInsertMode, FakeDatasetRef 

83from .genericDatastore import GenericBaseDatastore 

84 

85if TYPE_CHECKING: 

86 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey 

87 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

88 

89log = getLogger(__name__) 

90 

91 

92class _IngestPrepData(Datastore.IngestPrepData): 

93 """Helper class for FileDatastore ingest implementation. 

94 

95 Parameters 

96 ---------- 

97 datasets : `~collections.abc.Iterable` of `FileDataset` 

98 Files to be ingested by this datastore. 

99 """ 

100 

101 def __init__(self, datasets: Iterable[FileDataset]): 

102 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

103 self.datasets = datasets 

104 

105 

106@dataclass(frozen=True) 

107class DatastoreFileGetInformation: 

108 """Collection of useful parameters needed to retrieve a file from 

109 a Datastore. 

110 """ 

111 

112 location: Location 

113 """The location from which to read the dataset.""" 

114 

115 formatter: Formatter 

116 """The `Formatter` to use to deserialize the dataset.""" 

117 

118 info: StoredFileInfo 

119 """Stored information about this file and its formatter.""" 

120 

121 assemblerParams: Mapping[str, Any] 

122 """Parameters to use for post-processing the retrieved dataset.""" 

123 

124 formatterParams: Mapping[str, Any] 

125 """Parameters that were understood by the associated formatter.""" 

126 

127 component: str | None 

128 """The component to be retrieved (can be `None`).""" 

129 

130 readStorageClass: StorageClass 

131 """The `StorageClass` of the dataset being read.""" 

132 

133 

134class FileDatastore(GenericBaseDatastore): 

135 """Generic Datastore for file-based implementations. 

136 

137 Should always be sub-classed since key abstract methods are missing. 

138 

139 Parameters 

140 ---------- 

141 config : `DatastoreConfig` or `str` 

142 Configuration as either a `Config` object or URI to file. 

143 bridgeManager : `DatastoreRegistryBridgeManager` 

144 Object that manages the interface between `Registry` and datastores. 

145 butlerRoot : `str`, optional 

146 New datastore root to use to override the configuration value. 

147 

148 Raises 

149 ------ 

150 ValueError 

151 If root location does not exist and ``create`` is `False` in the 

152 configuration. 

153 """ 

154 

155 defaultConfigFile: ClassVar[str | None] = None 

156 """Path to configuration defaults. Accessed within the ``config`` resource 

157 or relative to a search path. Can be None if no defaults specified. 

158 """ 

159 

160 root: ResourcePath 

161 """Root directory URI of this `Datastore`.""" 

162 

163 locationFactory: LocationFactory 

164 """Factory for creating locations relative to the datastore root.""" 

165 

166 formatterFactory: FormatterFactory 

167 """Factory for creating instances of formatters.""" 

168 

169 templates: FileTemplates 

170 """File templates that can be used by this `Datastore`.""" 

171 

172 composites: CompositesMap 

173 """Determines whether a dataset should be disassembled on put.""" 

174 

175 defaultConfigFile = "datastores/fileDatastore.yaml" 

176 """Path to configuration defaults. Accessed within the ``config`` resource 

177 or relative to a search path. Can be None if no defaults specified. 

178 """ 

179 

180 _retrieve_dataset_method: Callable[[str], DatasetType | None] | None = None 

181 """Callable that is used in trusted mode to retrieve registry definition 

182 of a named dataset type. 

183 """ 

184 

185 @classmethod 

186 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

187 """Set any filesystem-dependent config options for this Datastore to 

188 be appropriate for a new empty repository with the given root. 

189 

190 Parameters 

191 ---------- 

192 root : `str` 

193 URI to the root of the data repository. 

194 config : `Config` 

195 A `Config` to update. Only the subset understood by 

196 this component will be updated. Will not expand 

197 defaults. 

198 full : `Config` 

199 A complete config with all defaults expanded that can be 

200 converted to a `DatastoreConfig`. Read-only and will not be 

201 modified by this method. 

202 Repository-specific options that should not be obtained 

203 from defaults when Butler instances are constructed 

204 should be copied from ``full`` to ``config``. 

205 overwrite : `bool`, optional 

206 If `False`, do not modify a value in ``config`` if the value 

207 already exists. Default is always to overwrite with the provided 

208 ``root``. 

209 

210 Notes 

211 ----- 

212 If a keyword is explicitly defined in the supplied ``config`` it 

213 will not be overridden by this method if ``overwrite`` is `False`. 

214 This allows explicit values set in external configs to be retained. 

215 """ 

216 Config.updateParameters( 

217 DatastoreConfig, 

218 config, 

219 full, 

220 toUpdate={"root": root}, 

221 toCopy=("cls", ("records", "table")), 

222 overwrite=overwrite, 

223 ) 

224 

225 @classmethod 

226 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

227 return ddl.TableSpec( 

228 fields=[ 

229 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

230 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

231 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

232 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

233 # Use empty string to indicate no component 

234 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

235 # TODO: should checksum be Base64Bytes instead? 

236 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

237 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

238 ], 

239 unique=frozenset(), 

240 indexes=[ddl.IndexSpec("path")], 

241 ) 

242 

243 def __init__( 

244 self, 

245 config: DatastoreConfig | ResourcePathExpression, 

246 bridgeManager: DatastoreRegistryBridgeManager, 

247 butlerRoot: str | None = None, 

248 ): 

249 super().__init__(config, bridgeManager) 

250 if "root" not in self.config: 

251 raise ValueError("No root directory specified in configuration") 

252 

253 self._bridgeManager = bridgeManager 

254 

255 # Name ourselves either using an explicit name or a name 

256 # derived from the (unexpanded) root 

257 if "name" in self.config: 

258 self.name = self.config["name"] 

259 else: 

260 # We use the unexpanded root in the name to indicate that this 

261 # datastore can be moved without having to update registry. 

262 self.name = "{}@{}".format(type(self).__name__, self.config["root"]) 

263 

264 # Support repository relocation in config 

265 # Existence of self.root is checked in subclass 

266 self.root = ResourcePath( 

267 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True 

268 ) 

269 

270 self.locationFactory = LocationFactory(self.root) 

271 self.formatterFactory = FormatterFactory() 

272 

273 # Now associate formatters with storage classes 

274 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe) 

275 

276 # Read the file naming templates 

277 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe) 

278 

279 # See if composites should be disassembled 

280 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe) 

281 

282 tableName = self.config["records", "table"] 

283 try: 

284 # Storage of paths and formatters, keyed by dataset_id 

285 self._table = bridgeManager.opaque.register( 

286 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType) 

287 ) 

288 # Interface to Registry. 

289 self._bridge = bridgeManager.register(self.name) 

290 except ReadOnlyDatabaseError: 

291 # If the database is read only and we just tried and failed to 

292 # create a table, it means someone is trying to create a read-only 

293 # butler client for an empty repo. That should be okay, as long 

294 # as they then try to get any datasets before some other client 

295 # creates the table. Chances are they'rejust validating 

296 # configuration. 

297 pass 

298 

299 # Determine whether checksums should be used - default to False 

300 self.useChecksum = self.config.get("checksum", False) 

301 

302 # Determine whether we can fall back to configuration if a 

303 # requested dataset is not known to registry 

304 self.trustGetRequest = self.config.get("trust_get_request", False) 

305 

306 # Create a cache manager 

307 self.cacheManager: AbstractDatastoreCacheManager 

308 if "cached" in self.config: 

309 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe) 

310 else: 

311 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe) 

312 

313 # Check existence and create directory structure if necessary 

314 if not self.root.exists(): 

315 if "create" not in self.config or not self.config["create"]: 

316 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

317 try: 

318 self.root.mkdir() 

319 except Exception as e: 

320 raise ValueError( 

321 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}" 

322 ) from e 

323 

324 def __str__(self) -> str: 

325 return str(self.root) 

326 

327 @property 

328 def bridge(self) -> DatastoreRegistryBridge: 

329 return self._bridge 

330 

331 @property 

332 def roots(self) -> dict[str, ResourcePath | None]: 

333 # Docstring inherited. 

334 return {self.name: self.root} 

335 

336 def _artifact_exists(self, location: Location) -> bool: 

337 """Check that an artifact exists in this datastore at the specified 

338 location. 

339 

340 Parameters 

341 ---------- 

342 location : `Location` 

343 Expected location of the artifact associated with this datastore. 

344 

345 Returns 

346 ------- 

347 exists : `bool` 

348 True if the location can be found, false otherwise. 

349 """ 

350 log.debug("Checking if resource exists: %s", location.uri) 

351 return location.uri.exists() 

352 

353 def _delete_artifact(self, location: Location) -> None: 

354 """Delete the artifact from the datastore. 

355 

356 Parameters 

357 ---------- 

358 location : `Location` 

359 Location of the artifact associated with this datastore. 

360 """ 

361 if location.pathInStore.isabs(): 

362 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

363 

364 try: 

365 location.uri.remove() 

366 except FileNotFoundError: 

367 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

368 raise 

369 except Exception as e: 

370 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

371 raise 

372 log.debug("Successfully deleted file: %s", location.uri) 

373 

374 def addStoredItemInfo( 

375 self, 

376 refs: Iterable[DatasetRef], 

377 infos: Iterable[StoredFileInfo], 

378 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT, 

379 ) -> None: 

380 # Docstring inherited from GenericBaseDatastore 

381 records = [info.rebase(ref).to_record() for ref, info in zip(refs, infos, strict=True)] 

382 match insert_mode: 

383 case DatabaseInsertMode.INSERT: 

384 self._table.insert(*records, transaction=self._transaction) 

385 case DatabaseInsertMode.ENSURE: 

386 self._table.ensure(*records, transaction=self._transaction) 

387 case DatabaseInsertMode.REPLACE: 

388 self._table.replace(*records, transaction=self._transaction) 

389 case _: 

390 raise ValueError(f"Unknown insert mode of '{insert_mode}'") 

391 

392 def getStoredItemsInfo(self, ref: DatasetIdRef) -> list[StoredFileInfo]: 

393 # Docstring inherited from GenericBaseDatastore 

394 

395 # Look for the dataset_id -- there might be multiple matches 

396 # if we have disassembled the dataset. 

397 records = self._table.fetch(dataset_id=ref.id) 

398 return [StoredFileInfo.from_record(record) for record in records] 

399 

400 def _get_stored_records_associated_with_refs( 

401 self, refs: Iterable[DatasetIdRef] 

402 ) -> dict[DatasetId, list[StoredFileInfo]]: 

403 """Retrieve all records associated with the provided refs. 

404 

405 Parameters 

406 ---------- 

407 refs : iterable of `DatasetIdRef` 

408 The refs for which records are to be retrieved. 

409 

410 Returns 

411 ------- 

412 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

413 The matching records indexed by the ref ID. The number of entries 

414 in the dict can be smaller than the number of requested refs. 

415 """ 

416 records = self._table.fetch(dataset_id=[ref.id for ref in refs]) 

417 

418 # Uniqueness is dataset_id + component so can have multiple records 

419 # per ref. 

420 records_by_ref = defaultdict(list) 

421 for record in records: 

422 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

423 return records_by_ref 

424 

425 def _refs_associated_with_artifacts(self, paths: list[str | ResourcePath]) -> dict[str, set[DatasetId]]: 

426 """Return paths and associated dataset refs. 

427 

428 Parameters 

429 ---------- 

430 paths : `list` of `str` or `lsst.resources.ResourcePath` 

431 All the paths to include in search. 

432 

433 Returns 

434 ------- 

435 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

436 Mapping of each path to a set of associated database IDs. 

437 """ 

438 records = self._table.fetch(path=[str(path) for path in paths]) 

439 result = defaultdict(set) 

440 for row in records: 

441 result[row["path"]].add(row["dataset_id"]) 

442 return result 

443 

444 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> set[DatasetId]: 

445 """Return all dataset refs associated with the supplied path. 

446 

447 Parameters 

448 ---------- 

449 pathInStore : `lsst.resources.ResourcePath` 

450 Path of interest in the data store. 

451 

452 Returns 

453 ------- 

454 ids : `set` of `int` 

455 All `DatasetRef` IDs associated with this path. 

456 """ 

457 records = list(self._table.fetch(path=str(pathInStore))) 

458 ids = {r["dataset_id"] for r in records} 

459 return ids 

460 

461 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

462 # Docstring inherited from GenericBaseDatastore 

463 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

464 

465 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> list[tuple[Location, StoredFileInfo]]: 

466 r"""Find all the `Location`\ s of the requested dataset in the 

467 `Datastore` and the associated stored file information. 

468 

469 Parameters 

470 ---------- 

471 ref : `DatasetRef` 

472 Reference to the required `Dataset`. 

473 

474 Returns 

475 ------- 

476 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

477 Location of the dataset within the datastore and 

478 stored information about each file and its formatter. 

479 """ 

480 # Get the file information (this will fail if no file) 

481 records = self.getStoredItemsInfo(ref) 

482 

483 # Use the path to determine the location -- we need to take 

484 # into account absolute URIs in the datastore record 

485 return [(r.file_location(self.locationFactory), r) for r in records] 

486 

487 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

488 """Check that there is only one dataset associated with the 

489 specified artifact. 

490 

491 Parameters 

492 ---------- 

493 ref : `DatasetRef` or `FakeDatasetRef` 

494 Dataset to be removed. 

495 location : `Location` 

496 The location of the artifact to be removed. 

497 

498 Returns 

499 ------- 

500 can_remove : `Bool` 

501 True if the artifact can be safely removed. 

502 """ 

503 # Can't ever delete absolute URIs. 

504 if location.pathInStore.isabs(): 

505 return False 

506 

507 # Get all entries associated with this path 

508 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

509 if not allRefs: 

510 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

511 

512 # Remove these refs from all the refs and if there is nothing left 

513 # then we can delete 

514 remainingRefs = allRefs - {ref.id} 

515 

516 if remainingRefs: 

517 return False 

518 return True 

519 

520 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> list[tuple[Location, StoredFileInfo]]: 

521 """Predict the location and related file information of the requested 

522 dataset in this datastore. 

523 

524 Parameters 

525 ---------- 

526 ref : `DatasetRef` 

527 Reference to the required `Dataset`. 

528 

529 Returns 

530 ------- 

531 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

532 Expected Location of the dataset within the datastore and 

533 placeholder information about each file and its formatter. 

534 

535 Notes 

536 ----- 

537 Uses the current configuration to determine how we would expect the 

538 datastore files to have been written if we couldn't ask registry. 

539 This is safe so long as there has been no change to datastore 

540 configuration between writing the dataset and wanting to read it. 

541 Will not work for files that have been ingested without using the 

542 standard file template or default formatter. 

543 """ 

544 # If we have a component ref we always need to ask the questions 

545 # of the composite. If the composite is disassembled this routine 

546 # should return all components. If the composite was not 

547 # disassembled the composite is what is stored regardless of 

548 # component request. Note that if the caller has disassembled 

549 # a composite there is no way for this guess to know that 

550 # without trying both the composite and component ref and seeing 

551 # if there is something at the component Location even without 

552 # disassembly being enabled. 

553 if ref.datasetType.isComponent(): 

554 ref = ref.makeCompositeRef() 

555 

556 # See if the ref is a composite that should be disassembled 

557 doDisassembly = self.composites.shouldBeDisassembled(ref) 

558 

559 all_info: list[tuple[Location, Formatter, StorageClass, str | None]] = [] 

560 

561 if doDisassembly: 

562 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

563 compRef = ref.makeComponentRef(component) 

564 location, formatter = self._determine_put_formatter_location(compRef) 

565 all_info.append((location, formatter, componentStorage, component)) 

566 

567 else: 

568 # Always use the composite ref if no disassembly 

569 location, formatter = self._determine_put_formatter_location(ref) 

570 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

571 

572 # Convert the list of tuples to have StoredFileInfo as second element 

573 return [ 

574 ( 

575 location, 

576 StoredFileInfo( 

577 formatter=formatter, 

578 path=location.pathInStore.path, 

579 storageClass=storageClass, 

580 component=component, 

581 checksum=None, 

582 file_size=-1, 

583 dataset_id=ref.id, 

584 ), 

585 ) 

586 for location, formatter, storageClass, component in all_info 

587 ] 

588 

589 def _prepare_for_get( 

590 self, ref: DatasetRef, parameters: Mapping[str, Any] | None = None 

591 ) -> list[DatastoreFileGetInformation]: 

592 """Check parameters for ``get`` and obtain formatter and 

593 location. 

594 

595 Parameters 

596 ---------- 

597 ref : `DatasetRef` 

598 Reference to the required Dataset. 

599 parameters : `dict` 

600 `StorageClass`-specific parameters that specify, for example, 

601 a slice of the dataset to be loaded. 

602 

603 Returns 

604 ------- 

605 getInfo : `list` [`DatastoreFileGetInformation`] 

606 Parameters needed to retrieve each file. 

607 """ 

608 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

609 

610 # The storage class we want to use eventually 

611 refStorageClass = ref.datasetType.storageClass 

612 

613 # For trusted mode need to reset storage class. 

614 ref = self._cast_storage_class(ref) 

615 

616 # Get file metadata and internal metadata 

617 fileLocations = self._get_dataset_locations_info(ref) 

618 if not fileLocations: 

619 if not self.trustGetRequest: 

620 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

621 # Assume the dataset is where we think it should be 

622 fileLocations = self._get_expected_dataset_locations_info(ref) 

623 

624 if len(fileLocations) > 1: 

625 disassembled = True 

626 

627 # If trust is involved it is possible that there will be 

628 # components listed here that do not exist in the datastore. 

629 # Explicitly check for file artifact existence and filter out any 

630 # that are missing. 

631 if self.trustGetRequest: 

632 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()] 

633 

634 # For now complain only if we have no components at all. One 

635 # component is probably a problem but we can punt that to the 

636 # assembler. 

637 if not fileLocations: 

638 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") 

639 

640 else: 

641 disassembled = False 

642 

643 # Is this a component request? 

644 refComponent = ref.datasetType.component() 

645 

646 fileGetInfo = [] 

647 for location, storedFileInfo in fileLocations: 

648 # The storage class used to write the file 

649 writeStorageClass = storedFileInfo.storageClass 

650 

651 # If this has been disassembled we need read to match the write 

652 if disassembled: 

653 readStorageClass = writeStorageClass 

654 else: 

655 readStorageClass = refStorageClass 

656 

657 formatter = get_instance_of( 

658 storedFileInfo.formatter, 

659 FileDescriptor( 

660 location, 

661 readStorageClass=readStorageClass, 

662 storageClass=writeStorageClass, 

663 parameters=parameters, 

664 ), 

665 ref.dataId, 

666 ) 

667 

668 formatterParams, notFormatterParams = formatter.segregateParameters() 

669 

670 # Of the remaining parameters, extract the ones supported by 

671 # this StorageClass (for components not all will be handled) 

672 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

673 

674 # The ref itself could be a component if the dataset was 

675 # disassembled by butler, or we disassembled in datastore and 

676 # components came from the datastore records 

677 component = storedFileInfo.component if storedFileInfo.component else refComponent 

678 

679 fileGetInfo.append( 

680 DatastoreFileGetInformation( 

681 location, 

682 formatter, 

683 storedFileInfo, 

684 assemblerParams, 

685 formatterParams, 

686 component, 

687 readStorageClass, 

688 ) 

689 ) 

690 

691 return fileGetInfo 

692 

693 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> tuple[Location, Formatter]: 

694 """Check the arguments for ``put`` and obtain formatter and 

695 location. 

696 

697 Parameters 

698 ---------- 

699 inMemoryDataset : `object` 

700 The dataset to store. 

701 ref : `DatasetRef` 

702 Reference to the associated Dataset. 

703 

704 Returns 

705 ------- 

706 location : `Location` 

707 The location to write the dataset. 

708 formatter : `Formatter` 

709 The `Formatter` to use to write the dataset. 

710 

711 Raises 

712 ------ 

713 TypeError 

714 Supplied object and storage class are inconsistent. 

715 DatasetTypeNotSupportedError 

716 The associated `DatasetType` is not handled by this datastore. 

717 """ 

718 self._validate_put_parameters(inMemoryDataset, ref) 

719 return self._determine_put_formatter_location(ref) 

720 

721 def _determine_put_formatter_location(self, ref: DatasetRef) -> tuple[Location, Formatter]: 

722 """Calculate the formatter and output location to use for put. 

723 

724 Parameters 

725 ---------- 

726 ref : `DatasetRef` 

727 Reference to the associated Dataset. 

728 

729 Returns 

730 ------- 

731 location : `Location` 

732 The location to write the dataset. 

733 formatter : `Formatter` 

734 The `Formatter` to use to write the dataset. 

735 """ 

736 # Work out output file name 

737 try: 

738 template = self.templates.getTemplate(ref) 

739 except KeyError as e: 

740 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

741 

742 # Validate the template to protect against filenames from different 

743 # dataIds returning the same and causing overwrite confusion. 

744 template.validateTemplate(ref) 

745 

746 location = self.locationFactory.fromPath(template.format(ref)) 

747 

748 # Get the formatter based on the storage class 

749 storageClass = ref.datasetType.storageClass 

750 try: 

751 formatter = self.formatterFactory.getFormatter( 

752 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId 

753 ) 

754 except KeyError as e: 

755 raise DatasetTypeNotSupportedError( 

756 f"Unable to find formatter for {ref} in datastore {self.name}" 

757 ) from e 

758 

759 # Now that we know the formatter, update the location 

760 location = formatter.makeUpdatedLocation(location) 

761 

762 return location, formatter 

763 

764 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None: 

765 # Docstring inherited from base class 

766 if transfer != "auto": 

767 return transfer 

768 

769 # See if the paths are within the datastore or not 

770 inside = [self._pathInStore(d.path) is not None for d in datasets] 

771 

772 if all(inside): 

773 transfer = None 

774 elif not any(inside): 

775 # Allow ResourcePath to use its own knowledge 

776 transfer = "auto" 

777 else: 

778 # This can happen when importing from a datastore that 

779 # has had some datasets ingested using "direct" mode. 

780 # Also allow ResourcePath to sort it out but warn about it. 

781 # This can happen if you are importing from a datastore 

782 # that had some direct transfer datasets. 

783 log.warning( 

784 "Some datasets are inside the datastore and some are outside. Using 'split' " 

785 "transfer mode. This assumes that the files outside the datastore are " 

786 "still accessible to the new butler since they will not be copied into " 

787 "the target datastore." 

788 ) 

789 transfer = "split" 

790 

791 return transfer 

792 

793 def _pathInStore(self, path: ResourcePathExpression) -> str | None: 

794 """Return path relative to datastore root. 

795 

796 Parameters 

797 ---------- 

798 path : `lsst.resources.ResourcePathExpression` 

799 Path to dataset. Can be absolute URI. If relative assumed to 

800 be relative to the datastore. Returns path in datastore 

801 or raises an exception if the path it outside. 

802 

803 Returns 

804 ------- 

805 inStore : `str` 

806 Path relative to datastore root. Returns `None` if the file is 

807 outside the root. 

808 """ 

809 # Relative path will always be relative to datastore 

810 pathUri = ResourcePath(path, forceAbsolute=False) 

811 return pathUri.relative_to(self.root) 

812 

813 def _standardizeIngestPath( 

814 self, path: str | ResourcePath, *, transfer: str | None = None 

815 ) -> str | ResourcePath: 

816 """Standardize the path of a to-be-ingested file. 

817 

818 Parameters 

819 ---------- 

820 path : `str` or `lsst.resources.ResourcePath` 

821 Path of a file to be ingested. This parameter is not expected 

822 to be all the types that can be used to construct a 

823 `~lsst.resources.ResourcePath`. 

824 transfer : `str`, optional 

825 How (and whether) the dataset should be added to the datastore. 

826 See `ingest` for details of transfer modes. 

827 This implementation is provided only so 

828 `NotImplementedError` can be raised if the mode is not supported; 

829 actual transfers are deferred to `_extractIngestInfo`. 

830 

831 Returns 

832 ------- 

833 path : `str` or `lsst.resources.ResourcePath` 

834 New path in what the datastore considers standard form. If an 

835 absolute URI was given that will be returned unchanged. 

836 

837 Notes 

838 ----- 

839 Subclasses of `FileDatastore` can implement this method instead 

840 of `_prepIngest`. It should not modify the data repository or given 

841 file in any way. 

842 

843 Raises 

844 ------ 

845 NotImplementedError 

846 Raised if the datastore does not support the given transfer mode 

847 (including the case where ingest is not supported at all). 

848 FileNotFoundError 

849 Raised if one of the given files does not exist. 

850 """ 

851 if transfer not in (None, "direct", "split") + self.root.transferModes: 

852 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

853 

854 # A relative URI indicates relative to datastore root 

855 srcUri = ResourcePath(path, forceAbsolute=False) 

856 if not srcUri.isabs(): 

857 srcUri = self.root.join(path) 

858 

859 if not srcUri.exists(): 

860 raise FileNotFoundError( 

861 f"Resource at {srcUri} does not exist; note that paths to ingest " 

862 f"are assumed to be relative to {self.root} unless they are absolute." 

863 ) 

864 

865 if transfer is None: 

866 relpath = srcUri.relative_to(self.root) 

867 if not relpath: 

868 raise RuntimeError( 

869 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})" 

870 ) 

871 

872 # Return the relative path within the datastore for internal 

873 # transfer 

874 path = relpath 

875 

876 return path 

877 

878 def _extractIngestInfo( 

879 self, 

880 path: ResourcePathExpression, 

881 ref: DatasetRef, 

882 *, 

883 formatter: Formatter | type[Formatter], 

884 transfer: str | None = None, 

885 record_validation_info: bool = True, 

886 ) -> StoredFileInfo: 

887 """Relocate (if necessary) and extract `StoredFileInfo` from a 

888 to-be-ingested file. 

889 

890 Parameters 

891 ---------- 

892 path : `lsst.resources.ResourcePathExpression` 

893 URI or path of a file to be ingested. 

894 ref : `DatasetRef` 

895 Reference for the dataset being ingested. Guaranteed to have 

896 ``dataset_id not None`. 

897 formatter : `type` or `Formatter` 

898 `Formatter` subclass to use for this dataset or an instance. 

899 transfer : `str`, optional 

900 How (and whether) the dataset should be added to the datastore. 

901 See `ingest` for details of transfer modes. 

902 record_validation_info : `bool`, optional 

903 If `True`, the default, the datastore can record validation 

904 information associated with the file. If `False` the datastore 

905 will not attempt to track any information such as checksums 

906 or file sizes. This can be useful if such information is tracked 

907 in an external system or if the file is to be compressed in place. 

908 It is up to the datastore whether this parameter is relevant. 

909 

910 Returns 

911 ------- 

912 info : `StoredFileInfo` 

913 Internal datastore record for this file. This will be inserted by 

914 the caller; the `_extractIngestInfo` is only responsible for 

915 creating and populating the struct. 

916 

917 Raises 

918 ------ 

919 FileNotFoundError 

920 Raised if one of the given files does not exist. 

921 FileExistsError 

922 Raised if transfer is not `None` but the (internal) location the 

923 file would be moved to is already occupied. 

924 """ 

925 if self._transaction is None: 

926 raise RuntimeError("Ingest called without transaction enabled") 

927 

928 # Create URI of the source path, do not need to force a relative 

929 # path to absolute. 

930 srcUri = ResourcePath(path, forceAbsolute=False) 

931 

932 # Track whether we have read the size of the source yet 

933 have_sized = False 

934 

935 tgtLocation: Location | None 

936 if transfer is None or transfer == "split": 

937 # A relative path is assumed to be relative to the datastore 

938 # in this context 

939 if not srcUri.isabs(): 

940 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

941 else: 

942 # Work out the path in the datastore from an absolute URI 

943 # This is required to be within the datastore. 

944 pathInStore = srcUri.relative_to(self.root) 

945 if pathInStore is None and transfer is None: 

946 raise RuntimeError( 

947 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}" 

948 ) 

949 if pathInStore: 

950 tgtLocation = self.locationFactory.fromPath(pathInStore) 

951 elif transfer == "split": 

952 # Outside the datastore but treat that as a direct ingest 

953 # instead. 

954 tgtLocation = None 

955 else: 

956 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}") 

957 elif transfer == "direct": 

958 # Want to store the full URI to the resource directly in 

959 # datastore. This is useful for referring to permanent archive 

960 # storage for raw data. 

961 # Trust that people know what they are doing. 

962 tgtLocation = None 

963 else: 

964 # Work out the name we want this ingested file to have 

965 # inside the datastore 

966 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

967 if not tgtLocation.uri.dirname().exists(): 

968 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

969 tgtLocation.uri.dirname().mkdir() 

970 

971 # if we are transferring from a local file to a remote location 

972 # it may be more efficient to get the size and checksum of the 

973 # local file rather than the transferred one 

974 if record_validation_info and srcUri.isLocal: 

975 size = srcUri.size() 

976 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

977 have_sized = True 

978 

979 # Transfer the resource to the destination. 

980 # Allow overwrite of an existing file. This matches the behavior 

981 # of datastore.put() in that it trusts that registry would not 

982 # be asking to overwrite unless registry thought that the 

983 # overwrite was allowed. 

984 tgtLocation.uri.transfer_from( 

985 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True 

986 ) 

987 

988 if tgtLocation is None: 

989 # This means we are using direct mode 

990 targetUri = srcUri 

991 targetPath = str(srcUri) 

992 else: 

993 targetUri = tgtLocation.uri 

994 targetPath = tgtLocation.pathInStore.path 

995 

996 # the file should exist in the datastore now 

997 if record_validation_info: 

998 if not have_sized: 

999 size = targetUri.size() 

1000 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

1001 else: 

1002 # Not recording any file information. 

1003 size = -1 

1004 checksum = None 

1005 

1006 return StoredFileInfo( 

1007 formatter=formatter, 

1008 path=targetPath, 

1009 storageClass=ref.datasetType.storageClass, 

1010 component=ref.datasetType.component(), 

1011 file_size=size, 

1012 checksum=checksum, 

1013 dataset_id=ref.id, 

1014 ) 

1015 

1016 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData: 

1017 # Docstring inherited from Datastore._prepIngest. 

1018 filtered = [] 

1019 for dataset in datasets: 

1020 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

1021 if not acceptable: 

1022 continue 

1023 else: 

1024 dataset.refs = acceptable 

1025 if dataset.formatter is None: 

1026 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

1027 else: 

1028 assert isinstance(dataset.formatter, type | str) 

1029 formatter_class = get_class_of(dataset.formatter) 

1030 if not issubclass(formatter_class, Formatter): 

1031 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.") 

1032 dataset.formatter = formatter_class 

1033 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

1034 filtered.append(dataset) 

1035 return _IngestPrepData(filtered) 

1036 

1037 @transactional 

1038 def _finishIngest( 

1039 self, 

1040 prepData: Datastore.IngestPrepData, 

1041 *, 

1042 transfer: str | None = None, 

1043 record_validation_info: bool = True, 

1044 ) -> None: 

1045 # Docstring inherited from Datastore._finishIngest. 

1046 refsAndInfos = [] 

1047 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

1048 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

1049 # Do ingest as if the first dataset ref is associated with the file 

1050 info = self._extractIngestInfo( 

1051 dataset.path, 

1052 dataset.refs[0], 

1053 formatter=dataset.formatter, 

1054 transfer=transfer, 

1055 record_validation_info=record_validation_info, 

1056 ) 

1057 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

1058 

1059 # In direct mode we can allow repeated ingests of the same thing 

1060 # if we are sure that the external dataset is immutable. We use 

1061 # UUIDv5 to indicate this. If there is a mix of v4 and v5 they are 

1062 # separated. 

1063 refs_and_infos_replace = [] 

1064 refs_and_infos_insert = [] 

1065 if transfer == "direct": 

1066 for entry in refsAndInfos: 

1067 if entry[0].id.version == 5: 

1068 refs_and_infos_replace.append(entry) 

1069 else: 

1070 refs_and_infos_insert.append(entry) 

1071 else: 

1072 refs_and_infos_insert = refsAndInfos 

1073 

1074 if refs_and_infos_insert: 

1075 self._register_datasets(refs_and_infos_insert, insert_mode=DatabaseInsertMode.INSERT) 

1076 if refs_and_infos_replace: 

1077 self._register_datasets(refs_and_infos_replace, insert_mode=DatabaseInsertMode.REPLACE) 

1078 

1079 def _calculate_ingested_datastore_name( 

1080 self, 

1081 srcUri: ResourcePath, 

1082 ref: DatasetRef, 

1083 formatter: Formatter | type[Formatter] | None = None, 

1084 ) -> Location: 

1085 """Given a source URI and a DatasetRef, determine the name the 

1086 dataset will have inside datastore. 

1087 

1088 Parameters 

1089 ---------- 

1090 srcUri : `lsst.resources.ResourcePath` 

1091 URI to the source dataset file. 

1092 ref : `DatasetRef` 

1093 Ref associated with the newly-ingested dataset artifact. This 

1094 is used to determine the name within the datastore. 

1095 formatter : `Formatter` or Formatter class. 

1096 Formatter to use for validation. Can be a class or an instance. 

1097 No validation of the file extension is performed if the 

1098 ``formatter`` is `None`. This can be used if the caller knows 

1099 that the source URI and target URI will use the same formatter. 

1100 

1101 Returns 

1102 ------- 

1103 location : `Location` 

1104 Target location for the newly-ingested dataset. 

1105 """ 

1106 # Ingesting a file from outside the datastore. 

1107 # This involves a new name. 

1108 template = self.templates.getTemplate(ref) 

1109 location = self.locationFactory.fromPath(template.format(ref)) 

1110 

1111 # Get the extension 

1112 ext = srcUri.getExtension() 

1113 

1114 # Update the destination to include that extension 

1115 location.updateExtension(ext) 

1116 

1117 # Ask the formatter to validate this extension 

1118 if formatter is not None: 

1119 formatter.validateExtension(location) 

1120 

1121 return location 

1122 

1123 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

1124 """Write out in memory dataset to datastore. 

1125 

1126 Parameters 

1127 ---------- 

1128 inMemoryDataset : `object` 

1129 Dataset to write to datastore. 

1130 ref : `DatasetRef` 

1131 Registry information associated with this dataset. 

1132 

1133 Returns 

1134 ------- 

1135 info : `StoredFileInfo` 

1136 Information describing the artifact written to the datastore. 

1137 """ 

1138 # May need to coerce the in memory dataset to the correct 

1139 # python type, but first we need to make sure the storage class 

1140 # reflects the one defined in the data repository. 

1141 ref = self._cast_storage_class(ref) 

1142 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset) 

1143 

1144 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

1145 uri = location.uri 

1146 

1147 if not uri.dirname().exists(): 

1148 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

1149 uri.dirname().mkdir() 

1150 

1151 if self._transaction is None: 

1152 raise RuntimeError("Attempting to write artifact without transaction enabled") 

1153 

1154 def _removeFileExists(uri: ResourcePath) -> None: 

1155 """Remove a file and do not complain if it is not there. 

1156 

1157 This is important since a formatter might fail before the file 

1158 is written and we should not confuse people by writing spurious 

1159 error messages to the log. 

1160 """ 

1161 with contextlib.suppress(FileNotFoundError): 

1162 uri.remove() 

1163 

1164 # Register a callback to try to delete the uploaded data if 

1165 # something fails below 

1166 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1167 

1168 data_written = False 

1169 

1170 # For remote URIs some datasets can be serialized directly 

1171 # to bytes and sent to the remote datastore without writing a 

1172 # file. If the dataset is intended to be saved to the cache 

1173 # a file is always written and direct write to the remote 

1174 # datastore is bypassed. 

1175 if not uri.isLocal and not self.cacheManager.should_be_cached(ref): 

1176 # Remote URI that is not cached so can write directly. 

1177 try: 

1178 serializedDataset = formatter.toBytes(inMemoryDataset) 

1179 except NotImplementedError: 

1180 # Fallback to the file writing option. 

1181 pass 

1182 except Exception as e: 

1183 raise RuntimeError( 

1184 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes." 

1185 ) from e 

1186 else: 

1187 log.debug("Writing bytes directly to %s", uri) 

1188 uri.write(serializedDataset, overwrite=True) 

1189 log.debug("Successfully wrote bytes directly to %s", uri) 

1190 data_written = True 

1191 

1192 if not data_written: 

1193 # Did not write the bytes directly to object store so instead 

1194 # write to temporary file. Always write to a temporary even if 

1195 # using a local file system -- that gives us atomic writes. 

1196 # If a process is killed as the file is being written we do not 

1197 # want it to remain in the correct place but in corrupt state. 

1198 # For local files write to the output directory not temporary dir. 

1199 prefix = uri.dirname() if uri.isLocal else None 

1200 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri: 

1201 # Need to configure the formatter to write to a different 

1202 # location and that needs us to overwrite internals 

1203 log.debug("Writing dataset to temporary location at %s", temporary_uri) 

1204 with formatter._updateLocation(Location(None, temporary_uri)): 

1205 try: 

1206 formatter.write(inMemoryDataset) 

1207 except Exception as e: 

1208 raise RuntimeError( 

1209 f"Failed to serialize dataset {ref} of type" 

1210 f" {type(inMemoryDataset)} to " 

1211 f"temporary location {temporary_uri}" 

1212 ) from e 

1213 

1214 # Use move for a local file since that becomes an efficient 

1215 # os.rename. For remote resources we use copy to allow the 

1216 # file to be cached afterwards. 

1217 transfer = "move" if uri.isLocal else "copy" 

1218 

1219 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True) 

1220 

1221 if transfer == "copy": 

1222 # Cache if required 

1223 self.cacheManager.move_to_cache(temporary_uri, ref) 

1224 

1225 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1226 

1227 # URI is needed to resolve what ingest case are we dealing with 

1228 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1229 

1230 def _read_artifact_into_memory( 

1231 self, 

1232 getInfo: DatastoreFileGetInformation, 

1233 ref: DatasetRef, 

1234 isComponent: bool = False, 

1235 cache_ref: DatasetRef | None = None, 

1236 ) -> Any: 

1237 """Read the artifact from datastore into in memory object. 

1238 

1239 Parameters 

1240 ---------- 

1241 getInfo : `DatastoreFileGetInformation` 

1242 Information about the artifact within the datastore. 

1243 ref : `DatasetRef` 

1244 The registry information associated with this artifact. 

1245 isComponent : `bool` 

1246 Flag to indicate if a component is being read from this artifact. 

1247 cache_ref : `DatasetRef`, optional 

1248 The DatasetRef to use when looking up the file in the cache. 

1249 This ref must have the same ID as the supplied ref but can 

1250 be a parent ref or component ref to indicate to the cache whether 

1251 a composite file is being requested from the cache or a component 

1252 file. Without this the cache will default to the supplied ref but 

1253 it can get confused with read-only derived components for 

1254 disassembled composites. 

1255 

1256 Returns 

1257 ------- 

1258 inMemoryDataset : `object` 

1259 The artifact as a python object. 

1260 """ 

1261 location = getInfo.location 

1262 uri = location.uri 

1263 log.debug("Accessing data from %s", uri) 

1264 

1265 if cache_ref is None: 

1266 cache_ref = ref 

1267 if cache_ref.id != ref.id: 

1268 raise ValueError( 

1269 "The supplied cache dataset ref refers to a different dataset than expected:" 

1270 f" {ref.id} != {cache_ref.id}" 

1271 ) 

1272 

1273 # Cannot recalculate checksum but can compare size as a quick check 

1274 # Do not do this if the size is negative since that indicates 

1275 # we do not know. 

1276 recorded_size = getInfo.info.file_size 

1277 resource_size = uri.size() 

1278 if recorded_size >= 0 and resource_size != recorded_size: 

1279 raise RuntimeError( 

1280 "Integrity failure in Datastore. " 

1281 f"Size of file {uri} ({resource_size}) " 

1282 f"does not match size recorded in registry of {recorded_size}" 

1283 ) 

1284 

1285 # For the general case we have choices for how to proceed. 

1286 # 1. Always use a local file (downloading the remote resource to a 

1287 # temporary file if needed). 

1288 # 2. Use a threshold size and read into memory and use bytes. 

1289 # Use both for now with an arbitrary hand off size. 

1290 # This allows small datasets to be downloaded from remote object 

1291 # stores without requiring a temporary file. 

1292 

1293 formatter = getInfo.formatter 

1294 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1295 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1296 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1297 if cached_file is not None: 

1298 desired_uri = cached_file 

1299 msg = f" (cached version of {uri})" 

1300 else: 

1301 desired_uri = uri 

1302 msg = "" 

1303 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)): 

1304 serializedDataset = desired_uri.read() 

1305 log.debug( 

1306 "Deserializing %s from %d bytes from location %s with formatter %s", 

1307 f"component {getInfo.component}" if isComponent else "", 

1308 len(serializedDataset), 

1309 uri, 

1310 formatter.name(), 

1311 ) 

1312 try: 

1313 result = formatter.fromBytes( 

1314 serializedDataset, component=getInfo.component if isComponent else None 

1315 ) 

1316 except Exception as e: 

1317 raise ValueError( 

1318 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1319 f" ({ref.datasetType.name} from {uri}): {e}" 

1320 ) from e 

1321 else: 

1322 # Read from file. 

1323 

1324 # Have to update the Location associated with the formatter 

1325 # because formatter.read does not allow an override. 

1326 # This could be improved. 

1327 location_updated = False 

1328 msg = "" 

1329 

1330 # First check in cache for local version. 

1331 # The cache will only be relevant for remote resources but 

1332 # no harm in always asking. Context manager ensures that cache 

1333 # file is not deleted during cache expiration. 

1334 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1335 if cached_file is not None: 

1336 msg = f"(via cache read of remote file {uri})" 

1337 uri = cached_file 

1338 location_updated = True 

1339 

1340 with uri.as_local() as local_uri: 

1341 can_be_cached = False 

1342 if uri != local_uri: 

1343 # URI was remote and file was downloaded 

1344 cache_msg = "" 

1345 location_updated = True 

1346 

1347 if self.cacheManager.should_be_cached(cache_ref): 

1348 # In this scenario we want to ask if the downloaded 

1349 # file should be cached but we should not cache 

1350 # it until after we've used it (to ensure it can't 

1351 # be expired whilst we are using it). 

1352 can_be_cached = True 

1353 

1354 # Say that it is "likely" to be cached because 

1355 # if the formatter read fails we will not be 

1356 # caching this file. 

1357 cache_msg = " and likely cached" 

1358 

1359 msg = f"(via download to local file{cache_msg})" 

1360 

1361 # Calculate the (possibly) new location for the formatter 

1362 # to use. 

1363 newLocation = Location(*local_uri.split()) if location_updated else None 

1364 

1365 log.debug( 

1366 "Reading%s from location %s %s with formatter %s", 

1367 f" component {getInfo.component}" if isComponent else "", 

1368 uri, 

1369 msg, 

1370 formatter.name(), 

1371 ) 

1372 try: 

1373 with ( 

1374 formatter._updateLocation(newLocation), 

1375 time_this( 

1376 log, 

1377 msg="Reading%s from location %s %s with formatter %s", 

1378 args=( 

1379 f" component {getInfo.component}" if isComponent else "", 

1380 uri, 

1381 msg, 

1382 formatter.name(), 

1383 ), 

1384 ), 

1385 ): 

1386 result = formatter.read(component=getInfo.component if isComponent else None) 

1387 except Exception as e: 

1388 raise ValueError( 

1389 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1390 f" ({ref.datasetType.name} from {uri}): {e}" 

1391 ) from e 

1392 

1393 # File was read successfully so can move to cache 

1394 if can_be_cached: 

1395 self.cacheManager.move_to_cache(local_uri, cache_ref) 

1396 

1397 return self._post_process_get( 

1398 result, ref.datasetType.storageClass, getInfo.assemblerParams, isComponent=isComponent 

1399 ) 

1400 

1401 def knows(self, ref: DatasetRef) -> bool: 

1402 """Check if the dataset is known to the datastore. 

1403 

1404 Does not check for existence of any artifact. 

1405 

1406 Parameters 

1407 ---------- 

1408 ref : `DatasetRef` 

1409 Reference to the required dataset. 

1410 

1411 Returns 

1412 ------- 

1413 exists : `bool` 

1414 `True` if the dataset is known to the datastore. 

1415 """ 

1416 fileLocations = self._get_dataset_locations_info(ref) 

1417 if fileLocations: 

1418 return True 

1419 return False 

1420 

1421 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

1422 # Docstring inherited from the base class. 

1423 

1424 # The records themselves. Could be missing some entries. 

1425 records = self._get_stored_records_associated_with_refs(refs) 

1426 

1427 return {ref: ref.id in records for ref in refs} 

1428 

1429 def _process_mexists_records( 

1430 self, 

1431 id_to_ref: dict[DatasetId, DatasetRef], 

1432 records: dict[DatasetId, list[StoredFileInfo]], 

1433 all_required: bool, 

1434 artifact_existence: dict[ResourcePath, bool] | None = None, 

1435 ) -> dict[DatasetRef, bool]: 

1436 """Check given records for existence. 

1437 

1438 Helper function for `mexists()`. 

1439 

1440 Parameters 

1441 ---------- 

1442 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`] 

1443 Mapping of the dataset ID to the dataset ref itself. 

1444 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

1445 Records as generally returned by 

1446 ``_get_stored_records_associated_with_refs``. 

1447 all_required : `bool` 

1448 Flag to indicate whether existence requires all artifacts 

1449 associated with a dataset ID to exist or not for existence. 

1450 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1451 Optional mapping of datastore artifact to existence. Updated by 

1452 this method with details of all artifacts tested. Can be `None` 

1453 if the caller is not interested. 

1454 

1455 Returns 

1456 ------- 

1457 existence : `dict` of [`DatasetRef`, `bool`] 

1458 Mapping from dataset to boolean indicating existence. 

1459 """ 

1460 # The URIs to be checked and a mapping of those URIs to 

1461 # the dataset ID. 

1462 uris_to_check: list[ResourcePath] = [] 

1463 location_map: dict[ResourcePath, DatasetId] = {} 

1464 

1465 location_factory = self.locationFactory 

1466 

1467 uri_existence: dict[ResourcePath, bool] = {} 

1468 for ref_id, infos in records.items(): 

1469 # Key is the dataset Id, value is list of StoredItemInfo 

1470 uris = [info.file_location(location_factory).uri for info in infos] 

1471 location_map.update({uri: ref_id for uri in uris}) 

1472 

1473 # Check the local cache directly for a dataset corresponding 

1474 # to the remote URI. 

1475 if self.cacheManager.file_count > 0: 

1476 ref = id_to_ref[ref_id] 

1477 for uri, storedFileInfo in zip(uris, infos, strict=True): 

1478 check_ref = ref 

1479 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1480 check_ref = ref.makeComponentRef(component) 

1481 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()): 

1482 # Proxy for URI existence. 

1483 uri_existence[uri] = True 

1484 else: 

1485 uris_to_check.append(uri) 

1486 else: 

1487 # Check all of them. 

1488 uris_to_check.extend(uris) 

1489 

1490 if artifact_existence is not None: 

1491 # If a URI has already been checked remove it from the list 

1492 # and immediately add the status to the output dict. 

1493 filtered_uris_to_check = [] 

1494 for uri in uris_to_check: 

1495 if uri in artifact_existence: 

1496 uri_existence[uri] = artifact_existence[uri] 

1497 else: 

1498 filtered_uris_to_check.append(uri) 

1499 uris_to_check = filtered_uris_to_check 

1500 

1501 # Results. 

1502 dataset_existence: dict[DatasetRef, bool] = {} 

1503 

1504 uri_existence.update(ResourcePath.mexists(uris_to_check)) 

1505 for uri, exists in uri_existence.items(): 

1506 dataset_id = location_map[uri] 

1507 ref = id_to_ref[dataset_id] 

1508 

1509 # Disassembled composite needs to check all locations. 

1510 # all_required indicates whether all need to exist or not. 

1511 if ref in dataset_existence: 

1512 if all_required: 

1513 exists = dataset_existence[ref] and exists 

1514 else: 

1515 exists = dataset_existence[ref] or exists 

1516 dataset_existence[ref] = exists 

1517 

1518 if artifact_existence is not None: 

1519 artifact_existence.update(uri_existence) 

1520 

1521 return dataset_existence 

1522 

1523 def mexists( 

1524 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1525 ) -> dict[DatasetRef, bool]: 

1526 """Check the existence of multiple datasets at once. 

1527 

1528 Parameters 

1529 ---------- 

1530 refs : iterable of `DatasetRef` 

1531 The datasets to be checked. 

1532 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1533 Optional mapping of datastore artifact to existence. Updated by 

1534 this method with details of all artifacts tested. Can be `None` 

1535 if the caller is not interested. 

1536 

1537 Returns 

1538 ------- 

1539 existence : `dict` of [`DatasetRef`, `bool`] 

1540 Mapping from dataset to boolean indicating existence. 

1541 

1542 Notes 

1543 ----- 

1544 To minimize potentially costly remote existence checks, the local 

1545 cache is checked as a proxy for existence. If a file for this 

1546 `DatasetRef` does exist no check is done for the actual URI. This 

1547 could result in possibly unexpected behavior if the dataset itself 

1548 has been removed from the datastore by another process whilst it is 

1549 still in the cache. 

1550 """ 

1551 chunk_size = 10_000 

1552 dataset_existence: dict[DatasetRef, bool] = {} 

1553 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size) 

1554 n_found_total = 0 

1555 n_checked = 0 

1556 n_chunks = 0 

1557 for chunk in chunk_iterable(refs, chunk_size=chunk_size): 

1558 chunk_result = self._mexists(chunk, artifact_existence) 

1559 

1560 # The log message level and content depend on how many 

1561 # datasets we are processing. 

1562 n_results = len(chunk_result) 

1563 

1564 # Use verbose logging to ensure that messages can be seen 

1565 # easily if many refs are being checked. 

1566 log_threshold = VERBOSE 

1567 n_checked += n_results 

1568 

1569 # This sum can take some time so only do it if we know the 

1570 # result is going to be used. 

1571 n_found = 0 

1572 if log.isEnabledFor(log_threshold): 

1573 # Can treat the booleans as 0, 1 integers and sum them. 

1574 n_found = sum(chunk_result.values()) 

1575 n_found_total += n_found 

1576 

1577 # We are deliberately not trying to count the number of refs 

1578 # provided in case it's in the millions. This means there is a 

1579 # situation where the number of refs exactly matches the chunk 

1580 # size and we will switch to the multi-chunk path even though 

1581 # we only have a single chunk. 

1582 if n_results < chunk_size and n_chunks == 0: 

1583 # Single chunk will be processed so we can provide more detail. 

1584 if n_results == 1: 

1585 ref = list(chunk_result)[0] 

1586 # Use debug logging to be consistent with `exists()`. 

1587 log.debug( 

1588 "Calling mexists() with single ref that does%s exist (%s).", 

1589 "" if chunk_result[ref] else " not", 

1590 ref, 

1591 ) 

1592 else: 

1593 # Single chunk but multiple files. Summarize. 

1594 log.log( 

1595 log_threshold, 

1596 "Number of datasets found in datastore: %d out of %d datasets checked.", 

1597 n_found, 

1598 n_checked, 

1599 ) 

1600 

1601 else: 

1602 # Use incremental verbose logging when we have multiple chunks. 

1603 log.log( 

1604 log_threshold, 

1605 "Number of datasets found in datastore for chunk %d: %d out of %d checked " 

1606 "(running total from all chunks so far: %d found out of %d checked)", 

1607 n_chunks, 

1608 n_found, 

1609 n_results, 

1610 n_found_total, 

1611 n_checked, 

1612 ) 

1613 dataset_existence.update(chunk_result) 

1614 n_chunks += 1 

1615 

1616 return dataset_existence 

1617 

1618 def _mexists( 

1619 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1620 ) -> dict[DatasetRef, bool]: 

1621 """Check the existence of multiple datasets at once. 

1622 

1623 Parameters 

1624 ---------- 

1625 refs : iterable of `DatasetRef` 

1626 The datasets to be checked. 

1627 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1628 Optional mapping of datastore artifact to existence. Updated by 

1629 this method with details of all artifacts tested. Can be `None` 

1630 if the caller is not interested. 

1631 

1632 Returns 

1633 ------- 

1634 existence : `dict` of [`DatasetRef`, `bool`] 

1635 Mapping from dataset to boolean indicating existence. 

1636 """ 

1637 # Make a mapping from refs with the internal storage class to the given 

1638 # refs that may have a different one. We'll use the internal refs 

1639 # throughout this method and convert back at the very end. 

1640 internal_ref_to_input_ref = {self._cast_storage_class(ref): ref for ref in refs} 

1641 

1642 # Need a mapping of dataset_id to (internal) dataset ref since some 

1643 # internal APIs work with dataset_id. 

1644 id_to_ref = {ref.id: ref for ref in internal_ref_to_input_ref} 

1645 

1646 # Set of all IDs we are checking for. 

1647 requested_ids = set(id_to_ref.keys()) 

1648 

1649 # The records themselves. Could be missing some entries. 

1650 records = self._get_stored_records_associated_with_refs(id_to_ref.values()) 

1651 

1652 dataset_existence = self._process_mexists_records( 

1653 id_to_ref, records, True, artifact_existence=artifact_existence 

1654 ) 

1655 

1656 # Set of IDs that have been handled. 

1657 handled_ids = {ref.id for ref in dataset_existence} 

1658 

1659 missing_ids = requested_ids - handled_ids 

1660 if missing_ids: 

1661 dataset_existence.update( 

1662 self._mexists_check_expected( 

1663 [id_to_ref[missing] for missing in missing_ids], artifact_existence 

1664 ) 

1665 ) 

1666 

1667 return { 

1668 internal_ref_to_input_ref[internal_ref]: existence 

1669 for internal_ref, existence in dataset_existence.items() 

1670 } 

1671 

1672 def _mexists_check_expected( 

1673 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1674 ) -> dict[DatasetRef, bool]: 

1675 """Check existence of refs that are not known to datastore. 

1676 

1677 Parameters 

1678 ---------- 

1679 refs : iterable of `DatasetRef` 

1680 The datasets to be checked. These are assumed not to be known 

1681 to datastore. 

1682 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1683 Optional mapping of datastore artifact to existence. Updated by 

1684 this method with details of all artifacts tested. Can be `None` 

1685 if the caller is not interested. 

1686 

1687 Returns 

1688 ------- 

1689 existence : `dict` of [`DatasetRef`, `bool`] 

1690 Mapping from dataset to boolean indicating existence. 

1691 """ 

1692 dataset_existence: dict[DatasetRef, bool] = {} 

1693 if not self.trustGetRequest: 

1694 # Must assume these do not exist 

1695 for ref in refs: 

1696 dataset_existence[ref] = False 

1697 else: 

1698 log.debug( 

1699 "%d datasets were not known to datastore during initial existence check.", 

1700 len(refs), 

1701 ) 

1702 

1703 # Construct data structure identical to that returned 

1704 # by _get_stored_records_associated_with_refs() but using 

1705 # guessed names. 

1706 records = {} 

1707 id_to_ref = {} 

1708 for missing_ref in refs: 

1709 expected = self._get_expected_dataset_locations_info(missing_ref) 

1710 dataset_id = missing_ref.id 

1711 records[dataset_id] = [info for _, info in expected] 

1712 id_to_ref[dataset_id] = missing_ref 

1713 

1714 dataset_existence.update( 

1715 self._process_mexists_records( 

1716 id_to_ref, 

1717 records, 

1718 False, 

1719 artifact_existence=artifact_existence, 

1720 ) 

1721 ) 

1722 

1723 return dataset_existence 

1724 

1725 def exists(self, ref: DatasetRef) -> bool: 

1726 """Check if the dataset exists in the datastore. 

1727 

1728 Parameters 

1729 ---------- 

1730 ref : `DatasetRef` 

1731 Reference to the required dataset. 

1732 

1733 Returns 

1734 ------- 

1735 exists : `bool` 

1736 `True` if the entity exists in the `Datastore`. 

1737 

1738 Notes 

1739 ----- 

1740 The local cache is checked as a proxy for existence in the remote 

1741 object store. It is possible that another process on a different 

1742 compute node could remove the file from the object store even 

1743 though it is present in the local cache. 

1744 """ 

1745 ref = self._cast_storage_class(ref) 

1746 fileLocations = self._get_dataset_locations_info(ref) 

1747 

1748 # if we are being asked to trust that registry might not be correct 

1749 # we ask for the expected locations and check them explicitly 

1750 if not fileLocations: 

1751 if not self.trustGetRequest: 

1752 return False 

1753 

1754 # First check the cache. If it is not found we must check 

1755 # the datastore itself. Assume that any component in the cache 

1756 # means that the dataset does exist somewhere. 

1757 if self.cacheManager.known_to_cache(ref): 

1758 return True 

1759 

1760 # When we are guessing a dataset location we can not check 

1761 # for the existence of every component since we can not 

1762 # know if every component was written. Instead we check 

1763 # for the existence of any of the expected locations. 

1764 for location, _ in self._get_expected_dataset_locations_info(ref): 

1765 if self._artifact_exists(location): 

1766 return True 

1767 return False 

1768 

1769 # All listed artifacts must exist. 

1770 for location, storedFileInfo in fileLocations: 

1771 # Checking in cache needs the component ref. 

1772 check_ref = ref 

1773 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1774 check_ref = ref.makeComponentRef(component) 

1775 if self.cacheManager.known_to_cache(check_ref, location.getExtension()): 

1776 continue 

1777 

1778 if not self._artifact_exists(location): 

1779 return False 

1780 

1781 return True 

1782 

1783 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

1784 """Return URIs associated with dataset. 

1785 

1786 Parameters 

1787 ---------- 

1788 ref : `DatasetRef` 

1789 Reference to the required dataset. 

1790 predict : `bool`, optional 

1791 If the datastore does not know about the dataset, should it 

1792 return a predicted URI or not? 

1793 

1794 Returns 

1795 ------- 

1796 uris : `DatasetRefURIs` 

1797 The URI to the primary artifact associated with this dataset (if 

1798 the dataset was disassembled within the datastore this may be 

1799 `None`), and the URIs to any components associated with the dataset 

1800 artifact. (can be empty if there are no components). 

1801 """ 

1802 many = self.getManyURIs([ref], predict=predict, allow_missing=False) 

1803 return many[ref] 

1804 

1805 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

1806 """URI to the Dataset. 

1807 

1808 Parameters 

1809 ---------- 

1810 ref : `DatasetRef` 

1811 Reference to the required Dataset. 

1812 predict : `bool` 

1813 If `True`, allow URIs to be returned of datasets that have not 

1814 been written. 

1815 

1816 Returns 

1817 ------- 

1818 uri : `str` 

1819 URI pointing to the dataset within the datastore. If the 

1820 dataset does not exist in the datastore, and if ``predict`` is 

1821 `True`, the URI will be a prediction and will include a URI 

1822 fragment "#predicted". 

1823 If the datastore does not have entities that relate well 

1824 to the concept of a URI the returned URI will be 

1825 descriptive. The returned URI is not guaranteed to be obtainable. 

1826 

1827 Raises 

1828 ------ 

1829 FileNotFoundError 

1830 Raised if a URI has been requested for a dataset that does not 

1831 exist and guessing is not allowed. 

1832 RuntimeError 

1833 Raised if a request is made for a single URI but multiple URIs 

1834 are associated with this dataset. 

1835 

1836 Notes 

1837 ----- 

1838 When a predicted URI is requested an attempt will be made to form 

1839 a reasonable URI based on file templates and the expected formatter. 

1840 """ 

1841 primary, components = self.getURIs(ref, predict) 

1842 if primary is None or components: 

1843 raise RuntimeError( 

1844 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

1845 ) 

1846 return primary 

1847 

1848 def _predict_URIs( 

1849 self, 

1850 ref: DatasetRef, 

1851 ) -> DatasetRefURIs: 

1852 """Predict the URIs of a dataset ref. 

1853 

1854 Parameters 

1855 ---------- 

1856 ref : `DatasetRef` 

1857 Reference to the required Dataset. 

1858 

1859 Returns 

1860 ------- 

1861 URI : DatasetRefUris 

1862 Primary and component URIs. URIs will contain a URI fragment 

1863 "#predicted". 

1864 """ 

1865 uris = DatasetRefURIs() 

1866 

1867 if self.composites.shouldBeDisassembled(ref): 

1868 for component, _ in ref.datasetType.storageClass.components.items(): 

1869 comp_ref = ref.makeComponentRef(component) 

1870 comp_location, _ = self._determine_put_formatter_location(comp_ref) 

1871 

1872 # Add the "#predicted" URI fragment to indicate this is a 

1873 # guess 

1874 uris.componentURIs[component] = ResourcePath(comp_location.uri.geturl() + "#predicted") 

1875 

1876 else: 

1877 location, _ = self._determine_put_formatter_location(ref) 

1878 

1879 # Add the "#predicted" URI fragment to indicate this is a guess 

1880 uris.primaryURI = ResourcePath(location.uri.geturl() + "#predicted") 

1881 

1882 return uris 

1883 

1884 def getManyURIs( 

1885 self, 

1886 refs: Iterable[DatasetRef], 

1887 predict: bool = False, 

1888 allow_missing: bool = False, 

1889 ) -> dict[DatasetRef, DatasetRefURIs]: 

1890 # Docstring inherited 

1891 

1892 uris: dict[DatasetRef, DatasetRefURIs] = {} 

1893 

1894 records = self._get_stored_records_associated_with_refs(refs) 

1895 records_keys = records.keys() 

1896 

1897 existing_refs = tuple(ref for ref in refs if ref.id in records_keys) 

1898 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys) 

1899 

1900 # Have to handle trustGetRequest mode by checking for the existence 

1901 # of the missing refs on disk. 

1902 if missing_refs: 

1903 dataset_existence = self._mexists_check_expected(missing_refs, None) 

1904 really_missing = set() 

1905 not_missing = set() 

1906 for ref, exists in dataset_existence.items(): 

1907 if exists: 

1908 not_missing.add(ref) 

1909 else: 

1910 really_missing.add(ref) 

1911 

1912 if not_missing: 

1913 # Need to recalculate the missing/existing split. 

1914 existing_refs = existing_refs + tuple(not_missing) 

1915 missing_refs = tuple(really_missing) 

1916 

1917 for ref in missing_refs: 

1918 # if this has never been written then we have to guess 

1919 if not predict: 

1920 if not allow_missing: 

1921 raise FileNotFoundError(f"Dataset {ref} not in this datastore.") 

1922 else: 

1923 uris[ref] = self._predict_URIs(ref) 

1924 

1925 for ref in existing_refs: 

1926 file_infos = records[ref.id] 

1927 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos] 

1928 uris[ref] = self._locations_to_URI(ref, file_locations) 

1929 

1930 return uris 

1931 

1932 def _locations_to_URI( 

1933 self, 

1934 ref: DatasetRef, 

1935 file_locations: Sequence[tuple[Location, StoredFileInfo]], 

1936 ) -> DatasetRefURIs: 

1937 """Convert one or more file locations associated with a DatasetRef 

1938 to a DatasetRefURIs. 

1939 

1940 Parameters 

1941 ---------- 

1942 ref : `DatasetRef` 

1943 Reference to the dataset. 

1944 file_locations : Sequence[Tuple[Location, StoredFileInfo]] 

1945 Each item in the sequence is the location of the dataset within the 

1946 datastore and stored information about the file and its formatter. 

1947 If there is only one item in the sequence then it is treated as the 

1948 primary URI. If there is more than one item then they are treated 

1949 as component URIs. If there are no items then an error is raised 

1950 unless ``self.trustGetRequest`` is `True`. 

1951 

1952 Returns 

1953 ------- 

1954 uris: DatasetRefURIs 

1955 Represents the primary URI or component URIs described by the 

1956 inputs. 

1957 

1958 Raises 

1959 ------ 

1960 RuntimeError 

1961 If no file locations are passed in and ``self.trustGetRequest`` is 

1962 `False`. 

1963 FileNotFoundError 

1964 If the a passed-in URI does not exist, and ``self.trustGetRequest`` 

1965 is `False`. 

1966 RuntimeError 

1967 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is 

1968 unexpected). 

1969 """ 

1970 guessing = False 

1971 uris = DatasetRefURIs() 

1972 

1973 if not file_locations: 

1974 if not self.trustGetRequest: 

1975 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1976 file_locations = self._get_expected_dataset_locations_info(ref) 

1977 guessing = True 

1978 

1979 if len(file_locations) == 1: 

1980 # No disassembly so this is the primary URI 

1981 uris.primaryURI = file_locations[0][0].uri 

1982 if guessing and not uris.primaryURI.exists(): 

1983 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist") 

1984 else: 

1985 for location, file_info in file_locations: 

1986 if file_info.component is None: 

1987 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1988 if guessing and not location.uri.exists(): 

1989 # If we are trusting then it is entirely possible for 

1990 # some components to be missing. In that case we skip 

1991 # to the next component. 

1992 if self.trustGetRequest: 

1993 continue 

1994 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist") 

1995 uris.componentURIs[file_info.component] = location.uri 

1996 

1997 return uris 

1998 

1999 def retrieveArtifacts( 

2000 self, 

2001 refs: Iterable[DatasetRef], 

2002 destination: ResourcePath, 

2003 transfer: str = "auto", 

2004 preserve_path: bool = True, 

2005 overwrite: bool = False, 

2006 ) -> list[ResourcePath]: 

2007 """Retrieve the file artifacts associated with the supplied refs. 

2008 

2009 Parameters 

2010 ---------- 

2011 refs : iterable of `DatasetRef` 

2012 The datasets for which file artifacts are to be retrieved. 

2013 A single ref can result in multiple files. The refs must 

2014 be resolved. 

2015 destination : `lsst.resources.ResourcePath` 

2016 Location to write the file artifacts. 

2017 transfer : `str`, optional 

2018 Method to use to transfer the artifacts. Must be one of the options 

2019 supported by `lsst.resources.ResourcePath.transfer_from()`. 

2020 "move" is not allowed. 

2021 preserve_path : `bool`, optional 

2022 If `True` the full path of the file artifact within the datastore 

2023 is preserved. If `False` the final file component of the path 

2024 is used. 

2025 overwrite : `bool`, optional 

2026 If `True` allow transfers to overwrite existing files at the 

2027 destination. 

2028 

2029 Returns 

2030 ------- 

2031 targets : `list` of `lsst.resources.ResourcePath` 

2032 URIs of file artifacts in destination location. Order is not 

2033 preserved. 

2034 """ 

2035 if not destination.isdir(): 

2036 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

2037 

2038 if transfer == "move": 

2039 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

2040 

2041 # Source -> Destination 

2042 # This also helps filter out duplicate DatasetRef in the request 

2043 # that will map to the same underlying file transfer. 

2044 to_transfer: dict[ResourcePath, ResourcePath] = {} 

2045 

2046 for ref in refs: 

2047 locations = self._get_dataset_locations_info(ref) 

2048 for location, _ in locations: 

2049 source_uri = location.uri 

2050 target_path: ResourcePathExpression 

2051 if preserve_path: 

2052 target_path = location.pathInStore 

2053 if target_path.isabs(): 

2054 # This is an absolute path to an external file. 

2055 # Use the full path. 

2056 target_path = target_path.relativeToPathRoot 

2057 else: 

2058 target_path = source_uri.basename() 

2059 target_uri = destination.join(target_path) 

2060 to_transfer[source_uri] = target_uri 

2061 

2062 # In theory can now parallelize the transfer 

2063 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer)) 

2064 for source_uri, target_uri in to_transfer.items(): 

2065 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

2066 

2067 return list(to_transfer.values()) 

2068 

2069 def get( 

2070 self, 

2071 ref: DatasetRef, 

2072 parameters: Mapping[str, Any] | None = None, 

2073 storageClass: StorageClass | str | None = None, 

2074 ) -> Any: 

2075 """Load an InMemoryDataset from the store. 

2076 

2077 Parameters 

2078 ---------- 

2079 ref : `DatasetRef` 

2080 Reference to the required Dataset. 

2081 parameters : `dict` 

2082 `StorageClass`-specific parameters that specify, for example, 

2083 a slice of the dataset to be loaded. 

2084 storageClass : `StorageClass` or `str`, optional 

2085 The storage class to be used to override the Python type 

2086 returned by this method. By default the returned type matches 

2087 the dataset type definition for this dataset. Specifying a 

2088 read `StorageClass` can force a different type to be returned. 

2089 This type must be compatible with the original type. 

2090 

2091 Returns 

2092 ------- 

2093 inMemoryDataset : `object` 

2094 Requested dataset or slice thereof as an InMemoryDataset. 

2095 

2096 Raises 

2097 ------ 

2098 FileNotFoundError 

2099 Requested dataset can not be retrieved. 

2100 TypeError 

2101 Return value from formatter has unexpected type. 

2102 ValueError 

2103 Formatter failed to process the dataset. 

2104 """ 

2105 # Supplied storage class for the component being read is either 

2106 # from the ref itself or some an override if we want to force 

2107 # type conversion. 

2108 if storageClass is not None: 

2109 ref = ref.overrideStorageClass(storageClass) 

2110 refStorageClass = ref.datasetType.storageClass 

2111 

2112 allGetInfo = self._prepare_for_get(ref, parameters) 

2113 refComponent = ref.datasetType.component() 

2114 

2115 # Create mapping from component name to related info 

2116 allComponents = {i.component: i for i in allGetInfo} 

2117 

2118 # By definition the dataset is disassembled if we have more 

2119 # than one record for it. 

2120 isDisassembled = len(allGetInfo) > 1 

2121 

2122 # Look for the special case where we are disassembled but the 

2123 # component is a derived component that was not written during 

2124 # disassembly. For this scenario we need to check that the 

2125 # component requested is listed as a derived component for the 

2126 # composite storage class 

2127 isDisassembledReadOnlyComponent = False 

2128 if isDisassembled and refComponent: 

2129 # The composite storage class should be accessible through 

2130 # the component dataset type 

2131 compositeStorageClass = ref.datasetType.parentStorageClass 

2132 

2133 # In the unlikely scenario where the composite storage 

2134 # class is not known, we can only assume that this is a 

2135 # normal component. If that assumption is wrong then the 

2136 # branch below that reads a persisted component will fail 

2137 # so there is no need to complain here. 

2138 if compositeStorageClass is not None: 

2139 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

2140 

2141 if isDisassembled and not refComponent: 

2142 # This was a disassembled dataset spread over multiple files 

2143 # and we need to put them all back together again. 

2144 # Read into memory and then assemble 

2145 

2146 # Check that the supplied parameters are suitable for the type read 

2147 refStorageClass.validateParameters(parameters) 

2148 

2149 # We want to keep track of all the parameters that were not used 

2150 # by formatters. We assume that if any of the component formatters 

2151 # use a parameter that we do not need to apply it again in the 

2152 # assembler. 

2153 usedParams = set() 

2154 

2155 components: dict[str, Any] = {} 

2156 for getInfo in allGetInfo: 

2157 # assemblerParams are parameters not understood by the 

2158 # associated formatter. 

2159 usedParams.update(set(getInfo.formatterParams)) 

2160 

2161 component = getInfo.component 

2162 

2163 if component is None: 

2164 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

2165 

2166 # We do not want the formatter to think it's reading 

2167 # a component though because it is really reading a 

2168 # standalone dataset -- always tell reader it is not a 

2169 # component. 

2170 components[component] = self._read_artifact_into_memory( 

2171 getInfo, ref.makeComponentRef(component), isComponent=False 

2172 ) 

2173 

2174 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

2175 

2176 # Any unused parameters will have to be passed to the assembler 

2177 if parameters: 

2178 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

2179 else: 

2180 unusedParams = {} 

2181 

2182 # Process parameters 

2183 return ref.datasetType.storageClass.delegate().handleParameters( 

2184 inMemoryDataset, parameters=unusedParams 

2185 ) 

2186 

2187 elif isDisassembledReadOnlyComponent: 

2188 compositeStorageClass = ref.datasetType.parentStorageClass 

2189 if compositeStorageClass is None: 

2190 raise RuntimeError( 

2191 f"Unable to retrieve derived component '{refComponent}' since" 

2192 "no composite storage class is available." 

2193 ) 

2194 

2195 if refComponent is None: 

2196 # Mainly for mypy 

2197 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

2198 

2199 # Assume that every derived component can be calculated by 

2200 # forwarding the request to a single read/write component. 

2201 # Rather than guessing which rw component is the right one by 

2202 # scanning each for a derived component of the same name, 

2203 # we ask the storage class delegate directly which one is best to 

2204 # use. 

2205 compositeDelegate = compositeStorageClass.delegate() 

2206 forwardedComponent = compositeDelegate.selectResponsibleComponent( 

2207 refComponent, set(allComponents) 

2208 ) 

2209 

2210 # Select the relevant component 

2211 rwInfo = allComponents[forwardedComponent] 

2212 

2213 # For now assume that read parameters are validated against 

2214 # the real component and not the requested component 

2215 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

2216 forwardedStorageClass.validateParameters(parameters) 

2217 

2218 # The reference to use for the caching must refer to the forwarded 

2219 # component and not the derived component. 

2220 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent) 

2221 

2222 # Unfortunately the FileDescriptor inside the formatter will have 

2223 # the wrong write storage class so we need to create a new one 

2224 # given the immutability constraint. 

2225 writeStorageClass = rwInfo.info.storageClass 

2226 

2227 # We may need to put some thought into parameters for read 

2228 # components but for now forward them on as is 

2229 readFormatter = type(rwInfo.formatter)( 

2230 FileDescriptor( 

2231 rwInfo.location, 

2232 readStorageClass=refStorageClass, 

2233 storageClass=writeStorageClass, 

2234 parameters=parameters, 

2235 ), 

2236 ref.dataId, 

2237 ) 

2238 

2239 # The assembler can not receive any parameter requests for a 

2240 # derived component at this time since the assembler will 

2241 # see the storage class of the derived component and those 

2242 # parameters will have to be handled by the formatter on the 

2243 # forwarded storage class. 

2244 assemblerParams: dict[str, Any] = {} 

2245 

2246 # Need to created a new info that specifies the derived 

2247 # component and associated storage class 

2248 readInfo = DatastoreFileGetInformation( 

2249 rwInfo.location, 

2250 readFormatter, 

2251 rwInfo.info, 

2252 assemblerParams, 

2253 {}, 

2254 refComponent, 

2255 refStorageClass, 

2256 ) 

2257 

2258 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref) 

2259 

2260 else: 

2261 # Single file request or component from that composite file 

2262 for lookup in (refComponent, None): 

2263 if lookup in allComponents: 

2264 getInfo = allComponents[lookup] 

2265 break 

2266 else: 

2267 raise FileNotFoundError( 

2268 f"Component {refComponent} not found for ref {ref} in datastore {self.name}" 

2269 ) 

2270 

2271 # Do not need the component itself if already disassembled 

2272 if isDisassembled: 

2273 isComponent = False 

2274 else: 

2275 isComponent = getInfo.component is not None 

2276 

2277 # For a component read of a composite we want the cache to 

2278 # be looking at the composite ref itself. 

2279 cache_ref = ref.makeCompositeRef() if isComponent else ref 

2280 

2281 # For a disassembled component we can validate parametersagainst 

2282 # the component storage class directly 

2283 if isDisassembled: 

2284 refStorageClass.validateParameters(parameters) 

2285 else: 

2286 # For an assembled composite this could be a derived 

2287 # component derived from a real component. The validity 

2288 # of the parameters is not clear. For now validate against 

2289 # the composite storage class 

2290 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

2291 

2292 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref) 

2293 

2294 @transactional 

2295 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

2296 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

2297 

2298 Parameters 

2299 ---------- 

2300 inMemoryDataset : `object` 

2301 The dataset to store. 

2302 ref : `DatasetRef` 

2303 Reference to the associated Dataset. 

2304 

2305 Raises 

2306 ------ 

2307 TypeError 

2308 Supplied object and storage class are inconsistent. 

2309 DatasetTypeNotSupportedError 

2310 The associated `DatasetType` is not handled by this datastore. 

2311 

2312 Notes 

2313 ----- 

2314 If the datastore is configured to reject certain dataset types it 

2315 is possible that the put will fail and raise a 

2316 `DatasetTypeNotSupportedError`. The main use case for this is to 

2317 allow `ChainedDatastore` to put to multiple datastores without 

2318 requiring that every datastore accepts the dataset. 

2319 """ 

2320 doDisassembly = self.composites.shouldBeDisassembled(ref) 

2321 # doDisassembly = True 

2322 

2323 artifacts = [] 

2324 if doDisassembly: 

2325 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

2326 if components is None: 

2327 raise RuntimeError( 

2328 f"Inconsistent configuration: dataset type {ref.datasetType.name} " 

2329 f"with storage class {ref.datasetType.storageClass.name} " 

2330 "is configured to be disassembled, but cannot be." 

2331 ) 

2332 for component, componentInfo in components.items(): 

2333 # Don't recurse because we want to take advantage of 

2334 # bulk insert -- need a new DatasetRef that refers to the 

2335 # same dataset_id but has the component DatasetType 

2336 # DatasetType does not refer to the types of components 

2337 # So we construct one ourselves. 

2338 compRef = ref.makeComponentRef(component) 

2339 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

2340 artifacts.append((compRef, storedInfo)) 

2341 else: 

2342 # Write the entire thing out 

2343 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

2344 artifacts.append((ref, storedInfo)) 

2345 

2346 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.INSERT) 

2347 

2348 @transactional 

2349 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None: 

2350 # At this point can safely remove these datasets from the cache 

2351 # to avoid confusion later on. If they are not trashed later 

2352 # the cache will simply be refilled. 

2353 self.cacheManager.remove_from_cache(ref) 

2354 

2355 # If we are in trust mode there will be nothing to move to 

2356 # the trash table and we will have to try to delete the file 

2357 # immediately. 

2358 if self.trustGetRequest: 

2359 # Try to keep the logic below for a single file trash. 

2360 if isinstance(ref, DatasetRef): 

2361 refs = {ref} 

2362 else: 

2363 # Will recreate ref at the end of this branch. 

2364 refs = set(ref) 

2365 

2366 # Determine which datasets are known to datastore directly. 

2367 id_to_ref = {ref.id: ref for ref in refs} 

2368 existing_ids = self._get_stored_records_associated_with_refs(refs) 

2369 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids} 

2370 

2371 missing = refs - existing_refs 

2372 if missing: 

2373 # Do an explicit existence check on these refs. 

2374 # We only care about the artifacts at this point and not 

2375 # the dataset existence. 

2376 artifact_existence: dict[ResourcePath, bool] = {} 

2377 _ = self.mexists(missing, artifact_existence) 

2378 uris = [uri for uri, exists in artifact_existence.items() if exists] 

2379 

2380 # FUTURE UPGRADE: Implement a parallelized bulk remove. 

2381 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris)) 

2382 for uri in uris: 

2383 try: 

2384 uri.remove() 

2385 except Exception as e: 

2386 if ignore_errors: 

2387 log.debug("Artifact %s could not be removed: %s", uri, e) 

2388 continue 

2389 raise 

2390 

2391 # There is no point asking the code below to remove refs we 

2392 # know are missing so update it with the list of existing 

2393 # records. Try to retain one vs many logic. 

2394 if not existing_refs: 

2395 # Nothing more to do since none of the datasets were 

2396 # known to the datastore record table. 

2397 return 

2398 ref = list(existing_refs) 

2399 if len(ref) == 1: 

2400 ref = ref[0] 

2401 

2402 # Get file metadata and internal metadata 

2403 if not isinstance(ref, DatasetRef): 

2404 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

2405 # Assumed to be an iterable of refs so bulk mode enabled. 

2406 try: 

2407 self.bridge.moveToTrash(ref, transaction=self._transaction) 

2408 except Exception as e: 

2409 if ignore_errors: 

2410 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

2411 else: 

2412 raise 

2413 return 

2414 

2415 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

2416 

2417 fileLocations = self._get_dataset_locations_info(ref) 

2418 

2419 if not fileLocations: 

2420 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

2421 if ignore_errors: 

2422 log.warning(err_msg) 

2423 return 

2424 else: 

2425 raise FileNotFoundError(err_msg) 

2426 

2427 for location, _ in fileLocations: 

2428 if not self._artifact_exists(location): 

2429 err_msg = ( 

2430 f"Dataset is known to datastore {self.name} but " 

2431 f"associated artifact ({location.uri}) is missing" 

2432 ) 

2433 if ignore_errors: 

2434 log.warning(err_msg) 

2435 return 

2436 else: 

2437 raise FileNotFoundError(err_msg) 

2438 

2439 # Mark dataset as trashed 

2440 try: 

2441 self.bridge.moveToTrash([ref], transaction=self._transaction) 

2442 except Exception as e: 

2443 if ignore_errors: 

2444 log.warning( 

2445 "Attempted to mark dataset (%s) to be trashed in datastore %s " 

2446 "but encountered an error: %s", 

2447 ref, 

2448 self.name, 

2449 e, 

2450 ) 

2451 pass 

2452 else: 

2453 raise 

2454 

2455 @transactional 

2456 def emptyTrash(self, ignore_errors: bool = True) -> None: 

2457 """Remove all datasets from the trash. 

2458 

2459 Parameters 

2460 ---------- 

2461 ignore_errors : `bool` 

2462 If `True` return without error even if something went wrong. 

2463 Problems could occur if another process is simultaneously trying 

2464 to delete. 

2465 """ 

2466 log.debug("Emptying trash in datastore %s", self.name) 

2467 

2468 # Context manager will empty trash iff we finish it without raising. 

2469 # It will also automatically delete the relevant rows from the 

2470 # trash table and the records table. 

2471 with self.bridge.emptyTrash( 

2472 self._table, record_class=StoredFileInfo, record_column="path" 

2473 ) as trash_data: 

2474 # Removing the artifacts themselves requires that the files are 

2475 # not also associated with refs that are not to be trashed. 

2476 # Therefore need to do a query with the file paths themselves 

2477 # and return all the refs associated with them. Can only delete 

2478 # a file if the refs to be trashed are the only refs associated 

2479 # with the file. 

2480 # This requires multiple copies of the trashed items 

2481 trashed, artifacts_to_keep = trash_data 

2482 

2483 if artifacts_to_keep is None: 

2484 # The bridge is not helping us so have to work it out 

2485 # ourselves. This is not going to be as efficient. 

2486 trashed = list(trashed) 

2487 

2488 # The instance check is for mypy since up to this point it 

2489 # does not know the type of info. 

2490 path_map = self._refs_associated_with_artifacts( 

2491 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)] 

2492 ) 

2493 

2494 for ref, info in trashed: 

2495 # Mypy needs to know this is not the base class 

2496 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2497 

2498 path_map[info.path].remove(ref.id) 

2499 if not path_map[info.path]: 

2500 del path_map[info.path] 

2501 

2502 artifacts_to_keep = set(path_map) 

2503 

2504 for ref, info in trashed: 

2505 # Should not happen for this implementation but need 

2506 # to keep mypy happy. 

2507 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

2508 

2509 # Mypy needs to know this is not the base class 

2510 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2511 

2512 if info.path in artifacts_to_keep: 

2513 # This is a multi-dataset artifact and we are not 

2514 # removing all associated refs. 

2515 continue 

2516 

2517 # Only trashed refs still known to datastore will be returned. 

2518 location = info.file_location(self.locationFactory) 

2519 

2520 # Point of no return for this artifact 

2521 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

2522 try: 

2523 self._delete_artifact(location) 

2524 except FileNotFoundError: 

2525 # If the file itself has been deleted there is nothing 

2526 # we can do about it. It is possible that trash has 

2527 # been run in parallel in another process or someone 

2528 # decided to delete the file. It is unlikely to come 

2529 # back and so we should still continue with the removal 

2530 # of the entry from the trash table. It is also possible 

2531 # we removed it in a previous iteration if it was 

2532 # a multi-dataset artifact. The delete artifact method 

2533 # will log a debug message in this scenario. 

2534 # Distinguishing file missing before trash started and 

2535 # file already removed previously as part of this trash 

2536 # is not worth the distinction with regards to potential 

2537 # memory cost. 

2538 pass 

2539 except Exception as e: 

2540 if ignore_errors: 

2541 # Use a debug message here even though it's not 

2542 # a good situation. In some cases this can be 

2543 # caused by a race between user A and user B 

2544 # and neither of them has permissions for the 

2545 # other's files. Butler does not know about users 

2546 # and trash has no idea what collections these 

2547 # files were in (without guessing from a path). 

2548 log.debug( 

2549 "Encountered error removing artifact %s from datastore %s: %s", 

2550 location.uri, 

2551 self.name, 

2552 e, 

2553 ) 

2554 else: 

2555 raise 

2556 

2557 @transactional 

2558 def transfer_from( 

2559 self, 

2560 source_datastore: Datastore, 

2561 refs: Iterable[DatasetRef], 

2562 transfer: str = "auto", 

2563 artifact_existence: dict[ResourcePath, bool] | None = None, 

2564 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

2565 # Docstring inherited 

2566 if type(self) is not type(source_datastore): 

2567 raise TypeError( 

2568 f"Datastore mismatch between this datastore ({type(self)}) and the " 

2569 f"source datastore ({type(source_datastore)})." 

2570 ) 

2571 

2572 # Be explicit for mypy 

2573 if not isinstance(source_datastore, FileDatastore): 

2574 raise TypeError( 

2575 "Can only transfer to a FileDatastore from another FileDatastore, not" 

2576 f" {type(source_datastore)}" 

2577 ) 

2578 

2579 # Stop early if "direct" transfer mode is requested. That would 

2580 # require that the URI inside the source datastore should be stored 

2581 # directly in the target datastore, which seems unlikely to be useful 

2582 # since at any moment the source datastore could delete the file. 

2583 if transfer in ("direct", "split"): 

2584 raise ValueError( 

2585 f"Can not transfer from a source datastore using {transfer} mode since" 

2586 " those files are controlled by the other datastore." 

2587 ) 

2588 

2589 # Empty existence lookup if none given. 

2590 if artifact_existence is None: 

2591 artifact_existence = {} 

2592 

2593 # We will go through the list multiple times so must convert 

2594 # generators to lists. 

2595 refs = list(refs) 

2596 

2597 # In order to handle disassembled composites the code works 

2598 # at the records level since it can assume that internal APIs 

2599 # can be used. 

2600 # - If the record already exists in the destination this is assumed 

2601 # to be okay. 

2602 # - If there is no record but the source and destination URIs are 

2603 # identical no transfer is done but the record is added. 

2604 # - If the source record refers to an absolute URI currently assume 

2605 # that that URI should remain absolute and will be visible to the 

2606 # destination butler. May need to have a flag to indicate whether 

2607 # the dataset should be transferred. This will only happen if 

2608 # the detached Butler has had a local ingest. 

2609 

2610 # What we really want is all the records in the source datastore 

2611 # associated with these refs. Or derived ones if they don't exist 

2612 # in the source. 

2613 source_records = source_datastore._get_stored_records_associated_with_refs(refs) 

2614 

2615 # The source dataset_ids are the keys in these records 

2616 source_ids = set(source_records) 

2617 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

2618 

2619 requested_ids = {ref.id for ref in refs} 

2620 missing_ids = requested_ids - source_ids 

2621 

2622 # Missing IDs can be okay if that datastore has allowed 

2623 # gets based on file existence. Should we transfer what we can 

2624 # or complain about it and warn? 

2625 if missing_ids and not source_datastore.trustGetRequest: 

2626 raise ValueError( 

2627 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}" 

2628 ) 

2629 

2630 # Need to map these missing IDs to a DatasetRef so we can guess 

2631 # the details. 

2632 if missing_ids: 

2633 log.info( 

2634 "Number of expected datasets missing from source datastore records: %d out of %d", 

2635 len(missing_ids), 

2636 len(requested_ids), 

2637 ) 

2638 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

2639 

2640 # This should be chunked in case we end up having to check 

2641 # the file store since we need some log output to show 

2642 # progress. 

2643 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000): 

2644 records = {} 

2645 for missing in missing_ids_chunk: 

2646 # Ask the source datastore where the missing artifacts 

2647 # should be. An execution butler might not know about the 

2648 # artifacts even if they are there. 

2649 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) 

2650 records[missing] = [info for _, info in expected] 

2651 

2652 # Call the mexist helper method in case we have not already 

2653 # checked these artifacts such that artifact_existence is 

2654 # empty. This allows us to benefit from parallelism. 

2655 # datastore.mexists() itself does not give us access to the 

2656 # derived datastore record. 

2657 log.verbose("Checking existence of %d datasets unknown to datastore", len(records)) 

2658 ref_exists = source_datastore._process_mexists_records( 

2659 id_to_ref, records, False, artifact_existence=artifact_existence 

2660 ) 

2661 

2662 # Now go through the records and propagate the ones that exist. 

2663 location_factory = source_datastore.locationFactory 

2664 for missing, record_list in records.items(): 

2665 # Skip completely if the ref does not exist. 

2666 ref = id_to_ref[missing] 

2667 if not ref_exists[ref]: 

2668 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref) 

2669 continue 

2670 # Check for file artifact to decide which parts of a 

2671 # disassembled composite do exist. If there is only a 

2672 # single record we don't even need to look because it can't 

2673 # be a composite and must exist. 

2674 if len(record_list) == 1: 

2675 dataset_records = record_list 

2676 else: 

2677 dataset_records = [ 

2678 record 

2679 for record in record_list 

2680 if artifact_existence[record.file_location(location_factory).uri] 

2681 ] 

2682 assert len(dataset_records) > 0, "Disassembled composite should have had some files." 

2683 

2684 # Rely on source_records being a defaultdict. 

2685 source_records[missing].extend(dataset_records) 

2686 

2687 # See if we already have these records 

2688 target_records = self._get_stored_records_associated_with_refs(refs) 

2689 

2690 # The artifacts to register 

2691 artifacts = [] 

2692 

2693 # Refs that already exist 

2694 already_present = [] 

2695 

2696 # Refs that were rejected by this datastore. 

2697 rejected = set() 

2698 

2699 # Refs that were transferred successfully. 

2700 accepted = set() 

2701 

2702 # Record each time we have done a "direct" transfer. 

2703 direct_transfers = [] 

2704 

2705 # Now can transfer the artifacts 

2706 for ref in refs: 

2707 if not self.constraints.isAcceptable(ref): 

2708 # This datastore should not be accepting this dataset. 

2709 rejected.add(ref) 

2710 continue 

2711 

2712 accepted.add(ref) 

2713 

2714 if ref.id in target_records: 

2715 # Already have an artifact for this. 

2716 already_present.append(ref) 

2717 continue 

2718 

2719 # mypy needs to know these are always resolved refs 

2720 for info in source_records[ref.id]: 

2721 source_location = info.file_location(source_datastore.locationFactory) 

2722 target_location = info.file_location(self.locationFactory) 

2723 if source_location == target_location and not source_location.pathInStore.isabs(): 

2724 # Artifact is already in the target location. 

2725 # (which is how execution butler currently runs) 

2726 pass 

2727 else: 

2728 if target_location.pathInStore.isabs(): 

2729 # Just because we can see the artifact when running 

2730 # the transfer doesn't mean it will be generally 

2731 # accessible to a user of this butler. Need to decide 

2732 # what to do about an absolute path. 

2733 if transfer == "auto": 

2734 # For "auto" transfers we allow the absolute URI 

2735 # to be recorded in the target datastore. 

2736 direct_transfers.append(source_location) 

2737 else: 

2738 # The user is explicitly requesting a transfer 

2739 # even for an absolute URI. This requires us to 

2740 # calculate the target path. 

2741 template_ref = ref 

2742 if info.component: 

2743 template_ref = ref.makeComponentRef(info.component) 

2744 target_location = self._calculate_ingested_datastore_name( 

2745 source_location.uri, 

2746 template_ref, 

2747 ) 

2748 

2749 info = info.update(path=target_location.pathInStore.path) 

2750 

2751 # Need to transfer it to the new location. 

2752 # Assume we should always overwrite. If the artifact 

2753 # is there this might indicate that a previous transfer 

2754 # was interrupted but was not able to be rolled back 

2755 # completely (eg pre-emption) so follow Datastore default 

2756 # and overwrite. 

2757 target_location.uri.transfer_from( 

2758 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction 

2759 ) 

2760 

2761 artifacts.append((ref, info)) 

2762 

2763 if direct_transfers: 

2764 log.info( 

2765 "Transfer request for an outside-datastore artifact with absolute URI done %d time%s", 

2766 len(direct_transfers), 

2767 "" if len(direct_transfers) == 1 else "s", 

2768 ) 

2769 

2770 # We are overwriting previous datasets that may have already 

2771 # existed. We therefore should ensure that we force the 

2772 # datastore records to agree. Note that this can potentially lead 

2773 # to difficulties if the dataset has previously been ingested 

2774 # disassembled and is somehow now assembled, or vice versa. 

2775 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.REPLACE) 

2776 

2777 if already_present: 

2778 n_skipped = len(already_present) 

2779 log.info( 

2780 "Skipped transfer of %d dataset%s already present in datastore", 

2781 n_skipped, 

2782 "" if n_skipped == 1 else "s", 

2783 ) 

2784 

2785 return accepted, rejected 

2786 

2787 @transactional 

2788 def forget(self, refs: Iterable[DatasetRef]) -> None: 

2789 # Docstring inherited. 

2790 refs = list(refs) 

2791 self.bridge.forget(refs) 

2792 self._table.delete(["dataset_id"], *[{"dataset_id": ref.id} for ref in refs]) 

2793 

2794 def validateConfiguration( 

2795 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

2796 ) -> None: 

2797 """Validate some of the configuration for this datastore. 

2798 

2799 Parameters 

2800 ---------- 

2801 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

2802 Entities to test against this configuration. Can be differing 

2803 types. 

2804 logFailures : `bool`, optional 

2805 If `True`, output a log message for every validation error 

2806 detected. 

2807 

2808 Raises 

2809 ------ 

2810 DatastoreValidationError 

2811 Raised if there is a validation problem with a configuration. 

2812 All the problems are reported in a single exception. 

2813 

2814 Notes 

2815 ----- 

2816 This method checks that all the supplied entities have valid file 

2817 templates and also have formatters defined. 

2818 """ 

2819 templateFailed = None 

2820 try: 

2821 self.templates.validateTemplates(entities, logFailures=logFailures) 

2822 except FileTemplateValidationError as e: 

2823 templateFailed = str(e) 

2824 

2825 formatterFailed = [] 

2826 for entity in entities: 

2827 try: 

2828 self.formatterFactory.getFormatterClass(entity) 

2829 except KeyError as e: 

2830 formatterFailed.append(str(e)) 

2831 if logFailures: 

2832 log.critical("Formatter failure: %s", e) 

2833 

2834 if templateFailed or formatterFailed: 

2835 messages = [] 

2836 if templateFailed: 

2837 messages.append(templateFailed) 

2838 if formatterFailed: 

2839 messages.append(",".join(formatterFailed)) 

2840 msg = ";\n".join(messages) 

2841 raise DatastoreValidationError(msg) 

2842 

2843 def getLookupKeys(self) -> set[LookupKey]: 

2844 # Docstring is inherited from base class 

2845 return ( 

2846 self.templates.getLookupKeys() 

2847 | self.formatterFactory.getLookupKeys() 

2848 | self.constraints.getLookupKeys() 

2849 ) 

2850 

2851 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

2852 # Docstring is inherited from base class 

2853 # The key can be valid in either formatters or templates so we can 

2854 # only check the template if it exists 

2855 if lookupKey in self.templates: 

2856 try: 

2857 self.templates[lookupKey].validateTemplate(entity) 

2858 except FileTemplateValidationError as e: 

2859 raise DatastoreValidationError(e) from e 

2860 

2861 def export( 

2862 self, 

2863 refs: Iterable[DatasetRef], 

2864 *, 

2865 directory: ResourcePathExpression | None = None, 

2866 transfer: str | None = "auto", 

2867 ) -> Iterable[FileDataset]: 

2868 # Docstring inherited from Datastore.export. 

2869 if transfer == "auto" and directory is None: 

2870 transfer = None 

2871 

2872 if transfer is not None and directory is None: 

2873 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

2874 

2875 if transfer == "move": 

2876 raise TypeError("Can not export by moving files out of datastore.") 

2877 elif transfer == "direct": 

2878 # For an export, treat this as equivalent to None. We do not 

2879 # want an import to risk using absolute URIs to datasets owned 

2880 # by another datastore. 

2881 log.info("Treating 'direct' transfer mode as in-place export.") 

2882 transfer = None 

2883 

2884 # Force the directory to be a URI object 

2885 directoryUri: ResourcePath | None = None 

2886 if directory is not None: 

2887 directoryUri = ResourcePath(directory, forceDirectory=True) 

2888 

2889 if transfer is not None and directoryUri is not None and not directoryUri.exists(): 

2890 # mypy needs the second test 

2891 raise FileNotFoundError(f"Export location {directory} does not exist") 

2892 

2893 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

2894 for ref in progress.wrap(refs, "Exporting dataset files"): 

2895 fileLocations = self._get_dataset_locations_info(ref) 

2896 if not fileLocations: 

2897 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

2898 # For now we can not export disassembled datasets 

2899 if len(fileLocations) > 1: 

2900 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

2901 location, storedFileInfo = fileLocations[0] 

2902 

2903 pathInStore = location.pathInStore.path 

2904 if transfer is None: 

2905 # TODO: do we also need to return the readStorageClass somehow? 

2906 # We will use the path in store directly. If this is an 

2907 # absolute URI, preserve it. 

2908 if location.pathInStore.isabs(): 

2909 pathInStore = str(location.uri) 

2910 elif transfer == "direct": 

2911 # Use full URIs to the remote store in the export 

2912 pathInStore = str(location.uri) 

2913 else: 

2914 # mypy needs help 

2915 assert directoryUri is not None, "directoryUri must be defined to get here" 

2916 storeUri = ResourcePath(location.uri) 

2917 

2918 # if the datastore has an absolute URI to a resource, we 

2919 # have two options: 

2920 # 1. Keep the absolute URI in the exported YAML 

2921 # 2. Allocate a new name in the local datastore and transfer 

2922 # it. 

2923 # For now go with option 2 

2924 if location.pathInStore.isabs(): 

2925 template = self.templates.getTemplate(ref) 

2926 newURI = ResourcePath(template.format(ref), forceAbsolute=False) 

2927 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

2928 

2929 exportUri = directoryUri.join(pathInStore) 

2930 exportUri.transfer_from(storeUri, transfer=transfer) 

2931 

2932 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

2933 

2934 @staticmethod 

2935 def computeChecksum(uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192) -> str | None: 

2936 """Compute the checksum of the supplied file. 

2937 

2938 Parameters 

2939 ---------- 

2940 uri : `lsst.resources.ResourcePath` 

2941 Name of resource to calculate checksum from. 

2942 algorithm : `str`, optional 

2943 Name of algorithm to use. Must be one of the algorithms supported 

2944 by :py:class`hashlib`. 

2945 block_size : `int` 

2946 Number of bytes to read from file at one time. 

2947 

2948 Returns 

2949 ------- 

2950 hexdigest : `str` 

2951 Hex digest of the file. 

2952 

2953 Notes 

2954 ----- 

2955 Currently returns None if the URI is for a remote resource. 

2956 """ 

2957 if algorithm not in hashlib.algorithms_guaranteed: 

2958 raise NameError(f"The specified algorithm '{algorithm}' is not supported by hashlib") 

2959 

2960 if not uri.isLocal: 

2961 return None 

2962 

2963 hasher = hashlib.new(algorithm) 

2964 

2965 with uri.as_local() as local_uri, open(local_uri.ospath, "rb") as f: 

2966 for chunk in iter(lambda: f.read(block_size), b""): 

2967 hasher.update(chunk) 

2968 

2969 return hasher.hexdigest() 

2970 

2971 def needs_expanded_data_ids( 

2972 self, 

2973 transfer: str | None, 

2974 entity: DatasetRef | DatasetType | StorageClass | None = None, 

2975 ) -> bool: 

2976 # Docstring inherited. 

2977 # This _could_ also use entity to inspect whether the filename template 

2978 # involves placeholders other than the required dimensions for its 

2979 # dataset type, but that's not necessary for correctness; it just 

2980 # enables more optimizations (perhaps only in theory). 

2981 return transfer not in ("direct", None) 

2982 

2983 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

2984 # Docstring inherited from the base class. 

2985 record_data = data.get(self.name) 

2986 if not record_data: 

2987 return 

2988 

2989 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records) 

2990 

2991 # TODO: Verify that there are no unexpected table names in the dict? 

2992 unpacked_records = [] 

2993 for dataset_data in record_data.records.values(): 

2994 records = dataset_data.get(self._table.name) 

2995 if records: 

2996 for info in records: 

2997 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records" 

2998 unpacked_records.append(info.to_record()) 

2999 if unpacked_records: 

3000 self._table.insert(*unpacked_records, transaction=self._transaction) 

3001 

3002 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

3003 # Docstring inherited from the base class. 

3004 exported_refs = list(self._bridge.check(refs)) 

3005 ids = {ref.id for ref in exported_refs} 

3006 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids} 

3007 for row in self._table.fetch(dataset_id=ids): 

3008 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row) 

3009 dataset_records = records.setdefault(info.dataset_id, {}) 

3010 dataset_records.setdefault(self._table.name, []).append(info) 

3011 

3012 record_data = DatastoreRecordData(records=records) 

3013 return {self.name: record_data} 

3014 

3015 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None: 

3016 # Docstring inherited from the base class. 

3017 self._retrieve_dataset_method = method 

3018 

3019 def _cast_storage_class(self, ref: DatasetRef) -> DatasetRef: 

3020 """Update dataset reference to use the storage class from registry.""" 

3021 if self._retrieve_dataset_method is None: 

3022 # We could raise an exception here but unit tests do not define 

3023 # this method. 

3024 return ref 

3025 dataset_type = self._retrieve_dataset_method(ref.datasetType.name) 

3026 if dataset_type is not None: 

3027 ref = ref.overrideStorageClass(dataset_type.storageClass) 

3028 return ref