Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 10%

996 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-12 09:44 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Generic file-based datastore code.""" 

29 

30from __future__ import annotations 

31 

32__all__ = ("FileDatastore",) 

33 

34import contextlib 

35import hashlib 

36import logging 

37from collections import defaultdict 

38from collections.abc import Callable, Iterable, Mapping, Sequence 

39from dataclasses import dataclass 

40from typing import TYPE_CHECKING, Any, ClassVar 

41 

42from lsst.daf.butler import ( 

43 Config, 

44 DatasetId, 

45 DatasetRef, 

46 DatasetType, 

47 DatasetTypeNotSupportedError, 

48 Datastore, 

49 FileDataset, 

50 FileDescriptor, 

51 Formatter, 

52 FormatterFactory, 

53 Location, 

54 LocationFactory, 

55 Progress, 

56 StorageClass, 

57 ddl, 

58) 

59from lsst.daf.butler.datastore import DatasetRefURIs, DatastoreConfig, DatastoreValidationError 

60from lsst.daf.butler.datastore.cache_manager import ( 

61 AbstractDatastoreCacheManager, 

62 DatastoreCacheManager, 

63 DatastoreDisabledCacheManager, 

64) 

65from lsst.daf.butler.datastore.composites import CompositesMap 

66from lsst.daf.butler.datastore.file_templates import FileTemplates, FileTemplateValidationError 

67from lsst.daf.butler.datastore.record_data import DatastoreRecordData 

68from lsst.daf.butler.datastore.stored_file_info import StoredDatastoreItemInfo, StoredFileInfo 

69from lsst.daf.butler.registry.interfaces import ( 

70 DatabaseInsertMode, 

71 DatastoreRegistryBridge, 

72 FakeDatasetRef, 

73 ReadOnlyDatabaseError, 

74) 

75from lsst.daf.butler.repo_relocation import replaceRoot 

76from lsst.daf.butler.utils import transactional 

77from lsst.resources import ResourcePath, ResourcePathExpression 

78from lsst.utils.introspection import get_class_of, get_instance_of 

79from lsst.utils.iteration import chunk_iterable 

80 

81# For VERBOSE logging usage. 

82from lsst.utils.logging import VERBOSE, getLogger 

83from lsst.utils.timer import time_this 

84from sqlalchemy import BigInteger, String 

85 

86from ..datastore.generic_base import GenericBaseDatastore 

87 

88if TYPE_CHECKING: 

89 from lsst.daf.butler import LookupKey 

90 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

91 

92log = getLogger(__name__) 

93 

94 

95class _IngestPrepData(Datastore.IngestPrepData): 

96 """Helper class for FileDatastore ingest implementation. 

97 

98 Parameters 

99 ---------- 

100 datasets : `~collections.abc.Iterable` of `FileDataset` 

101 Files to be ingested by this datastore. 

102 """ 

103 

104 def __init__(self, datasets: Iterable[FileDataset]): 

105 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

106 self.datasets = datasets 

107 

108 

109@dataclass(frozen=True) 

110class DatastoreFileGetInformation: 

111 """Collection of useful parameters needed to retrieve a file from 

112 a Datastore. 

113 """ 

114 

115 location: Location 

116 """The location from which to read the dataset.""" 

117 

118 formatter: Formatter 

119 """The `Formatter` to use to deserialize the dataset.""" 

120 

121 info: StoredFileInfo 

122 """Stored information about this file and its formatter.""" 

123 

124 assemblerParams: Mapping[str, Any] 

125 """Parameters to use for post-processing the retrieved dataset.""" 

126 

127 formatterParams: Mapping[str, Any] 

128 """Parameters that were understood by the associated formatter.""" 

129 

130 component: str | None 

131 """The component to be retrieved (can be `None`).""" 

132 

133 readStorageClass: StorageClass 

134 """The `StorageClass` of the dataset being read.""" 

135 

136 

137class FileDatastore(GenericBaseDatastore): 

138 """Generic Datastore for file-based implementations. 

139 

140 Should always be sub-classed since key abstract methods are missing. 

141 

142 Parameters 

143 ---------- 

144 config : `DatastoreConfig` or `str` 

145 Configuration as either a `Config` object or URI to file. 

146 bridgeManager : `DatastoreRegistryBridgeManager` 

147 Object that manages the interface between `Registry` and datastores. 

148 butlerRoot : `str`, optional 

149 New datastore root to use to override the configuration value. 

150 

151 Raises 

152 ------ 

153 ValueError 

154 If root location does not exist and ``create`` is `False` in the 

155 configuration. 

156 """ 

157 

158 defaultConfigFile: ClassVar[str | None] = None 

159 """Path to configuration defaults. Accessed within the ``config`` resource 

160 or relative to a search path. Can be None if no defaults specified. 

161 """ 

162 

163 root: ResourcePath 

164 """Root directory URI of this `Datastore`.""" 

165 

166 locationFactory: LocationFactory 

167 """Factory for creating locations relative to the datastore root.""" 

168 

169 formatterFactory: FormatterFactory 

170 """Factory for creating instances of formatters.""" 

171 

172 templates: FileTemplates 

173 """File templates that can be used by this `Datastore`.""" 

174 

175 composites: CompositesMap 

176 """Determines whether a dataset should be disassembled on put.""" 

177 

178 defaultConfigFile = "datastores/fileDatastore.yaml" 

179 """Path to configuration defaults. Accessed within the ``config`` resource 

180 or relative to a search path. Can be None if no defaults specified. 

181 """ 

182 

183 _retrieve_dataset_method: Callable[[str], DatasetType | None] | None = None 

184 """Callable that is used in trusted mode to retrieve registry definition 

185 of a named dataset type. 

186 """ 

187 

188 @classmethod 

189 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

190 """Set any filesystem-dependent config options for this Datastore to 

191 be appropriate for a new empty repository with the given root. 

192 

193 Parameters 

194 ---------- 

195 root : `str` 

196 URI to the root of the data repository. 

197 config : `Config` 

198 A `Config` to update. Only the subset understood by 

199 this component will be updated. Will not expand 

200 defaults. 

201 full : `Config` 

202 A complete config with all defaults expanded that can be 

203 converted to a `DatastoreConfig`. Read-only and will not be 

204 modified by this method. 

205 Repository-specific options that should not be obtained 

206 from defaults when Butler instances are constructed 

207 should be copied from ``full`` to ``config``. 

208 overwrite : `bool`, optional 

209 If `False`, do not modify a value in ``config`` if the value 

210 already exists. Default is always to overwrite with the provided 

211 ``root``. 

212 

213 Notes 

214 ----- 

215 If a keyword is explicitly defined in the supplied ``config`` it 

216 will not be overridden by this method if ``overwrite`` is `False`. 

217 This allows explicit values set in external configs to be retained. 

218 """ 

219 Config.updateParameters( 

220 DatastoreConfig, 

221 config, 

222 full, 

223 toUpdate={"root": root}, 

224 toCopy=("cls", ("records", "table")), 

225 overwrite=overwrite, 

226 ) 

227 

228 @classmethod 

229 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

230 return ddl.TableSpec( 

231 fields=[ 

232 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

233 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

234 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

235 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

236 # Use empty string to indicate no component 

237 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

238 # TODO: should checksum be Base64Bytes instead? 

239 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

240 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

241 ], 

242 unique=frozenset(), 

243 indexes=[ddl.IndexSpec("path")], 

244 ) 

245 

246 def __init__( 

247 self, 

248 config: DatastoreConfig | ResourcePathExpression, 

249 bridgeManager: DatastoreRegistryBridgeManager, 

250 butlerRoot: str | None = None, 

251 ): 

252 super().__init__(config, bridgeManager) 

253 if "root" not in self.config: 

254 raise ValueError("No root directory specified in configuration") 

255 

256 self._bridgeManager = bridgeManager 

257 

258 # Name ourselves either using an explicit name or a name 

259 # derived from the (unexpanded) root 

260 if "name" in self.config: 

261 self.name = self.config["name"] 

262 else: 

263 # We use the unexpanded root in the name to indicate that this 

264 # datastore can be moved without having to update registry. 

265 self.name = "{}@{}".format(type(self).__name__, self.config["root"]) 

266 

267 # Support repository relocation in config 

268 # Existence of self.root is checked in subclass 

269 self.root = ResourcePath( 

270 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True 

271 ) 

272 

273 self.locationFactory = LocationFactory(self.root) 

274 self.formatterFactory = FormatterFactory() 

275 

276 # Now associate formatters with storage classes 

277 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe) 

278 

279 # Read the file naming templates 

280 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe) 

281 

282 # See if composites should be disassembled 

283 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe) 

284 

285 tableName = self.config["records", "table"] 

286 try: 

287 # Storage of paths and formatters, keyed by dataset_id 

288 self._table = bridgeManager.opaque.register( 

289 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType) 

290 ) 

291 # Interface to Registry. 

292 self._bridge = bridgeManager.register(self.name) 

293 except ReadOnlyDatabaseError: 

294 # If the database is read only and we just tried and failed to 

295 # create a table, it means someone is trying to create a read-only 

296 # butler client for an empty repo. That should be okay, as long 

297 # as they then try to get any datasets before some other client 

298 # creates the table. Chances are they'rejust validating 

299 # configuration. 

300 pass 

301 

302 # Determine whether checksums should be used - default to False 

303 self.useChecksum = self.config.get("checksum", False) 

304 

305 # Determine whether we can fall back to configuration if a 

306 # requested dataset is not known to registry 

307 self.trustGetRequest = self.config.get("trust_get_request", False) 

308 

309 # Create a cache manager 

310 self.cacheManager: AbstractDatastoreCacheManager 

311 if "cached" in self.config: 

312 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe) 

313 else: 

314 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe) 

315 

316 # Check existence and create directory structure if necessary 

317 if not self.root.exists(): 

318 if "create" not in self.config or not self.config["create"]: 

319 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

320 try: 

321 self.root.mkdir() 

322 except Exception as e: 

323 raise ValueError( 

324 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}" 

325 ) from e 

326 

327 def __str__(self) -> str: 

328 return str(self.root) 

329 

330 @property 

331 def bridge(self) -> DatastoreRegistryBridge: 

332 return self._bridge 

333 

334 @property 

335 def roots(self) -> dict[str, ResourcePath | None]: 

336 # Docstring inherited. 

337 return {self.name: self.root} 

338 

339 def _artifact_exists(self, location: Location) -> bool: 

340 """Check that an artifact exists in this datastore at the specified 

341 location. 

342 

343 Parameters 

344 ---------- 

345 location : `Location` 

346 Expected location of the artifact associated with this datastore. 

347 

348 Returns 

349 ------- 

350 exists : `bool` 

351 True if the location can be found, false otherwise. 

352 """ 

353 log.debug("Checking if resource exists: %s", location.uri) 

354 return location.uri.exists() 

355 

356 def _delete_artifact(self, location: Location) -> None: 

357 """Delete the artifact from the datastore. 

358 

359 Parameters 

360 ---------- 

361 location : `Location` 

362 Location of the artifact associated with this datastore. 

363 """ 

364 if location.pathInStore.isabs(): 

365 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

366 

367 try: 

368 location.uri.remove() 

369 except FileNotFoundError: 

370 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

371 raise 

372 except Exception as e: 

373 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

374 raise 

375 log.debug("Successfully deleted file: %s", location.uri) 

376 

377 def addStoredItemInfo( 

378 self, 

379 refs: Iterable[DatasetRef], 

380 infos: Iterable[StoredFileInfo], 

381 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT, 

382 ) -> None: 

383 # Docstring inherited from GenericBaseDatastore 

384 records = [info.rebase(ref).to_record() for ref, info in zip(refs, infos, strict=True)] 

385 match insert_mode: 

386 case DatabaseInsertMode.INSERT: 

387 self._table.insert(*records, transaction=self._transaction) 

388 case DatabaseInsertMode.ENSURE: 

389 self._table.ensure(*records, transaction=self._transaction) 

390 case DatabaseInsertMode.REPLACE: 

391 self._table.replace(*records, transaction=self._transaction) 

392 case _: 

393 raise ValueError(f"Unknown insert mode of '{insert_mode}'") 

394 

395 def getStoredItemsInfo(self, ref: DatasetIdRef) -> list[StoredFileInfo]: 

396 # Docstring inherited from GenericBaseDatastore 

397 

398 # Look for the dataset_id -- there might be multiple matches 

399 # if we have disassembled the dataset. 

400 records = self._table.fetch(dataset_id=ref.id) 

401 return [StoredFileInfo.from_record(record) for record in records] 

402 

403 def _get_stored_records_associated_with_refs( 

404 self, refs: Iterable[DatasetIdRef] 

405 ) -> dict[DatasetId, list[StoredFileInfo]]: 

406 """Retrieve all records associated with the provided refs. 

407 

408 Parameters 

409 ---------- 

410 refs : iterable of `DatasetIdRef` 

411 The refs for which records are to be retrieved. 

412 

413 Returns 

414 ------- 

415 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

416 The matching records indexed by the ref ID. The number of entries 

417 in the dict can be smaller than the number of requested refs. 

418 """ 

419 records = self._table.fetch(dataset_id=[ref.id for ref in refs]) 

420 

421 # Uniqueness is dataset_id + component so can have multiple records 

422 # per ref. 

423 records_by_ref = defaultdict(list) 

424 for record in records: 

425 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

426 return records_by_ref 

427 

428 def _refs_associated_with_artifacts(self, paths: list[str | ResourcePath]) -> dict[str, set[DatasetId]]: 

429 """Return paths and associated dataset refs. 

430 

431 Parameters 

432 ---------- 

433 paths : `list` of `str` or `lsst.resources.ResourcePath` 

434 All the paths to include in search. 

435 

436 Returns 

437 ------- 

438 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

439 Mapping of each path to a set of associated database IDs. 

440 """ 

441 records = self._table.fetch(path=[str(path) for path in paths]) 

442 result = defaultdict(set) 

443 for row in records: 

444 result[row["path"]].add(row["dataset_id"]) 

445 return result 

446 

447 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> set[DatasetId]: 

448 """Return all dataset refs associated with the supplied path. 

449 

450 Parameters 

451 ---------- 

452 pathInStore : `lsst.resources.ResourcePath` 

453 Path of interest in the data store. 

454 

455 Returns 

456 ------- 

457 ids : `set` of `int` 

458 All `DatasetRef` IDs associated with this path. 

459 """ 

460 records = list(self._table.fetch(path=str(pathInStore))) 

461 ids = {r["dataset_id"] for r in records} 

462 return ids 

463 

464 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

465 # Docstring inherited from GenericBaseDatastore 

466 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

467 

468 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> list[tuple[Location, StoredFileInfo]]: 

469 r"""Find all the `Location`\ s of the requested dataset in the 

470 `Datastore` and the associated stored file information. 

471 

472 Parameters 

473 ---------- 

474 ref : `DatasetRef` 

475 Reference to the required `Dataset`. 

476 

477 Returns 

478 ------- 

479 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

480 Location of the dataset within the datastore and 

481 stored information about each file and its formatter. 

482 """ 

483 # Get the file information (this will fail if no file) 

484 records = self.getStoredItemsInfo(ref) 

485 

486 # Use the path to determine the location -- we need to take 

487 # into account absolute URIs in the datastore record 

488 return [(r.file_location(self.locationFactory), r) for r in records] 

489 

490 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

491 """Check that there is only one dataset associated with the 

492 specified artifact. 

493 

494 Parameters 

495 ---------- 

496 ref : `DatasetRef` or `FakeDatasetRef` 

497 Dataset to be removed. 

498 location : `Location` 

499 The location of the artifact to be removed. 

500 

501 Returns 

502 ------- 

503 can_remove : `Bool` 

504 True if the artifact can be safely removed. 

505 """ 

506 # Can't ever delete absolute URIs. 

507 if location.pathInStore.isabs(): 

508 return False 

509 

510 # Get all entries associated with this path 

511 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

512 if not allRefs: 

513 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

514 

515 # Remove these refs from all the refs and if there is nothing left 

516 # then we can delete 

517 remainingRefs = allRefs - {ref.id} 

518 

519 if remainingRefs: 

520 return False 

521 return True 

522 

523 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> list[tuple[Location, StoredFileInfo]]: 

524 """Predict the location and related file information of the requested 

525 dataset in this datastore. 

526 

527 Parameters 

528 ---------- 

529 ref : `DatasetRef` 

530 Reference to the required `Dataset`. 

531 

532 Returns 

533 ------- 

534 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

535 Expected Location of the dataset within the datastore and 

536 placeholder information about each file and its formatter. 

537 

538 Notes 

539 ----- 

540 Uses the current configuration to determine how we would expect the 

541 datastore files to have been written if we couldn't ask registry. 

542 This is safe so long as there has been no change to datastore 

543 configuration between writing the dataset and wanting to read it. 

544 Will not work for files that have been ingested without using the 

545 standard file template or default formatter. 

546 """ 

547 # If we have a component ref we always need to ask the questions 

548 # of the composite. If the composite is disassembled this routine 

549 # should return all components. If the composite was not 

550 # disassembled the composite is what is stored regardless of 

551 # component request. Note that if the caller has disassembled 

552 # a composite there is no way for this guess to know that 

553 # without trying both the composite and component ref and seeing 

554 # if there is something at the component Location even without 

555 # disassembly being enabled. 

556 if ref.datasetType.isComponent(): 

557 ref = ref.makeCompositeRef() 

558 

559 # See if the ref is a composite that should be disassembled 

560 doDisassembly = self.composites.shouldBeDisassembled(ref) 

561 

562 all_info: list[tuple[Location, Formatter, StorageClass, str | None]] = [] 

563 

564 if doDisassembly: 

565 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

566 compRef = ref.makeComponentRef(component) 

567 location, formatter = self._determine_put_formatter_location(compRef) 

568 all_info.append((location, formatter, componentStorage, component)) 

569 

570 else: 

571 # Always use the composite ref if no disassembly 

572 location, formatter = self._determine_put_formatter_location(ref) 

573 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

574 

575 # Convert the list of tuples to have StoredFileInfo as second element 

576 return [ 

577 ( 

578 location, 

579 StoredFileInfo( 

580 formatter=formatter, 

581 path=location.pathInStore.path, 

582 storageClass=storageClass, 

583 component=component, 

584 checksum=None, 

585 file_size=-1, 

586 dataset_id=ref.id, 

587 ), 

588 ) 

589 for location, formatter, storageClass, component in all_info 

590 ] 

591 

592 def _prepare_for_get( 

593 self, ref: DatasetRef, parameters: Mapping[str, Any] | None = None 

594 ) -> list[DatastoreFileGetInformation]: 

595 """Check parameters for ``get`` and obtain formatter and 

596 location. 

597 

598 Parameters 

599 ---------- 

600 ref : `DatasetRef` 

601 Reference to the required Dataset. 

602 parameters : `dict` 

603 `StorageClass`-specific parameters that specify, for example, 

604 a slice of the dataset to be loaded. 

605 

606 Returns 

607 ------- 

608 getInfo : `list` [`DatastoreFileGetInformation`] 

609 Parameters needed to retrieve each file. 

610 """ 

611 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

612 

613 # The storage class we want to use eventually 

614 refStorageClass = ref.datasetType.storageClass 

615 

616 # For trusted mode need to reset storage class. 

617 ref = self._cast_storage_class(ref) 

618 

619 # Get file metadata and internal metadata 

620 fileLocations = self._get_dataset_locations_info(ref) 

621 if not fileLocations: 

622 if not self.trustGetRequest: 

623 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

624 # Assume the dataset is where we think it should be 

625 fileLocations = self._get_expected_dataset_locations_info(ref) 

626 

627 if len(fileLocations) > 1: 

628 disassembled = True 

629 

630 # If trust is involved it is possible that there will be 

631 # components listed here that do not exist in the datastore. 

632 # Explicitly check for file artifact existence and filter out any 

633 # that are missing. 

634 if self.trustGetRequest: 

635 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()] 

636 

637 # For now complain only if we have no components at all. One 

638 # component is probably a problem but we can punt that to the 

639 # assembler. 

640 if not fileLocations: 

641 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") 

642 

643 else: 

644 disassembled = False 

645 

646 # Is this a component request? 

647 refComponent = ref.datasetType.component() 

648 

649 fileGetInfo = [] 

650 for location, storedFileInfo in fileLocations: 

651 # The storage class used to write the file 

652 writeStorageClass = storedFileInfo.storageClass 

653 

654 # If this has been disassembled we need read to match the write 

655 if disassembled: 

656 readStorageClass = writeStorageClass 

657 else: 

658 readStorageClass = refStorageClass 

659 

660 formatter = get_instance_of( 

661 storedFileInfo.formatter, 

662 FileDescriptor( 

663 location, 

664 readStorageClass=readStorageClass, 

665 storageClass=writeStorageClass, 

666 parameters=parameters, 

667 ), 

668 ref.dataId, 

669 ) 

670 

671 formatterParams, notFormatterParams = formatter.segregateParameters() 

672 

673 # Of the remaining parameters, extract the ones supported by 

674 # this StorageClass (for components not all will be handled) 

675 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

676 

677 # The ref itself could be a component if the dataset was 

678 # disassembled by butler, or we disassembled in datastore and 

679 # components came from the datastore records 

680 component = storedFileInfo.component if storedFileInfo.component else refComponent 

681 

682 fileGetInfo.append( 

683 DatastoreFileGetInformation( 

684 location, 

685 formatter, 

686 storedFileInfo, 

687 assemblerParams, 

688 formatterParams, 

689 component, 

690 readStorageClass, 

691 ) 

692 ) 

693 

694 return fileGetInfo 

695 

696 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> tuple[Location, Formatter]: 

697 """Check the arguments for ``put`` and obtain formatter and 

698 location. 

699 

700 Parameters 

701 ---------- 

702 inMemoryDataset : `object` 

703 The dataset to store. 

704 ref : `DatasetRef` 

705 Reference to the associated Dataset. 

706 

707 Returns 

708 ------- 

709 location : `Location` 

710 The location to write the dataset. 

711 formatter : `Formatter` 

712 The `Formatter` to use to write the dataset. 

713 

714 Raises 

715 ------ 

716 TypeError 

717 Supplied object and storage class are inconsistent. 

718 DatasetTypeNotSupportedError 

719 The associated `DatasetType` is not handled by this datastore. 

720 """ 

721 self._validate_put_parameters(inMemoryDataset, ref) 

722 return self._determine_put_formatter_location(ref) 

723 

724 def _determine_put_formatter_location(self, ref: DatasetRef) -> tuple[Location, Formatter]: 

725 """Calculate the formatter and output location to use for put. 

726 

727 Parameters 

728 ---------- 

729 ref : `DatasetRef` 

730 Reference to the associated Dataset. 

731 

732 Returns 

733 ------- 

734 location : `Location` 

735 The location to write the dataset. 

736 formatter : `Formatter` 

737 The `Formatter` to use to write the dataset. 

738 """ 

739 # Work out output file name 

740 try: 

741 template = self.templates.getTemplate(ref) 

742 except KeyError as e: 

743 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

744 

745 # Validate the template to protect against filenames from different 

746 # dataIds returning the same and causing overwrite confusion. 

747 template.validateTemplate(ref) 

748 

749 location = self.locationFactory.fromPath(template.format(ref)) 

750 

751 # Get the formatter based on the storage class 

752 storageClass = ref.datasetType.storageClass 

753 try: 

754 formatter = self.formatterFactory.getFormatter( 

755 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId 

756 ) 

757 except KeyError as e: 

758 raise DatasetTypeNotSupportedError( 

759 f"Unable to find formatter for {ref} in datastore {self.name}" 

760 ) from e 

761 

762 # Now that we know the formatter, update the location 

763 location = formatter.makeUpdatedLocation(location) 

764 

765 return location, formatter 

766 

767 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None: 

768 # Docstring inherited from base class 

769 if transfer != "auto": 

770 return transfer 

771 

772 # See if the paths are within the datastore or not 

773 inside = [self._pathInStore(d.path) is not None for d in datasets] 

774 

775 if all(inside): 

776 transfer = None 

777 elif not any(inside): 

778 # Allow ResourcePath to use its own knowledge 

779 transfer = "auto" 

780 else: 

781 # This can happen when importing from a datastore that 

782 # has had some datasets ingested using "direct" mode. 

783 # Also allow ResourcePath to sort it out but warn about it. 

784 # This can happen if you are importing from a datastore 

785 # that had some direct transfer datasets. 

786 log.warning( 

787 "Some datasets are inside the datastore and some are outside. Using 'split' " 

788 "transfer mode. This assumes that the files outside the datastore are " 

789 "still accessible to the new butler since they will not be copied into " 

790 "the target datastore." 

791 ) 

792 transfer = "split" 

793 

794 return transfer 

795 

796 def _pathInStore(self, path: ResourcePathExpression) -> str | None: 

797 """Return path relative to datastore root. 

798 

799 Parameters 

800 ---------- 

801 path : `lsst.resources.ResourcePathExpression` 

802 Path to dataset. Can be absolute URI. If relative assumed to 

803 be relative to the datastore. Returns path in datastore 

804 or raises an exception if the path it outside. 

805 

806 Returns 

807 ------- 

808 inStore : `str` 

809 Path relative to datastore root. Returns `None` if the file is 

810 outside the root. 

811 """ 

812 # Relative path will always be relative to datastore 

813 pathUri = ResourcePath(path, forceAbsolute=False) 

814 return pathUri.relative_to(self.root) 

815 

816 def _standardizeIngestPath( 

817 self, path: str | ResourcePath, *, transfer: str | None = None 

818 ) -> str | ResourcePath: 

819 """Standardize the path of a to-be-ingested file. 

820 

821 Parameters 

822 ---------- 

823 path : `str` or `lsst.resources.ResourcePath` 

824 Path of a file to be ingested. This parameter is not expected 

825 to be all the types that can be used to construct a 

826 `~lsst.resources.ResourcePath`. 

827 transfer : `str`, optional 

828 How (and whether) the dataset should be added to the datastore. 

829 See `ingest` for details of transfer modes. 

830 This implementation is provided only so 

831 `NotImplementedError` can be raised if the mode is not supported; 

832 actual transfers are deferred to `_extractIngestInfo`. 

833 

834 Returns 

835 ------- 

836 path : `str` or `lsst.resources.ResourcePath` 

837 New path in what the datastore considers standard form. If an 

838 absolute URI was given that will be returned unchanged. 

839 

840 Notes 

841 ----- 

842 Subclasses of `FileDatastore` can implement this method instead 

843 of `_prepIngest`. It should not modify the data repository or given 

844 file in any way. 

845 

846 Raises 

847 ------ 

848 NotImplementedError 

849 Raised if the datastore does not support the given transfer mode 

850 (including the case where ingest is not supported at all). 

851 FileNotFoundError 

852 Raised if one of the given files does not exist. 

853 """ 

854 if transfer not in (None, "direct", "split") + self.root.transferModes: 

855 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

856 

857 # A relative URI indicates relative to datastore root 

858 srcUri = ResourcePath(path, forceAbsolute=False) 

859 if not srcUri.isabs(): 

860 srcUri = self.root.join(path) 

861 

862 if not srcUri.exists(): 

863 raise FileNotFoundError( 

864 f"Resource at {srcUri} does not exist; note that paths to ingest " 

865 f"are assumed to be relative to {self.root} unless they are absolute." 

866 ) 

867 

868 if transfer is None: 

869 relpath = srcUri.relative_to(self.root) 

870 if not relpath: 

871 raise RuntimeError( 

872 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})" 

873 ) 

874 

875 # Return the relative path within the datastore for internal 

876 # transfer 

877 path = relpath 

878 

879 return path 

880 

881 def _extractIngestInfo( 

882 self, 

883 path: ResourcePathExpression, 

884 ref: DatasetRef, 

885 *, 

886 formatter: Formatter | type[Formatter], 

887 transfer: str | None = None, 

888 record_validation_info: bool = True, 

889 ) -> StoredFileInfo: 

890 """Relocate (if necessary) and extract `StoredFileInfo` from a 

891 to-be-ingested file. 

892 

893 Parameters 

894 ---------- 

895 path : `lsst.resources.ResourcePathExpression` 

896 URI or path of a file to be ingested. 

897 ref : `DatasetRef` 

898 Reference for the dataset being ingested. Guaranteed to have 

899 ``dataset_id not None`. 

900 formatter : `type` or `Formatter` 

901 `Formatter` subclass to use for this dataset or an instance. 

902 transfer : `str`, optional 

903 How (and whether) the dataset should be added to the datastore. 

904 See `ingest` for details of transfer modes. 

905 record_validation_info : `bool`, optional 

906 If `True`, the default, the datastore can record validation 

907 information associated with the file. If `False` the datastore 

908 will not attempt to track any information such as checksums 

909 or file sizes. This can be useful if such information is tracked 

910 in an external system or if the file is to be compressed in place. 

911 It is up to the datastore whether this parameter is relevant. 

912 

913 Returns 

914 ------- 

915 info : `StoredFileInfo` 

916 Internal datastore record for this file. This will be inserted by 

917 the caller; the `_extractIngestInfo` is only responsible for 

918 creating and populating the struct. 

919 

920 Raises 

921 ------ 

922 FileNotFoundError 

923 Raised if one of the given files does not exist. 

924 FileExistsError 

925 Raised if transfer is not `None` but the (internal) location the 

926 file would be moved to is already occupied. 

927 """ 

928 if self._transaction is None: 

929 raise RuntimeError("Ingest called without transaction enabled") 

930 

931 # Create URI of the source path, do not need to force a relative 

932 # path to absolute. 

933 srcUri = ResourcePath(path, forceAbsolute=False) 

934 

935 # Track whether we have read the size of the source yet 

936 have_sized = False 

937 

938 tgtLocation: Location | None 

939 if transfer is None or transfer == "split": 

940 # A relative path is assumed to be relative to the datastore 

941 # in this context 

942 if not srcUri.isabs(): 

943 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

944 else: 

945 # Work out the path in the datastore from an absolute URI 

946 # This is required to be within the datastore. 

947 pathInStore = srcUri.relative_to(self.root) 

948 if pathInStore is None and transfer is None: 

949 raise RuntimeError( 

950 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}" 

951 ) 

952 if pathInStore: 

953 tgtLocation = self.locationFactory.fromPath(pathInStore) 

954 elif transfer == "split": 

955 # Outside the datastore but treat that as a direct ingest 

956 # instead. 

957 tgtLocation = None 

958 else: 

959 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}") 

960 elif transfer == "direct": 

961 # Want to store the full URI to the resource directly in 

962 # datastore. This is useful for referring to permanent archive 

963 # storage for raw data. 

964 # Trust that people know what they are doing. 

965 tgtLocation = None 

966 else: 

967 # Work out the name we want this ingested file to have 

968 # inside the datastore 

969 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

970 if not tgtLocation.uri.dirname().exists(): 

971 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

972 tgtLocation.uri.dirname().mkdir() 

973 

974 # if we are transferring from a local file to a remote location 

975 # it may be more efficient to get the size and checksum of the 

976 # local file rather than the transferred one 

977 if record_validation_info and srcUri.isLocal: 

978 size = srcUri.size() 

979 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

980 have_sized = True 

981 

982 # Transfer the resource to the destination. 

983 # Allow overwrite of an existing file. This matches the behavior 

984 # of datastore.put() in that it trusts that registry would not 

985 # be asking to overwrite unless registry thought that the 

986 # overwrite was allowed. 

987 tgtLocation.uri.transfer_from( 

988 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True 

989 ) 

990 

991 if tgtLocation is None: 

992 # This means we are using direct mode 

993 targetUri = srcUri 

994 targetPath = str(srcUri) 

995 else: 

996 targetUri = tgtLocation.uri 

997 targetPath = tgtLocation.pathInStore.path 

998 

999 # the file should exist in the datastore now 

1000 if record_validation_info: 

1001 if not have_sized: 

1002 size = targetUri.size() 

1003 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

1004 else: 

1005 # Not recording any file information. 

1006 size = -1 

1007 checksum = None 

1008 

1009 return StoredFileInfo( 

1010 formatter=formatter, 

1011 path=targetPath, 

1012 storageClass=ref.datasetType.storageClass, 

1013 component=ref.datasetType.component(), 

1014 file_size=size, 

1015 checksum=checksum, 

1016 dataset_id=ref.id, 

1017 ) 

1018 

1019 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData: 

1020 # Docstring inherited from Datastore._prepIngest. 

1021 filtered = [] 

1022 for dataset in datasets: 

1023 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

1024 if not acceptable: 

1025 continue 

1026 else: 

1027 dataset.refs = acceptable 

1028 if dataset.formatter is None: 

1029 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

1030 else: 

1031 assert isinstance(dataset.formatter, type | str) 

1032 formatter_class = get_class_of(dataset.formatter) 

1033 if not issubclass(formatter_class, Formatter): 

1034 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.") 

1035 dataset.formatter = formatter_class 

1036 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

1037 filtered.append(dataset) 

1038 return _IngestPrepData(filtered) 

1039 

1040 @transactional 

1041 def _finishIngest( 

1042 self, 

1043 prepData: Datastore.IngestPrepData, 

1044 *, 

1045 transfer: str | None = None, 

1046 record_validation_info: bool = True, 

1047 ) -> None: 

1048 # Docstring inherited from Datastore._finishIngest. 

1049 refsAndInfos = [] 

1050 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

1051 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

1052 # Do ingest as if the first dataset ref is associated with the file 

1053 info = self._extractIngestInfo( 

1054 dataset.path, 

1055 dataset.refs[0], 

1056 formatter=dataset.formatter, 

1057 transfer=transfer, 

1058 record_validation_info=record_validation_info, 

1059 ) 

1060 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

1061 

1062 # In direct mode we can allow repeated ingests of the same thing 

1063 # if we are sure that the external dataset is immutable. We use 

1064 # UUIDv5 to indicate this. If there is a mix of v4 and v5 they are 

1065 # separated. 

1066 refs_and_infos_replace = [] 

1067 refs_and_infos_insert = [] 

1068 if transfer == "direct": 

1069 for entry in refsAndInfos: 

1070 if entry[0].id.version == 5: 

1071 refs_and_infos_replace.append(entry) 

1072 else: 

1073 refs_and_infos_insert.append(entry) 

1074 else: 

1075 refs_and_infos_insert = refsAndInfos 

1076 

1077 if refs_and_infos_insert: 

1078 self._register_datasets(refs_and_infos_insert, insert_mode=DatabaseInsertMode.INSERT) 

1079 if refs_and_infos_replace: 

1080 self._register_datasets(refs_and_infos_replace, insert_mode=DatabaseInsertMode.REPLACE) 

1081 

1082 def _calculate_ingested_datastore_name( 

1083 self, 

1084 srcUri: ResourcePath, 

1085 ref: DatasetRef, 

1086 formatter: Formatter | type[Formatter] | None = None, 

1087 ) -> Location: 

1088 """Given a source URI and a DatasetRef, determine the name the 

1089 dataset will have inside datastore. 

1090 

1091 Parameters 

1092 ---------- 

1093 srcUri : `lsst.resources.ResourcePath` 

1094 URI to the source dataset file. 

1095 ref : `DatasetRef` 

1096 Ref associated with the newly-ingested dataset artifact. This 

1097 is used to determine the name within the datastore. 

1098 formatter : `Formatter` or Formatter class. 

1099 Formatter to use for validation. Can be a class or an instance. 

1100 No validation of the file extension is performed if the 

1101 ``formatter`` is `None`. This can be used if the caller knows 

1102 that the source URI and target URI will use the same formatter. 

1103 

1104 Returns 

1105 ------- 

1106 location : `Location` 

1107 Target location for the newly-ingested dataset. 

1108 """ 

1109 # Ingesting a file from outside the datastore. 

1110 # This involves a new name. 

1111 template = self.templates.getTemplate(ref) 

1112 location = self.locationFactory.fromPath(template.format(ref)) 

1113 

1114 # Get the extension 

1115 ext = srcUri.getExtension() 

1116 

1117 # Update the destination to include that extension 

1118 location.updateExtension(ext) 

1119 

1120 # Ask the formatter to validate this extension 

1121 if formatter is not None: 

1122 formatter.validateExtension(location) 

1123 

1124 return location 

1125 

1126 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

1127 """Write out in memory dataset to datastore. 

1128 

1129 Parameters 

1130 ---------- 

1131 inMemoryDataset : `object` 

1132 Dataset to write to datastore. 

1133 ref : `DatasetRef` 

1134 Registry information associated with this dataset. 

1135 

1136 Returns 

1137 ------- 

1138 info : `StoredFileInfo` 

1139 Information describing the artifact written to the datastore. 

1140 """ 

1141 # May need to coerce the in memory dataset to the correct 

1142 # python type, but first we need to make sure the storage class 

1143 # reflects the one defined in the data repository. 

1144 ref = self._cast_storage_class(ref) 

1145 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset) 

1146 

1147 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

1148 uri = location.uri 

1149 

1150 if not uri.dirname().exists(): 

1151 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

1152 uri.dirname().mkdir() 

1153 

1154 if self._transaction is None: 

1155 raise RuntimeError("Attempting to write artifact without transaction enabled") 

1156 

1157 def _removeFileExists(uri: ResourcePath) -> None: 

1158 """Remove a file and do not complain if it is not there. 

1159 

1160 This is important since a formatter might fail before the file 

1161 is written and we should not confuse people by writing spurious 

1162 error messages to the log. 

1163 """ 

1164 with contextlib.suppress(FileNotFoundError): 

1165 uri.remove() 

1166 

1167 # Register a callback to try to delete the uploaded data if 

1168 # something fails below 

1169 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1170 

1171 data_written = False 

1172 

1173 # For remote URIs some datasets can be serialized directly 

1174 # to bytes and sent to the remote datastore without writing a 

1175 # file. If the dataset is intended to be saved to the cache 

1176 # a file is always written and direct write to the remote 

1177 # datastore is bypassed. 

1178 if not uri.isLocal and not self.cacheManager.should_be_cached(ref): 

1179 # Remote URI that is not cached so can write directly. 

1180 try: 

1181 serializedDataset = formatter.toBytes(inMemoryDataset) 

1182 except NotImplementedError: 

1183 # Fallback to the file writing option. 

1184 pass 

1185 except Exception as e: 

1186 raise RuntimeError( 

1187 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes." 

1188 ) from e 

1189 else: 

1190 log.debug("Writing bytes directly to %s", uri) 

1191 uri.write(serializedDataset, overwrite=True) 

1192 log.debug("Successfully wrote bytes directly to %s", uri) 

1193 data_written = True 

1194 

1195 if not data_written: 

1196 # Did not write the bytes directly to object store so instead 

1197 # write to temporary file. Always write to a temporary even if 

1198 # using a local file system -- that gives us atomic writes. 

1199 # If a process is killed as the file is being written we do not 

1200 # want it to remain in the correct place but in corrupt state. 

1201 # For local files write to the output directory not temporary dir. 

1202 prefix = uri.dirname() if uri.isLocal else None 

1203 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri: 

1204 # Need to configure the formatter to write to a different 

1205 # location and that needs us to overwrite internals 

1206 log.debug("Writing dataset to temporary location at %s", temporary_uri) 

1207 with formatter._updateLocation(Location(None, temporary_uri)): 

1208 try: 

1209 formatter.write(inMemoryDataset) 

1210 except Exception as e: 

1211 raise RuntimeError( 

1212 f"Failed to serialize dataset {ref} of type" 

1213 f" {type(inMemoryDataset)} to " 

1214 f"temporary location {temporary_uri}" 

1215 ) from e 

1216 

1217 # Use move for a local file since that becomes an efficient 

1218 # os.rename. For remote resources we use copy to allow the 

1219 # file to be cached afterwards. 

1220 transfer = "move" if uri.isLocal else "copy" 

1221 

1222 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True) 

1223 

1224 if transfer == "copy": 

1225 # Cache if required 

1226 self.cacheManager.move_to_cache(temporary_uri, ref) 

1227 

1228 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1229 

1230 # URI is needed to resolve what ingest case are we dealing with 

1231 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1232 

1233 def _read_artifact_into_memory( 

1234 self, 

1235 getInfo: DatastoreFileGetInformation, 

1236 ref: DatasetRef, 

1237 isComponent: bool = False, 

1238 cache_ref: DatasetRef | None = None, 

1239 ) -> Any: 

1240 """Read the artifact from datastore into in memory object. 

1241 

1242 Parameters 

1243 ---------- 

1244 getInfo : `DatastoreFileGetInformation` 

1245 Information about the artifact within the datastore. 

1246 ref : `DatasetRef` 

1247 The registry information associated with this artifact. 

1248 isComponent : `bool` 

1249 Flag to indicate if a component is being read from this artifact. 

1250 cache_ref : `DatasetRef`, optional 

1251 The DatasetRef to use when looking up the file in the cache. 

1252 This ref must have the same ID as the supplied ref but can 

1253 be a parent ref or component ref to indicate to the cache whether 

1254 a composite file is being requested from the cache or a component 

1255 file. Without this the cache will default to the supplied ref but 

1256 it can get confused with read-only derived components for 

1257 disassembled composites. 

1258 

1259 Returns 

1260 ------- 

1261 inMemoryDataset : `object` 

1262 The artifact as a python object. 

1263 """ 

1264 location = getInfo.location 

1265 uri = location.uri 

1266 log.debug("Accessing data from %s", uri) 

1267 

1268 if cache_ref is None: 

1269 cache_ref = ref 

1270 if cache_ref.id != ref.id: 

1271 raise ValueError( 

1272 "The supplied cache dataset ref refers to a different dataset than expected:" 

1273 f" {ref.id} != {cache_ref.id}" 

1274 ) 

1275 

1276 # Cannot recalculate checksum but can compare size as a quick check 

1277 # Do not do this if the size is negative since that indicates 

1278 # we do not know. 

1279 recorded_size = getInfo.info.file_size 

1280 resource_size = uri.size() 

1281 if recorded_size >= 0 and resource_size != recorded_size: 

1282 raise RuntimeError( 

1283 "Integrity failure in Datastore. " 

1284 f"Size of file {uri} ({resource_size}) " 

1285 f"does not match size recorded in registry of {recorded_size}" 

1286 ) 

1287 

1288 # For the general case we have choices for how to proceed. 

1289 # 1. Always use a local file (downloading the remote resource to a 

1290 # temporary file if needed). 

1291 # 2. Use a threshold size and read into memory and use bytes. 

1292 # Use both for now with an arbitrary hand off size. 

1293 # This allows small datasets to be downloaded from remote object 

1294 # stores without requiring a temporary file. 

1295 

1296 formatter = getInfo.formatter 

1297 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1298 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1299 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1300 if cached_file is not None: 

1301 desired_uri = cached_file 

1302 msg = f" (cached version of {uri})" 

1303 else: 

1304 desired_uri = uri 

1305 msg = "" 

1306 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)): 

1307 serializedDataset = desired_uri.read() 

1308 log.debug( 

1309 "Deserializing %s from %d bytes from location %s with formatter %s", 

1310 f"component {getInfo.component}" if isComponent else "", 

1311 len(serializedDataset), 

1312 uri, 

1313 formatter.name(), 

1314 ) 

1315 try: 

1316 result = formatter.fromBytes( 

1317 serializedDataset, component=getInfo.component if isComponent else None 

1318 ) 

1319 except Exception as e: 

1320 raise ValueError( 

1321 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1322 f" ({ref.datasetType.name} from {uri}): {e}" 

1323 ) from e 

1324 else: 

1325 # Read from file. 

1326 

1327 # Have to update the Location associated with the formatter 

1328 # because formatter.read does not allow an override. 

1329 # This could be improved. 

1330 location_updated = False 

1331 msg = "" 

1332 

1333 # First check in cache for local version. 

1334 # The cache will only be relevant for remote resources but 

1335 # no harm in always asking. Context manager ensures that cache 

1336 # file is not deleted during cache expiration. 

1337 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1338 if cached_file is not None: 

1339 msg = f"(via cache read of remote file {uri})" 

1340 uri = cached_file 

1341 location_updated = True 

1342 

1343 with uri.as_local() as local_uri: 

1344 can_be_cached = False 

1345 if uri != local_uri: 

1346 # URI was remote and file was downloaded 

1347 cache_msg = "" 

1348 location_updated = True 

1349 

1350 if self.cacheManager.should_be_cached(cache_ref): 

1351 # In this scenario we want to ask if the downloaded 

1352 # file should be cached but we should not cache 

1353 # it until after we've used it (to ensure it can't 

1354 # be expired whilst we are using it). 

1355 can_be_cached = True 

1356 

1357 # Say that it is "likely" to be cached because 

1358 # if the formatter read fails we will not be 

1359 # caching this file. 

1360 cache_msg = " and likely cached" 

1361 

1362 msg = f"(via download to local file{cache_msg})" 

1363 

1364 # Calculate the (possibly) new location for the formatter 

1365 # to use. 

1366 newLocation = Location(*local_uri.split()) if location_updated else None 

1367 

1368 log.debug( 

1369 "Reading%s from location %s %s with formatter %s", 

1370 f" component {getInfo.component}" if isComponent else "", 

1371 uri, 

1372 msg, 

1373 formatter.name(), 

1374 ) 

1375 try: 

1376 with ( 

1377 formatter._updateLocation(newLocation), 

1378 time_this( 

1379 log, 

1380 msg="Reading%s from location %s %s with formatter %s", 

1381 args=( 

1382 f" component {getInfo.component}" if isComponent else "", 

1383 uri, 

1384 msg, 

1385 formatter.name(), 

1386 ), 

1387 ), 

1388 ): 

1389 result = formatter.read(component=getInfo.component if isComponent else None) 

1390 except Exception as e: 

1391 raise ValueError( 

1392 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1393 f" ({ref.datasetType.name} from {uri}): {e}" 

1394 ) from e 

1395 

1396 # File was read successfully so can move to cache 

1397 if can_be_cached: 

1398 self.cacheManager.move_to_cache(local_uri, cache_ref) 

1399 

1400 return self._post_process_get( 

1401 result, ref.datasetType.storageClass, getInfo.assemblerParams, isComponent=isComponent 

1402 ) 

1403 

1404 def knows(self, ref: DatasetRef) -> bool: 

1405 """Check if the dataset is known to the datastore. 

1406 

1407 Does not check for existence of any artifact. 

1408 

1409 Parameters 

1410 ---------- 

1411 ref : `DatasetRef` 

1412 Reference to the required dataset. 

1413 

1414 Returns 

1415 ------- 

1416 exists : `bool` 

1417 `True` if the dataset is known to the datastore. 

1418 """ 

1419 fileLocations = self._get_dataset_locations_info(ref) 

1420 if fileLocations: 

1421 return True 

1422 return False 

1423 

1424 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

1425 # Docstring inherited from the base class. 

1426 

1427 # The records themselves. Could be missing some entries. 

1428 records = self._get_stored_records_associated_with_refs(refs) 

1429 

1430 return {ref: ref.id in records for ref in refs} 

1431 

1432 def _process_mexists_records( 

1433 self, 

1434 id_to_ref: dict[DatasetId, DatasetRef], 

1435 records: dict[DatasetId, list[StoredFileInfo]], 

1436 all_required: bool, 

1437 artifact_existence: dict[ResourcePath, bool] | None = None, 

1438 ) -> dict[DatasetRef, bool]: 

1439 """Check given records for existence. 

1440 

1441 Helper function for `mexists()`. 

1442 

1443 Parameters 

1444 ---------- 

1445 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`] 

1446 Mapping of the dataset ID to the dataset ref itself. 

1447 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

1448 Records as generally returned by 

1449 ``_get_stored_records_associated_with_refs``. 

1450 all_required : `bool` 

1451 Flag to indicate whether existence requires all artifacts 

1452 associated with a dataset ID to exist or not for existence. 

1453 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1454 Optional mapping of datastore artifact to existence. Updated by 

1455 this method with details of all artifacts tested. Can be `None` 

1456 if the caller is not interested. 

1457 

1458 Returns 

1459 ------- 

1460 existence : `dict` of [`DatasetRef`, `bool`] 

1461 Mapping from dataset to boolean indicating existence. 

1462 """ 

1463 # The URIs to be checked and a mapping of those URIs to 

1464 # the dataset ID. 

1465 uris_to_check: list[ResourcePath] = [] 

1466 location_map: dict[ResourcePath, DatasetId] = {} 

1467 

1468 location_factory = self.locationFactory 

1469 

1470 uri_existence: dict[ResourcePath, bool] = {} 

1471 for ref_id, infos in records.items(): 

1472 # Key is the dataset Id, value is list of StoredItemInfo 

1473 uris = [info.file_location(location_factory).uri for info in infos] 

1474 location_map.update({uri: ref_id for uri in uris}) 

1475 

1476 # Check the local cache directly for a dataset corresponding 

1477 # to the remote URI. 

1478 if self.cacheManager.file_count > 0: 

1479 ref = id_to_ref[ref_id] 

1480 for uri, storedFileInfo in zip(uris, infos, strict=True): 

1481 check_ref = ref 

1482 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1483 check_ref = ref.makeComponentRef(component) 

1484 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()): 

1485 # Proxy for URI existence. 

1486 uri_existence[uri] = True 

1487 else: 

1488 uris_to_check.append(uri) 

1489 else: 

1490 # Check all of them. 

1491 uris_to_check.extend(uris) 

1492 

1493 if artifact_existence is not None: 

1494 # If a URI has already been checked remove it from the list 

1495 # and immediately add the status to the output dict. 

1496 filtered_uris_to_check = [] 

1497 for uri in uris_to_check: 

1498 if uri in artifact_existence: 

1499 uri_existence[uri] = artifact_existence[uri] 

1500 else: 

1501 filtered_uris_to_check.append(uri) 

1502 uris_to_check = filtered_uris_to_check 

1503 

1504 # Results. 

1505 dataset_existence: dict[DatasetRef, bool] = {} 

1506 

1507 uri_existence.update(ResourcePath.mexists(uris_to_check)) 

1508 for uri, exists in uri_existence.items(): 

1509 dataset_id = location_map[uri] 

1510 ref = id_to_ref[dataset_id] 

1511 

1512 # Disassembled composite needs to check all locations. 

1513 # all_required indicates whether all need to exist or not. 

1514 if ref in dataset_existence: 

1515 if all_required: 

1516 exists = dataset_existence[ref] and exists 

1517 else: 

1518 exists = dataset_existence[ref] or exists 

1519 dataset_existence[ref] = exists 

1520 

1521 if artifact_existence is not None: 

1522 artifact_existence.update(uri_existence) 

1523 

1524 return dataset_existence 

1525 

1526 def mexists( 

1527 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1528 ) -> dict[DatasetRef, bool]: 

1529 """Check the existence of multiple datasets at once. 

1530 

1531 Parameters 

1532 ---------- 

1533 refs : iterable of `DatasetRef` 

1534 The datasets to be checked. 

1535 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1536 Optional mapping of datastore artifact to existence. Updated by 

1537 this method with details of all artifacts tested. Can be `None` 

1538 if the caller is not interested. 

1539 

1540 Returns 

1541 ------- 

1542 existence : `dict` of [`DatasetRef`, `bool`] 

1543 Mapping from dataset to boolean indicating existence. 

1544 

1545 Notes 

1546 ----- 

1547 To minimize potentially costly remote existence checks, the local 

1548 cache is checked as a proxy for existence. If a file for this 

1549 `DatasetRef` does exist no check is done for the actual URI. This 

1550 could result in possibly unexpected behavior if the dataset itself 

1551 has been removed from the datastore by another process whilst it is 

1552 still in the cache. 

1553 """ 

1554 chunk_size = 10_000 

1555 dataset_existence: dict[DatasetRef, bool] = {} 

1556 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size) 

1557 n_found_total = 0 

1558 n_checked = 0 

1559 n_chunks = 0 

1560 for chunk in chunk_iterable(refs, chunk_size=chunk_size): 

1561 chunk_result = self._mexists(chunk, artifact_existence) 

1562 

1563 # The log message level and content depend on how many 

1564 # datasets we are processing. 

1565 n_results = len(chunk_result) 

1566 

1567 # Use verbose logging to ensure that messages can be seen 

1568 # easily if many refs are being checked. 

1569 log_threshold = VERBOSE 

1570 n_checked += n_results 

1571 

1572 # This sum can take some time so only do it if we know the 

1573 # result is going to be used. 

1574 n_found = 0 

1575 if log.isEnabledFor(log_threshold): 

1576 # Can treat the booleans as 0, 1 integers and sum them. 

1577 n_found = sum(chunk_result.values()) 

1578 n_found_total += n_found 

1579 

1580 # We are deliberately not trying to count the number of refs 

1581 # provided in case it's in the millions. This means there is a 

1582 # situation where the number of refs exactly matches the chunk 

1583 # size and we will switch to the multi-chunk path even though 

1584 # we only have a single chunk. 

1585 if n_results < chunk_size and n_chunks == 0: 

1586 # Single chunk will be processed so we can provide more detail. 

1587 if n_results == 1: 

1588 ref = list(chunk_result)[0] 

1589 # Use debug logging to be consistent with `exists()`. 

1590 log.debug( 

1591 "Calling mexists() with single ref that does%s exist (%s).", 

1592 "" if chunk_result[ref] else " not", 

1593 ref, 

1594 ) 

1595 else: 

1596 # Single chunk but multiple files. Summarize. 

1597 log.log( 

1598 log_threshold, 

1599 "Number of datasets found in datastore: %d out of %d datasets checked.", 

1600 n_found, 

1601 n_checked, 

1602 ) 

1603 

1604 else: 

1605 # Use incremental verbose logging when we have multiple chunks. 

1606 log.log( 

1607 log_threshold, 

1608 "Number of datasets found in datastore for chunk %d: %d out of %d checked " 

1609 "(running total from all chunks so far: %d found out of %d checked)", 

1610 n_chunks, 

1611 n_found, 

1612 n_results, 

1613 n_found_total, 

1614 n_checked, 

1615 ) 

1616 dataset_existence.update(chunk_result) 

1617 n_chunks += 1 

1618 

1619 return dataset_existence 

1620 

1621 def _mexists( 

1622 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1623 ) -> dict[DatasetRef, bool]: 

1624 """Check the existence of multiple datasets at once. 

1625 

1626 Parameters 

1627 ---------- 

1628 refs : iterable of `DatasetRef` 

1629 The datasets to be checked. 

1630 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1631 Optional mapping of datastore artifact to existence. Updated by 

1632 this method with details of all artifacts tested. Can be `None` 

1633 if the caller is not interested. 

1634 

1635 Returns 

1636 ------- 

1637 existence : `dict` of [`DatasetRef`, `bool`] 

1638 Mapping from dataset to boolean indicating existence. 

1639 """ 

1640 # Make a mapping from refs with the internal storage class to the given 

1641 # refs that may have a different one. We'll use the internal refs 

1642 # throughout this method and convert back at the very end. 

1643 internal_ref_to_input_ref = {self._cast_storage_class(ref): ref for ref in refs} 

1644 

1645 # Need a mapping of dataset_id to (internal) dataset ref since some 

1646 # internal APIs work with dataset_id. 

1647 id_to_ref = {ref.id: ref for ref in internal_ref_to_input_ref} 

1648 

1649 # Set of all IDs we are checking for. 

1650 requested_ids = set(id_to_ref.keys()) 

1651 

1652 # The records themselves. Could be missing some entries. 

1653 records = self._get_stored_records_associated_with_refs(id_to_ref.values()) 

1654 

1655 dataset_existence = self._process_mexists_records( 

1656 id_to_ref, records, True, artifact_existence=artifact_existence 

1657 ) 

1658 

1659 # Set of IDs that have been handled. 

1660 handled_ids = {ref.id for ref in dataset_existence} 

1661 

1662 missing_ids = requested_ids - handled_ids 

1663 if missing_ids: 

1664 dataset_existence.update( 

1665 self._mexists_check_expected( 

1666 [id_to_ref[missing] for missing in missing_ids], artifact_existence 

1667 ) 

1668 ) 

1669 

1670 return { 

1671 internal_ref_to_input_ref[internal_ref]: existence 

1672 for internal_ref, existence in dataset_existence.items() 

1673 } 

1674 

1675 def _mexists_check_expected( 

1676 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1677 ) -> dict[DatasetRef, bool]: 

1678 """Check existence of refs that are not known to datastore. 

1679 

1680 Parameters 

1681 ---------- 

1682 refs : iterable of `DatasetRef` 

1683 The datasets to be checked. These are assumed not to be known 

1684 to datastore. 

1685 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1686 Optional mapping of datastore artifact to existence. Updated by 

1687 this method with details of all artifacts tested. Can be `None` 

1688 if the caller is not interested. 

1689 

1690 Returns 

1691 ------- 

1692 existence : `dict` of [`DatasetRef`, `bool`] 

1693 Mapping from dataset to boolean indicating existence. 

1694 """ 

1695 dataset_existence: dict[DatasetRef, bool] = {} 

1696 if not self.trustGetRequest: 

1697 # Must assume these do not exist 

1698 for ref in refs: 

1699 dataset_existence[ref] = False 

1700 else: 

1701 log.debug( 

1702 "%d datasets were not known to datastore during initial existence check.", 

1703 len(refs), 

1704 ) 

1705 

1706 # Construct data structure identical to that returned 

1707 # by _get_stored_records_associated_with_refs() but using 

1708 # guessed names. 

1709 records = {} 

1710 id_to_ref = {} 

1711 for missing_ref in refs: 

1712 expected = self._get_expected_dataset_locations_info(missing_ref) 

1713 dataset_id = missing_ref.id 

1714 records[dataset_id] = [info for _, info in expected] 

1715 id_to_ref[dataset_id] = missing_ref 

1716 

1717 dataset_existence.update( 

1718 self._process_mexists_records( 

1719 id_to_ref, 

1720 records, 

1721 False, 

1722 artifact_existence=artifact_existence, 

1723 ) 

1724 ) 

1725 

1726 return dataset_existence 

1727 

1728 def exists(self, ref: DatasetRef) -> bool: 

1729 """Check if the dataset exists in the datastore. 

1730 

1731 Parameters 

1732 ---------- 

1733 ref : `DatasetRef` 

1734 Reference to the required dataset. 

1735 

1736 Returns 

1737 ------- 

1738 exists : `bool` 

1739 `True` if the entity exists in the `Datastore`. 

1740 

1741 Notes 

1742 ----- 

1743 The local cache is checked as a proxy for existence in the remote 

1744 object store. It is possible that another process on a different 

1745 compute node could remove the file from the object store even 

1746 though it is present in the local cache. 

1747 """ 

1748 ref = self._cast_storage_class(ref) 

1749 fileLocations = self._get_dataset_locations_info(ref) 

1750 

1751 # if we are being asked to trust that registry might not be correct 

1752 # we ask for the expected locations and check them explicitly 

1753 if not fileLocations: 

1754 if not self.trustGetRequest: 

1755 return False 

1756 

1757 # First check the cache. If it is not found we must check 

1758 # the datastore itself. Assume that any component in the cache 

1759 # means that the dataset does exist somewhere. 

1760 if self.cacheManager.known_to_cache(ref): 

1761 return True 

1762 

1763 # When we are guessing a dataset location we can not check 

1764 # for the existence of every component since we can not 

1765 # know if every component was written. Instead we check 

1766 # for the existence of any of the expected locations. 

1767 for location, _ in self._get_expected_dataset_locations_info(ref): 

1768 if self._artifact_exists(location): 

1769 return True 

1770 return False 

1771 

1772 # All listed artifacts must exist. 

1773 for location, storedFileInfo in fileLocations: 

1774 # Checking in cache needs the component ref. 

1775 check_ref = ref 

1776 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1777 check_ref = ref.makeComponentRef(component) 

1778 if self.cacheManager.known_to_cache(check_ref, location.getExtension()): 

1779 continue 

1780 

1781 if not self._artifact_exists(location): 

1782 return False 

1783 

1784 return True 

1785 

1786 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

1787 """Return URIs associated with dataset. 

1788 

1789 Parameters 

1790 ---------- 

1791 ref : `DatasetRef` 

1792 Reference to the required dataset. 

1793 predict : `bool`, optional 

1794 If the datastore does not know about the dataset, should it 

1795 return a predicted URI or not? 

1796 

1797 Returns 

1798 ------- 

1799 uris : `DatasetRefURIs` 

1800 The URI to the primary artifact associated with this dataset (if 

1801 the dataset was disassembled within the datastore this may be 

1802 `None`), and the URIs to any components associated with the dataset 

1803 artifact. (can be empty if there are no components). 

1804 """ 

1805 many = self.getManyURIs([ref], predict=predict, allow_missing=False) 

1806 return many[ref] 

1807 

1808 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

1809 """URI to the Dataset. 

1810 

1811 Parameters 

1812 ---------- 

1813 ref : `DatasetRef` 

1814 Reference to the required Dataset. 

1815 predict : `bool` 

1816 If `True`, allow URIs to be returned of datasets that have not 

1817 been written. 

1818 

1819 Returns 

1820 ------- 

1821 uri : `str` 

1822 URI pointing to the dataset within the datastore. If the 

1823 dataset does not exist in the datastore, and if ``predict`` is 

1824 `True`, the URI will be a prediction and will include a URI 

1825 fragment "#predicted". 

1826 If the datastore does not have entities that relate well 

1827 to the concept of a URI the returned URI will be 

1828 descriptive. The returned URI is not guaranteed to be obtainable. 

1829 

1830 Raises 

1831 ------ 

1832 FileNotFoundError 

1833 Raised if a URI has been requested for a dataset that does not 

1834 exist and guessing is not allowed. 

1835 RuntimeError 

1836 Raised if a request is made for a single URI but multiple URIs 

1837 are associated with this dataset. 

1838 

1839 Notes 

1840 ----- 

1841 When a predicted URI is requested an attempt will be made to form 

1842 a reasonable URI based on file templates and the expected formatter. 

1843 """ 

1844 primary, components = self.getURIs(ref, predict) 

1845 if primary is None or components: 

1846 raise RuntimeError( 

1847 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

1848 ) 

1849 return primary 

1850 

1851 def _predict_URIs( 

1852 self, 

1853 ref: DatasetRef, 

1854 ) -> DatasetRefURIs: 

1855 """Predict the URIs of a dataset ref. 

1856 

1857 Parameters 

1858 ---------- 

1859 ref : `DatasetRef` 

1860 Reference to the required Dataset. 

1861 

1862 Returns 

1863 ------- 

1864 URI : DatasetRefUris 

1865 Primary and component URIs. URIs will contain a URI fragment 

1866 "#predicted". 

1867 """ 

1868 uris = DatasetRefURIs() 

1869 

1870 if self.composites.shouldBeDisassembled(ref): 

1871 for component, _ in ref.datasetType.storageClass.components.items(): 

1872 comp_ref = ref.makeComponentRef(component) 

1873 comp_location, _ = self._determine_put_formatter_location(comp_ref) 

1874 

1875 # Add the "#predicted" URI fragment to indicate this is a 

1876 # guess 

1877 uris.componentURIs[component] = ResourcePath(comp_location.uri.geturl() + "#predicted") 

1878 

1879 else: 

1880 location, _ = self._determine_put_formatter_location(ref) 

1881 

1882 # Add the "#predicted" URI fragment to indicate this is a guess 

1883 uris.primaryURI = ResourcePath(location.uri.geturl() + "#predicted") 

1884 

1885 return uris 

1886 

1887 def getManyURIs( 

1888 self, 

1889 refs: Iterable[DatasetRef], 

1890 predict: bool = False, 

1891 allow_missing: bool = False, 

1892 ) -> dict[DatasetRef, DatasetRefURIs]: 

1893 # Docstring inherited 

1894 

1895 uris: dict[DatasetRef, DatasetRefURIs] = {} 

1896 

1897 records = self._get_stored_records_associated_with_refs(refs) 

1898 records_keys = records.keys() 

1899 

1900 existing_refs = tuple(ref for ref in refs if ref.id in records_keys) 

1901 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys) 

1902 

1903 # Have to handle trustGetRequest mode by checking for the existence 

1904 # of the missing refs on disk. 

1905 if missing_refs: 

1906 dataset_existence = self._mexists_check_expected(missing_refs, None) 

1907 really_missing = set() 

1908 not_missing = set() 

1909 for ref, exists in dataset_existence.items(): 

1910 if exists: 

1911 not_missing.add(ref) 

1912 else: 

1913 really_missing.add(ref) 

1914 

1915 if not_missing: 

1916 # Need to recalculate the missing/existing split. 

1917 existing_refs = existing_refs + tuple(not_missing) 

1918 missing_refs = tuple(really_missing) 

1919 

1920 for ref in missing_refs: 

1921 # if this has never been written then we have to guess 

1922 if not predict: 

1923 if not allow_missing: 

1924 raise FileNotFoundError(f"Dataset {ref} not in this datastore.") 

1925 else: 

1926 uris[ref] = self._predict_URIs(ref) 

1927 

1928 for ref in existing_refs: 

1929 file_infos = records[ref.id] 

1930 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos] 

1931 uris[ref] = self._locations_to_URI(ref, file_locations) 

1932 

1933 return uris 

1934 

1935 def _locations_to_URI( 

1936 self, 

1937 ref: DatasetRef, 

1938 file_locations: Sequence[tuple[Location, StoredFileInfo]], 

1939 ) -> DatasetRefURIs: 

1940 """Convert one or more file locations associated with a DatasetRef 

1941 to a DatasetRefURIs. 

1942 

1943 Parameters 

1944 ---------- 

1945 ref : `DatasetRef` 

1946 Reference to the dataset. 

1947 file_locations : Sequence[Tuple[Location, StoredFileInfo]] 

1948 Each item in the sequence is the location of the dataset within the 

1949 datastore and stored information about the file and its formatter. 

1950 If there is only one item in the sequence then it is treated as the 

1951 primary URI. If there is more than one item then they are treated 

1952 as component URIs. If there are no items then an error is raised 

1953 unless ``self.trustGetRequest`` is `True`. 

1954 

1955 Returns 

1956 ------- 

1957 uris: DatasetRefURIs 

1958 Represents the primary URI or component URIs described by the 

1959 inputs. 

1960 

1961 Raises 

1962 ------ 

1963 RuntimeError 

1964 If no file locations are passed in and ``self.trustGetRequest`` is 

1965 `False`. 

1966 FileNotFoundError 

1967 If the a passed-in URI does not exist, and ``self.trustGetRequest`` 

1968 is `False`. 

1969 RuntimeError 

1970 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is 

1971 unexpected). 

1972 """ 

1973 guessing = False 

1974 uris = DatasetRefURIs() 

1975 

1976 if not file_locations: 

1977 if not self.trustGetRequest: 

1978 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1979 file_locations = self._get_expected_dataset_locations_info(ref) 

1980 guessing = True 

1981 

1982 if len(file_locations) == 1: 

1983 # No disassembly so this is the primary URI 

1984 uris.primaryURI = file_locations[0][0].uri 

1985 if guessing and not uris.primaryURI.exists(): 

1986 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist") 

1987 else: 

1988 for location, file_info in file_locations: 

1989 if file_info.component is None: 

1990 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1991 if guessing and not location.uri.exists(): 

1992 # If we are trusting then it is entirely possible for 

1993 # some components to be missing. In that case we skip 

1994 # to the next component. 

1995 if self.trustGetRequest: 

1996 continue 

1997 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist") 

1998 uris.componentURIs[file_info.component] = location.uri 

1999 

2000 return uris 

2001 

2002 def retrieveArtifacts( 

2003 self, 

2004 refs: Iterable[DatasetRef], 

2005 destination: ResourcePath, 

2006 transfer: str = "auto", 

2007 preserve_path: bool = True, 

2008 overwrite: bool = False, 

2009 ) -> list[ResourcePath]: 

2010 """Retrieve the file artifacts associated with the supplied refs. 

2011 

2012 Parameters 

2013 ---------- 

2014 refs : iterable of `DatasetRef` 

2015 The datasets for which file artifacts are to be retrieved. 

2016 A single ref can result in multiple files. The refs must 

2017 be resolved. 

2018 destination : `lsst.resources.ResourcePath` 

2019 Location to write the file artifacts. 

2020 transfer : `str`, optional 

2021 Method to use to transfer the artifacts. Must be one of the options 

2022 supported by `lsst.resources.ResourcePath.transfer_from()`. 

2023 "move" is not allowed. 

2024 preserve_path : `bool`, optional 

2025 If `True` the full path of the file artifact within the datastore 

2026 is preserved. If `False` the final file component of the path 

2027 is used. 

2028 overwrite : `bool`, optional 

2029 If `True` allow transfers to overwrite existing files at the 

2030 destination. 

2031 

2032 Returns 

2033 ------- 

2034 targets : `list` of `lsst.resources.ResourcePath` 

2035 URIs of file artifacts in destination location. Order is not 

2036 preserved. 

2037 """ 

2038 if not destination.isdir(): 

2039 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

2040 

2041 if transfer == "move": 

2042 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

2043 

2044 # Source -> Destination 

2045 # This also helps filter out duplicate DatasetRef in the request 

2046 # that will map to the same underlying file transfer. 

2047 to_transfer: dict[ResourcePath, ResourcePath] = {} 

2048 

2049 for ref in refs: 

2050 locations = self._get_dataset_locations_info(ref) 

2051 for location, _ in locations: 

2052 source_uri = location.uri 

2053 target_path: ResourcePathExpression 

2054 if preserve_path: 

2055 target_path = location.pathInStore 

2056 if target_path.isabs(): 

2057 # This is an absolute path to an external file. 

2058 # Use the full path. 

2059 target_path = target_path.relativeToPathRoot 

2060 else: 

2061 target_path = source_uri.basename() 

2062 target_uri = destination.join(target_path) 

2063 to_transfer[source_uri] = target_uri 

2064 

2065 # In theory can now parallelize the transfer 

2066 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer)) 

2067 for source_uri, target_uri in to_transfer.items(): 

2068 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

2069 

2070 return list(to_transfer.values()) 

2071 

2072 def get( 

2073 self, 

2074 ref: DatasetRef, 

2075 parameters: Mapping[str, Any] | None = None, 

2076 storageClass: StorageClass | str | None = None, 

2077 ) -> Any: 

2078 """Load an InMemoryDataset from the store. 

2079 

2080 Parameters 

2081 ---------- 

2082 ref : `DatasetRef` 

2083 Reference to the required Dataset. 

2084 parameters : `dict` 

2085 `StorageClass`-specific parameters that specify, for example, 

2086 a slice of the dataset to be loaded. 

2087 storageClass : `StorageClass` or `str`, optional 

2088 The storage class to be used to override the Python type 

2089 returned by this method. By default the returned type matches 

2090 the dataset type definition for this dataset. Specifying a 

2091 read `StorageClass` can force a different type to be returned. 

2092 This type must be compatible with the original type. 

2093 

2094 Returns 

2095 ------- 

2096 inMemoryDataset : `object` 

2097 Requested dataset or slice thereof as an InMemoryDataset. 

2098 

2099 Raises 

2100 ------ 

2101 FileNotFoundError 

2102 Requested dataset can not be retrieved. 

2103 TypeError 

2104 Return value from formatter has unexpected type. 

2105 ValueError 

2106 Formatter failed to process the dataset. 

2107 """ 

2108 # Supplied storage class for the component being read is either 

2109 # from the ref itself or some an override if we want to force 

2110 # type conversion. 

2111 if storageClass is not None: 

2112 ref = ref.overrideStorageClass(storageClass) 

2113 refStorageClass = ref.datasetType.storageClass 

2114 

2115 allGetInfo = self._prepare_for_get(ref, parameters) 

2116 refComponent = ref.datasetType.component() 

2117 

2118 # Create mapping from component name to related info 

2119 allComponents = {i.component: i for i in allGetInfo} 

2120 

2121 # By definition the dataset is disassembled if we have more 

2122 # than one record for it. 

2123 isDisassembled = len(allGetInfo) > 1 

2124 

2125 # Look for the special case where we are disassembled but the 

2126 # component is a derived component that was not written during 

2127 # disassembly. For this scenario we need to check that the 

2128 # component requested is listed as a derived component for the 

2129 # composite storage class 

2130 isDisassembledReadOnlyComponent = False 

2131 if isDisassembled and refComponent: 

2132 # The composite storage class should be accessible through 

2133 # the component dataset type 

2134 compositeStorageClass = ref.datasetType.parentStorageClass 

2135 

2136 # In the unlikely scenario where the composite storage 

2137 # class is not known, we can only assume that this is a 

2138 # normal component. If that assumption is wrong then the 

2139 # branch below that reads a persisted component will fail 

2140 # so there is no need to complain here. 

2141 if compositeStorageClass is not None: 

2142 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

2143 

2144 if isDisassembled and not refComponent: 

2145 # This was a disassembled dataset spread over multiple files 

2146 # and we need to put them all back together again. 

2147 # Read into memory and then assemble 

2148 

2149 # Check that the supplied parameters are suitable for the type read 

2150 refStorageClass.validateParameters(parameters) 

2151 

2152 # We want to keep track of all the parameters that were not used 

2153 # by formatters. We assume that if any of the component formatters 

2154 # use a parameter that we do not need to apply it again in the 

2155 # assembler. 

2156 usedParams = set() 

2157 

2158 components: dict[str, Any] = {} 

2159 for getInfo in allGetInfo: 

2160 # assemblerParams are parameters not understood by the 

2161 # associated formatter. 

2162 usedParams.update(set(getInfo.formatterParams)) 

2163 

2164 component = getInfo.component 

2165 

2166 if component is None: 

2167 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

2168 

2169 # We do not want the formatter to think it's reading 

2170 # a component though because it is really reading a 

2171 # standalone dataset -- always tell reader it is not a 

2172 # component. 

2173 components[component] = self._read_artifact_into_memory( 

2174 getInfo, ref.makeComponentRef(component), isComponent=False 

2175 ) 

2176 

2177 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

2178 

2179 # Any unused parameters will have to be passed to the assembler 

2180 if parameters: 

2181 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

2182 else: 

2183 unusedParams = {} 

2184 

2185 # Process parameters 

2186 return ref.datasetType.storageClass.delegate().handleParameters( 

2187 inMemoryDataset, parameters=unusedParams 

2188 ) 

2189 

2190 elif isDisassembledReadOnlyComponent: 

2191 compositeStorageClass = ref.datasetType.parentStorageClass 

2192 if compositeStorageClass is None: 

2193 raise RuntimeError( 

2194 f"Unable to retrieve derived component '{refComponent}' since" 

2195 "no composite storage class is available." 

2196 ) 

2197 

2198 if refComponent is None: 

2199 # Mainly for mypy 

2200 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

2201 

2202 # Assume that every derived component can be calculated by 

2203 # forwarding the request to a single read/write component. 

2204 # Rather than guessing which rw component is the right one by 

2205 # scanning each for a derived component of the same name, 

2206 # we ask the storage class delegate directly which one is best to 

2207 # use. 

2208 compositeDelegate = compositeStorageClass.delegate() 

2209 forwardedComponent = compositeDelegate.selectResponsibleComponent( 

2210 refComponent, set(allComponents) 

2211 ) 

2212 

2213 # Select the relevant component 

2214 rwInfo = allComponents[forwardedComponent] 

2215 

2216 # For now assume that read parameters are validated against 

2217 # the real component and not the requested component 

2218 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

2219 forwardedStorageClass.validateParameters(parameters) 

2220 

2221 # The reference to use for the caching must refer to the forwarded 

2222 # component and not the derived component. 

2223 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent) 

2224 

2225 # Unfortunately the FileDescriptor inside the formatter will have 

2226 # the wrong write storage class so we need to create a new one 

2227 # given the immutability constraint. 

2228 writeStorageClass = rwInfo.info.storageClass 

2229 

2230 # We may need to put some thought into parameters for read 

2231 # components but for now forward them on as is 

2232 readFormatter = type(rwInfo.formatter)( 

2233 FileDescriptor( 

2234 rwInfo.location, 

2235 readStorageClass=refStorageClass, 

2236 storageClass=writeStorageClass, 

2237 parameters=parameters, 

2238 ), 

2239 ref.dataId, 

2240 ) 

2241 

2242 # The assembler can not receive any parameter requests for a 

2243 # derived component at this time since the assembler will 

2244 # see the storage class of the derived component and those 

2245 # parameters will have to be handled by the formatter on the 

2246 # forwarded storage class. 

2247 assemblerParams: dict[str, Any] = {} 

2248 

2249 # Need to created a new info that specifies the derived 

2250 # component and associated storage class 

2251 readInfo = DatastoreFileGetInformation( 

2252 rwInfo.location, 

2253 readFormatter, 

2254 rwInfo.info, 

2255 assemblerParams, 

2256 {}, 

2257 refComponent, 

2258 refStorageClass, 

2259 ) 

2260 

2261 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref) 

2262 

2263 else: 

2264 # Single file request or component from that composite file 

2265 for lookup in (refComponent, None): 

2266 if lookup in allComponents: 

2267 getInfo = allComponents[lookup] 

2268 break 

2269 else: 

2270 raise FileNotFoundError( 

2271 f"Component {refComponent} not found for ref {ref} in datastore {self.name}" 

2272 ) 

2273 

2274 # Do not need the component itself if already disassembled 

2275 if isDisassembled: 

2276 isComponent = False 

2277 else: 

2278 isComponent = getInfo.component is not None 

2279 

2280 # For a component read of a composite we want the cache to 

2281 # be looking at the composite ref itself. 

2282 cache_ref = ref.makeCompositeRef() if isComponent else ref 

2283 

2284 # For a disassembled component we can validate parametersagainst 

2285 # the component storage class directly 

2286 if isDisassembled: 

2287 refStorageClass.validateParameters(parameters) 

2288 else: 

2289 # For an assembled composite this could be a derived 

2290 # component derived from a real component. The validity 

2291 # of the parameters is not clear. For now validate against 

2292 # the composite storage class 

2293 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

2294 

2295 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref) 

2296 

2297 @transactional 

2298 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

2299 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

2300 

2301 Parameters 

2302 ---------- 

2303 inMemoryDataset : `object` 

2304 The dataset to store. 

2305 ref : `DatasetRef` 

2306 Reference to the associated Dataset. 

2307 

2308 Raises 

2309 ------ 

2310 TypeError 

2311 Supplied object and storage class are inconsistent. 

2312 DatasetTypeNotSupportedError 

2313 The associated `DatasetType` is not handled by this datastore. 

2314 

2315 Notes 

2316 ----- 

2317 If the datastore is configured to reject certain dataset types it 

2318 is possible that the put will fail and raise a 

2319 `DatasetTypeNotSupportedError`. The main use case for this is to 

2320 allow `ChainedDatastore` to put to multiple datastores without 

2321 requiring that every datastore accepts the dataset. 

2322 """ 

2323 doDisassembly = self.composites.shouldBeDisassembled(ref) 

2324 # doDisassembly = True 

2325 

2326 artifacts = [] 

2327 if doDisassembly: 

2328 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

2329 if components is None: 

2330 raise RuntimeError( 

2331 f"Inconsistent configuration: dataset type {ref.datasetType.name} " 

2332 f"with storage class {ref.datasetType.storageClass.name} " 

2333 "is configured to be disassembled, but cannot be." 

2334 ) 

2335 for component, componentInfo in components.items(): 

2336 # Don't recurse because we want to take advantage of 

2337 # bulk insert -- need a new DatasetRef that refers to the 

2338 # same dataset_id but has the component DatasetType 

2339 # DatasetType does not refer to the types of components 

2340 # So we construct one ourselves. 

2341 compRef = ref.makeComponentRef(component) 

2342 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

2343 artifacts.append((compRef, storedInfo)) 

2344 else: 

2345 # Write the entire thing out 

2346 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

2347 artifacts.append((ref, storedInfo)) 

2348 

2349 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.INSERT) 

2350 

2351 @transactional 

2352 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None: 

2353 # At this point can safely remove these datasets from the cache 

2354 # to avoid confusion later on. If they are not trashed later 

2355 # the cache will simply be refilled. 

2356 self.cacheManager.remove_from_cache(ref) 

2357 

2358 # If we are in trust mode there will be nothing to move to 

2359 # the trash table and we will have to try to delete the file 

2360 # immediately. 

2361 if self.trustGetRequest: 

2362 # Try to keep the logic below for a single file trash. 

2363 if isinstance(ref, DatasetRef): 

2364 refs = {ref} 

2365 else: 

2366 # Will recreate ref at the end of this branch. 

2367 refs = set(ref) 

2368 

2369 # Determine which datasets are known to datastore directly. 

2370 id_to_ref = {ref.id: ref for ref in refs} 

2371 existing_ids = self._get_stored_records_associated_with_refs(refs) 

2372 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids} 

2373 

2374 missing = refs - existing_refs 

2375 if missing: 

2376 # Do an explicit existence check on these refs. 

2377 # We only care about the artifacts at this point and not 

2378 # the dataset existence. 

2379 artifact_existence: dict[ResourcePath, bool] = {} 

2380 _ = self.mexists(missing, artifact_existence) 

2381 uris = [uri for uri, exists in artifact_existence.items() if exists] 

2382 

2383 # FUTURE UPGRADE: Implement a parallelized bulk remove. 

2384 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris)) 

2385 for uri in uris: 

2386 try: 

2387 uri.remove() 

2388 except Exception as e: 

2389 if ignore_errors: 

2390 log.debug("Artifact %s could not be removed: %s", uri, e) 

2391 continue 

2392 raise 

2393 

2394 # There is no point asking the code below to remove refs we 

2395 # know are missing so update it with the list of existing 

2396 # records. Try to retain one vs many logic. 

2397 if not existing_refs: 

2398 # Nothing more to do since none of the datasets were 

2399 # known to the datastore record table. 

2400 return 

2401 ref = list(existing_refs) 

2402 if len(ref) == 1: 

2403 ref = ref[0] 

2404 

2405 # Get file metadata and internal metadata 

2406 if not isinstance(ref, DatasetRef): 

2407 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

2408 # Assumed to be an iterable of refs so bulk mode enabled. 

2409 try: 

2410 self.bridge.moveToTrash(ref, transaction=self._transaction) 

2411 except Exception as e: 

2412 if ignore_errors: 

2413 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

2414 else: 

2415 raise 

2416 return 

2417 

2418 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

2419 

2420 fileLocations = self._get_dataset_locations_info(ref) 

2421 

2422 if not fileLocations: 

2423 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

2424 if ignore_errors: 

2425 log.warning(err_msg) 

2426 return 

2427 else: 

2428 raise FileNotFoundError(err_msg) 

2429 

2430 for location, _ in fileLocations: 

2431 if not self._artifact_exists(location): 

2432 err_msg = ( 

2433 f"Dataset is known to datastore {self.name} but " 

2434 f"associated artifact ({location.uri}) is missing" 

2435 ) 

2436 if ignore_errors: 

2437 log.warning(err_msg) 

2438 return 

2439 else: 

2440 raise FileNotFoundError(err_msg) 

2441 

2442 # Mark dataset as trashed 

2443 try: 

2444 self.bridge.moveToTrash([ref], transaction=self._transaction) 

2445 except Exception as e: 

2446 if ignore_errors: 

2447 log.warning( 

2448 "Attempted to mark dataset (%s) to be trashed in datastore %s " 

2449 "but encountered an error: %s", 

2450 ref, 

2451 self.name, 

2452 e, 

2453 ) 

2454 pass 

2455 else: 

2456 raise 

2457 

2458 @transactional 

2459 def emptyTrash(self, ignore_errors: bool = True) -> None: 

2460 """Remove all datasets from the trash. 

2461 

2462 Parameters 

2463 ---------- 

2464 ignore_errors : `bool` 

2465 If `True` return without error even if something went wrong. 

2466 Problems could occur if another process is simultaneously trying 

2467 to delete. 

2468 """ 

2469 log.debug("Emptying trash in datastore %s", self.name) 

2470 

2471 # Context manager will empty trash iff we finish it without raising. 

2472 # It will also automatically delete the relevant rows from the 

2473 # trash table and the records table. 

2474 with self.bridge.emptyTrash( 

2475 self._table, record_class=StoredFileInfo, record_column="path" 

2476 ) as trash_data: 

2477 # Removing the artifacts themselves requires that the files are 

2478 # not also associated with refs that are not to be trashed. 

2479 # Therefore need to do a query with the file paths themselves 

2480 # and return all the refs associated with them. Can only delete 

2481 # a file if the refs to be trashed are the only refs associated 

2482 # with the file. 

2483 # This requires multiple copies of the trashed items 

2484 trashed, artifacts_to_keep = trash_data 

2485 

2486 if artifacts_to_keep is None: 

2487 # The bridge is not helping us so have to work it out 

2488 # ourselves. This is not going to be as efficient. 

2489 trashed = list(trashed) 

2490 

2491 # The instance check is for mypy since up to this point it 

2492 # does not know the type of info. 

2493 path_map = self._refs_associated_with_artifacts( 

2494 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)] 

2495 ) 

2496 

2497 for ref, info in trashed: 

2498 # Mypy needs to know this is not the base class 

2499 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2500 

2501 path_map[info.path].remove(ref.id) 

2502 if not path_map[info.path]: 

2503 del path_map[info.path] 

2504 

2505 artifacts_to_keep = set(path_map) 

2506 

2507 for ref, info in trashed: 

2508 # Should not happen for this implementation but need 

2509 # to keep mypy happy. 

2510 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

2511 

2512 # Mypy needs to know this is not the base class 

2513 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2514 

2515 if info.path in artifacts_to_keep: 

2516 # This is a multi-dataset artifact and we are not 

2517 # removing all associated refs. 

2518 continue 

2519 

2520 # Only trashed refs still known to datastore will be returned. 

2521 location = info.file_location(self.locationFactory) 

2522 

2523 # Point of no return for this artifact 

2524 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

2525 try: 

2526 self._delete_artifact(location) 

2527 except FileNotFoundError: 

2528 # If the file itself has been deleted there is nothing 

2529 # we can do about it. It is possible that trash has 

2530 # been run in parallel in another process or someone 

2531 # decided to delete the file. It is unlikely to come 

2532 # back and so we should still continue with the removal 

2533 # of the entry from the trash table. It is also possible 

2534 # we removed it in a previous iteration if it was 

2535 # a multi-dataset artifact. The delete artifact method 

2536 # will log a debug message in this scenario. 

2537 # Distinguishing file missing before trash started and 

2538 # file already removed previously as part of this trash 

2539 # is not worth the distinction with regards to potential 

2540 # memory cost. 

2541 pass 

2542 except Exception as e: 

2543 if ignore_errors: 

2544 # Use a debug message here even though it's not 

2545 # a good situation. In some cases this can be 

2546 # caused by a race between user A and user B 

2547 # and neither of them has permissions for the 

2548 # other's files. Butler does not know about users 

2549 # and trash has no idea what collections these 

2550 # files were in (without guessing from a path). 

2551 log.debug( 

2552 "Encountered error removing artifact %s from datastore %s: %s", 

2553 location.uri, 

2554 self.name, 

2555 e, 

2556 ) 

2557 else: 

2558 raise 

2559 

2560 @transactional 

2561 def transfer_from( 

2562 self, 

2563 source_datastore: Datastore, 

2564 refs: Iterable[DatasetRef], 

2565 transfer: str = "auto", 

2566 artifact_existence: dict[ResourcePath, bool] | None = None, 

2567 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

2568 # Docstring inherited 

2569 if type(self) is not type(source_datastore): 

2570 raise TypeError( 

2571 f"Datastore mismatch between this datastore ({type(self)}) and the " 

2572 f"source datastore ({type(source_datastore)})." 

2573 ) 

2574 

2575 # Be explicit for mypy 

2576 if not isinstance(source_datastore, FileDatastore): 

2577 raise TypeError( 

2578 "Can only transfer to a FileDatastore from another FileDatastore, not" 

2579 f" {type(source_datastore)}" 

2580 ) 

2581 

2582 # Stop early if "direct" transfer mode is requested. That would 

2583 # require that the URI inside the source datastore should be stored 

2584 # directly in the target datastore, which seems unlikely to be useful 

2585 # since at any moment the source datastore could delete the file. 

2586 if transfer in ("direct", "split"): 

2587 raise ValueError( 

2588 f"Can not transfer from a source datastore using {transfer} mode since" 

2589 " those files are controlled by the other datastore." 

2590 ) 

2591 

2592 # Empty existence lookup if none given. 

2593 if artifact_existence is None: 

2594 artifact_existence = {} 

2595 

2596 # We will go through the list multiple times so must convert 

2597 # generators to lists. 

2598 refs = list(refs) 

2599 

2600 # In order to handle disassembled composites the code works 

2601 # at the records level since it can assume that internal APIs 

2602 # can be used. 

2603 # - If the record already exists in the destination this is assumed 

2604 # to be okay. 

2605 # - If there is no record but the source and destination URIs are 

2606 # identical no transfer is done but the record is added. 

2607 # - If the source record refers to an absolute URI currently assume 

2608 # that that URI should remain absolute and will be visible to the 

2609 # destination butler. May need to have a flag to indicate whether 

2610 # the dataset should be transferred. This will only happen if 

2611 # the detached Butler has had a local ingest. 

2612 

2613 # What we really want is all the records in the source datastore 

2614 # associated with these refs. Or derived ones if they don't exist 

2615 # in the source. 

2616 source_records = source_datastore._get_stored_records_associated_with_refs(refs) 

2617 

2618 # The source dataset_ids are the keys in these records 

2619 source_ids = set(source_records) 

2620 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

2621 

2622 requested_ids = {ref.id for ref in refs} 

2623 missing_ids = requested_ids - source_ids 

2624 

2625 # Missing IDs can be okay if that datastore has allowed 

2626 # gets based on file existence. Should we transfer what we can 

2627 # or complain about it and warn? 

2628 if missing_ids and not source_datastore.trustGetRequest: 

2629 raise ValueError( 

2630 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}" 

2631 ) 

2632 

2633 # Need to map these missing IDs to a DatasetRef so we can guess 

2634 # the details. 

2635 if missing_ids: 

2636 log.info( 

2637 "Number of expected datasets missing from source datastore records: %d out of %d", 

2638 len(missing_ids), 

2639 len(requested_ids), 

2640 ) 

2641 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

2642 

2643 # This should be chunked in case we end up having to check 

2644 # the file store since we need some log output to show 

2645 # progress. 

2646 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000): 

2647 records = {} 

2648 for missing in missing_ids_chunk: 

2649 # Ask the source datastore where the missing artifacts 

2650 # should be. An execution butler might not know about the 

2651 # artifacts even if they are there. 

2652 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) 

2653 records[missing] = [info for _, info in expected] 

2654 

2655 # Call the mexist helper method in case we have not already 

2656 # checked these artifacts such that artifact_existence is 

2657 # empty. This allows us to benefit from parallelism. 

2658 # datastore.mexists() itself does not give us access to the 

2659 # derived datastore record. 

2660 log.verbose("Checking existence of %d datasets unknown to datastore", len(records)) 

2661 ref_exists = source_datastore._process_mexists_records( 

2662 id_to_ref, records, False, artifact_existence=artifact_existence 

2663 ) 

2664 

2665 # Now go through the records and propagate the ones that exist. 

2666 location_factory = source_datastore.locationFactory 

2667 for missing, record_list in records.items(): 

2668 # Skip completely if the ref does not exist. 

2669 ref = id_to_ref[missing] 

2670 if not ref_exists[ref]: 

2671 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref) 

2672 continue 

2673 # Check for file artifact to decide which parts of a 

2674 # disassembled composite do exist. If there is only a 

2675 # single record we don't even need to look because it can't 

2676 # be a composite and must exist. 

2677 if len(record_list) == 1: 

2678 dataset_records = record_list 

2679 else: 

2680 dataset_records = [ 

2681 record 

2682 for record in record_list 

2683 if artifact_existence[record.file_location(location_factory).uri] 

2684 ] 

2685 assert len(dataset_records) > 0, "Disassembled composite should have had some files." 

2686 

2687 # Rely on source_records being a defaultdict. 

2688 source_records[missing].extend(dataset_records) 

2689 

2690 # See if we already have these records 

2691 target_records = self._get_stored_records_associated_with_refs(refs) 

2692 

2693 # The artifacts to register 

2694 artifacts = [] 

2695 

2696 # Refs that already exist 

2697 already_present = [] 

2698 

2699 # Refs that were rejected by this datastore. 

2700 rejected = set() 

2701 

2702 # Refs that were transferred successfully. 

2703 accepted = set() 

2704 

2705 # Record each time we have done a "direct" transfer. 

2706 direct_transfers = [] 

2707 

2708 # Now can transfer the artifacts 

2709 for ref in refs: 

2710 if not self.constraints.isAcceptable(ref): 

2711 # This datastore should not be accepting this dataset. 

2712 rejected.add(ref) 

2713 continue 

2714 

2715 accepted.add(ref) 

2716 

2717 if ref.id in target_records: 

2718 # Already have an artifact for this. 

2719 already_present.append(ref) 

2720 continue 

2721 

2722 # mypy needs to know these are always resolved refs 

2723 for info in source_records[ref.id]: 

2724 source_location = info.file_location(source_datastore.locationFactory) 

2725 target_location = info.file_location(self.locationFactory) 

2726 if source_location == target_location and not source_location.pathInStore.isabs(): 

2727 # Artifact is already in the target location. 

2728 # (which is how execution butler currently runs) 

2729 pass 

2730 else: 

2731 if target_location.pathInStore.isabs(): 

2732 # Just because we can see the artifact when running 

2733 # the transfer doesn't mean it will be generally 

2734 # accessible to a user of this butler. Need to decide 

2735 # what to do about an absolute path. 

2736 if transfer == "auto": 

2737 # For "auto" transfers we allow the absolute URI 

2738 # to be recorded in the target datastore. 

2739 direct_transfers.append(source_location) 

2740 else: 

2741 # The user is explicitly requesting a transfer 

2742 # even for an absolute URI. This requires us to 

2743 # calculate the target path. 

2744 template_ref = ref 

2745 if info.component: 

2746 template_ref = ref.makeComponentRef(info.component) 

2747 target_location = self._calculate_ingested_datastore_name( 

2748 source_location.uri, 

2749 template_ref, 

2750 ) 

2751 

2752 info = info.update(path=target_location.pathInStore.path) 

2753 

2754 # Need to transfer it to the new location. 

2755 # Assume we should always overwrite. If the artifact 

2756 # is there this might indicate that a previous transfer 

2757 # was interrupted but was not able to be rolled back 

2758 # completely (eg pre-emption) so follow Datastore default 

2759 # and overwrite. 

2760 target_location.uri.transfer_from( 

2761 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction 

2762 ) 

2763 

2764 artifacts.append((ref, info)) 

2765 

2766 if direct_transfers: 

2767 log.info( 

2768 "Transfer request for an outside-datastore artifact with absolute URI done %d time%s", 

2769 len(direct_transfers), 

2770 "" if len(direct_transfers) == 1 else "s", 

2771 ) 

2772 

2773 # We are overwriting previous datasets that may have already 

2774 # existed. We therefore should ensure that we force the 

2775 # datastore records to agree. Note that this can potentially lead 

2776 # to difficulties if the dataset has previously been ingested 

2777 # disassembled and is somehow now assembled, or vice versa. 

2778 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.REPLACE) 

2779 

2780 if already_present: 

2781 n_skipped = len(already_present) 

2782 log.info( 

2783 "Skipped transfer of %d dataset%s already present in datastore", 

2784 n_skipped, 

2785 "" if n_skipped == 1 else "s", 

2786 ) 

2787 

2788 return accepted, rejected 

2789 

2790 @transactional 

2791 def forget(self, refs: Iterable[DatasetRef]) -> None: 

2792 # Docstring inherited. 

2793 refs = list(refs) 

2794 self.bridge.forget(refs) 

2795 self._table.delete(["dataset_id"], *[{"dataset_id": ref.id} for ref in refs]) 

2796 

2797 def validateConfiguration( 

2798 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

2799 ) -> None: 

2800 """Validate some of the configuration for this datastore. 

2801 

2802 Parameters 

2803 ---------- 

2804 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

2805 Entities to test against this configuration. Can be differing 

2806 types. 

2807 logFailures : `bool`, optional 

2808 If `True`, output a log message for every validation error 

2809 detected. 

2810 

2811 Raises 

2812 ------ 

2813 DatastoreValidationError 

2814 Raised if there is a validation problem with a configuration. 

2815 All the problems are reported in a single exception. 

2816 

2817 Notes 

2818 ----- 

2819 This method checks that all the supplied entities have valid file 

2820 templates and also have formatters defined. 

2821 """ 

2822 templateFailed = None 

2823 try: 

2824 self.templates.validateTemplates(entities, logFailures=logFailures) 

2825 except FileTemplateValidationError as e: 

2826 templateFailed = str(e) 

2827 

2828 formatterFailed = [] 

2829 for entity in entities: 

2830 try: 

2831 self.formatterFactory.getFormatterClass(entity) 

2832 except KeyError as e: 

2833 formatterFailed.append(str(e)) 

2834 if logFailures: 

2835 log.critical("Formatter failure: %s", e) 

2836 

2837 if templateFailed or formatterFailed: 

2838 messages = [] 

2839 if templateFailed: 

2840 messages.append(templateFailed) 

2841 if formatterFailed: 

2842 messages.append(",".join(formatterFailed)) 

2843 msg = ";\n".join(messages) 

2844 raise DatastoreValidationError(msg) 

2845 

2846 def getLookupKeys(self) -> set[LookupKey]: 

2847 # Docstring is inherited from base class 

2848 return ( 

2849 self.templates.getLookupKeys() 

2850 | self.formatterFactory.getLookupKeys() 

2851 | self.constraints.getLookupKeys() 

2852 ) 

2853 

2854 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

2855 # Docstring is inherited from base class 

2856 # The key can be valid in either formatters or templates so we can 

2857 # only check the template if it exists 

2858 if lookupKey in self.templates: 

2859 try: 

2860 self.templates[lookupKey].validateTemplate(entity) 

2861 except FileTemplateValidationError as e: 

2862 raise DatastoreValidationError(e) from e 

2863 

2864 def export( 

2865 self, 

2866 refs: Iterable[DatasetRef], 

2867 *, 

2868 directory: ResourcePathExpression | None = None, 

2869 transfer: str | None = "auto", 

2870 ) -> Iterable[FileDataset]: 

2871 # Docstring inherited from Datastore.export. 

2872 if transfer == "auto" and directory is None: 

2873 transfer = None 

2874 

2875 if transfer is not None and directory is None: 

2876 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

2877 

2878 if transfer == "move": 

2879 raise TypeError("Can not export by moving files out of datastore.") 

2880 elif transfer == "direct": 

2881 # For an export, treat this as equivalent to None. We do not 

2882 # want an import to risk using absolute URIs to datasets owned 

2883 # by another datastore. 

2884 log.info("Treating 'direct' transfer mode as in-place export.") 

2885 transfer = None 

2886 

2887 # Force the directory to be a URI object 

2888 directoryUri: ResourcePath | None = None 

2889 if directory is not None: 

2890 directoryUri = ResourcePath(directory, forceDirectory=True) 

2891 

2892 if transfer is not None and directoryUri is not None and not directoryUri.exists(): 

2893 # mypy needs the second test 

2894 raise FileNotFoundError(f"Export location {directory} does not exist") 

2895 

2896 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

2897 for ref in progress.wrap(refs, "Exporting dataset files"): 

2898 fileLocations = self._get_dataset_locations_info(ref) 

2899 if not fileLocations: 

2900 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

2901 # For now we can not export disassembled datasets 

2902 if len(fileLocations) > 1: 

2903 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

2904 location, storedFileInfo = fileLocations[0] 

2905 

2906 pathInStore = location.pathInStore.path 

2907 if transfer is None: 

2908 # TODO: do we also need to return the readStorageClass somehow? 

2909 # We will use the path in store directly. If this is an 

2910 # absolute URI, preserve it. 

2911 if location.pathInStore.isabs(): 

2912 pathInStore = str(location.uri) 

2913 elif transfer == "direct": 

2914 # Use full URIs to the remote store in the export 

2915 pathInStore = str(location.uri) 

2916 else: 

2917 # mypy needs help 

2918 assert directoryUri is not None, "directoryUri must be defined to get here" 

2919 storeUri = ResourcePath(location.uri) 

2920 

2921 # if the datastore has an absolute URI to a resource, we 

2922 # have two options: 

2923 # 1. Keep the absolute URI in the exported YAML 

2924 # 2. Allocate a new name in the local datastore and transfer 

2925 # it. 

2926 # For now go with option 2 

2927 if location.pathInStore.isabs(): 

2928 template = self.templates.getTemplate(ref) 

2929 newURI = ResourcePath(template.format(ref), forceAbsolute=False) 

2930 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

2931 

2932 exportUri = directoryUri.join(pathInStore) 

2933 exportUri.transfer_from(storeUri, transfer=transfer) 

2934 

2935 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

2936 

2937 @staticmethod 

2938 def computeChecksum(uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192) -> str | None: 

2939 """Compute the checksum of the supplied file. 

2940 

2941 Parameters 

2942 ---------- 

2943 uri : `lsst.resources.ResourcePath` 

2944 Name of resource to calculate checksum from. 

2945 algorithm : `str`, optional 

2946 Name of algorithm to use. Must be one of the algorithms supported 

2947 by :py:class`hashlib`. 

2948 block_size : `int` 

2949 Number of bytes to read from file at one time. 

2950 

2951 Returns 

2952 ------- 

2953 hexdigest : `str` 

2954 Hex digest of the file. 

2955 

2956 Notes 

2957 ----- 

2958 Currently returns None if the URI is for a remote resource. 

2959 """ 

2960 if algorithm not in hashlib.algorithms_guaranteed: 

2961 raise NameError(f"The specified algorithm '{algorithm}' is not supported by hashlib") 

2962 

2963 if not uri.isLocal: 

2964 return None 

2965 

2966 hasher = hashlib.new(algorithm) 

2967 

2968 with uri.as_local() as local_uri, open(local_uri.ospath, "rb") as f: 

2969 for chunk in iter(lambda: f.read(block_size), b""): 

2970 hasher.update(chunk) 

2971 

2972 return hasher.hexdigest() 

2973 

2974 def needs_expanded_data_ids( 

2975 self, 

2976 transfer: str | None, 

2977 entity: DatasetRef | DatasetType | StorageClass | None = None, 

2978 ) -> bool: 

2979 # Docstring inherited. 

2980 # This _could_ also use entity to inspect whether the filename template 

2981 # involves placeholders other than the required dimensions for its 

2982 # dataset type, but that's not necessary for correctness; it just 

2983 # enables more optimizations (perhaps only in theory). 

2984 return transfer not in ("direct", None) 

2985 

2986 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

2987 # Docstring inherited from the base class. 

2988 record_data = data.get(self.name) 

2989 if not record_data: 

2990 return 

2991 

2992 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records) 

2993 

2994 # TODO: Verify that there are no unexpected table names in the dict? 

2995 unpacked_records = [] 

2996 for dataset_data in record_data.records.values(): 

2997 records = dataset_data.get(self._table.name) 

2998 if records: 

2999 for info in records: 

3000 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records" 

3001 unpacked_records.append(info.to_record()) 

3002 if unpacked_records: 

3003 self._table.insert(*unpacked_records, transaction=self._transaction) 

3004 

3005 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

3006 # Docstring inherited from the base class. 

3007 exported_refs = list(self._bridge.check(refs)) 

3008 ids = {ref.id for ref in exported_refs} 

3009 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids} 

3010 for row in self._table.fetch(dataset_id=ids): 

3011 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row) 

3012 dataset_records = records.setdefault(info.dataset_id, {}) 

3013 dataset_records.setdefault(self._table.name, []).append(info) 

3014 

3015 record_data = DatastoreRecordData(records=records) 

3016 return {self.name: record_data} 

3017 

3018 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None: 

3019 # Docstring inherited from the base class. 

3020 self._retrieve_dataset_method = method 

3021 

3022 def _cast_storage_class(self, ref: DatasetRef) -> DatasetRef: 

3023 """Update dataset reference to use the storage class from registry.""" 

3024 if self._retrieve_dataset_method is None: 

3025 # We could raise an exception here but unit tests do not define 

3026 # this method. 

3027 return ref 

3028 dataset_type = self._retrieve_dataset_method(ref.datasetType.name) 

3029 if dataset_type is not None: 

3030 ref = ref.overrideStorageClass(dataset_type.storageClass) 

3031 return ref