Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 10%

991 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-25 15:14 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Generic file-based datastore code.""" 

23 

24from __future__ import annotations 

25 

26__all__ = ("FileDatastore",) 

27 

28import contextlib 

29import hashlib 

30import logging 

31from collections import defaultdict 

32from collections.abc import Callable, Iterable, Mapping, Sequence 

33from dataclasses import dataclass 

34from typing import TYPE_CHECKING, Any, ClassVar 

35 

36from lsst.daf.butler import ( 

37 CompositesMap, 

38 Config, 

39 DatasetId, 

40 DatasetRef, 

41 DatasetRefURIs, 

42 DatasetType, 

43 DatasetTypeNotSupportedError, 

44 Datastore, 

45 DatastoreCacheManager, 

46 DatastoreConfig, 

47 DatastoreDisabledCacheManager, 

48 DatastoreRecordData, 

49 DatastoreValidationError, 

50 FileDataset, 

51 FileDescriptor, 

52 FileTemplates, 

53 FileTemplateValidationError, 

54 Formatter, 

55 FormatterFactory, 

56 Location, 

57 LocationFactory, 

58 Progress, 

59 StorageClass, 

60 StoredDatastoreItemInfo, 

61 StoredFileInfo, 

62 ddl, 

63) 

64from lsst.daf.butler.core.repoRelocation import replaceRoot 

65from lsst.daf.butler.core.utils import transactional 

66from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError 

67from lsst.resources import ResourcePath, ResourcePathExpression 

68from lsst.utils.introspection import get_class_of, get_instance_of 

69from lsst.utils.iteration import chunk_iterable 

70 

71# For VERBOSE logging usage. 

72from lsst.utils.logging import VERBOSE, getLogger 

73from lsst.utils.timer import time_this 

74from sqlalchemy import BigInteger, String 

75 

76from ..registry.interfaces import DatabaseInsertMode, FakeDatasetRef 

77from .genericDatastore import GenericBaseDatastore 

78 

79if TYPE_CHECKING: 

80 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey 

81 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

82 

83log = getLogger(__name__) 

84 

85 

86class _IngestPrepData(Datastore.IngestPrepData): 

87 """Helper class for FileDatastore ingest implementation. 

88 

89 Parameters 

90 ---------- 

91 datasets : `~collections.abc.Iterable` of `FileDataset` 

92 Files to be ingested by this datastore. 

93 """ 

94 

95 def __init__(self, datasets: Iterable[FileDataset]): 

96 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

97 self.datasets = datasets 

98 

99 

100@dataclass(frozen=True) 

101class DatastoreFileGetInformation: 

102 """Collection of useful parameters needed to retrieve a file from 

103 a Datastore. 

104 """ 

105 

106 location: Location 

107 """The location from which to read the dataset.""" 

108 

109 formatter: Formatter 

110 """The `Formatter` to use to deserialize the dataset.""" 

111 

112 info: StoredFileInfo 

113 """Stored information about this file and its formatter.""" 

114 

115 assemblerParams: Mapping[str, Any] 

116 """Parameters to use for post-processing the retrieved dataset.""" 

117 

118 formatterParams: Mapping[str, Any] 

119 """Parameters that were understood by the associated formatter.""" 

120 

121 component: str | None 

122 """The component to be retrieved (can be `None`).""" 

123 

124 readStorageClass: StorageClass 

125 """The `StorageClass` of the dataset being read.""" 

126 

127 

128class FileDatastore(GenericBaseDatastore): 

129 """Generic Datastore for file-based implementations. 

130 

131 Should always be sub-classed since key abstract methods are missing. 

132 

133 Parameters 

134 ---------- 

135 config : `DatastoreConfig` or `str` 

136 Configuration as either a `Config` object or URI to file. 

137 bridgeManager : `DatastoreRegistryBridgeManager` 

138 Object that manages the interface between `Registry` and datastores. 

139 butlerRoot : `str`, optional 

140 New datastore root to use to override the configuration value. 

141 

142 Raises 

143 ------ 

144 ValueError 

145 If root location does not exist and ``create`` is `False` in the 

146 configuration. 

147 """ 

148 

149 defaultConfigFile: ClassVar[str | None] = None 

150 """Path to configuration defaults. Accessed within the ``config`` resource 

151 or relative to a search path. Can be None if no defaults specified. 

152 """ 

153 

154 root: ResourcePath 

155 """Root directory URI of this `Datastore`.""" 

156 

157 locationFactory: LocationFactory 

158 """Factory for creating locations relative to the datastore root.""" 

159 

160 formatterFactory: FormatterFactory 

161 """Factory for creating instances of formatters.""" 

162 

163 templates: FileTemplates 

164 """File templates that can be used by this `Datastore`.""" 

165 

166 composites: CompositesMap 

167 """Determines whether a dataset should be disassembled on put.""" 

168 

169 defaultConfigFile = "datastores/fileDatastore.yaml" 

170 """Path to configuration defaults. Accessed within the ``config`` resource 

171 or relative to a search path. Can be None if no defaults specified. 

172 """ 

173 

174 _retrieve_dataset_method: Callable[[str], DatasetType | None] | None = None 

175 """Callable that is used in trusted mode to retrieve registry definition 

176 of a named dataset type. 

177 """ 

178 

179 @classmethod 

180 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

181 """Set any filesystem-dependent config options for this Datastore to 

182 be appropriate for a new empty repository with the given root. 

183 

184 Parameters 

185 ---------- 

186 root : `str` 

187 URI to the root of the data repository. 

188 config : `Config` 

189 A `Config` to update. Only the subset understood by 

190 this component will be updated. Will not expand 

191 defaults. 

192 full : `Config` 

193 A complete config with all defaults expanded that can be 

194 converted to a `DatastoreConfig`. Read-only and will not be 

195 modified by this method. 

196 Repository-specific options that should not be obtained 

197 from defaults when Butler instances are constructed 

198 should be copied from ``full`` to ``config``. 

199 overwrite : `bool`, optional 

200 If `False`, do not modify a value in ``config`` if the value 

201 already exists. Default is always to overwrite with the provided 

202 ``root``. 

203 

204 Notes 

205 ----- 

206 If a keyword is explicitly defined in the supplied ``config`` it 

207 will not be overridden by this method if ``overwrite`` is `False`. 

208 This allows explicit values set in external configs to be retained. 

209 """ 

210 Config.updateParameters( 

211 DatastoreConfig, 

212 config, 

213 full, 

214 toUpdate={"root": root}, 

215 toCopy=("cls", ("records", "table")), 

216 overwrite=overwrite, 

217 ) 

218 

219 @classmethod 

220 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

221 return ddl.TableSpec( 

222 fields=[ 

223 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

224 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

225 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

226 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

227 # Use empty string to indicate no component 

228 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

229 # TODO: should checksum be Base64Bytes instead? 

230 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

231 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

232 ], 

233 unique=frozenset(), 

234 indexes=[ddl.IndexSpec("path")], 

235 ) 

236 

237 def __init__( 

238 self, 

239 config: DatastoreConfig | ResourcePathExpression, 

240 bridgeManager: DatastoreRegistryBridgeManager, 

241 butlerRoot: str | None = None, 

242 ): 

243 super().__init__(config, bridgeManager) 

244 if "root" not in self.config: 

245 raise ValueError("No root directory specified in configuration") 

246 

247 self._bridgeManager = bridgeManager 

248 

249 # Name ourselves either using an explicit name or a name 

250 # derived from the (unexpanded) root 

251 if "name" in self.config: 

252 self.name = self.config["name"] 

253 else: 

254 # We use the unexpanded root in the name to indicate that this 

255 # datastore can be moved without having to update registry. 

256 self.name = "{}@{}".format(type(self).__name__, self.config["root"]) 

257 

258 # Support repository relocation in config 

259 # Existence of self.root is checked in subclass 

260 self.root = ResourcePath( 

261 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True 

262 ) 

263 

264 self.locationFactory = LocationFactory(self.root) 

265 self.formatterFactory = FormatterFactory() 

266 

267 # Now associate formatters with storage classes 

268 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe) 

269 

270 # Read the file naming templates 

271 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe) 

272 

273 # See if composites should be disassembled 

274 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe) 

275 

276 tableName = self.config["records", "table"] 

277 try: 

278 # Storage of paths and formatters, keyed by dataset_id 

279 self._table = bridgeManager.opaque.register( 

280 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType) 

281 ) 

282 # Interface to Registry. 

283 self._bridge = bridgeManager.register(self.name) 

284 except ReadOnlyDatabaseError: 

285 # If the database is read only and we just tried and failed to 

286 # create a table, it means someone is trying to create a read-only 

287 # butler client for an empty repo. That should be okay, as long 

288 # as they then try to get any datasets before some other client 

289 # creates the table. Chances are they'rejust validating 

290 # configuration. 

291 pass 

292 

293 # Determine whether checksums should be used - default to False 

294 self.useChecksum = self.config.get("checksum", False) 

295 

296 # Determine whether we can fall back to configuration if a 

297 # requested dataset is not known to registry 

298 self.trustGetRequest = self.config.get("trust_get_request", False) 

299 

300 # Create a cache manager 

301 self.cacheManager: AbstractDatastoreCacheManager 

302 if "cached" in self.config: 

303 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe) 

304 else: 

305 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe) 

306 

307 # Check existence and create directory structure if necessary 

308 if not self.root.exists(): 

309 if "create" not in self.config or not self.config["create"]: 

310 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

311 try: 

312 self.root.mkdir() 

313 except Exception as e: 

314 raise ValueError( 

315 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}" 

316 ) from e 

317 

318 def __str__(self) -> str: 

319 return str(self.root) 

320 

321 @property 

322 def bridge(self) -> DatastoreRegistryBridge: 

323 return self._bridge 

324 

325 @property 

326 def roots(self) -> dict[str, ResourcePath | None]: 

327 # Docstring inherited. 

328 return {self.name: self.root} 

329 

330 def _artifact_exists(self, location: Location) -> bool: 

331 """Check that an artifact exists in this datastore at the specified 

332 location. 

333 

334 Parameters 

335 ---------- 

336 location : `Location` 

337 Expected location of the artifact associated with this datastore. 

338 

339 Returns 

340 ------- 

341 exists : `bool` 

342 True if the location can be found, false otherwise. 

343 """ 

344 log.debug("Checking if resource exists: %s", location.uri) 

345 return location.uri.exists() 

346 

347 def _delete_artifact(self, location: Location) -> None: 

348 """Delete the artifact from the datastore. 

349 

350 Parameters 

351 ---------- 

352 location : `Location` 

353 Location of the artifact associated with this datastore. 

354 """ 

355 if location.pathInStore.isabs(): 

356 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

357 

358 try: 

359 location.uri.remove() 

360 except FileNotFoundError: 

361 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

362 raise 

363 except Exception as e: 

364 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

365 raise 

366 log.debug("Successfully deleted file: %s", location.uri) 

367 

368 def addStoredItemInfo( 

369 self, 

370 refs: Iterable[DatasetRef], 

371 infos: Iterable[StoredFileInfo], 

372 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT, 

373 ) -> None: 

374 # Docstring inherited from GenericBaseDatastore 

375 records = [info.rebase(ref).to_record() for ref, info in zip(refs, infos, strict=True)] 

376 match insert_mode: 

377 case DatabaseInsertMode.INSERT: 

378 self._table.insert(*records, transaction=self._transaction) 

379 case DatabaseInsertMode.ENSURE: 

380 self._table.ensure(*records, transaction=self._transaction) 

381 case DatabaseInsertMode.REPLACE: 

382 self._table.replace(*records, transaction=self._transaction) 

383 case _: 

384 raise ValueError(f"Unknown insert mode of '{insert_mode}'") 

385 

386 def getStoredItemsInfo(self, ref: DatasetIdRef) -> list[StoredFileInfo]: 

387 # Docstring inherited from GenericBaseDatastore 

388 

389 # Look for the dataset_id -- there might be multiple matches 

390 # if we have disassembled the dataset. 

391 records = self._table.fetch(dataset_id=ref.id) 

392 return [StoredFileInfo.from_record(record) for record in records] 

393 

394 def _get_stored_records_associated_with_refs( 

395 self, refs: Iterable[DatasetIdRef] 

396 ) -> dict[DatasetId, list[StoredFileInfo]]: 

397 """Retrieve all records associated with the provided refs. 

398 

399 Parameters 

400 ---------- 

401 refs : iterable of `DatasetIdRef` 

402 The refs for which records are to be retrieved. 

403 

404 Returns 

405 ------- 

406 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

407 The matching records indexed by the ref ID. The number of entries 

408 in the dict can be smaller than the number of requested refs. 

409 """ 

410 records = self._table.fetch(dataset_id=[ref.id for ref in refs]) 

411 

412 # Uniqueness is dataset_id + component so can have multiple records 

413 # per ref. 

414 records_by_ref = defaultdict(list) 

415 for record in records: 

416 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

417 return records_by_ref 

418 

419 def _refs_associated_with_artifacts(self, paths: list[str | ResourcePath]) -> dict[str, set[DatasetId]]: 

420 """Return paths and associated dataset refs. 

421 

422 Parameters 

423 ---------- 

424 paths : `list` of `str` or `lsst.resources.ResourcePath` 

425 All the paths to include in search. 

426 

427 Returns 

428 ------- 

429 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

430 Mapping of each path to a set of associated database IDs. 

431 """ 

432 records = self._table.fetch(path=[str(path) for path in paths]) 

433 result = defaultdict(set) 

434 for row in records: 

435 result[row["path"]].add(row["dataset_id"]) 

436 return result 

437 

438 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> set[DatasetId]: 

439 """Return all dataset refs associated with the supplied path. 

440 

441 Parameters 

442 ---------- 

443 pathInStore : `lsst.resources.ResourcePath` 

444 Path of interest in the data store. 

445 

446 Returns 

447 ------- 

448 ids : `set` of `int` 

449 All `DatasetRef` IDs associated with this path. 

450 """ 

451 records = list(self._table.fetch(path=str(pathInStore))) 

452 ids = {r["dataset_id"] for r in records} 

453 return ids 

454 

455 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

456 # Docstring inherited from GenericBaseDatastore 

457 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

458 

459 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> list[tuple[Location, StoredFileInfo]]: 

460 r"""Find all the `Location`\ s of the requested dataset in the 

461 `Datastore` and the associated stored file information. 

462 

463 Parameters 

464 ---------- 

465 ref : `DatasetRef` 

466 Reference to the required `Dataset`. 

467 

468 Returns 

469 ------- 

470 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

471 Location of the dataset within the datastore and 

472 stored information about each file and its formatter. 

473 """ 

474 # Get the file information (this will fail if no file) 

475 records = self.getStoredItemsInfo(ref) 

476 

477 # Use the path to determine the location -- we need to take 

478 # into account absolute URIs in the datastore record 

479 return [(r.file_location(self.locationFactory), r) for r in records] 

480 

481 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

482 """Check that there is only one dataset associated with the 

483 specified artifact. 

484 

485 Parameters 

486 ---------- 

487 ref : `DatasetRef` or `FakeDatasetRef` 

488 Dataset to be removed. 

489 location : `Location` 

490 The location of the artifact to be removed. 

491 

492 Returns 

493 ------- 

494 can_remove : `Bool` 

495 True if the artifact can be safely removed. 

496 """ 

497 # Can't ever delete absolute URIs. 

498 if location.pathInStore.isabs(): 

499 return False 

500 

501 # Get all entries associated with this path 

502 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

503 if not allRefs: 

504 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

505 

506 # Remove these refs from all the refs and if there is nothing left 

507 # then we can delete 

508 remainingRefs = allRefs - {ref.id} 

509 

510 if remainingRefs: 

511 return False 

512 return True 

513 

514 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> list[tuple[Location, StoredFileInfo]]: 

515 """Predict the location and related file information of the requested 

516 dataset in this datastore. 

517 

518 Parameters 

519 ---------- 

520 ref : `DatasetRef` 

521 Reference to the required `Dataset`. 

522 

523 Returns 

524 ------- 

525 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

526 Expected Location of the dataset within the datastore and 

527 placeholder information about each file and its formatter. 

528 

529 Notes 

530 ----- 

531 Uses the current configuration to determine how we would expect the 

532 datastore files to have been written if we couldn't ask registry. 

533 This is safe so long as there has been no change to datastore 

534 configuration between writing the dataset and wanting to read it. 

535 Will not work for files that have been ingested without using the 

536 standard file template or default formatter. 

537 """ 

538 # If we have a component ref we always need to ask the questions 

539 # of the composite. If the composite is disassembled this routine 

540 # should return all components. If the composite was not 

541 # disassembled the composite is what is stored regardless of 

542 # component request. Note that if the caller has disassembled 

543 # a composite there is no way for this guess to know that 

544 # without trying both the composite and component ref and seeing 

545 # if there is something at the component Location even without 

546 # disassembly being enabled. 

547 if ref.datasetType.isComponent(): 

548 ref = ref.makeCompositeRef() 

549 

550 # See if the ref is a composite that should be disassembled 

551 doDisassembly = self.composites.shouldBeDisassembled(ref) 

552 

553 all_info: list[tuple[Location, Formatter, StorageClass, str | None]] = [] 

554 

555 if doDisassembly: 

556 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

557 compRef = ref.makeComponentRef(component) 

558 location, formatter = self._determine_put_formatter_location(compRef) 

559 all_info.append((location, formatter, componentStorage, component)) 

560 

561 else: 

562 # Always use the composite ref if no disassembly 

563 location, formatter = self._determine_put_formatter_location(ref) 

564 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

565 

566 # Convert the list of tuples to have StoredFileInfo as second element 

567 return [ 

568 ( 

569 location, 

570 StoredFileInfo( 

571 formatter=formatter, 

572 path=location.pathInStore.path, 

573 storageClass=storageClass, 

574 component=component, 

575 checksum=None, 

576 file_size=-1, 

577 dataset_id=ref.id, 

578 ), 

579 ) 

580 for location, formatter, storageClass, component in all_info 

581 ] 

582 

583 def _prepare_for_get( 

584 self, ref: DatasetRef, parameters: Mapping[str, Any] | None = None 

585 ) -> list[DatastoreFileGetInformation]: 

586 """Check parameters for ``get`` and obtain formatter and 

587 location. 

588 

589 Parameters 

590 ---------- 

591 ref : `DatasetRef` 

592 Reference to the required Dataset. 

593 parameters : `dict` 

594 `StorageClass`-specific parameters that specify, for example, 

595 a slice of the dataset to be loaded. 

596 

597 Returns 

598 ------- 

599 getInfo : `list` [`DatastoreFileGetInformation`] 

600 Parameters needed to retrieve each file. 

601 """ 

602 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

603 

604 # The storage class we want to use eventually 

605 refStorageClass = ref.datasetType.storageClass 

606 

607 # For trusted mode need to reset storage class. 

608 ref = self._cast_storage_class(ref) 

609 

610 # Get file metadata and internal metadata 

611 fileLocations = self._get_dataset_locations_info(ref) 

612 if not fileLocations: 

613 if not self.trustGetRequest: 

614 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

615 # Assume the dataset is where we think it should be 

616 fileLocations = self._get_expected_dataset_locations_info(ref) 

617 

618 if len(fileLocations) > 1: 

619 disassembled = True 

620 

621 # If trust is involved it is possible that there will be 

622 # components listed here that do not exist in the datastore. 

623 # Explicitly check for file artifact existence and filter out any 

624 # that are missing. 

625 if self.trustGetRequest: 

626 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()] 

627 

628 # For now complain only if we have no components at all. One 

629 # component is probably a problem but we can punt that to the 

630 # assembler. 

631 if not fileLocations: 

632 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") 

633 

634 else: 

635 disassembled = False 

636 

637 # Is this a component request? 

638 refComponent = ref.datasetType.component() 

639 

640 fileGetInfo = [] 

641 for location, storedFileInfo in fileLocations: 

642 # The storage class used to write the file 

643 writeStorageClass = storedFileInfo.storageClass 

644 

645 # If this has been disassembled we need read to match the write 

646 if disassembled: 

647 readStorageClass = writeStorageClass 

648 else: 

649 readStorageClass = refStorageClass 

650 

651 formatter = get_instance_of( 

652 storedFileInfo.formatter, 

653 FileDescriptor( 

654 location, 

655 readStorageClass=readStorageClass, 

656 storageClass=writeStorageClass, 

657 parameters=parameters, 

658 ), 

659 ref.dataId, 

660 ) 

661 

662 formatterParams, notFormatterParams = formatter.segregateParameters() 

663 

664 # Of the remaining parameters, extract the ones supported by 

665 # this StorageClass (for components not all will be handled) 

666 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

667 

668 # The ref itself could be a component if the dataset was 

669 # disassembled by butler, or we disassembled in datastore and 

670 # components came from the datastore records 

671 component = storedFileInfo.component if storedFileInfo.component else refComponent 

672 

673 fileGetInfo.append( 

674 DatastoreFileGetInformation( 

675 location, 

676 formatter, 

677 storedFileInfo, 

678 assemblerParams, 

679 formatterParams, 

680 component, 

681 readStorageClass, 

682 ) 

683 ) 

684 

685 return fileGetInfo 

686 

687 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> tuple[Location, Formatter]: 

688 """Check the arguments for ``put`` and obtain formatter and 

689 location. 

690 

691 Parameters 

692 ---------- 

693 inMemoryDataset : `object` 

694 The dataset to store. 

695 ref : `DatasetRef` 

696 Reference to the associated Dataset. 

697 

698 Returns 

699 ------- 

700 location : `Location` 

701 The location to write the dataset. 

702 formatter : `Formatter` 

703 The `Formatter` to use to write the dataset. 

704 

705 Raises 

706 ------ 

707 TypeError 

708 Supplied object and storage class are inconsistent. 

709 DatasetTypeNotSupportedError 

710 The associated `DatasetType` is not handled by this datastore. 

711 """ 

712 self._validate_put_parameters(inMemoryDataset, ref) 

713 return self._determine_put_formatter_location(ref) 

714 

715 def _determine_put_formatter_location(self, ref: DatasetRef) -> tuple[Location, Formatter]: 

716 """Calculate the formatter and output location to use for put. 

717 

718 Parameters 

719 ---------- 

720 ref : `DatasetRef` 

721 Reference to the associated Dataset. 

722 

723 Returns 

724 ------- 

725 location : `Location` 

726 The location to write the dataset. 

727 formatter : `Formatter` 

728 The `Formatter` to use to write the dataset. 

729 """ 

730 # Work out output file name 

731 try: 

732 template = self.templates.getTemplate(ref) 

733 except KeyError as e: 

734 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

735 

736 # Validate the template to protect against filenames from different 

737 # dataIds returning the same and causing overwrite confusion. 

738 template.validateTemplate(ref) 

739 

740 location = self.locationFactory.fromPath(template.format(ref)) 

741 

742 # Get the formatter based on the storage class 

743 storageClass = ref.datasetType.storageClass 

744 try: 

745 formatter = self.formatterFactory.getFormatter( 

746 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId 

747 ) 

748 except KeyError as e: 

749 raise DatasetTypeNotSupportedError( 

750 f"Unable to find formatter for {ref} in datastore {self.name}" 

751 ) from e 

752 

753 # Now that we know the formatter, update the location 

754 location = formatter.makeUpdatedLocation(location) 

755 

756 return location, formatter 

757 

758 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None: 

759 # Docstring inherited from base class 

760 if transfer != "auto": 

761 return transfer 

762 

763 # See if the paths are within the datastore or not 

764 inside = [self._pathInStore(d.path) is not None for d in datasets] 

765 

766 if all(inside): 

767 transfer = None 

768 elif not any(inside): 

769 # Allow ResourcePath to use its own knowledge 

770 transfer = "auto" 

771 else: 

772 # This can happen when importing from a datastore that 

773 # has had some datasets ingested using "direct" mode. 

774 # Also allow ResourcePath to sort it out but warn about it. 

775 # This can happen if you are importing from a datastore 

776 # that had some direct transfer datasets. 

777 log.warning( 

778 "Some datasets are inside the datastore and some are outside. Using 'split' " 

779 "transfer mode. This assumes that the files outside the datastore are " 

780 "still accessible to the new butler since they will not be copied into " 

781 "the target datastore." 

782 ) 

783 transfer = "split" 

784 

785 return transfer 

786 

787 def _pathInStore(self, path: ResourcePathExpression) -> str | None: 

788 """Return path relative to datastore root. 

789 

790 Parameters 

791 ---------- 

792 path : `lsst.resources.ResourcePathExpression` 

793 Path to dataset. Can be absolute URI. If relative assumed to 

794 be relative to the datastore. Returns path in datastore 

795 or raises an exception if the path it outside. 

796 

797 Returns 

798 ------- 

799 inStore : `str` 

800 Path relative to datastore root. Returns `None` if the file is 

801 outside the root. 

802 """ 

803 # Relative path will always be relative to datastore 

804 pathUri = ResourcePath(path, forceAbsolute=False) 

805 return pathUri.relative_to(self.root) 

806 

807 def _standardizeIngestPath( 

808 self, path: str | ResourcePath, *, transfer: str | None = None 

809 ) -> str | ResourcePath: 

810 """Standardize the path of a to-be-ingested file. 

811 

812 Parameters 

813 ---------- 

814 path : `str` or `lsst.resources.ResourcePath` 

815 Path of a file to be ingested. This parameter is not expected 

816 to be all the types that can be used to construct a 

817 `~lsst.resources.ResourcePath`. 

818 transfer : `str`, optional 

819 How (and whether) the dataset should be added to the datastore. 

820 See `ingest` for details of transfer modes. 

821 This implementation is provided only so 

822 `NotImplementedError` can be raised if the mode is not supported; 

823 actual transfers are deferred to `_extractIngestInfo`. 

824 

825 Returns 

826 ------- 

827 path : `str` or `lsst.resources.ResourcePath` 

828 New path in what the datastore considers standard form. If an 

829 absolute URI was given that will be returned unchanged. 

830 

831 Notes 

832 ----- 

833 Subclasses of `FileDatastore` can implement this method instead 

834 of `_prepIngest`. It should not modify the data repository or given 

835 file in any way. 

836 

837 Raises 

838 ------ 

839 NotImplementedError 

840 Raised if the datastore does not support the given transfer mode 

841 (including the case where ingest is not supported at all). 

842 FileNotFoundError 

843 Raised if one of the given files does not exist. 

844 """ 

845 if transfer not in (None, "direct", "split") + self.root.transferModes: 

846 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

847 

848 # A relative URI indicates relative to datastore root 

849 srcUri = ResourcePath(path, forceAbsolute=False) 

850 if not srcUri.isabs(): 

851 srcUri = self.root.join(path) 

852 

853 if not srcUri.exists(): 

854 raise FileNotFoundError( 

855 f"Resource at {srcUri} does not exist; note that paths to ingest " 

856 f"are assumed to be relative to {self.root} unless they are absolute." 

857 ) 

858 

859 if transfer is None: 

860 relpath = srcUri.relative_to(self.root) 

861 if not relpath: 

862 raise RuntimeError( 

863 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})" 

864 ) 

865 

866 # Return the relative path within the datastore for internal 

867 # transfer 

868 path = relpath 

869 

870 return path 

871 

872 def _extractIngestInfo( 

873 self, 

874 path: ResourcePathExpression, 

875 ref: DatasetRef, 

876 *, 

877 formatter: Formatter | type[Formatter], 

878 transfer: str | None = None, 

879 record_validation_info: bool = True, 

880 ) -> StoredFileInfo: 

881 """Relocate (if necessary) and extract `StoredFileInfo` from a 

882 to-be-ingested file. 

883 

884 Parameters 

885 ---------- 

886 path : `lsst.resources.ResourcePathExpression` 

887 URI or path of a file to be ingested. 

888 ref : `DatasetRef` 

889 Reference for the dataset being ingested. Guaranteed to have 

890 ``dataset_id not None`. 

891 formatter : `type` or `Formatter` 

892 `Formatter` subclass to use for this dataset or an instance. 

893 transfer : `str`, optional 

894 How (and whether) the dataset should be added to the datastore. 

895 See `ingest` for details of transfer modes. 

896 record_validation_info : `bool`, optional 

897 If `True`, the default, the datastore can record validation 

898 information associated with the file. If `False` the datastore 

899 will not attempt to track any information such as checksums 

900 or file sizes. This can be useful if such information is tracked 

901 in an external system or if the file is to be compressed in place. 

902 It is up to the datastore whether this parameter is relevant. 

903 

904 Returns 

905 ------- 

906 info : `StoredFileInfo` 

907 Internal datastore record for this file. This will be inserted by 

908 the caller; the `_extractIngestInfo` is only responsible for 

909 creating and populating the struct. 

910 

911 Raises 

912 ------ 

913 FileNotFoundError 

914 Raised if one of the given files does not exist. 

915 FileExistsError 

916 Raised if transfer is not `None` but the (internal) location the 

917 file would be moved to is already occupied. 

918 """ 

919 if self._transaction is None: 

920 raise RuntimeError("Ingest called without transaction enabled") 

921 

922 # Create URI of the source path, do not need to force a relative 

923 # path to absolute. 

924 srcUri = ResourcePath(path, forceAbsolute=False) 

925 

926 # Track whether we have read the size of the source yet 

927 have_sized = False 

928 

929 tgtLocation: Location | None 

930 if transfer is None or transfer == "split": 

931 # A relative path is assumed to be relative to the datastore 

932 # in this context 

933 if not srcUri.isabs(): 

934 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

935 else: 

936 # Work out the path in the datastore from an absolute URI 

937 # This is required to be within the datastore. 

938 pathInStore = srcUri.relative_to(self.root) 

939 if pathInStore is None and transfer is None: 

940 raise RuntimeError( 

941 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}" 

942 ) 

943 if pathInStore: 

944 tgtLocation = self.locationFactory.fromPath(pathInStore) 

945 elif transfer == "split": 

946 # Outside the datastore but treat that as a direct ingest 

947 # instead. 

948 tgtLocation = None 

949 else: 

950 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}") 

951 elif transfer == "direct": 

952 # Want to store the full URI to the resource directly in 

953 # datastore. This is useful for referring to permanent archive 

954 # storage for raw data. 

955 # Trust that people know what they are doing. 

956 tgtLocation = None 

957 else: 

958 # Work out the name we want this ingested file to have 

959 # inside the datastore 

960 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

961 if not tgtLocation.uri.dirname().exists(): 

962 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

963 tgtLocation.uri.dirname().mkdir() 

964 

965 # if we are transferring from a local file to a remote location 

966 # it may be more efficient to get the size and checksum of the 

967 # local file rather than the transferred one 

968 if record_validation_info and srcUri.isLocal: 

969 size = srcUri.size() 

970 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

971 have_sized = True 

972 

973 # Transfer the resource to the destination. 

974 # Allow overwrite of an existing file. This matches the behavior 

975 # of datastore.put() in that it trusts that registry would not 

976 # be asking to overwrite unless registry thought that the 

977 # overwrite was allowed. 

978 tgtLocation.uri.transfer_from( 

979 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True 

980 ) 

981 

982 if tgtLocation is None: 

983 # This means we are using direct mode 

984 targetUri = srcUri 

985 targetPath = str(srcUri) 

986 else: 

987 targetUri = tgtLocation.uri 

988 targetPath = tgtLocation.pathInStore.path 

989 

990 # the file should exist in the datastore now 

991 if record_validation_info: 

992 if not have_sized: 

993 size = targetUri.size() 

994 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

995 else: 

996 # Not recording any file information. 

997 size = -1 

998 checksum = None 

999 

1000 return StoredFileInfo( 

1001 formatter=formatter, 

1002 path=targetPath, 

1003 storageClass=ref.datasetType.storageClass, 

1004 component=ref.datasetType.component(), 

1005 file_size=size, 

1006 checksum=checksum, 

1007 dataset_id=ref.id, 

1008 ) 

1009 

1010 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData: 

1011 # Docstring inherited from Datastore._prepIngest. 

1012 filtered = [] 

1013 for dataset in datasets: 

1014 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

1015 if not acceptable: 

1016 continue 

1017 else: 

1018 dataset.refs = acceptable 

1019 if dataset.formatter is None: 

1020 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

1021 else: 

1022 assert isinstance(dataset.formatter, type | str) 

1023 formatter_class = get_class_of(dataset.formatter) 

1024 if not issubclass(formatter_class, Formatter): 

1025 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.") 

1026 dataset.formatter = formatter_class 

1027 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

1028 filtered.append(dataset) 

1029 return _IngestPrepData(filtered) 

1030 

1031 @transactional 

1032 def _finishIngest( 

1033 self, 

1034 prepData: Datastore.IngestPrepData, 

1035 *, 

1036 transfer: str | None = None, 

1037 record_validation_info: bool = True, 

1038 ) -> None: 

1039 # Docstring inherited from Datastore._finishIngest. 

1040 refsAndInfos = [] 

1041 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

1042 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

1043 # Do ingest as if the first dataset ref is associated with the file 

1044 info = self._extractIngestInfo( 

1045 dataset.path, 

1046 dataset.refs[0], 

1047 formatter=dataset.formatter, 

1048 transfer=transfer, 

1049 record_validation_info=record_validation_info, 

1050 ) 

1051 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

1052 

1053 # In direct mode we can allow repeated ingests of the same thing 

1054 # if we are sure that the external dataset is immutable. We use 

1055 # UUIDv5 to indicate this. If there is a mix of v4 and v5 they are 

1056 # separated. 

1057 refs_and_infos_replace = [] 

1058 refs_and_infos_insert = [] 

1059 if transfer == "direct": 

1060 for entry in refsAndInfos: 

1061 if entry[0].id.version == 5: 

1062 refs_and_infos_replace.append(entry) 

1063 else: 

1064 refs_and_infos_insert.append(entry) 

1065 else: 

1066 refs_and_infos_insert = refsAndInfos 

1067 

1068 if refs_and_infos_insert: 

1069 self._register_datasets(refs_and_infos_insert, insert_mode=DatabaseInsertMode.INSERT) 

1070 if refs_and_infos_replace: 

1071 self._register_datasets(refs_and_infos_replace, insert_mode=DatabaseInsertMode.REPLACE) 

1072 

1073 def _calculate_ingested_datastore_name( 

1074 self, 

1075 srcUri: ResourcePath, 

1076 ref: DatasetRef, 

1077 formatter: Formatter | type[Formatter] | None = None, 

1078 ) -> Location: 

1079 """Given a source URI and a DatasetRef, determine the name the 

1080 dataset will have inside datastore. 

1081 

1082 Parameters 

1083 ---------- 

1084 srcUri : `lsst.resources.ResourcePath` 

1085 URI to the source dataset file. 

1086 ref : `DatasetRef` 

1087 Ref associated with the newly-ingested dataset artifact. This 

1088 is used to determine the name within the datastore. 

1089 formatter : `Formatter` or Formatter class. 

1090 Formatter to use for validation. Can be a class or an instance. 

1091 No validation of the file extension is performed if the 

1092 ``formatter`` is `None`. This can be used if the caller knows 

1093 that the source URI and target URI will use the same formatter. 

1094 

1095 Returns 

1096 ------- 

1097 location : `Location` 

1098 Target location for the newly-ingested dataset. 

1099 """ 

1100 # Ingesting a file from outside the datastore. 

1101 # This involves a new name. 

1102 template = self.templates.getTemplate(ref) 

1103 location = self.locationFactory.fromPath(template.format(ref)) 

1104 

1105 # Get the extension 

1106 ext = srcUri.getExtension() 

1107 

1108 # Update the destination to include that extension 

1109 location.updateExtension(ext) 

1110 

1111 # Ask the formatter to validate this extension 

1112 if formatter is not None: 

1113 formatter.validateExtension(location) 

1114 

1115 return location 

1116 

1117 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

1118 """Write out in memory dataset to datastore. 

1119 

1120 Parameters 

1121 ---------- 

1122 inMemoryDataset : `object` 

1123 Dataset to write to datastore. 

1124 ref : `DatasetRef` 

1125 Registry information associated with this dataset. 

1126 

1127 Returns 

1128 ------- 

1129 info : `StoredFileInfo` 

1130 Information describing the artifact written to the datastore. 

1131 """ 

1132 # May need to coerce the in memory dataset to the correct 

1133 # python type, but first we need to make sure the storage class 

1134 # reflects the one defined in the data repository. 

1135 ref = self._cast_storage_class(ref) 

1136 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset) 

1137 

1138 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

1139 uri = location.uri 

1140 

1141 if not uri.dirname().exists(): 

1142 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

1143 uri.dirname().mkdir() 

1144 

1145 if self._transaction is None: 

1146 raise RuntimeError("Attempting to write artifact without transaction enabled") 

1147 

1148 def _removeFileExists(uri: ResourcePath) -> None: 

1149 """Remove a file and do not complain if it is not there. 

1150 

1151 This is important since a formatter might fail before the file 

1152 is written and we should not confuse people by writing spurious 

1153 error messages to the log. 

1154 """ 

1155 with contextlib.suppress(FileNotFoundError): 

1156 uri.remove() 

1157 

1158 # Register a callback to try to delete the uploaded data if 

1159 # something fails below 

1160 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1161 

1162 data_written = False 

1163 

1164 # For remote URIs some datasets can be serialized directly 

1165 # to bytes and sent to the remote datastore without writing a 

1166 # file. If the dataset is intended to be saved to the cache 

1167 # a file is always written and direct write to the remote 

1168 # datastore is bypassed. 

1169 if not uri.isLocal and not self.cacheManager.should_be_cached(ref): 

1170 # Remote URI that is not cached so can write directly. 

1171 try: 

1172 serializedDataset = formatter.toBytes(inMemoryDataset) 

1173 except NotImplementedError: 

1174 # Fallback to the file writing option. 

1175 pass 

1176 except Exception as e: 

1177 raise RuntimeError( 

1178 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes." 

1179 ) from e 

1180 else: 

1181 log.debug("Writing bytes directly to %s", uri) 

1182 uri.write(serializedDataset, overwrite=True) 

1183 log.debug("Successfully wrote bytes directly to %s", uri) 

1184 data_written = True 

1185 

1186 if not data_written: 

1187 # Did not write the bytes directly to object store so instead 

1188 # write to temporary file. Always write to a temporary even if 

1189 # using a local file system -- that gives us atomic writes. 

1190 # If a process is killed as the file is being written we do not 

1191 # want it to remain in the correct place but in corrupt state. 

1192 # For local files write to the output directory not temporary dir. 

1193 prefix = uri.dirname() if uri.isLocal else None 

1194 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri: 

1195 # Need to configure the formatter to write to a different 

1196 # location and that needs us to overwrite internals 

1197 log.debug("Writing dataset to temporary location at %s", temporary_uri) 

1198 with formatter._updateLocation(Location(None, temporary_uri)): 

1199 try: 

1200 formatter.write(inMemoryDataset) 

1201 except Exception as e: 

1202 raise RuntimeError( 

1203 f"Failed to serialize dataset {ref} of type" 

1204 f" {type(inMemoryDataset)} to " 

1205 f"temporary location {temporary_uri}" 

1206 ) from e 

1207 

1208 # Use move for a local file since that becomes an efficient 

1209 # os.rename. For remote resources we use copy to allow the 

1210 # file to be cached afterwards. 

1211 transfer = "move" if uri.isLocal else "copy" 

1212 

1213 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True) 

1214 

1215 if transfer == "copy": 

1216 # Cache if required 

1217 self.cacheManager.move_to_cache(temporary_uri, ref) 

1218 

1219 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1220 

1221 # URI is needed to resolve what ingest case are we dealing with 

1222 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1223 

1224 def _read_artifact_into_memory( 

1225 self, 

1226 getInfo: DatastoreFileGetInformation, 

1227 ref: DatasetRef, 

1228 isComponent: bool = False, 

1229 cache_ref: DatasetRef | None = None, 

1230 ) -> Any: 

1231 """Read the artifact from datastore into in memory object. 

1232 

1233 Parameters 

1234 ---------- 

1235 getInfo : `DatastoreFileGetInformation` 

1236 Information about the artifact within the datastore. 

1237 ref : `DatasetRef` 

1238 The registry information associated with this artifact. 

1239 isComponent : `bool` 

1240 Flag to indicate if a component is being read from this artifact. 

1241 cache_ref : `DatasetRef`, optional 

1242 The DatasetRef to use when looking up the file in the cache. 

1243 This ref must have the same ID as the supplied ref but can 

1244 be a parent ref or component ref to indicate to the cache whether 

1245 a composite file is being requested from the cache or a component 

1246 file. Without this the cache will default to the supplied ref but 

1247 it can get confused with read-only derived components for 

1248 disassembled composites. 

1249 

1250 Returns 

1251 ------- 

1252 inMemoryDataset : `object` 

1253 The artifact as a python object. 

1254 """ 

1255 location = getInfo.location 

1256 uri = location.uri 

1257 log.debug("Accessing data from %s", uri) 

1258 

1259 if cache_ref is None: 

1260 cache_ref = ref 

1261 if cache_ref.id != ref.id: 

1262 raise ValueError( 

1263 "The supplied cache dataset ref refers to a different dataset than expected:" 

1264 f" {ref.id} != {cache_ref.id}" 

1265 ) 

1266 

1267 # Cannot recalculate checksum but can compare size as a quick check 

1268 # Do not do this if the size is negative since that indicates 

1269 # we do not know. 

1270 recorded_size = getInfo.info.file_size 

1271 resource_size = uri.size() 

1272 if recorded_size >= 0 and resource_size != recorded_size: 

1273 raise RuntimeError( 

1274 "Integrity failure in Datastore. " 

1275 f"Size of file {uri} ({resource_size}) " 

1276 f"does not match size recorded in registry of {recorded_size}" 

1277 ) 

1278 

1279 # For the general case we have choices for how to proceed. 

1280 # 1. Always use a local file (downloading the remote resource to a 

1281 # temporary file if needed). 

1282 # 2. Use a threshold size and read into memory and use bytes. 

1283 # Use both for now with an arbitrary hand off size. 

1284 # This allows small datasets to be downloaded from remote object 

1285 # stores without requiring a temporary file. 

1286 

1287 formatter = getInfo.formatter 

1288 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1289 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1290 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1291 if cached_file is not None: 

1292 desired_uri = cached_file 

1293 msg = f" (cached version of {uri})" 

1294 else: 

1295 desired_uri = uri 

1296 msg = "" 

1297 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)): 

1298 serializedDataset = desired_uri.read() 

1299 log.debug( 

1300 "Deserializing %s from %d bytes from location %s with formatter %s", 

1301 f"component {getInfo.component}" if isComponent else "", 

1302 len(serializedDataset), 

1303 uri, 

1304 formatter.name(), 

1305 ) 

1306 try: 

1307 result = formatter.fromBytes( 

1308 serializedDataset, component=getInfo.component if isComponent else None 

1309 ) 

1310 except Exception as e: 

1311 raise ValueError( 

1312 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1313 f" ({ref.datasetType.name} from {uri}): {e}" 

1314 ) from e 

1315 else: 

1316 # Read from file. 

1317 

1318 # Have to update the Location associated with the formatter 

1319 # because formatter.read does not allow an override. 

1320 # This could be improved. 

1321 location_updated = False 

1322 msg = "" 

1323 

1324 # First check in cache for local version. 

1325 # The cache will only be relevant for remote resources but 

1326 # no harm in always asking. Context manager ensures that cache 

1327 # file is not deleted during cache expiration. 

1328 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1329 if cached_file is not None: 

1330 msg = f"(via cache read of remote file {uri})" 

1331 uri = cached_file 

1332 location_updated = True 

1333 

1334 with uri.as_local() as local_uri: 

1335 can_be_cached = False 

1336 if uri != local_uri: 

1337 # URI was remote and file was downloaded 

1338 cache_msg = "" 

1339 location_updated = True 

1340 

1341 if self.cacheManager.should_be_cached(cache_ref): 

1342 # In this scenario we want to ask if the downloaded 

1343 # file should be cached but we should not cache 

1344 # it until after we've used it (to ensure it can't 

1345 # be expired whilst we are using it). 

1346 can_be_cached = True 

1347 

1348 # Say that it is "likely" to be cached because 

1349 # if the formatter read fails we will not be 

1350 # caching this file. 

1351 cache_msg = " and likely cached" 

1352 

1353 msg = f"(via download to local file{cache_msg})" 

1354 

1355 # Calculate the (possibly) new location for the formatter 

1356 # to use. 

1357 newLocation = Location(*local_uri.split()) if location_updated else None 

1358 

1359 log.debug( 

1360 "Reading%s from location %s %s with formatter %s", 

1361 f" component {getInfo.component}" if isComponent else "", 

1362 uri, 

1363 msg, 

1364 formatter.name(), 

1365 ) 

1366 try: 

1367 with ( 

1368 formatter._updateLocation(newLocation), 

1369 time_this( 

1370 log, 

1371 msg="Reading%s from location %s %s with formatter %s", 

1372 args=( 

1373 f" component {getInfo.component}" if isComponent else "", 

1374 uri, 

1375 msg, 

1376 formatter.name(), 

1377 ), 

1378 ), 

1379 ): 

1380 result = formatter.read(component=getInfo.component if isComponent else None) 

1381 except Exception as e: 

1382 raise ValueError( 

1383 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1384 f" ({ref.datasetType.name} from {uri}): {e}" 

1385 ) from e 

1386 

1387 # File was read successfully so can move to cache 

1388 if can_be_cached: 

1389 self.cacheManager.move_to_cache(local_uri, cache_ref) 

1390 

1391 return self._post_process_get( 

1392 result, ref.datasetType.storageClass, getInfo.assemblerParams, isComponent=isComponent 

1393 ) 

1394 

1395 def knows(self, ref: DatasetRef) -> bool: 

1396 """Check if the dataset is known to the datastore. 

1397 

1398 Does not check for existence of any artifact. 

1399 

1400 Parameters 

1401 ---------- 

1402 ref : `DatasetRef` 

1403 Reference to the required dataset. 

1404 

1405 Returns 

1406 ------- 

1407 exists : `bool` 

1408 `True` if the dataset is known to the datastore. 

1409 """ 

1410 fileLocations = self._get_dataset_locations_info(ref) 

1411 if fileLocations: 

1412 return True 

1413 return False 

1414 

1415 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

1416 # Docstring inherited from the base class. 

1417 

1418 # The records themselves. Could be missing some entries. 

1419 records = self._get_stored_records_associated_with_refs(refs) 

1420 

1421 return {ref: ref.id in records for ref in refs} 

1422 

1423 def _process_mexists_records( 

1424 self, 

1425 id_to_ref: dict[DatasetId, DatasetRef], 

1426 records: dict[DatasetId, list[StoredFileInfo]], 

1427 all_required: bool, 

1428 artifact_existence: dict[ResourcePath, bool] | None = None, 

1429 ) -> dict[DatasetRef, bool]: 

1430 """Check given records for existence. 

1431 

1432 Helper function for `mexists()`. 

1433 

1434 Parameters 

1435 ---------- 

1436 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`] 

1437 Mapping of the dataset ID to the dataset ref itself. 

1438 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

1439 Records as generally returned by 

1440 ``_get_stored_records_associated_with_refs``. 

1441 all_required : `bool` 

1442 Flag to indicate whether existence requires all artifacts 

1443 associated with a dataset ID to exist or not for existence. 

1444 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1445 Optional mapping of datastore artifact to existence. Updated by 

1446 this method with details of all artifacts tested. Can be `None` 

1447 if the caller is not interested. 

1448 

1449 Returns 

1450 ------- 

1451 existence : `dict` of [`DatasetRef`, `bool`] 

1452 Mapping from dataset to boolean indicating existence. 

1453 """ 

1454 # The URIs to be checked and a mapping of those URIs to 

1455 # the dataset ID. 

1456 uris_to_check: list[ResourcePath] = [] 

1457 location_map: dict[ResourcePath, DatasetId] = {} 

1458 

1459 location_factory = self.locationFactory 

1460 

1461 uri_existence: dict[ResourcePath, bool] = {} 

1462 for ref_id, infos in records.items(): 

1463 # Key is the dataset Id, value is list of StoredItemInfo 

1464 uris = [info.file_location(location_factory).uri for info in infos] 

1465 location_map.update({uri: ref_id for uri in uris}) 

1466 

1467 # Check the local cache directly for a dataset corresponding 

1468 # to the remote URI. 

1469 if self.cacheManager.file_count > 0: 

1470 ref = id_to_ref[ref_id] 

1471 for uri, storedFileInfo in zip(uris, infos, strict=True): 

1472 check_ref = ref 

1473 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1474 check_ref = ref.makeComponentRef(component) 

1475 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()): 

1476 # Proxy for URI existence. 

1477 uri_existence[uri] = True 

1478 else: 

1479 uris_to_check.append(uri) 

1480 else: 

1481 # Check all of them. 

1482 uris_to_check.extend(uris) 

1483 

1484 if artifact_existence is not None: 

1485 # If a URI has already been checked remove it from the list 

1486 # and immediately add the status to the output dict. 

1487 filtered_uris_to_check = [] 

1488 for uri in uris_to_check: 

1489 if uri in artifact_existence: 

1490 uri_existence[uri] = artifact_existence[uri] 

1491 else: 

1492 filtered_uris_to_check.append(uri) 

1493 uris_to_check = filtered_uris_to_check 

1494 

1495 # Results. 

1496 dataset_existence: dict[DatasetRef, bool] = {} 

1497 

1498 uri_existence.update(ResourcePath.mexists(uris_to_check)) 

1499 for uri, exists in uri_existence.items(): 

1500 dataset_id = location_map[uri] 

1501 ref = id_to_ref[dataset_id] 

1502 

1503 # Disassembled composite needs to check all locations. 

1504 # all_required indicates whether all need to exist or not. 

1505 if ref in dataset_existence: 

1506 if all_required: 

1507 exists = dataset_existence[ref] and exists 

1508 else: 

1509 exists = dataset_existence[ref] or exists 

1510 dataset_existence[ref] = exists 

1511 

1512 if artifact_existence is not None: 

1513 artifact_existence.update(uri_existence) 

1514 

1515 return dataset_existence 

1516 

1517 def mexists( 

1518 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1519 ) -> dict[DatasetRef, bool]: 

1520 """Check the existence of multiple datasets at once. 

1521 

1522 Parameters 

1523 ---------- 

1524 refs : iterable of `DatasetRef` 

1525 The datasets to be checked. 

1526 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1527 Optional mapping of datastore artifact to existence. Updated by 

1528 this method with details of all artifacts tested. Can be `None` 

1529 if the caller is not interested. 

1530 

1531 Returns 

1532 ------- 

1533 existence : `dict` of [`DatasetRef`, `bool`] 

1534 Mapping from dataset to boolean indicating existence. 

1535 

1536 Notes 

1537 ----- 

1538 To minimize potentially costly remote existence checks, the local 

1539 cache is checked as a proxy for existence. If a file for this 

1540 `DatasetRef` does exist no check is done for the actual URI. This 

1541 could result in possibly unexpected behavior if the dataset itself 

1542 has been removed from the datastore by another process whilst it is 

1543 still in the cache. 

1544 """ 

1545 chunk_size = 10_000 

1546 dataset_existence: dict[DatasetRef, bool] = {} 

1547 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size) 

1548 n_found_total = 0 

1549 n_checked = 0 

1550 n_chunks = 0 

1551 for chunk in chunk_iterable(refs, chunk_size=chunk_size): 

1552 chunk_result = self._mexists(chunk, artifact_existence) 

1553 

1554 # The log message level and content depend on how many 

1555 # datasets we are processing. 

1556 n_results = len(chunk_result) 

1557 

1558 # Use verbose logging to ensure that messages can be seen 

1559 # easily if many refs are being checked. 

1560 log_threshold = VERBOSE 

1561 n_checked += n_results 

1562 

1563 # This sum can take some time so only do it if we know the 

1564 # result is going to be used. 

1565 n_found = 0 

1566 if log.isEnabledFor(log_threshold): 

1567 # Can treat the booleans as 0, 1 integers and sum them. 

1568 n_found = sum(chunk_result.values()) 

1569 n_found_total += n_found 

1570 

1571 # We are deliberately not trying to count the number of refs 

1572 # provided in case it's in the millions. This means there is a 

1573 # situation where the number of refs exactly matches the chunk 

1574 # size and we will switch to the multi-chunk path even though 

1575 # we only have a single chunk. 

1576 if n_results < chunk_size and n_chunks == 0: 

1577 # Single chunk will be processed so we can provide more detail. 

1578 if n_results == 1: 

1579 ref = list(chunk_result)[0] 

1580 # Use debug logging to be consistent with `exists()`. 

1581 log.debug( 

1582 "Calling mexists() with single ref that does%s exist (%s).", 

1583 "" if chunk_result[ref] else " not", 

1584 ref, 

1585 ) 

1586 else: 

1587 # Single chunk but multiple files. Summarize. 

1588 log.log( 

1589 log_threshold, 

1590 "Number of datasets found in datastore: %d out of %d datasets checked.", 

1591 n_found, 

1592 n_checked, 

1593 ) 

1594 

1595 else: 

1596 # Use incremental verbose logging when we have multiple chunks. 

1597 log.log( 

1598 log_threshold, 

1599 "Number of datasets found in datastore for chunk %d: %d out of %d checked " 

1600 "(running total from all chunks so far: %d found out of %d checked)", 

1601 n_chunks, 

1602 n_found, 

1603 n_results, 

1604 n_found_total, 

1605 n_checked, 

1606 ) 

1607 dataset_existence.update(chunk_result) 

1608 n_chunks += 1 

1609 

1610 return dataset_existence 

1611 

1612 def _mexists( 

1613 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1614 ) -> dict[DatasetRef, bool]: 

1615 """Check the existence of multiple datasets at once. 

1616 

1617 Parameters 

1618 ---------- 

1619 refs : iterable of `DatasetRef` 

1620 The datasets to be checked. 

1621 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1622 Optional mapping of datastore artifact to existence. Updated by 

1623 this method with details of all artifacts tested. Can be `None` 

1624 if the caller is not interested. 

1625 

1626 Returns 

1627 ------- 

1628 existence : `dict` of [`DatasetRef`, `bool`] 

1629 Mapping from dataset to boolean indicating existence. 

1630 """ 

1631 # Make a mapping from refs with the internal storage class to the given 

1632 # refs that may have a different one. We'll use the internal refs 

1633 # throughout this method and convert back at the very end. 

1634 internal_ref_to_input_ref = {self._cast_storage_class(ref): ref for ref in refs} 

1635 

1636 # Need a mapping of dataset_id to (internal) dataset ref since some 

1637 # internal APIs work with dataset_id. 

1638 id_to_ref = {ref.id: ref for ref in internal_ref_to_input_ref} 

1639 

1640 # Set of all IDs we are checking for. 

1641 requested_ids = set(id_to_ref.keys()) 

1642 

1643 # The records themselves. Could be missing some entries. 

1644 records = self._get_stored_records_associated_with_refs(id_to_ref.values()) 

1645 

1646 dataset_existence = self._process_mexists_records( 

1647 id_to_ref, records, True, artifact_existence=artifact_existence 

1648 ) 

1649 

1650 # Set of IDs that have been handled. 

1651 handled_ids = {ref.id for ref in dataset_existence} 

1652 

1653 missing_ids = requested_ids - handled_ids 

1654 if missing_ids: 

1655 dataset_existence.update( 

1656 self._mexists_check_expected( 

1657 [id_to_ref[missing] for missing in missing_ids], artifact_existence 

1658 ) 

1659 ) 

1660 

1661 return { 

1662 internal_ref_to_input_ref[internal_ref]: existence 

1663 for internal_ref, existence in dataset_existence.items() 

1664 } 

1665 

1666 def _mexists_check_expected( 

1667 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1668 ) -> dict[DatasetRef, bool]: 

1669 """Check existence of refs that are not known to datastore. 

1670 

1671 Parameters 

1672 ---------- 

1673 refs : iterable of `DatasetRef` 

1674 The datasets to be checked. These are assumed not to be known 

1675 to datastore. 

1676 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1677 Optional mapping of datastore artifact to existence. Updated by 

1678 this method with details of all artifacts tested. Can be `None` 

1679 if the caller is not interested. 

1680 

1681 Returns 

1682 ------- 

1683 existence : `dict` of [`DatasetRef`, `bool`] 

1684 Mapping from dataset to boolean indicating existence. 

1685 """ 

1686 dataset_existence: dict[DatasetRef, bool] = {} 

1687 if not self.trustGetRequest: 

1688 # Must assume these do not exist 

1689 for ref in refs: 

1690 dataset_existence[ref] = False 

1691 else: 

1692 log.debug( 

1693 "%d datasets were not known to datastore during initial existence check.", 

1694 len(refs), 

1695 ) 

1696 

1697 # Construct data structure identical to that returned 

1698 # by _get_stored_records_associated_with_refs() but using 

1699 # guessed names. 

1700 records = {} 

1701 id_to_ref = {} 

1702 for missing_ref in refs: 

1703 expected = self._get_expected_dataset_locations_info(missing_ref) 

1704 dataset_id = missing_ref.id 

1705 records[dataset_id] = [info for _, info in expected] 

1706 id_to_ref[dataset_id] = missing_ref 

1707 

1708 dataset_existence.update( 

1709 self._process_mexists_records( 

1710 id_to_ref, 

1711 records, 

1712 False, 

1713 artifact_existence=artifact_existence, 

1714 ) 

1715 ) 

1716 

1717 return dataset_existence 

1718 

1719 def exists(self, ref: DatasetRef) -> bool: 

1720 """Check if the dataset exists in the datastore. 

1721 

1722 Parameters 

1723 ---------- 

1724 ref : `DatasetRef` 

1725 Reference to the required dataset. 

1726 

1727 Returns 

1728 ------- 

1729 exists : `bool` 

1730 `True` if the entity exists in the `Datastore`. 

1731 

1732 Notes 

1733 ----- 

1734 The local cache is checked as a proxy for existence in the remote 

1735 object store. It is possible that another process on a different 

1736 compute node could remove the file from the object store even 

1737 though it is present in the local cache. 

1738 """ 

1739 ref = self._cast_storage_class(ref) 

1740 fileLocations = self._get_dataset_locations_info(ref) 

1741 

1742 # if we are being asked to trust that registry might not be correct 

1743 # we ask for the expected locations and check them explicitly 

1744 if not fileLocations: 

1745 if not self.trustGetRequest: 

1746 return False 

1747 

1748 # First check the cache. If it is not found we must check 

1749 # the datastore itself. Assume that any component in the cache 

1750 # means that the dataset does exist somewhere. 

1751 if self.cacheManager.known_to_cache(ref): 

1752 return True 

1753 

1754 # When we are guessing a dataset location we can not check 

1755 # for the existence of every component since we can not 

1756 # know if every component was written. Instead we check 

1757 # for the existence of any of the expected locations. 

1758 for location, _ in self._get_expected_dataset_locations_info(ref): 

1759 if self._artifact_exists(location): 

1760 return True 

1761 return False 

1762 

1763 # All listed artifacts must exist. 

1764 for location, storedFileInfo in fileLocations: 

1765 # Checking in cache needs the component ref. 

1766 check_ref = ref 

1767 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1768 check_ref = ref.makeComponentRef(component) 

1769 if self.cacheManager.known_to_cache(check_ref, location.getExtension()): 

1770 continue 

1771 

1772 if not self._artifact_exists(location): 

1773 return False 

1774 

1775 return True 

1776 

1777 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

1778 """Return URIs associated with dataset. 

1779 

1780 Parameters 

1781 ---------- 

1782 ref : `DatasetRef` 

1783 Reference to the required dataset. 

1784 predict : `bool`, optional 

1785 If the datastore does not know about the dataset, should it 

1786 return a predicted URI or not? 

1787 

1788 Returns 

1789 ------- 

1790 uris : `DatasetRefURIs` 

1791 The URI to the primary artifact associated with this dataset (if 

1792 the dataset was disassembled within the datastore this may be 

1793 `None`), and the URIs to any components associated with the dataset 

1794 artifact. (can be empty if there are no components). 

1795 """ 

1796 many = self.getManyURIs([ref], predict=predict, allow_missing=False) 

1797 return many[ref] 

1798 

1799 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

1800 """URI to the Dataset. 

1801 

1802 Parameters 

1803 ---------- 

1804 ref : `DatasetRef` 

1805 Reference to the required Dataset. 

1806 predict : `bool` 

1807 If `True`, allow URIs to be returned of datasets that have not 

1808 been written. 

1809 

1810 Returns 

1811 ------- 

1812 uri : `str` 

1813 URI pointing to the dataset within the datastore. If the 

1814 dataset does not exist in the datastore, and if ``predict`` is 

1815 `True`, the URI will be a prediction and will include a URI 

1816 fragment "#predicted". 

1817 If the datastore does not have entities that relate well 

1818 to the concept of a URI the returned URI will be 

1819 descriptive. The returned URI is not guaranteed to be obtainable. 

1820 

1821 Raises 

1822 ------ 

1823 FileNotFoundError 

1824 Raised if a URI has been requested for a dataset that does not 

1825 exist and guessing is not allowed. 

1826 RuntimeError 

1827 Raised if a request is made for a single URI but multiple URIs 

1828 are associated with this dataset. 

1829 

1830 Notes 

1831 ----- 

1832 When a predicted URI is requested an attempt will be made to form 

1833 a reasonable URI based on file templates and the expected formatter. 

1834 """ 

1835 primary, components = self.getURIs(ref, predict) 

1836 if primary is None or components: 

1837 raise RuntimeError( 

1838 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

1839 ) 

1840 return primary 

1841 

1842 def _predict_URIs( 

1843 self, 

1844 ref: DatasetRef, 

1845 ) -> DatasetRefURIs: 

1846 """Predict the URIs of a dataset ref. 

1847 

1848 Parameters 

1849 ---------- 

1850 ref : `DatasetRef` 

1851 Reference to the required Dataset. 

1852 

1853 Returns 

1854 ------- 

1855 URI : DatasetRefUris 

1856 Primary and component URIs. URIs will contain a URI fragment 

1857 "#predicted". 

1858 """ 

1859 uris = DatasetRefURIs() 

1860 

1861 if self.composites.shouldBeDisassembled(ref): 

1862 for component, _ in ref.datasetType.storageClass.components.items(): 

1863 comp_ref = ref.makeComponentRef(component) 

1864 comp_location, _ = self._determine_put_formatter_location(comp_ref) 

1865 

1866 # Add the "#predicted" URI fragment to indicate this is a 

1867 # guess 

1868 uris.componentURIs[component] = ResourcePath(comp_location.uri.geturl() + "#predicted") 

1869 

1870 else: 

1871 location, _ = self._determine_put_formatter_location(ref) 

1872 

1873 # Add the "#predicted" URI fragment to indicate this is a guess 

1874 uris.primaryURI = ResourcePath(location.uri.geturl() + "#predicted") 

1875 

1876 return uris 

1877 

1878 def getManyURIs( 

1879 self, 

1880 refs: Iterable[DatasetRef], 

1881 predict: bool = False, 

1882 allow_missing: bool = False, 

1883 ) -> dict[DatasetRef, DatasetRefURIs]: 

1884 # Docstring inherited 

1885 

1886 uris: dict[DatasetRef, DatasetRefURIs] = {} 

1887 

1888 records = self._get_stored_records_associated_with_refs(refs) 

1889 records_keys = records.keys() 

1890 

1891 existing_refs = tuple(ref for ref in refs if ref.id in records_keys) 

1892 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys) 

1893 

1894 # Have to handle trustGetRequest mode by checking for the existence 

1895 # of the missing refs on disk. 

1896 if missing_refs: 

1897 dataset_existence = self._mexists_check_expected(missing_refs, None) 

1898 really_missing = set() 

1899 not_missing = set() 

1900 for ref, exists in dataset_existence.items(): 

1901 if exists: 

1902 not_missing.add(ref) 

1903 else: 

1904 really_missing.add(ref) 

1905 

1906 if not_missing: 

1907 # Need to recalculate the missing/existing split. 

1908 existing_refs = existing_refs + tuple(not_missing) 

1909 missing_refs = tuple(really_missing) 

1910 

1911 for ref in missing_refs: 

1912 # if this has never been written then we have to guess 

1913 if not predict: 

1914 if not allow_missing: 

1915 raise FileNotFoundError(f"Dataset {ref} not in this datastore.") 

1916 else: 

1917 uris[ref] = self._predict_URIs(ref) 

1918 

1919 for ref in existing_refs: 

1920 file_infos = records[ref.id] 

1921 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos] 

1922 uris[ref] = self._locations_to_URI(ref, file_locations) 

1923 

1924 return uris 

1925 

1926 def _locations_to_URI( 

1927 self, 

1928 ref: DatasetRef, 

1929 file_locations: Sequence[tuple[Location, StoredFileInfo]], 

1930 ) -> DatasetRefURIs: 

1931 """Convert one or more file locations associated with a DatasetRef 

1932 to a DatasetRefURIs. 

1933 

1934 Parameters 

1935 ---------- 

1936 ref : `DatasetRef` 

1937 Reference to the dataset. 

1938 file_locations : Sequence[Tuple[Location, StoredFileInfo]] 

1939 Each item in the sequence is the location of the dataset within the 

1940 datastore and stored information about the file and its formatter. 

1941 If there is only one item in the sequence then it is treated as the 

1942 primary URI. If there is more than one item then they are treated 

1943 as component URIs. If there are no items then an error is raised 

1944 unless ``self.trustGetRequest`` is `True`. 

1945 

1946 Returns 

1947 ------- 

1948 uris: DatasetRefURIs 

1949 Represents the primary URI or component URIs described by the 

1950 inputs. 

1951 

1952 Raises 

1953 ------ 

1954 RuntimeError 

1955 If no file locations are passed in and ``self.trustGetRequest`` is 

1956 `False`. 

1957 FileNotFoundError 

1958 If the a passed-in URI does not exist, and ``self.trustGetRequest`` 

1959 is `False`. 

1960 RuntimeError 

1961 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is 

1962 unexpected). 

1963 """ 

1964 guessing = False 

1965 uris = DatasetRefURIs() 

1966 

1967 if not file_locations: 

1968 if not self.trustGetRequest: 

1969 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1970 file_locations = self._get_expected_dataset_locations_info(ref) 

1971 guessing = True 

1972 

1973 if len(file_locations) == 1: 

1974 # No disassembly so this is the primary URI 

1975 uris.primaryURI = file_locations[0][0].uri 

1976 if guessing and not uris.primaryURI.exists(): 

1977 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist") 

1978 else: 

1979 for location, file_info in file_locations: 

1980 if file_info.component is None: 

1981 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1982 if guessing and not location.uri.exists(): 

1983 # If we are trusting then it is entirely possible for 

1984 # some components to be missing. In that case we skip 

1985 # to the next component. 

1986 if self.trustGetRequest: 

1987 continue 

1988 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist") 

1989 uris.componentURIs[file_info.component] = location.uri 

1990 

1991 return uris 

1992 

1993 def retrieveArtifacts( 

1994 self, 

1995 refs: Iterable[DatasetRef], 

1996 destination: ResourcePath, 

1997 transfer: str = "auto", 

1998 preserve_path: bool = True, 

1999 overwrite: bool = False, 

2000 ) -> list[ResourcePath]: 

2001 """Retrieve the file artifacts associated with the supplied refs. 

2002 

2003 Parameters 

2004 ---------- 

2005 refs : iterable of `DatasetRef` 

2006 The datasets for which file artifacts are to be retrieved. 

2007 A single ref can result in multiple files. The refs must 

2008 be resolved. 

2009 destination : `lsst.resources.ResourcePath` 

2010 Location to write the file artifacts. 

2011 transfer : `str`, optional 

2012 Method to use to transfer the artifacts. Must be one of the options 

2013 supported by `lsst.resources.ResourcePath.transfer_from()`. 

2014 "move" is not allowed. 

2015 preserve_path : `bool`, optional 

2016 If `True` the full path of the file artifact within the datastore 

2017 is preserved. If `False` the final file component of the path 

2018 is used. 

2019 overwrite : `bool`, optional 

2020 If `True` allow transfers to overwrite existing files at the 

2021 destination. 

2022 

2023 Returns 

2024 ------- 

2025 targets : `list` of `lsst.resources.ResourcePath` 

2026 URIs of file artifacts in destination location. Order is not 

2027 preserved. 

2028 """ 

2029 if not destination.isdir(): 

2030 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

2031 

2032 if transfer == "move": 

2033 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

2034 

2035 # Source -> Destination 

2036 # This also helps filter out duplicate DatasetRef in the request 

2037 # that will map to the same underlying file transfer. 

2038 to_transfer: dict[ResourcePath, ResourcePath] = {} 

2039 

2040 for ref in refs: 

2041 locations = self._get_dataset_locations_info(ref) 

2042 for location, _ in locations: 

2043 source_uri = location.uri 

2044 target_path: ResourcePathExpression 

2045 if preserve_path: 

2046 target_path = location.pathInStore 

2047 if target_path.isabs(): 

2048 # This is an absolute path to an external file. 

2049 # Use the full path. 

2050 target_path = target_path.relativeToPathRoot 

2051 else: 

2052 target_path = source_uri.basename() 

2053 target_uri = destination.join(target_path) 

2054 to_transfer[source_uri] = target_uri 

2055 

2056 # In theory can now parallelize the transfer 

2057 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer)) 

2058 for source_uri, target_uri in to_transfer.items(): 

2059 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

2060 

2061 return list(to_transfer.values()) 

2062 

2063 def get( 

2064 self, 

2065 ref: DatasetRef, 

2066 parameters: Mapping[str, Any] | None = None, 

2067 storageClass: StorageClass | str | None = None, 

2068 ) -> Any: 

2069 """Load an InMemoryDataset from the store. 

2070 

2071 Parameters 

2072 ---------- 

2073 ref : `DatasetRef` 

2074 Reference to the required Dataset. 

2075 parameters : `dict` 

2076 `StorageClass`-specific parameters that specify, for example, 

2077 a slice of the dataset to be loaded. 

2078 storageClass : `StorageClass` or `str`, optional 

2079 The storage class to be used to override the Python type 

2080 returned by this method. By default the returned type matches 

2081 the dataset type definition for this dataset. Specifying a 

2082 read `StorageClass` can force a different type to be returned. 

2083 This type must be compatible with the original type. 

2084 

2085 Returns 

2086 ------- 

2087 inMemoryDataset : `object` 

2088 Requested dataset or slice thereof as an InMemoryDataset. 

2089 

2090 Raises 

2091 ------ 

2092 FileNotFoundError 

2093 Requested dataset can not be retrieved. 

2094 TypeError 

2095 Return value from formatter has unexpected type. 

2096 ValueError 

2097 Formatter failed to process the dataset. 

2098 """ 

2099 # Supplied storage class for the component being read is either 

2100 # from the ref itself or some an override if we want to force 

2101 # type conversion. 

2102 if storageClass is not None: 

2103 ref = ref.overrideStorageClass(storageClass) 

2104 refStorageClass = ref.datasetType.storageClass 

2105 

2106 allGetInfo = self._prepare_for_get(ref, parameters) 

2107 refComponent = ref.datasetType.component() 

2108 

2109 # Create mapping from component name to related info 

2110 allComponents = {i.component: i for i in allGetInfo} 

2111 

2112 # By definition the dataset is disassembled if we have more 

2113 # than one record for it. 

2114 isDisassembled = len(allGetInfo) > 1 

2115 

2116 # Look for the special case where we are disassembled but the 

2117 # component is a derived component that was not written during 

2118 # disassembly. For this scenario we need to check that the 

2119 # component requested is listed as a derived component for the 

2120 # composite storage class 

2121 isDisassembledReadOnlyComponent = False 

2122 if isDisassembled and refComponent: 

2123 # The composite storage class should be accessible through 

2124 # the component dataset type 

2125 compositeStorageClass = ref.datasetType.parentStorageClass 

2126 

2127 # In the unlikely scenario where the composite storage 

2128 # class is not known, we can only assume that this is a 

2129 # normal component. If that assumption is wrong then the 

2130 # branch below that reads a persisted component will fail 

2131 # so there is no need to complain here. 

2132 if compositeStorageClass is not None: 

2133 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

2134 

2135 if isDisassembled and not refComponent: 

2136 # This was a disassembled dataset spread over multiple files 

2137 # and we need to put them all back together again. 

2138 # Read into memory and then assemble 

2139 

2140 # Check that the supplied parameters are suitable for the type read 

2141 refStorageClass.validateParameters(parameters) 

2142 

2143 # We want to keep track of all the parameters that were not used 

2144 # by formatters. We assume that if any of the component formatters 

2145 # use a parameter that we do not need to apply it again in the 

2146 # assembler. 

2147 usedParams = set() 

2148 

2149 components: dict[str, Any] = {} 

2150 for getInfo in allGetInfo: 

2151 # assemblerParams are parameters not understood by the 

2152 # associated formatter. 

2153 usedParams.update(set(getInfo.formatterParams)) 

2154 

2155 component = getInfo.component 

2156 

2157 if component is None: 

2158 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

2159 

2160 # We do not want the formatter to think it's reading 

2161 # a component though because it is really reading a 

2162 # standalone dataset -- always tell reader it is not a 

2163 # component. 

2164 components[component] = self._read_artifact_into_memory( 

2165 getInfo, ref.makeComponentRef(component), isComponent=False 

2166 ) 

2167 

2168 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

2169 

2170 # Any unused parameters will have to be passed to the assembler 

2171 if parameters: 

2172 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

2173 else: 

2174 unusedParams = {} 

2175 

2176 # Process parameters 

2177 return ref.datasetType.storageClass.delegate().handleParameters( 

2178 inMemoryDataset, parameters=unusedParams 

2179 ) 

2180 

2181 elif isDisassembledReadOnlyComponent: 

2182 compositeStorageClass = ref.datasetType.parentStorageClass 

2183 if compositeStorageClass is None: 

2184 raise RuntimeError( 

2185 f"Unable to retrieve derived component '{refComponent}' since" 

2186 "no composite storage class is available." 

2187 ) 

2188 

2189 if refComponent is None: 

2190 # Mainly for mypy 

2191 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

2192 

2193 # Assume that every derived component can be calculated by 

2194 # forwarding the request to a single read/write component. 

2195 # Rather than guessing which rw component is the right one by 

2196 # scanning each for a derived component of the same name, 

2197 # we ask the storage class delegate directly which one is best to 

2198 # use. 

2199 compositeDelegate = compositeStorageClass.delegate() 

2200 forwardedComponent = compositeDelegate.selectResponsibleComponent( 

2201 refComponent, set(allComponents) 

2202 ) 

2203 

2204 # Select the relevant component 

2205 rwInfo = allComponents[forwardedComponent] 

2206 

2207 # For now assume that read parameters are validated against 

2208 # the real component and not the requested component 

2209 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

2210 forwardedStorageClass.validateParameters(parameters) 

2211 

2212 # The reference to use for the caching must refer to the forwarded 

2213 # component and not the derived component. 

2214 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent) 

2215 

2216 # Unfortunately the FileDescriptor inside the formatter will have 

2217 # the wrong write storage class so we need to create a new one 

2218 # given the immutability constraint. 

2219 writeStorageClass = rwInfo.info.storageClass 

2220 

2221 # We may need to put some thought into parameters for read 

2222 # components but for now forward them on as is 

2223 readFormatter = type(rwInfo.formatter)( 

2224 FileDescriptor( 

2225 rwInfo.location, 

2226 readStorageClass=refStorageClass, 

2227 storageClass=writeStorageClass, 

2228 parameters=parameters, 

2229 ), 

2230 ref.dataId, 

2231 ) 

2232 

2233 # The assembler can not receive any parameter requests for a 

2234 # derived component at this time since the assembler will 

2235 # see the storage class of the derived component and those 

2236 # parameters will have to be handled by the formatter on the 

2237 # forwarded storage class. 

2238 assemblerParams: dict[str, Any] = {} 

2239 

2240 # Need to created a new info that specifies the derived 

2241 # component and associated storage class 

2242 readInfo = DatastoreFileGetInformation( 

2243 rwInfo.location, 

2244 readFormatter, 

2245 rwInfo.info, 

2246 assemblerParams, 

2247 {}, 

2248 refComponent, 

2249 refStorageClass, 

2250 ) 

2251 

2252 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref) 

2253 

2254 else: 

2255 # Single file request or component from that composite file 

2256 for lookup in (refComponent, None): 

2257 if lookup in allComponents: 

2258 getInfo = allComponents[lookup] 

2259 break 

2260 else: 

2261 raise FileNotFoundError( 

2262 f"Component {refComponent} not found for ref {ref} in datastore {self.name}" 

2263 ) 

2264 

2265 # Do not need the component itself if already disassembled 

2266 if isDisassembled: 

2267 isComponent = False 

2268 else: 

2269 isComponent = getInfo.component is not None 

2270 

2271 # For a component read of a composite we want the cache to 

2272 # be looking at the composite ref itself. 

2273 cache_ref = ref.makeCompositeRef() if isComponent else ref 

2274 

2275 # For a disassembled component we can validate parametersagainst 

2276 # the component storage class directly 

2277 if isDisassembled: 

2278 refStorageClass.validateParameters(parameters) 

2279 else: 

2280 # For an assembled composite this could be a derived 

2281 # component derived from a real component. The validity 

2282 # of the parameters is not clear. For now validate against 

2283 # the composite storage class 

2284 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

2285 

2286 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref) 

2287 

2288 @transactional 

2289 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

2290 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

2291 

2292 Parameters 

2293 ---------- 

2294 inMemoryDataset : `object` 

2295 The dataset to store. 

2296 ref : `DatasetRef` 

2297 Reference to the associated Dataset. 

2298 

2299 Raises 

2300 ------ 

2301 TypeError 

2302 Supplied object and storage class are inconsistent. 

2303 DatasetTypeNotSupportedError 

2304 The associated `DatasetType` is not handled by this datastore. 

2305 

2306 Notes 

2307 ----- 

2308 If the datastore is configured to reject certain dataset types it 

2309 is possible that the put will fail and raise a 

2310 `DatasetTypeNotSupportedError`. The main use case for this is to 

2311 allow `ChainedDatastore` to put to multiple datastores without 

2312 requiring that every datastore accepts the dataset. 

2313 """ 

2314 doDisassembly = self.composites.shouldBeDisassembled(ref) 

2315 # doDisassembly = True 

2316 

2317 artifacts = [] 

2318 if doDisassembly: 

2319 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

2320 if components is None: 

2321 raise RuntimeError( 

2322 f"Inconsistent configuration: dataset type {ref.datasetType.name} " 

2323 f"with storage class {ref.datasetType.storageClass.name} " 

2324 "is configured to be disassembled, but cannot be." 

2325 ) 

2326 for component, componentInfo in components.items(): 

2327 # Don't recurse because we want to take advantage of 

2328 # bulk insert -- need a new DatasetRef that refers to the 

2329 # same dataset_id but has the component DatasetType 

2330 # DatasetType does not refer to the types of components 

2331 # So we construct one ourselves. 

2332 compRef = ref.makeComponentRef(component) 

2333 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

2334 artifacts.append((compRef, storedInfo)) 

2335 else: 

2336 # Write the entire thing out 

2337 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

2338 artifacts.append((ref, storedInfo)) 

2339 

2340 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.INSERT) 

2341 

2342 @transactional 

2343 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None: 

2344 # At this point can safely remove these datasets from the cache 

2345 # to avoid confusion later on. If they are not trashed later 

2346 # the cache will simply be refilled. 

2347 self.cacheManager.remove_from_cache(ref) 

2348 

2349 # If we are in trust mode there will be nothing to move to 

2350 # the trash table and we will have to try to delete the file 

2351 # immediately. 

2352 if self.trustGetRequest: 

2353 # Try to keep the logic below for a single file trash. 

2354 if isinstance(ref, DatasetRef): 

2355 refs = {ref} 

2356 else: 

2357 # Will recreate ref at the end of this branch. 

2358 refs = set(ref) 

2359 

2360 # Determine which datasets are known to datastore directly. 

2361 id_to_ref = {ref.id: ref for ref in refs} 

2362 existing_ids = self._get_stored_records_associated_with_refs(refs) 

2363 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids} 

2364 

2365 missing = refs - existing_refs 

2366 if missing: 

2367 # Do an explicit existence check on these refs. 

2368 # We only care about the artifacts at this point and not 

2369 # the dataset existence. 

2370 artifact_existence: dict[ResourcePath, bool] = {} 

2371 _ = self.mexists(missing, artifact_existence) 

2372 uris = [uri for uri, exists in artifact_existence.items() if exists] 

2373 

2374 # FUTURE UPGRADE: Implement a parallelized bulk remove. 

2375 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris)) 

2376 for uri in uris: 

2377 try: 

2378 uri.remove() 

2379 except Exception as e: 

2380 if ignore_errors: 

2381 log.debug("Artifact %s could not be removed: %s", uri, e) 

2382 continue 

2383 raise 

2384 

2385 # There is no point asking the code below to remove refs we 

2386 # know are missing so update it with the list of existing 

2387 # records. Try to retain one vs many logic. 

2388 if not existing_refs: 

2389 # Nothing more to do since none of the datasets were 

2390 # known to the datastore record table. 

2391 return 

2392 ref = list(existing_refs) 

2393 if len(ref) == 1: 

2394 ref = ref[0] 

2395 

2396 # Get file metadata and internal metadata 

2397 if not isinstance(ref, DatasetRef): 

2398 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

2399 # Assumed to be an iterable of refs so bulk mode enabled. 

2400 try: 

2401 self.bridge.moveToTrash(ref, transaction=self._transaction) 

2402 except Exception as e: 

2403 if ignore_errors: 

2404 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

2405 else: 

2406 raise 

2407 return 

2408 

2409 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

2410 

2411 fileLocations = self._get_dataset_locations_info(ref) 

2412 

2413 if not fileLocations: 

2414 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

2415 if ignore_errors: 

2416 log.warning(err_msg) 

2417 return 

2418 else: 

2419 raise FileNotFoundError(err_msg) 

2420 

2421 for location, _ in fileLocations: 

2422 if not self._artifact_exists(location): 

2423 err_msg = ( 

2424 f"Dataset is known to datastore {self.name} but " 

2425 f"associated artifact ({location.uri}) is missing" 

2426 ) 

2427 if ignore_errors: 

2428 log.warning(err_msg) 

2429 return 

2430 else: 

2431 raise FileNotFoundError(err_msg) 

2432 

2433 # Mark dataset as trashed 

2434 try: 

2435 self.bridge.moveToTrash([ref], transaction=self._transaction) 

2436 except Exception as e: 

2437 if ignore_errors: 

2438 log.warning( 

2439 "Attempted to mark dataset (%s) to be trashed in datastore %s " 

2440 "but encountered an error: %s", 

2441 ref, 

2442 self.name, 

2443 e, 

2444 ) 

2445 pass 

2446 else: 

2447 raise 

2448 

2449 @transactional 

2450 def emptyTrash(self, ignore_errors: bool = True) -> None: 

2451 """Remove all datasets from the trash. 

2452 

2453 Parameters 

2454 ---------- 

2455 ignore_errors : `bool` 

2456 If `True` return without error even if something went wrong. 

2457 Problems could occur if another process is simultaneously trying 

2458 to delete. 

2459 """ 

2460 log.debug("Emptying trash in datastore %s", self.name) 

2461 

2462 # Context manager will empty trash iff we finish it without raising. 

2463 # It will also automatically delete the relevant rows from the 

2464 # trash table and the records table. 

2465 with self.bridge.emptyTrash( 

2466 self._table, record_class=StoredFileInfo, record_column="path" 

2467 ) as trash_data: 

2468 # Removing the artifacts themselves requires that the files are 

2469 # not also associated with refs that are not to be trashed. 

2470 # Therefore need to do a query with the file paths themselves 

2471 # and return all the refs associated with them. Can only delete 

2472 # a file if the refs to be trashed are the only refs associated 

2473 # with the file. 

2474 # This requires multiple copies of the trashed items 

2475 trashed, artifacts_to_keep = trash_data 

2476 

2477 if artifacts_to_keep is None: 

2478 # The bridge is not helping us so have to work it out 

2479 # ourselves. This is not going to be as efficient. 

2480 trashed = list(trashed) 

2481 

2482 # The instance check is for mypy since up to this point it 

2483 # does not know the type of info. 

2484 path_map = self._refs_associated_with_artifacts( 

2485 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)] 

2486 ) 

2487 

2488 for ref, info in trashed: 

2489 # Mypy needs to know this is not the base class 

2490 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2491 

2492 path_map[info.path].remove(ref.id) 

2493 if not path_map[info.path]: 

2494 del path_map[info.path] 

2495 

2496 artifacts_to_keep = set(path_map) 

2497 

2498 for ref, info in trashed: 

2499 # Should not happen for this implementation but need 

2500 # to keep mypy happy. 

2501 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

2502 

2503 # Mypy needs to know this is not the base class 

2504 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2505 

2506 if info.path in artifacts_to_keep: 

2507 # This is a multi-dataset artifact and we are not 

2508 # removing all associated refs. 

2509 continue 

2510 

2511 # Only trashed refs still known to datastore will be returned. 

2512 location = info.file_location(self.locationFactory) 

2513 

2514 # Point of no return for this artifact 

2515 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

2516 try: 

2517 self._delete_artifact(location) 

2518 except FileNotFoundError: 

2519 # If the file itself has been deleted there is nothing 

2520 # we can do about it. It is possible that trash has 

2521 # been run in parallel in another process or someone 

2522 # decided to delete the file. It is unlikely to come 

2523 # back and so we should still continue with the removal 

2524 # of the entry from the trash table. It is also possible 

2525 # we removed it in a previous iteration if it was 

2526 # a multi-dataset artifact. The delete artifact method 

2527 # will log a debug message in this scenario. 

2528 # Distinguishing file missing before trash started and 

2529 # file already removed previously as part of this trash 

2530 # is not worth the distinction with regards to potential 

2531 # memory cost. 

2532 pass 

2533 except Exception as e: 

2534 if ignore_errors: 

2535 # Use a debug message here even though it's not 

2536 # a good situation. In some cases this can be 

2537 # caused by a race between user A and user B 

2538 # and neither of them has permissions for the 

2539 # other's files. Butler does not know about users 

2540 # and trash has no idea what collections these 

2541 # files were in (without guessing from a path). 

2542 log.debug( 

2543 "Encountered error removing artifact %s from datastore %s: %s", 

2544 location.uri, 

2545 self.name, 

2546 e, 

2547 ) 

2548 else: 

2549 raise 

2550 

2551 @transactional 

2552 def transfer_from( 

2553 self, 

2554 source_datastore: Datastore, 

2555 refs: Iterable[DatasetRef], 

2556 transfer: str = "auto", 

2557 artifact_existence: dict[ResourcePath, bool] | None = None, 

2558 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

2559 # Docstring inherited 

2560 if type(self) is not type(source_datastore): 

2561 raise TypeError( 

2562 f"Datastore mismatch between this datastore ({type(self)}) and the " 

2563 f"source datastore ({type(source_datastore)})." 

2564 ) 

2565 

2566 # Be explicit for mypy 

2567 if not isinstance(source_datastore, FileDatastore): 

2568 raise TypeError( 

2569 "Can only transfer to a FileDatastore from another FileDatastore, not" 

2570 f" {type(source_datastore)}" 

2571 ) 

2572 

2573 # Stop early if "direct" transfer mode is requested. That would 

2574 # require that the URI inside the source datastore should be stored 

2575 # directly in the target datastore, which seems unlikely to be useful 

2576 # since at any moment the source datastore could delete the file. 

2577 if transfer in ("direct", "split"): 

2578 raise ValueError( 

2579 f"Can not transfer from a source datastore using {transfer} mode since" 

2580 " those files are controlled by the other datastore." 

2581 ) 

2582 

2583 # Empty existence lookup if none given. 

2584 if artifact_existence is None: 

2585 artifact_existence = {} 

2586 

2587 # We will go through the list multiple times so must convert 

2588 # generators to lists. 

2589 refs = list(refs) 

2590 

2591 # In order to handle disassembled composites the code works 

2592 # at the records level since it can assume that internal APIs 

2593 # can be used. 

2594 # - If the record already exists in the destination this is assumed 

2595 # to be okay. 

2596 # - If there is no record but the source and destination URIs are 

2597 # identical no transfer is done but the record is added. 

2598 # - If the source record refers to an absolute URI currently assume 

2599 # that that URI should remain absolute and will be visible to the 

2600 # destination butler. May need to have a flag to indicate whether 

2601 # the dataset should be transferred. This will only happen if 

2602 # the detached Butler has had a local ingest. 

2603 

2604 # What we really want is all the records in the source datastore 

2605 # associated with these refs. Or derived ones if they don't exist 

2606 # in the source. 

2607 source_records = source_datastore._get_stored_records_associated_with_refs(refs) 

2608 

2609 # The source dataset_ids are the keys in these records 

2610 source_ids = set(source_records) 

2611 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

2612 

2613 requested_ids = {ref.id for ref in refs} 

2614 missing_ids = requested_ids - source_ids 

2615 

2616 # Missing IDs can be okay if that datastore has allowed 

2617 # gets based on file existence. Should we transfer what we can 

2618 # or complain about it and warn? 

2619 if missing_ids and not source_datastore.trustGetRequest: 

2620 raise ValueError( 

2621 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}" 

2622 ) 

2623 

2624 # Need to map these missing IDs to a DatasetRef so we can guess 

2625 # the details. 

2626 if missing_ids: 

2627 log.info( 

2628 "Number of expected datasets missing from source datastore records: %d out of %d", 

2629 len(missing_ids), 

2630 len(requested_ids), 

2631 ) 

2632 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

2633 

2634 # This should be chunked in case we end up having to check 

2635 # the file store since we need some log output to show 

2636 # progress. 

2637 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000): 

2638 records = {} 

2639 for missing in missing_ids_chunk: 

2640 # Ask the source datastore where the missing artifacts 

2641 # should be. An execution butler might not know about the 

2642 # artifacts even if they are there. 

2643 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) 

2644 records[missing] = [info for _, info in expected] 

2645 

2646 # Call the mexist helper method in case we have not already 

2647 # checked these artifacts such that artifact_existence is 

2648 # empty. This allows us to benefit from parallelism. 

2649 # datastore.mexists() itself does not give us access to the 

2650 # derived datastore record. 

2651 log.verbose("Checking existence of %d datasets unknown to datastore", len(records)) 

2652 ref_exists = source_datastore._process_mexists_records( 

2653 id_to_ref, records, False, artifact_existence=artifact_existence 

2654 ) 

2655 

2656 # Now go through the records and propagate the ones that exist. 

2657 location_factory = source_datastore.locationFactory 

2658 for missing, record_list in records.items(): 

2659 # Skip completely if the ref does not exist. 

2660 ref = id_to_ref[missing] 

2661 if not ref_exists[ref]: 

2662 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref) 

2663 continue 

2664 # Check for file artifact to decide which parts of a 

2665 # disassembled composite do exist. If there is only a 

2666 # single record we don't even need to look because it can't 

2667 # be a composite and must exist. 

2668 if len(record_list) == 1: 

2669 dataset_records = record_list 

2670 else: 

2671 dataset_records = [ 

2672 record 

2673 for record in record_list 

2674 if artifact_existence[record.file_location(location_factory).uri] 

2675 ] 

2676 assert len(dataset_records) > 0, "Disassembled composite should have had some files." 

2677 

2678 # Rely on source_records being a defaultdict. 

2679 source_records[missing].extend(dataset_records) 

2680 

2681 # See if we already have these records 

2682 target_records = self._get_stored_records_associated_with_refs(refs) 

2683 

2684 # The artifacts to register 

2685 artifacts = [] 

2686 

2687 # Refs that already exist 

2688 already_present = [] 

2689 

2690 # Refs that were rejected by this datastore. 

2691 rejected = set() 

2692 

2693 # Refs that were transferred successfully. 

2694 accepted = set() 

2695 

2696 # Record each time we have done a "direct" transfer. 

2697 direct_transfers = [] 

2698 

2699 # Now can transfer the artifacts 

2700 for ref in refs: 

2701 if not self.constraints.isAcceptable(ref): 

2702 # This datastore should not be accepting this dataset. 

2703 rejected.add(ref) 

2704 continue 

2705 

2706 accepted.add(ref) 

2707 

2708 if ref.id in target_records: 

2709 # Already have an artifact for this. 

2710 already_present.append(ref) 

2711 continue 

2712 

2713 # mypy needs to know these are always resolved refs 

2714 for info in source_records[ref.id]: 

2715 source_location = info.file_location(source_datastore.locationFactory) 

2716 target_location = info.file_location(self.locationFactory) 

2717 if source_location == target_location and not source_location.pathInStore.isabs(): 

2718 # Artifact is already in the target location. 

2719 # (which is how execution butler currently runs) 

2720 pass 

2721 else: 

2722 if target_location.pathInStore.isabs(): 

2723 # Just because we can see the artifact when running 

2724 # the transfer doesn't mean it will be generally 

2725 # accessible to a user of this butler. Need to decide 

2726 # what to do about an absolute path. 

2727 if transfer == "auto": 

2728 # For "auto" transfers we allow the absolute URI 

2729 # to be recorded in the target datastore. 

2730 direct_transfers.append(source_location) 

2731 else: 

2732 # The user is explicitly requesting a transfer 

2733 # even for an absolute URI. This requires us to 

2734 # calculate the target path. 

2735 template_ref = ref 

2736 if info.component: 

2737 template_ref = ref.makeComponentRef(info.component) 

2738 target_location = self._calculate_ingested_datastore_name( 

2739 source_location.uri, 

2740 template_ref, 

2741 ) 

2742 

2743 info = info.update(path=target_location.pathInStore.path) 

2744 

2745 # Need to transfer it to the new location. 

2746 # Assume we should always overwrite. If the artifact 

2747 # is there this might indicate that a previous transfer 

2748 # was interrupted but was not able to be rolled back 

2749 # completely (eg pre-emption) so follow Datastore default 

2750 # and overwrite. 

2751 target_location.uri.transfer_from( 

2752 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction 

2753 ) 

2754 

2755 artifacts.append((ref, info)) 

2756 

2757 if direct_transfers: 

2758 log.info( 

2759 "Transfer request for an outside-datastore artifact with absolute URI done %d time%s", 

2760 len(direct_transfers), 

2761 "" if len(direct_transfers) == 1 else "s", 

2762 ) 

2763 

2764 # We are overwriting previous datasets that may have already 

2765 # existed. We therefore should ensure that we force the 

2766 # datastore records to agree. Note that this can potentially lead 

2767 # to difficulties if the dataset has previously been ingested 

2768 # disassembled and is somehow now assembled, or vice versa. 

2769 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.REPLACE) 

2770 

2771 if already_present: 

2772 n_skipped = len(already_present) 

2773 log.info( 

2774 "Skipped transfer of %d dataset%s already present in datastore", 

2775 n_skipped, 

2776 "" if n_skipped == 1 else "s", 

2777 ) 

2778 

2779 return accepted, rejected 

2780 

2781 @transactional 

2782 def forget(self, refs: Iterable[DatasetRef]) -> None: 

2783 # Docstring inherited. 

2784 refs = list(refs) 

2785 self.bridge.forget(refs) 

2786 self._table.delete(["dataset_id"], *[{"dataset_id": ref.id} for ref in refs]) 

2787 

2788 def validateConfiguration( 

2789 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

2790 ) -> None: 

2791 """Validate some of the configuration for this datastore. 

2792 

2793 Parameters 

2794 ---------- 

2795 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

2796 Entities to test against this configuration. Can be differing 

2797 types. 

2798 logFailures : `bool`, optional 

2799 If `True`, output a log message for every validation error 

2800 detected. 

2801 

2802 Raises 

2803 ------ 

2804 DatastoreValidationError 

2805 Raised if there is a validation problem with a configuration. 

2806 All the problems are reported in a single exception. 

2807 

2808 Notes 

2809 ----- 

2810 This method checks that all the supplied entities have valid file 

2811 templates and also have formatters defined. 

2812 """ 

2813 templateFailed = None 

2814 try: 

2815 self.templates.validateTemplates(entities, logFailures=logFailures) 

2816 except FileTemplateValidationError as e: 

2817 templateFailed = str(e) 

2818 

2819 formatterFailed = [] 

2820 for entity in entities: 

2821 try: 

2822 self.formatterFactory.getFormatterClass(entity) 

2823 except KeyError as e: 

2824 formatterFailed.append(str(e)) 

2825 if logFailures: 

2826 log.critical("Formatter failure: %s", e) 

2827 

2828 if templateFailed or formatterFailed: 

2829 messages = [] 

2830 if templateFailed: 

2831 messages.append(templateFailed) 

2832 if formatterFailed: 

2833 messages.append(",".join(formatterFailed)) 

2834 msg = ";\n".join(messages) 

2835 raise DatastoreValidationError(msg) 

2836 

2837 def getLookupKeys(self) -> set[LookupKey]: 

2838 # Docstring is inherited from base class 

2839 return ( 

2840 self.templates.getLookupKeys() 

2841 | self.formatterFactory.getLookupKeys() 

2842 | self.constraints.getLookupKeys() 

2843 ) 

2844 

2845 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

2846 # Docstring is inherited from base class 

2847 # The key can be valid in either formatters or templates so we can 

2848 # only check the template if it exists 

2849 if lookupKey in self.templates: 

2850 try: 

2851 self.templates[lookupKey].validateTemplate(entity) 

2852 except FileTemplateValidationError as e: 

2853 raise DatastoreValidationError(e) from e 

2854 

2855 def export( 

2856 self, 

2857 refs: Iterable[DatasetRef], 

2858 *, 

2859 directory: ResourcePathExpression | None = None, 

2860 transfer: str | None = "auto", 

2861 ) -> Iterable[FileDataset]: 

2862 # Docstring inherited from Datastore.export. 

2863 if transfer == "auto" and directory is None: 

2864 transfer = None 

2865 

2866 if transfer is not None and directory is None: 

2867 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

2868 

2869 if transfer == "move": 

2870 raise TypeError("Can not export by moving files out of datastore.") 

2871 elif transfer == "direct": 

2872 # For an export, treat this as equivalent to None. We do not 

2873 # want an import to risk using absolute URIs to datasets owned 

2874 # by another datastore. 

2875 log.info("Treating 'direct' transfer mode as in-place export.") 

2876 transfer = None 

2877 

2878 # Force the directory to be a URI object 

2879 directoryUri: ResourcePath | None = None 

2880 if directory is not None: 

2881 directoryUri = ResourcePath(directory, forceDirectory=True) 

2882 

2883 if transfer is not None and directoryUri is not None and not directoryUri.exists(): 

2884 # mypy needs the second test 

2885 raise FileNotFoundError(f"Export location {directory} does not exist") 

2886 

2887 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

2888 for ref in progress.wrap(refs, "Exporting dataset files"): 

2889 fileLocations = self._get_dataset_locations_info(ref) 

2890 if not fileLocations: 

2891 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

2892 # For now we can not export disassembled datasets 

2893 if len(fileLocations) > 1: 

2894 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

2895 location, storedFileInfo = fileLocations[0] 

2896 

2897 pathInStore = location.pathInStore.path 

2898 if transfer is None: 

2899 # TODO: do we also need to return the readStorageClass somehow? 

2900 # We will use the path in store directly. If this is an 

2901 # absolute URI, preserve it. 

2902 if location.pathInStore.isabs(): 

2903 pathInStore = str(location.uri) 

2904 elif transfer == "direct": 

2905 # Use full URIs to the remote store in the export 

2906 pathInStore = str(location.uri) 

2907 else: 

2908 # mypy needs help 

2909 assert directoryUri is not None, "directoryUri must be defined to get here" 

2910 storeUri = ResourcePath(location.uri) 

2911 

2912 # if the datastore has an absolute URI to a resource, we 

2913 # have two options: 

2914 # 1. Keep the absolute URI in the exported YAML 

2915 # 2. Allocate a new name in the local datastore and transfer 

2916 # it. 

2917 # For now go with option 2 

2918 if location.pathInStore.isabs(): 

2919 template = self.templates.getTemplate(ref) 

2920 newURI = ResourcePath(template.format(ref), forceAbsolute=False) 

2921 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

2922 

2923 exportUri = directoryUri.join(pathInStore) 

2924 exportUri.transfer_from(storeUri, transfer=transfer) 

2925 

2926 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

2927 

2928 @staticmethod 

2929 def computeChecksum(uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192) -> str | None: 

2930 """Compute the checksum of the supplied file. 

2931 

2932 Parameters 

2933 ---------- 

2934 uri : `lsst.resources.ResourcePath` 

2935 Name of resource to calculate checksum from. 

2936 algorithm : `str`, optional 

2937 Name of algorithm to use. Must be one of the algorithms supported 

2938 by :py:class`hashlib`. 

2939 block_size : `int` 

2940 Number of bytes to read from file at one time. 

2941 

2942 Returns 

2943 ------- 

2944 hexdigest : `str` 

2945 Hex digest of the file. 

2946 

2947 Notes 

2948 ----- 

2949 Currently returns None if the URI is for a remote resource. 

2950 """ 

2951 if algorithm not in hashlib.algorithms_guaranteed: 

2952 raise NameError(f"The specified algorithm '{algorithm}' is not supported by hashlib") 

2953 

2954 if not uri.isLocal: 

2955 return None 

2956 

2957 hasher = hashlib.new(algorithm) 

2958 

2959 with uri.as_local() as local_uri, open(local_uri.ospath, "rb") as f: 

2960 for chunk in iter(lambda: f.read(block_size), b""): 

2961 hasher.update(chunk) 

2962 

2963 return hasher.hexdigest() 

2964 

2965 def needs_expanded_data_ids( 

2966 self, 

2967 transfer: str | None, 

2968 entity: DatasetRef | DatasetType | StorageClass | None = None, 

2969 ) -> bool: 

2970 # Docstring inherited. 

2971 # This _could_ also use entity to inspect whether the filename template 

2972 # involves placeholders other than the required dimensions for its 

2973 # dataset type, but that's not necessary for correctness; it just 

2974 # enables more optimizations (perhaps only in theory). 

2975 return transfer not in ("direct", None) 

2976 

2977 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

2978 # Docstring inherited from the base class. 

2979 record_data = data.get(self.name) 

2980 if not record_data: 

2981 return 

2982 

2983 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records) 

2984 

2985 # TODO: Verify that there are no unexpected table names in the dict? 

2986 unpacked_records = [] 

2987 for dataset_data in record_data.records.values(): 

2988 records = dataset_data.get(self._table.name) 

2989 if records: 

2990 for info in records: 

2991 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records" 

2992 unpacked_records.append(info.to_record()) 

2993 if unpacked_records: 

2994 self._table.insert(*unpacked_records, transaction=self._transaction) 

2995 

2996 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

2997 # Docstring inherited from the base class. 

2998 exported_refs = list(self._bridge.check(refs)) 

2999 ids = {ref.id for ref in exported_refs} 

3000 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids} 

3001 for row in self._table.fetch(dataset_id=ids): 

3002 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row) 

3003 dataset_records = records.setdefault(info.dataset_id, {}) 

3004 dataset_records.setdefault(self._table.name, []).append(info) 

3005 

3006 record_data = DatastoreRecordData(records=records) 

3007 return {self.name: record_data} 

3008 

3009 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None: 

3010 # Docstring inherited from the base class. 

3011 self._retrieve_dataset_method = method 

3012 

3013 def _cast_storage_class(self, ref: DatasetRef) -> DatasetRef: 

3014 """Update dataset reference to use the storage class from registry.""" 

3015 if self._retrieve_dataset_method is None: 

3016 # We could raise an exception here but unit tests do not define 

3017 # this method. 

3018 return ref 

3019 dataset_type = self._retrieve_dataset_method(ref.datasetType.name) 

3020 if dataset_type is not None: 

3021 ref = ref.overrideStorageClass(dataset_type.storageClass) 

3022 return ref