Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 10%

909 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-16 10:44 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Generic file-based datastore code.""" 

29 

30from __future__ import annotations 

31 

32__all__ = ("FileDatastore",) 

33 

34import contextlib 

35import hashlib 

36import logging 

37from collections import defaultdict 

38from collections.abc import Callable, Iterable, Mapping, Sequence 

39from typing import TYPE_CHECKING, Any, ClassVar, cast 

40 

41from lsst.daf.butler import ( 

42 Config, 

43 DatasetId, 

44 DatasetRef, 

45 DatasetType, 

46 DatasetTypeNotSupportedError, 

47 FileDataset, 

48 FileDescriptor, 

49 Formatter, 

50 FormatterFactory, 

51 Location, 

52 LocationFactory, 

53 Progress, 

54 StorageClass, 

55 ddl, 

56) 

57from lsst.daf.butler.datastore import ( 

58 DatasetRefURIs, 

59 Datastore, 

60 DatastoreConfig, 

61 DatastoreOpaqueTable, 

62 DatastoreValidationError, 

63) 

64from lsst.daf.butler.datastore.cache_manager import ( 

65 AbstractDatastoreCacheManager, 

66 DatastoreCacheManager, 

67 DatastoreDisabledCacheManager, 

68) 

69from lsst.daf.butler.datastore.composites import CompositesMap 

70from lsst.daf.butler.datastore.file_templates import FileTemplates, FileTemplateValidationError 

71from lsst.daf.butler.datastore.generic_base import GenericBaseDatastore 

72from lsst.daf.butler.datastore.record_data import DatastoreRecordData 

73from lsst.daf.butler.datastore.stored_file_info import StoredDatastoreItemInfo, StoredFileInfo 

74from lsst.daf.butler.datastores.file_datastore.get import ( 

75 DatasetLocationInformation, 

76 DatastoreFileGetInformation, 

77 generate_datastore_get_information, 

78 get_dataset_as_python_object_from_get_info, 

79) 

80from lsst.daf.butler.datastores.fileDatastoreClient import ( 

81 FileDatastoreGetPayload, 

82 FileDatastoreGetPayloadFileInfo, 

83) 

84from lsst.daf.butler.registry.interfaces import ( 

85 DatabaseInsertMode, 

86 DatastoreRegistryBridge, 

87 FakeDatasetRef, 

88 ReadOnlyDatabaseError, 

89) 

90from lsst.daf.butler.repo_relocation import replaceRoot 

91from lsst.daf.butler.utils import transactional 

92from lsst.resources import ResourcePath, ResourcePathExpression 

93from lsst.utils.introspection import get_class_of 

94from lsst.utils.iteration import chunk_iterable 

95 

96# For VERBOSE logging usage. 

97from lsst.utils.logging import VERBOSE, getLogger 

98from sqlalchemy import BigInteger, String 

99 

100if TYPE_CHECKING: 

101 from lsst.daf.butler import LookupKey 

102 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

103 

104log = getLogger(__name__) 

105 

106 

107class _IngestPrepData(Datastore.IngestPrepData): 

108 """Helper class for FileDatastore ingest implementation. 

109 

110 Parameters 

111 ---------- 

112 datasets : `~collections.abc.Iterable` of `FileDataset` 

113 Files to be ingested by this datastore. 

114 """ 

115 

116 def __init__(self, datasets: Iterable[FileDataset]): 

117 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

118 self.datasets = datasets 

119 

120 

121class FileDatastore(GenericBaseDatastore[StoredFileInfo]): 

122 """Generic Datastore for file-based implementations. 

123 

124 Should always be sub-classed since key abstract methods are missing. 

125 

126 Parameters 

127 ---------- 

128 config : `DatastoreConfig` or `str` 

129 Configuration as either a `Config` object or URI to file. 

130 bridgeManager : `DatastoreRegistryBridgeManager` 

131 Object that manages the interface between `Registry` and datastores. 

132 butlerRoot : `str`, optional 

133 New datastore root to use to override the configuration value. 

134 

135 Raises 

136 ------ 

137 ValueError 

138 If root location does not exist and ``create`` is `False` in the 

139 configuration. 

140 """ 

141 

142 defaultConfigFile: ClassVar[str | None] = None 

143 """Path to configuration defaults. Accessed within the ``config`` resource 

144 or relative to a search path. Can be None if no defaults specified. 

145 """ 

146 

147 root: ResourcePath 

148 """Root directory URI of this `Datastore`.""" 

149 

150 locationFactory: LocationFactory 

151 """Factory for creating locations relative to the datastore root.""" 

152 

153 formatterFactory: FormatterFactory 

154 """Factory for creating instances of formatters.""" 

155 

156 templates: FileTemplates 

157 """File templates that can be used by this `Datastore`.""" 

158 

159 composites: CompositesMap 

160 """Determines whether a dataset should be disassembled on put.""" 

161 

162 defaultConfigFile = "datastores/fileDatastore.yaml" 

163 """Path to configuration defaults. Accessed within the ``config`` resource 

164 or relative to a search path. Can be None if no defaults specified. 

165 """ 

166 

167 _retrieve_dataset_method: Callable[[str], DatasetType | None] | None = None 

168 """Callable that is used in trusted mode to retrieve registry definition 

169 of a named dataset type. 

170 """ 

171 

172 @classmethod 

173 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

174 """Set any filesystem-dependent config options for this Datastore to 

175 be appropriate for a new empty repository with the given root. 

176 

177 Parameters 

178 ---------- 

179 root : `str` 

180 URI to the root of the data repository. 

181 config : `Config` 

182 A `Config` to update. Only the subset understood by 

183 this component will be updated. Will not expand 

184 defaults. 

185 full : `Config` 

186 A complete config with all defaults expanded that can be 

187 converted to a `DatastoreConfig`. Read-only and will not be 

188 modified by this method. 

189 Repository-specific options that should not be obtained 

190 from defaults when Butler instances are constructed 

191 should be copied from ``full`` to ``config``. 

192 overwrite : `bool`, optional 

193 If `False`, do not modify a value in ``config`` if the value 

194 already exists. Default is always to overwrite with the provided 

195 ``root``. 

196 

197 Notes 

198 ----- 

199 If a keyword is explicitly defined in the supplied ``config`` it 

200 will not be overridden by this method if ``overwrite`` is `False`. 

201 This allows explicit values set in external configs to be retained. 

202 """ 

203 Config.updateParameters( 

204 DatastoreConfig, 

205 config, 

206 full, 

207 toUpdate={"root": root}, 

208 toCopy=("cls", ("records", "table")), 

209 overwrite=overwrite, 

210 ) 

211 

212 @classmethod 

213 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

214 return ddl.TableSpec( 

215 fields=[ 

216 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

217 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

218 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

219 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

220 # Use empty string to indicate no component 

221 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

222 # TODO: should checksum be Base64Bytes instead? 

223 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

224 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

225 ], 

226 unique=frozenset(), 

227 indexes=[ddl.IndexSpec("path")], 

228 ) 

229 

230 def __init__( 

231 self, 

232 config: DatastoreConfig | ResourcePathExpression, 

233 bridgeManager: DatastoreRegistryBridgeManager, 

234 butlerRoot: str | None = None, 

235 ): 

236 super().__init__(config, bridgeManager) 

237 if "root" not in self.config: 

238 raise ValueError("No root directory specified in configuration") 

239 

240 # Name ourselves either using an explicit name or a name 

241 # derived from the (unexpanded) root 

242 if "name" in self.config: 

243 self.name = self.config["name"] 

244 else: 

245 # We use the unexpanded root in the name to indicate that this 

246 # datastore can be moved without having to update registry. 

247 self.name = "{}@{}".format(type(self).__name__, self.config["root"]) 

248 

249 # Support repository relocation in config 

250 # Existence of self.root is checked in subclass 

251 self.root = ResourcePath( 

252 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True 

253 ) 

254 

255 self.locationFactory = LocationFactory(self.root) 

256 self.formatterFactory = FormatterFactory() 

257 

258 # Now associate formatters with storage classes 

259 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe) 

260 

261 # Read the file naming templates 

262 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe) 

263 

264 # See if composites should be disassembled 

265 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe) 

266 

267 self._opaque_table_name = self.config["records", "table"] 

268 try: 

269 # Storage of paths and formatters, keyed by dataset_id 

270 self._table = bridgeManager.opaque.register( 

271 self._opaque_table_name, self.makeTableSpec(bridgeManager.datasetIdColumnType) 

272 ) 

273 # Interface to Registry. 

274 self._bridge = bridgeManager.register(self.name) 

275 except ReadOnlyDatabaseError: 

276 # If the database is read only and we just tried and failed to 

277 # create a table, it means someone is trying to create a read-only 

278 # butler client for an empty repo. That should be okay, as long 

279 # as they then try to get any datasets before some other client 

280 # creates the table. Chances are they're just validating 

281 # configuration. 

282 pass 

283 

284 # Determine whether checksums should be used - default to False 

285 self.useChecksum = self.config.get("checksum", False) 

286 

287 # Determine whether we can fall back to configuration if a 

288 # requested dataset is not known to registry 

289 self.trustGetRequest = self.config.get("trust_get_request", False) 

290 

291 # Create a cache manager 

292 self.cacheManager: AbstractDatastoreCacheManager 

293 if "cached" in self.config: 

294 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe) 

295 else: 

296 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe) 

297 

298 # Check existence and create directory structure if necessary 

299 if not self.root.exists(): 

300 if "create" not in self.config or not self.config["create"]: 

301 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

302 try: 

303 self.root.mkdir() 

304 except Exception as e: 

305 raise ValueError( 

306 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}" 

307 ) from e 

308 

309 def __str__(self) -> str: 

310 return str(self.root) 

311 

312 @property 

313 def bridge(self) -> DatastoreRegistryBridge: 

314 return self._bridge 

315 

316 @property 

317 def roots(self) -> dict[str, ResourcePath | None]: 

318 # Docstring inherited. 

319 return {self.name: self.root} 

320 

321 def _artifact_exists(self, location: Location) -> bool: 

322 """Check that an artifact exists in this datastore at the specified 

323 location. 

324 

325 Parameters 

326 ---------- 

327 location : `Location` 

328 Expected location of the artifact associated with this datastore. 

329 

330 Returns 

331 ------- 

332 exists : `bool` 

333 True if the location can be found, false otherwise. 

334 """ 

335 log.debug("Checking if resource exists: %s", location.uri) 

336 return location.uri.exists() 

337 

338 def _delete_artifact(self, location: Location) -> None: 

339 """Delete the artifact from the datastore. 

340 

341 Parameters 

342 ---------- 

343 location : `Location` 

344 Location of the artifact associated with this datastore. 

345 """ 

346 if location.pathInStore.isabs(): 

347 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

348 

349 try: 

350 location.uri.remove() 

351 except FileNotFoundError: 

352 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

353 raise 

354 except Exception as e: 

355 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

356 raise 

357 log.debug("Successfully deleted file: %s", location.uri) 

358 

359 def addStoredItemInfo( 

360 self, 

361 refs: Iterable[DatasetRef], 

362 infos: Iterable[StoredFileInfo], 

363 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT, 

364 ) -> None: 

365 """Record internal storage information associated with one or more 

366 datasets. 

367 

368 Parameters 

369 ---------- 

370 refs : sequence of `DatasetRef` 

371 The datasets that have been stored. 

372 infos : sequence of `StoredDatastoreItemInfo` 

373 Metadata associated with the stored datasets. 

374 insert_mode : `~lsst.daf.butler.registry.interfaces.DatabaseInsertMode` 

375 Mode to use to insert the new records into the table. The 

376 options are ``INSERT`` (error if pre-existing), ``REPLACE`` 

377 (replace content with new values), and ``ENSURE`` (skip if the row 

378 already exists). 

379 """ 

380 records = [ 

381 info.rebase(ref).to_record(dataset_id=ref.id) for ref, info in zip(refs, infos, strict=True) 

382 ] 

383 match insert_mode: 

384 case DatabaseInsertMode.INSERT: 

385 self._table.insert(*records, transaction=self._transaction) 

386 case DatabaseInsertMode.ENSURE: 

387 self._table.ensure(*records, transaction=self._transaction) 

388 case DatabaseInsertMode.REPLACE: 

389 self._table.replace(*records, transaction=self._transaction) 

390 case _: 

391 raise ValueError(f"Unknown insert mode of '{insert_mode}'") 

392 

393 def getStoredItemsInfo( 

394 self, ref: DatasetIdRef, ignore_datastore_records: bool = False 

395 ) -> list[StoredFileInfo]: 

396 """Retrieve information associated with files stored in this 

397 `Datastore` associated with this dataset ref. 

398 

399 Parameters 

400 ---------- 

401 ref : `DatasetRef` 

402 The dataset that is to be queried. 

403 ignore_datastore_records : `bool` 

404 If `True` then do not use datastore records stored in refs. 

405 

406 Returns 

407 ------- 

408 items : `~collections.abc.Iterable` [`StoredDatastoreItemInfo`] 

409 Stored information about the files and associated formatters 

410 associated with this dataset. Only one file will be returned 

411 if the dataset has not been disassembled. Can return an empty 

412 list if no matching datasets can be found. 

413 """ 

414 # Try to get them from the ref first. 

415 if ref._datastore_records is not None and not ignore_datastore_records: 

416 if (ref_records := ref._datastore_records.get(self._table.name)) is not None: 

417 # Need to make sure they have correct type. 

418 for record in ref_records: 

419 if not isinstance(record, StoredFileInfo): 

420 raise TypeError(f"Datastore record has unexpected type {record.__class__.__name__}") 

421 return cast(list[StoredFileInfo], ref_records) 

422 

423 # Look for the dataset_id -- there might be multiple matches 

424 # if we have disassembled the dataset. 

425 records = self._table.fetch(dataset_id=ref.id) 

426 return [StoredFileInfo.from_record(record) for record in records] 

427 

428 def _register_datasets( 

429 self, 

430 refsAndInfos: Iterable[tuple[DatasetRef, StoredFileInfo]], 

431 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT, 

432 ) -> None: 

433 """Update registry to indicate that one or more datasets have been 

434 stored. 

435 

436 Parameters 

437 ---------- 

438 refsAndInfos : sequence `tuple` [`DatasetRef`, 

439 `StoredDatastoreItemInfo`] 

440 Datasets to register and the internal datastore metadata associated 

441 with them. 

442 insert_mode : `str`, optional 

443 Indicate whether the new records should be new ("insert", default), 

444 or allowed to exists ("ensure") or be replaced if already present 

445 ("replace"). 

446 """ 

447 expandedRefs: list[DatasetRef] = [] 

448 expandedItemInfos: list[StoredFileInfo] = [] 

449 

450 for ref, itemInfo in refsAndInfos: 

451 expandedRefs.append(ref) 

452 expandedItemInfos.append(itemInfo) 

453 

454 # Dataset location only cares about registry ID so if we have 

455 # disassembled in datastore we have to deduplicate. Since they 

456 # will have different datasetTypes we can't use a set 

457 registryRefs = {r.id: r for r in expandedRefs} 

458 if insert_mode == DatabaseInsertMode.INSERT: 

459 self.bridge.insert(registryRefs.values()) 

460 else: 

461 # There are only two columns and all that matters is the 

462 # dataset ID. 

463 self.bridge.ensure(registryRefs.values()) 

464 self.addStoredItemInfo(expandedRefs, expandedItemInfos, insert_mode=insert_mode) 

465 

466 def _get_stored_records_associated_with_refs( 

467 self, refs: Iterable[DatasetIdRef], ignore_datastore_records: bool = False 

468 ) -> dict[DatasetId, list[StoredFileInfo]]: 

469 """Retrieve all records associated with the provided refs. 

470 

471 Parameters 

472 ---------- 

473 refs : iterable of `DatasetIdRef` 

474 The refs for which records are to be retrieved. 

475 ignore_datastore_records : `bool` 

476 If `True` then do not use datastore records stored in refs. 

477 

478 Returns 

479 ------- 

480 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

481 The matching records indexed by the ref ID. The number of entries 

482 in the dict can be smaller than the number of requested refs. 

483 """ 

484 # Check datastore records in refs first. 

485 records_by_ref: defaultdict[DatasetId, list[StoredFileInfo]] = defaultdict(list) 

486 refs_with_no_records = [] 

487 for ref in refs: 

488 if ignore_datastore_records or ref._datastore_records is None: 

489 refs_with_no_records.append(ref) 

490 else: 

491 if (ref_records := ref._datastore_records.get(self._table.name)) is not None: 

492 # Need to make sure they have correct type. 

493 for ref_record in ref_records: 

494 if not isinstance(ref_record, StoredFileInfo): 

495 raise TypeError( 

496 f"Datastore record has unexpected type {ref_record.__class__.__name__}" 

497 ) 

498 records_by_ref[ref.id].append(ref_record) 

499 

500 # If there were any refs without datastore records, check opaque table. 

501 records = self._table.fetch(dataset_id=[ref.id for ref in refs_with_no_records]) 

502 

503 # Uniqueness is dataset_id + component so can have multiple records 

504 # per ref. 

505 for record in records: 

506 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

507 return records_by_ref 

508 

509 def _refs_associated_with_artifacts(self, paths: list[str | ResourcePath]) -> dict[str, set[DatasetId]]: 

510 """Return paths and associated dataset refs. 

511 

512 Parameters 

513 ---------- 

514 paths : `list` of `str` or `lsst.resources.ResourcePath` 

515 All the paths to include in search. 

516 

517 Returns 

518 ------- 

519 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

520 Mapping of each path to a set of associated database IDs. 

521 """ 

522 records = self._table.fetch(path=[str(path) for path in paths]) 

523 result = defaultdict(set) 

524 for row in records: 

525 result[row["path"]].add(row["dataset_id"]) 

526 return result 

527 

528 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> set[DatasetId]: 

529 """Return all dataset refs associated with the supplied path. 

530 

531 Parameters 

532 ---------- 

533 pathInStore : `lsst.resources.ResourcePath` 

534 Path of interest in the data store. 

535 

536 Returns 

537 ------- 

538 ids : `set` of `int` 

539 All `DatasetRef` IDs associated with this path. 

540 """ 

541 records = list(self._table.fetch(path=str(pathInStore))) 

542 ids = {r["dataset_id"] for r in records} 

543 return ids 

544 

545 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

546 """Remove information about the file associated with this dataset. 

547 

548 Parameters 

549 ---------- 

550 ref : `DatasetRef` 

551 The dataset that has been removed. 

552 """ 

553 # Note that this method is actually not used by this implementation, 

554 # we depend on bridge to delete opaque records. But there are some 

555 # tests that check that this method works, so we keep it for now. 

556 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

557 

558 def _get_dataset_locations_info( 

559 self, ref: DatasetIdRef, ignore_datastore_records: bool = False 

560 ) -> list[DatasetLocationInformation]: 

561 r"""Find all the `Location`\ s of the requested dataset in the 

562 `Datastore` and the associated stored file information. 

563 

564 Parameters 

565 ---------- 

566 ref : `DatasetRef` 

567 Reference to the required `Dataset`. 

568 ignore_datastore_records : `bool` 

569 If `True` then do not use datastore records stored in refs. 

570 

571 Returns 

572 ------- 

573 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

574 Location of the dataset within the datastore and 

575 stored information about each file and its formatter. 

576 """ 

577 # Get the file information (this will fail if no file) 

578 records = self.getStoredItemsInfo(ref, ignore_datastore_records) 

579 

580 # Use the path to determine the location -- we need to take 

581 # into account absolute URIs in the datastore record 

582 return [(r.file_location(self.locationFactory), r) for r in records] 

583 

584 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

585 """Check that there is only one dataset associated with the 

586 specified artifact. 

587 

588 Parameters 

589 ---------- 

590 ref : `DatasetRef` or `FakeDatasetRef` 

591 Dataset to be removed. 

592 location : `Location` 

593 The location of the artifact to be removed. 

594 

595 Returns 

596 ------- 

597 can_remove : `Bool` 

598 True if the artifact can be safely removed. 

599 """ 

600 # Can't ever delete absolute URIs. 

601 if location.pathInStore.isabs(): 

602 return False 

603 

604 # Get all entries associated with this path 

605 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

606 if not allRefs: 

607 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

608 

609 # Remove these refs from all the refs and if there is nothing left 

610 # then we can delete 

611 remainingRefs = allRefs - {ref.id} 

612 

613 if remainingRefs: 

614 return False 

615 return True 

616 

617 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> list[tuple[Location, StoredFileInfo]]: 

618 """Predict the location and related file information of the requested 

619 dataset in this datastore. 

620 

621 Parameters 

622 ---------- 

623 ref : `DatasetRef` 

624 Reference to the required `Dataset`. 

625 

626 Returns 

627 ------- 

628 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

629 Expected Location of the dataset within the datastore and 

630 placeholder information about each file and its formatter. 

631 

632 Notes 

633 ----- 

634 Uses the current configuration to determine how we would expect the 

635 datastore files to have been written if we couldn't ask registry. 

636 This is safe so long as there has been no change to datastore 

637 configuration between writing the dataset and wanting to read it. 

638 Will not work for files that have been ingested without using the 

639 standard file template or default formatter. 

640 """ 

641 # If we have a component ref we always need to ask the questions 

642 # of the composite. If the composite is disassembled this routine 

643 # should return all components. If the composite was not 

644 # disassembled the composite is what is stored regardless of 

645 # component request. Note that if the caller has disassembled 

646 # a composite there is no way for this guess to know that 

647 # without trying both the composite and component ref and seeing 

648 # if there is something at the component Location even without 

649 # disassembly being enabled. 

650 if ref.datasetType.isComponent(): 

651 ref = ref.makeCompositeRef() 

652 

653 # See if the ref is a composite that should be disassembled 

654 doDisassembly = self.composites.shouldBeDisassembled(ref) 

655 

656 all_info: list[tuple[Location, Formatter, StorageClass, str | None]] = [] 

657 

658 if doDisassembly: 

659 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

660 compRef = ref.makeComponentRef(component) 

661 location, formatter = self._determine_put_formatter_location(compRef) 

662 all_info.append((location, formatter, componentStorage, component)) 

663 

664 else: 

665 # Always use the composite ref if no disassembly 

666 location, formatter = self._determine_put_formatter_location(ref) 

667 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

668 

669 # Convert the list of tuples to have StoredFileInfo as second element 

670 return [ 

671 ( 

672 location, 

673 StoredFileInfo( 

674 formatter=formatter, 

675 path=location.pathInStore.path, 

676 storageClass=storageClass, 

677 component=component, 

678 checksum=None, 

679 file_size=-1, 

680 ), 

681 ) 

682 for location, formatter, storageClass, component in all_info 

683 ] 

684 

685 def _prepare_for_direct_get( 

686 self, ref: DatasetRef, parameters: Mapping[str, Any] | None = None 

687 ) -> list[DatastoreFileGetInformation]: 

688 """Check parameters for ``get`` and obtain formatter and 

689 location. 

690 

691 Parameters 

692 ---------- 

693 ref : `DatasetRef` 

694 Reference to the required Dataset. 

695 parameters : `dict` 

696 `StorageClass`-specific parameters that specify, for example, 

697 a slice of the dataset to be loaded. 

698 

699 Returns 

700 ------- 

701 getInfo : `list` [`DatastoreFileGetInformation`] 

702 Parameters needed to retrieve each file. 

703 """ 

704 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

705 

706 # The storage class we want to use eventually 

707 refStorageClass = ref.datasetType.storageClass 

708 

709 # For trusted mode need to reset storage class. 

710 ref = self._cast_storage_class(ref) 

711 

712 # Get file metadata and internal metadata 

713 fileLocations = self._get_dataset_locations_info(ref) 

714 if not fileLocations: 

715 if not self.trustGetRequest: 

716 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

717 # Assume the dataset is where we think it should be 

718 fileLocations = self._get_expected_dataset_locations_info(ref) 

719 

720 if len(fileLocations) > 1: 

721 # If trust is involved it is possible that there will be 

722 # components listed here that do not exist in the datastore. 

723 # Explicitly check for file artifact existence and filter out any 

724 # that are missing. 

725 if self.trustGetRequest: 

726 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()] 

727 

728 # For now complain only if we have no components at all. One 

729 # component is probably a problem but we can punt that to the 

730 # assembler. 

731 if not fileLocations: 

732 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") 

733 

734 return generate_datastore_get_information( 

735 fileLocations, 

736 readStorageClass=refStorageClass, 

737 ref=ref, 

738 parameters=parameters, 

739 ) 

740 

741 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> tuple[Location, Formatter]: 

742 """Check the arguments for ``put`` and obtain formatter and 

743 location. 

744 

745 Parameters 

746 ---------- 

747 inMemoryDataset : `object` 

748 The dataset to store. 

749 ref : `DatasetRef` 

750 Reference to the associated Dataset. 

751 

752 Returns 

753 ------- 

754 location : `Location` 

755 The location to write the dataset. 

756 formatter : `Formatter` 

757 The `Formatter` to use to write the dataset. 

758 

759 Raises 

760 ------ 

761 TypeError 

762 Supplied object and storage class are inconsistent. 

763 DatasetTypeNotSupportedError 

764 The associated `DatasetType` is not handled by this datastore. 

765 """ 

766 self._validate_put_parameters(inMemoryDataset, ref) 

767 return self._determine_put_formatter_location(ref) 

768 

769 def _determine_put_formatter_location(self, ref: DatasetRef) -> tuple[Location, Formatter]: 

770 """Calculate the formatter and output location to use for put. 

771 

772 Parameters 

773 ---------- 

774 ref : `DatasetRef` 

775 Reference to the associated Dataset. 

776 

777 Returns 

778 ------- 

779 location : `Location` 

780 The location to write the dataset. 

781 formatter : `Formatter` 

782 The `Formatter` to use to write the dataset. 

783 """ 

784 # Work out output file name 

785 try: 

786 template = self.templates.getTemplate(ref) 

787 except KeyError as e: 

788 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

789 

790 # Validate the template to protect against filenames from different 

791 # dataIds returning the same and causing overwrite confusion. 

792 template.validateTemplate(ref) 

793 

794 location = self.locationFactory.fromPath(template.format(ref)) 

795 

796 # Get the formatter based on the storage class 

797 storageClass = ref.datasetType.storageClass 

798 try: 

799 formatter = self.formatterFactory.getFormatter( 

800 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId 

801 ) 

802 except KeyError as e: 

803 raise DatasetTypeNotSupportedError( 

804 f"Unable to find formatter for {ref} in datastore {self.name}" 

805 ) from e 

806 

807 # Now that we know the formatter, update the location 

808 location = formatter.makeUpdatedLocation(location) 

809 

810 return location, formatter 

811 

812 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None: 

813 # Docstring inherited from base class 

814 if transfer != "auto": 

815 return transfer 

816 

817 # See if the paths are within the datastore or not 

818 inside = [self._pathInStore(d.path) is not None for d in datasets] 

819 

820 if all(inside): 

821 transfer = None 

822 elif not any(inside): 

823 # Allow ResourcePath to use its own knowledge 

824 transfer = "auto" 

825 else: 

826 # This can happen when importing from a datastore that 

827 # has had some datasets ingested using "direct" mode. 

828 # Also allow ResourcePath to sort it out but warn about it. 

829 # This can happen if you are importing from a datastore 

830 # that had some direct transfer datasets. 

831 log.warning( 

832 "Some datasets are inside the datastore and some are outside. Using 'split' " 

833 "transfer mode. This assumes that the files outside the datastore are " 

834 "still accessible to the new butler since they will not be copied into " 

835 "the target datastore." 

836 ) 

837 transfer = "split" 

838 

839 return transfer 

840 

841 def _pathInStore(self, path: ResourcePathExpression) -> str | None: 

842 """Return path relative to datastore root. 

843 

844 Parameters 

845 ---------- 

846 path : `lsst.resources.ResourcePathExpression` 

847 Path to dataset. Can be absolute URI. If relative assumed to 

848 be relative to the datastore. Returns path in datastore 

849 or raises an exception if the path it outside. 

850 

851 Returns 

852 ------- 

853 inStore : `str` 

854 Path relative to datastore root. Returns `None` if the file is 

855 outside the root. 

856 """ 

857 # Relative path will always be relative to datastore 

858 pathUri = ResourcePath(path, forceAbsolute=False) 

859 return pathUri.relative_to(self.root) 

860 

861 def _standardizeIngestPath( 

862 self, path: str | ResourcePath, *, transfer: str | None = None 

863 ) -> str | ResourcePath: 

864 """Standardize the path of a to-be-ingested file. 

865 

866 Parameters 

867 ---------- 

868 path : `str` or `lsst.resources.ResourcePath` 

869 Path of a file to be ingested. This parameter is not expected 

870 to be all the types that can be used to construct a 

871 `~lsst.resources.ResourcePath`. 

872 transfer : `str`, optional 

873 How (and whether) the dataset should be added to the datastore. 

874 See `ingest` for details of transfer modes. 

875 This implementation is provided only so 

876 `NotImplementedError` can be raised if the mode is not supported; 

877 actual transfers are deferred to `_extractIngestInfo`. 

878 

879 Returns 

880 ------- 

881 path : `str` or `lsst.resources.ResourcePath` 

882 New path in what the datastore considers standard form. If an 

883 absolute URI was given that will be returned unchanged. 

884 

885 Notes 

886 ----- 

887 Subclasses of `FileDatastore` can implement this method instead 

888 of `_prepIngest`. It should not modify the data repository or given 

889 file in any way. 

890 

891 Raises 

892 ------ 

893 NotImplementedError 

894 Raised if the datastore does not support the given transfer mode 

895 (including the case where ingest is not supported at all). 

896 FileNotFoundError 

897 Raised if one of the given files does not exist. 

898 """ 

899 if transfer not in (None, "direct", "split") + self.root.transferModes: 

900 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

901 

902 # A relative URI indicates relative to datastore root 

903 srcUri = ResourcePath(path, forceAbsolute=False) 

904 if not srcUri.isabs(): 

905 srcUri = self.root.join(path) 

906 

907 if not srcUri.exists(): 

908 raise FileNotFoundError( 

909 f"Resource at {srcUri} does not exist; note that paths to ingest " 

910 f"are assumed to be relative to {self.root} unless they are absolute." 

911 ) 

912 

913 if transfer is None: 

914 relpath = srcUri.relative_to(self.root) 

915 if not relpath: 

916 raise RuntimeError( 

917 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})" 

918 ) 

919 

920 # Return the relative path within the datastore for internal 

921 # transfer 

922 path = relpath 

923 

924 return path 

925 

926 def _extractIngestInfo( 

927 self, 

928 path: ResourcePathExpression, 

929 ref: DatasetRef, 

930 *, 

931 formatter: Formatter | type[Formatter], 

932 transfer: str | None = None, 

933 record_validation_info: bool = True, 

934 ) -> StoredFileInfo: 

935 """Relocate (if necessary) and extract `StoredFileInfo` from a 

936 to-be-ingested file. 

937 

938 Parameters 

939 ---------- 

940 path : `lsst.resources.ResourcePathExpression` 

941 URI or path of a file to be ingested. 

942 ref : `DatasetRef` 

943 Reference for the dataset being ingested. Guaranteed to have 

944 ``dataset_id not None`. 

945 formatter : `type` or `Formatter` 

946 `Formatter` subclass to use for this dataset or an instance. 

947 transfer : `str`, optional 

948 How (and whether) the dataset should be added to the datastore. 

949 See `ingest` for details of transfer modes. 

950 record_validation_info : `bool`, optional 

951 If `True`, the default, the datastore can record validation 

952 information associated with the file. If `False` the datastore 

953 will not attempt to track any information such as checksums 

954 or file sizes. This can be useful if such information is tracked 

955 in an external system or if the file is to be compressed in place. 

956 It is up to the datastore whether this parameter is relevant. 

957 

958 Returns 

959 ------- 

960 info : `StoredFileInfo` 

961 Internal datastore record for this file. This will be inserted by 

962 the caller; the `_extractIngestInfo` is only responsible for 

963 creating and populating the struct. 

964 

965 Raises 

966 ------ 

967 FileNotFoundError 

968 Raised if one of the given files does not exist. 

969 FileExistsError 

970 Raised if transfer is not `None` but the (internal) location the 

971 file would be moved to is already occupied. 

972 """ 

973 if self._transaction is None: 

974 raise RuntimeError("Ingest called without transaction enabled") 

975 

976 # Create URI of the source path, do not need to force a relative 

977 # path to absolute. 

978 srcUri = ResourcePath(path, forceAbsolute=False) 

979 

980 # Track whether we have read the size of the source yet 

981 have_sized = False 

982 

983 tgtLocation: Location | None 

984 if transfer is None or transfer == "split": 

985 # A relative path is assumed to be relative to the datastore 

986 # in this context 

987 if not srcUri.isabs(): 

988 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

989 else: 

990 # Work out the path in the datastore from an absolute URI 

991 # This is required to be within the datastore. 

992 pathInStore = srcUri.relative_to(self.root) 

993 if pathInStore is None and transfer is None: 

994 raise RuntimeError( 

995 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}" 

996 ) 

997 if pathInStore: 

998 tgtLocation = self.locationFactory.fromPath(pathInStore) 

999 elif transfer == "split": 

1000 # Outside the datastore but treat that as a direct ingest 

1001 # instead. 

1002 tgtLocation = None 

1003 else: 

1004 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}") 

1005 elif transfer == "direct": 

1006 # Want to store the full URI to the resource directly in 

1007 # datastore. This is useful for referring to permanent archive 

1008 # storage for raw data. 

1009 # Trust that people know what they are doing. 

1010 tgtLocation = None 

1011 else: 

1012 # Work out the name we want this ingested file to have 

1013 # inside the datastore 

1014 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

1015 if not tgtLocation.uri.dirname().exists(): 

1016 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

1017 tgtLocation.uri.dirname().mkdir() 

1018 

1019 # if we are transferring from a local file to a remote location 

1020 # it may be more efficient to get the size and checksum of the 

1021 # local file rather than the transferred one 

1022 if record_validation_info and srcUri.isLocal: 

1023 size = srcUri.size() 

1024 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

1025 have_sized = True 

1026 

1027 # Transfer the resource to the destination. 

1028 # Allow overwrite of an existing file. This matches the behavior 

1029 # of datastore.put() in that it trusts that registry would not 

1030 # be asking to overwrite unless registry thought that the 

1031 # overwrite was allowed. 

1032 tgtLocation.uri.transfer_from( 

1033 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True 

1034 ) 

1035 

1036 if tgtLocation is None: 

1037 # This means we are using direct mode 

1038 targetUri = srcUri 

1039 targetPath = str(srcUri) 

1040 else: 

1041 targetUri = tgtLocation.uri 

1042 targetPath = tgtLocation.pathInStore.path 

1043 

1044 # the file should exist in the datastore now 

1045 if record_validation_info: 

1046 if not have_sized: 

1047 size = targetUri.size() 

1048 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

1049 else: 

1050 # Not recording any file information. 

1051 size = -1 

1052 checksum = None 

1053 

1054 return StoredFileInfo( 

1055 formatter=formatter, 

1056 path=targetPath, 

1057 storageClass=ref.datasetType.storageClass, 

1058 component=ref.datasetType.component(), 

1059 file_size=size, 

1060 checksum=checksum, 

1061 ) 

1062 

1063 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData: 

1064 # Docstring inherited from Datastore._prepIngest. 

1065 filtered = [] 

1066 for dataset in datasets: 

1067 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

1068 if not acceptable: 

1069 continue 

1070 else: 

1071 dataset.refs = acceptable 

1072 if dataset.formatter is None: 

1073 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

1074 else: 

1075 assert isinstance(dataset.formatter, type | str) 

1076 formatter_class = get_class_of(dataset.formatter) 

1077 if not issubclass(formatter_class, Formatter): 

1078 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.") 

1079 dataset.formatter = formatter_class 

1080 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

1081 filtered.append(dataset) 

1082 return _IngestPrepData(filtered) 

1083 

1084 @transactional 

1085 def _finishIngest( 

1086 self, 

1087 prepData: Datastore.IngestPrepData, 

1088 *, 

1089 transfer: str | None = None, 

1090 record_validation_info: bool = True, 

1091 ) -> None: 

1092 # Docstring inherited from Datastore._finishIngest. 

1093 refsAndInfos = [] 

1094 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

1095 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

1096 # Do ingest as if the first dataset ref is associated with the file 

1097 info = self._extractIngestInfo( 

1098 dataset.path, 

1099 dataset.refs[0], 

1100 formatter=dataset.formatter, 

1101 transfer=transfer, 

1102 record_validation_info=record_validation_info, 

1103 ) 

1104 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

1105 

1106 # In direct mode we can allow repeated ingests of the same thing 

1107 # if we are sure that the external dataset is immutable. We use 

1108 # UUIDv5 to indicate this. If there is a mix of v4 and v5 they are 

1109 # separated. 

1110 refs_and_infos_replace = [] 

1111 refs_and_infos_insert = [] 

1112 if transfer == "direct": 

1113 for entry in refsAndInfos: 

1114 if entry[0].id.version == 5: 

1115 refs_and_infos_replace.append(entry) 

1116 else: 

1117 refs_and_infos_insert.append(entry) 

1118 else: 

1119 refs_and_infos_insert = refsAndInfos 

1120 

1121 if refs_and_infos_insert: 

1122 self._register_datasets(refs_and_infos_insert, insert_mode=DatabaseInsertMode.INSERT) 

1123 if refs_and_infos_replace: 

1124 self._register_datasets(refs_and_infos_replace, insert_mode=DatabaseInsertMode.REPLACE) 

1125 

1126 def _calculate_ingested_datastore_name( 

1127 self, 

1128 srcUri: ResourcePath, 

1129 ref: DatasetRef, 

1130 formatter: Formatter | type[Formatter] | None = None, 

1131 ) -> Location: 

1132 """Given a source URI and a DatasetRef, determine the name the 

1133 dataset will have inside datastore. 

1134 

1135 Parameters 

1136 ---------- 

1137 srcUri : `lsst.resources.ResourcePath` 

1138 URI to the source dataset file. 

1139 ref : `DatasetRef` 

1140 Ref associated with the newly-ingested dataset artifact. This 

1141 is used to determine the name within the datastore. 

1142 formatter : `Formatter` or Formatter class. 

1143 Formatter to use for validation. Can be a class or an instance. 

1144 No validation of the file extension is performed if the 

1145 ``formatter`` is `None`. This can be used if the caller knows 

1146 that the source URI and target URI will use the same formatter. 

1147 

1148 Returns 

1149 ------- 

1150 location : `Location` 

1151 Target location for the newly-ingested dataset. 

1152 """ 

1153 # Ingesting a file from outside the datastore. 

1154 # This involves a new name. 

1155 template = self.templates.getTemplate(ref) 

1156 location = self.locationFactory.fromPath(template.format(ref)) 

1157 

1158 # Get the extension 

1159 ext = srcUri.getExtension() 

1160 

1161 # Update the destination to include that extension 

1162 location.updateExtension(ext) 

1163 

1164 # Ask the formatter to validate this extension 

1165 if formatter is not None: 

1166 formatter.validateExtension(location) 

1167 

1168 return location 

1169 

1170 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

1171 """Write out in memory dataset to datastore. 

1172 

1173 Parameters 

1174 ---------- 

1175 inMemoryDataset : `object` 

1176 Dataset to write to datastore. 

1177 ref : `DatasetRef` 

1178 Registry information associated with this dataset. 

1179 

1180 Returns 

1181 ------- 

1182 info : `StoredFileInfo` 

1183 Information describing the artifact written to the datastore. 

1184 """ 

1185 # May need to coerce the in memory dataset to the correct 

1186 # python type, but first we need to make sure the storage class 

1187 # reflects the one defined in the data repository. 

1188 ref = self._cast_storage_class(ref) 

1189 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset) 

1190 

1191 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

1192 uri = location.uri 

1193 

1194 if not uri.dirname().exists(): 

1195 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

1196 uri.dirname().mkdir() 

1197 

1198 if self._transaction is None: 

1199 raise RuntimeError("Attempting to write artifact without transaction enabled") 

1200 

1201 def _removeFileExists(uri: ResourcePath) -> None: 

1202 """Remove a file and do not complain if it is not there. 

1203 

1204 This is important since a formatter might fail before the file 

1205 is written and we should not confuse people by writing spurious 

1206 error messages to the log. 

1207 """ 

1208 with contextlib.suppress(FileNotFoundError): 

1209 uri.remove() 

1210 

1211 # Register a callback to try to delete the uploaded data if 

1212 # something fails below 

1213 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1214 

1215 data_written = False 

1216 

1217 # For remote URIs some datasets can be serialized directly 

1218 # to bytes and sent to the remote datastore without writing a 

1219 # file. If the dataset is intended to be saved to the cache 

1220 # a file is always written and direct write to the remote 

1221 # datastore is bypassed. 

1222 if not uri.isLocal and not self.cacheManager.should_be_cached(ref): 

1223 # Remote URI that is not cached so can write directly. 

1224 try: 

1225 serializedDataset = formatter.toBytes(inMemoryDataset) 

1226 except NotImplementedError: 

1227 # Fallback to the file writing option. 

1228 pass 

1229 except Exception as e: 

1230 raise RuntimeError( 

1231 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes." 

1232 ) from e 

1233 else: 

1234 log.debug("Writing bytes directly to %s", uri) 

1235 uri.write(serializedDataset, overwrite=True) 

1236 log.debug("Successfully wrote bytes directly to %s", uri) 

1237 data_written = True 

1238 

1239 if not data_written: 

1240 # Did not write the bytes directly to object store so instead 

1241 # write to temporary file. Always write to a temporary even if 

1242 # using a local file system -- that gives us atomic writes. 

1243 # If a process is killed as the file is being written we do not 

1244 # want it to remain in the correct place but in corrupt state. 

1245 # For local files write to the output directory not temporary dir. 

1246 prefix = uri.dirname() if uri.isLocal else None 

1247 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri: 

1248 # Need to configure the formatter to write to a different 

1249 # location and that needs us to overwrite internals 

1250 log.debug("Writing dataset to temporary location at %s", temporary_uri) 

1251 with formatter._updateLocation(Location(None, temporary_uri)): 

1252 try: 

1253 formatter.write(inMemoryDataset) 

1254 except Exception as e: 

1255 raise RuntimeError( 

1256 f"Failed to serialize dataset {ref} of type" 

1257 f" {type(inMemoryDataset)} to " 

1258 f"temporary location {temporary_uri}" 

1259 ) from e 

1260 

1261 # Use move for a local file since that becomes an efficient 

1262 # os.rename. For remote resources we use copy to allow the 

1263 # file to be cached afterwards. 

1264 transfer = "move" if uri.isLocal else "copy" 

1265 

1266 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True) 

1267 

1268 if transfer == "copy": 

1269 # Cache if required 

1270 self.cacheManager.move_to_cache(temporary_uri, ref) 

1271 

1272 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1273 

1274 # URI is needed to resolve what ingest case are we dealing with 

1275 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1276 

1277 def knows(self, ref: DatasetRef) -> bool: 

1278 """Check if the dataset is known to the datastore. 

1279 

1280 Does not check for existence of any artifact. 

1281 

1282 Parameters 

1283 ---------- 

1284 ref : `DatasetRef` 

1285 Reference to the required dataset. 

1286 

1287 Returns 

1288 ------- 

1289 exists : `bool` 

1290 `True` if the dataset is known to the datastore. 

1291 """ 

1292 # We cannot trust datastore records from ref, as many unit tests delete 

1293 # datasets and check their existence. 

1294 fileLocations = self._get_dataset_locations_info(ref, ignore_datastore_records=True) 

1295 if fileLocations: 

1296 return True 

1297 return False 

1298 

1299 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

1300 # Docstring inherited from the base class. 

1301 

1302 # The records themselves. Could be missing some entries. 

1303 records = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True) 

1304 

1305 return {ref: ref.id in records for ref in refs} 

1306 

1307 def _process_mexists_records( 

1308 self, 

1309 id_to_ref: dict[DatasetId, DatasetRef], 

1310 records: dict[DatasetId, list[StoredFileInfo]], 

1311 all_required: bool, 

1312 artifact_existence: dict[ResourcePath, bool] | None = None, 

1313 ) -> dict[DatasetRef, bool]: 

1314 """Check given records for existence. 

1315 

1316 Helper function for `mexists()`. 

1317 

1318 Parameters 

1319 ---------- 

1320 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`] 

1321 Mapping of the dataset ID to the dataset ref itself. 

1322 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

1323 Records as generally returned by 

1324 ``_get_stored_records_associated_with_refs``. 

1325 all_required : `bool` 

1326 Flag to indicate whether existence requires all artifacts 

1327 associated with a dataset ID to exist or not for existence. 

1328 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1329 Optional mapping of datastore artifact to existence. Updated by 

1330 this method with details of all artifacts tested. Can be `None` 

1331 if the caller is not interested. 

1332 

1333 Returns 

1334 ------- 

1335 existence : `dict` of [`DatasetRef`, `bool`] 

1336 Mapping from dataset to boolean indicating existence. 

1337 """ 

1338 # The URIs to be checked and a mapping of those URIs to 

1339 # the dataset ID. 

1340 uris_to_check: list[ResourcePath] = [] 

1341 location_map: dict[ResourcePath, DatasetId] = {} 

1342 

1343 location_factory = self.locationFactory 

1344 

1345 uri_existence: dict[ResourcePath, bool] = {} 

1346 for ref_id, infos in records.items(): 

1347 # Key is the dataset Id, value is list of StoredItemInfo 

1348 uris = [info.file_location(location_factory).uri for info in infos] 

1349 location_map.update({uri: ref_id for uri in uris}) 

1350 

1351 # Check the local cache directly for a dataset corresponding 

1352 # to the remote URI. 

1353 if self.cacheManager.file_count > 0: 

1354 ref = id_to_ref[ref_id] 

1355 for uri, storedFileInfo in zip(uris, infos, strict=True): 

1356 check_ref = ref 

1357 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1358 check_ref = ref.makeComponentRef(component) 

1359 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()): 

1360 # Proxy for URI existence. 

1361 uri_existence[uri] = True 

1362 else: 

1363 uris_to_check.append(uri) 

1364 else: 

1365 # Check all of them. 

1366 uris_to_check.extend(uris) 

1367 

1368 if artifact_existence is not None: 

1369 # If a URI has already been checked remove it from the list 

1370 # and immediately add the status to the output dict. 

1371 filtered_uris_to_check = [] 

1372 for uri in uris_to_check: 

1373 if uri in artifact_existence: 

1374 uri_existence[uri] = artifact_existence[uri] 

1375 else: 

1376 filtered_uris_to_check.append(uri) 

1377 uris_to_check = filtered_uris_to_check 

1378 

1379 # Results. 

1380 dataset_existence: dict[DatasetRef, bool] = {} 

1381 

1382 uri_existence.update(ResourcePath.mexists(uris_to_check)) 

1383 for uri, exists in uri_existence.items(): 

1384 dataset_id = location_map[uri] 

1385 ref = id_to_ref[dataset_id] 

1386 

1387 # Disassembled composite needs to check all locations. 

1388 # all_required indicates whether all need to exist or not. 

1389 if ref in dataset_existence: 

1390 if all_required: 

1391 exists = dataset_existence[ref] and exists 

1392 else: 

1393 exists = dataset_existence[ref] or exists 

1394 dataset_existence[ref] = exists 

1395 

1396 if artifact_existence is not None: 

1397 artifact_existence.update(uri_existence) 

1398 

1399 return dataset_existence 

1400 

1401 def mexists( 

1402 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1403 ) -> dict[DatasetRef, bool]: 

1404 """Check the existence of multiple datasets at once. 

1405 

1406 Parameters 

1407 ---------- 

1408 refs : iterable of `DatasetRef` 

1409 The datasets to be checked. 

1410 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1411 Optional mapping of datastore artifact to existence. Updated by 

1412 this method with details of all artifacts tested. Can be `None` 

1413 if the caller is not interested. 

1414 

1415 Returns 

1416 ------- 

1417 existence : `dict` of [`DatasetRef`, `bool`] 

1418 Mapping from dataset to boolean indicating existence. 

1419 

1420 Notes 

1421 ----- 

1422 To minimize potentially costly remote existence checks, the local 

1423 cache is checked as a proxy for existence. If a file for this 

1424 `DatasetRef` does exist no check is done for the actual URI. This 

1425 could result in possibly unexpected behavior if the dataset itself 

1426 has been removed from the datastore by another process whilst it is 

1427 still in the cache. 

1428 """ 

1429 chunk_size = 10_000 

1430 dataset_existence: dict[DatasetRef, bool] = {} 

1431 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size) 

1432 n_found_total = 0 

1433 n_checked = 0 

1434 n_chunks = 0 

1435 for chunk in chunk_iterable(refs, chunk_size=chunk_size): 

1436 chunk_result = self._mexists(chunk, artifact_existence) 

1437 

1438 # The log message level and content depend on how many 

1439 # datasets we are processing. 

1440 n_results = len(chunk_result) 

1441 

1442 # Use verbose logging to ensure that messages can be seen 

1443 # easily if many refs are being checked. 

1444 log_threshold = VERBOSE 

1445 n_checked += n_results 

1446 

1447 # This sum can take some time so only do it if we know the 

1448 # result is going to be used. 

1449 n_found = 0 

1450 if log.isEnabledFor(log_threshold): 

1451 # Can treat the booleans as 0, 1 integers and sum them. 

1452 n_found = sum(chunk_result.values()) 

1453 n_found_total += n_found 

1454 

1455 # We are deliberately not trying to count the number of refs 

1456 # provided in case it's in the millions. This means there is a 

1457 # situation where the number of refs exactly matches the chunk 

1458 # size and we will switch to the multi-chunk path even though 

1459 # we only have a single chunk. 

1460 if n_results < chunk_size and n_chunks == 0: 

1461 # Single chunk will be processed so we can provide more detail. 

1462 if n_results == 1: 

1463 ref = list(chunk_result)[0] 

1464 # Use debug logging to be consistent with `exists()`. 

1465 log.debug( 

1466 "Calling mexists() with single ref that does%s exist (%s).", 

1467 "" if chunk_result[ref] else " not", 

1468 ref, 

1469 ) 

1470 else: 

1471 # Single chunk but multiple files. Summarize. 

1472 log.log( 

1473 log_threshold, 

1474 "Number of datasets found in datastore: %d out of %d datasets checked.", 

1475 n_found, 

1476 n_checked, 

1477 ) 

1478 

1479 else: 

1480 # Use incremental verbose logging when we have multiple chunks. 

1481 log.log( 

1482 log_threshold, 

1483 "Number of datasets found in datastore for chunk %d: %d out of %d checked " 

1484 "(running total from all chunks so far: %d found out of %d checked)", 

1485 n_chunks, 

1486 n_found, 

1487 n_results, 

1488 n_found_total, 

1489 n_checked, 

1490 ) 

1491 dataset_existence.update(chunk_result) 

1492 n_chunks += 1 

1493 

1494 return dataset_existence 

1495 

1496 def _mexists( 

1497 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1498 ) -> dict[DatasetRef, bool]: 

1499 """Check the existence of multiple datasets at once. 

1500 

1501 Parameters 

1502 ---------- 

1503 refs : iterable of `DatasetRef` 

1504 The datasets to be checked. 

1505 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1506 Optional mapping of datastore artifact to existence. Updated by 

1507 this method with details of all artifacts tested. Can be `None` 

1508 if the caller is not interested. 

1509 

1510 Returns 

1511 ------- 

1512 existence : `dict` of [`DatasetRef`, `bool`] 

1513 Mapping from dataset to boolean indicating existence. 

1514 """ 

1515 # Make a mapping from refs with the internal storage class to the given 

1516 # refs that may have a different one. We'll use the internal refs 

1517 # throughout this method and convert back at the very end. 

1518 internal_ref_to_input_ref = {self._cast_storage_class(ref): ref for ref in refs} 

1519 

1520 # Need a mapping of dataset_id to (internal) dataset ref since some 

1521 # internal APIs work with dataset_id. 

1522 id_to_ref = {ref.id: ref for ref in internal_ref_to_input_ref} 

1523 

1524 # Set of all IDs we are checking for. 

1525 requested_ids = set(id_to_ref.keys()) 

1526 

1527 # The records themselves. Could be missing some entries. 

1528 records = self._get_stored_records_associated_with_refs( 

1529 id_to_ref.values(), ignore_datastore_records=True 

1530 ) 

1531 

1532 dataset_existence = self._process_mexists_records( 

1533 id_to_ref, records, True, artifact_existence=artifact_existence 

1534 ) 

1535 

1536 # Set of IDs that have been handled. 

1537 handled_ids = {ref.id for ref in dataset_existence} 

1538 

1539 missing_ids = requested_ids - handled_ids 

1540 if missing_ids: 

1541 dataset_existence.update( 

1542 self._mexists_check_expected( 

1543 [id_to_ref[missing] for missing in missing_ids], artifact_existence 

1544 ) 

1545 ) 

1546 

1547 return { 

1548 internal_ref_to_input_ref[internal_ref]: existence 

1549 for internal_ref, existence in dataset_existence.items() 

1550 } 

1551 

1552 def _mexists_check_expected( 

1553 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1554 ) -> dict[DatasetRef, bool]: 

1555 """Check existence of refs that are not known to datastore. 

1556 

1557 Parameters 

1558 ---------- 

1559 refs : iterable of `DatasetRef` 

1560 The datasets to be checked. These are assumed not to be known 

1561 to datastore. 

1562 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1563 Optional mapping of datastore artifact to existence. Updated by 

1564 this method with details of all artifacts tested. Can be `None` 

1565 if the caller is not interested. 

1566 

1567 Returns 

1568 ------- 

1569 existence : `dict` of [`DatasetRef`, `bool`] 

1570 Mapping from dataset to boolean indicating existence. 

1571 """ 

1572 dataset_existence: dict[DatasetRef, bool] = {} 

1573 if not self.trustGetRequest: 

1574 # Must assume these do not exist 

1575 for ref in refs: 

1576 dataset_existence[ref] = False 

1577 else: 

1578 log.debug( 

1579 "%d datasets were not known to datastore during initial existence check.", 

1580 len(refs), 

1581 ) 

1582 

1583 # Construct data structure identical to that returned 

1584 # by _get_stored_records_associated_with_refs() but using 

1585 # guessed names. 

1586 records = {} 

1587 id_to_ref = {} 

1588 for missing_ref in refs: 

1589 expected = self._get_expected_dataset_locations_info(missing_ref) 

1590 dataset_id = missing_ref.id 

1591 records[dataset_id] = [info for _, info in expected] 

1592 id_to_ref[dataset_id] = missing_ref 

1593 

1594 dataset_existence.update( 

1595 self._process_mexists_records( 

1596 id_to_ref, 

1597 records, 

1598 False, 

1599 artifact_existence=artifact_existence, 

1600 ) 

1601 ) 

1602 

1603 return dataset_existence 

1604 

1605 def exists(self, ref: DatasetRef) -> bool: 

1606 """Check if the dataset exists in the datastore. 

1607 

1608 Parameters 

1609 ---------- 

1610 ref : `DatasetRef` 

1611 Reference to the required dataset. 

1612 

1613 Returns 

1614 ------- 

1615 exists : `bool` 

1616 `True` if the entity exists in the `Datastore`. 

1617 

1618 Notes 

1619 ----- 

1620 The local cache is checked as a proxy for existence in the remote 

1621 object store. It is possible that another process on a different 

1622 compute node could remove the file from the object store even 

1623 though it is present in the local cache. 

1624 """ 

1625 ref = self._cast_storage_class(ref) 

1626 # We cannot trust datastore records from ref, as many unit tests delete 

1627 # datasets and check their existence. 

1628 fileLocations = self._get_dataset_locations_info(ref, ignore_datastore_records=True) 

1629 

1630 # if we are being asked to trust that registry might not be correct 

1631 # we ask for the expected locations and check them explicitly 

1632 if not fileLocations: 

1633 if not self.trustGetRequest: 

1634 return False 

1635 

1636 # First check the cache. If it is not found we must check 

1637 # the datastore itself. Assume that any component in the cache 

1638 # means that the dataset does exist somewhere. 

1639 if self.cacheManager.known_to_cache(ref): 

1640 return True 

1641 

1642 # When we are guessing a dataset location we can not check 

1643 # for the existence of every component since we can not 

1644 # know if every component was written. Instead we check 

1645 # for the existence of any of the expected locations. 

1646 for location, _ in self._get_expected_dataset_locations_info(ref): 

1647 if self._artifact_exists(location): 

1648 return True 

1649 return False 

1650 

1651 # All listed artifacts must exist. 

1652 for location, storedFileInfo in fileLocations: 

1653 # Checking in cache needs the component ref. 

1654 check_ref = ref 

1655 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1656 check_ref = ref.makeComponentRef(component) 

1657 if self.cacheManager.known_to_cache(check_ref, location.getExtension()): 

1658 continue 

1659 

1660 if not self._artifact_exists(location): 

1661 return False 

1662 

1663 return True 

1664 

1665 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

1666 """Return URIs associated with dataset. 

1667 

1668 Parameters 

1669 ---------- 

1670 ref : `DatasetRef` 

1671 Reference to the required dataset. 

1672 predict : `bool`, optional 

1673 If the datastore does not know about the dataset, controls whether 

1674 it should return a predicted URI or not. 

1675 

1676 Returns 

1677 ------- 

1678 uris : `DatasetRefURIs` 

1679 The URI to the primary artifact associated with this dataset (if 

1680 the dataset was disassembled within the datastore this may be 

1681 `None`), and the URIs to any components associated with the dataset 

1682 artifact. (can be empty if there are no components). 

1683 """ 

1684 many = self.getManyURIs([ref], predict=predict, allow_missing=False) 

1685 return many[ref] 

1686 

1687 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

1688 """URI to the Dataset. 

1689 

1690 Parameters 

1691 ---------- 

1692 ref : `DatasetRef` 

1693 Reference to the required Dataset. 

1694 predict : `bool` 

1695 If `True`, allow URIs to be returned of datasets that have not 

1696 been written. 

1697 

1698 Returns 

1699 ------- 

1700 uri : `str` 

1701 URI pointing to the dataset within the datastore. If the 

1702 dataset does not exist in the datastore, and if ``predict`` is 

1703 `True`, the URI will be a prediction and will include a URI 

1704 fragment "#predicted". 

1705 If the datastore does not have entities that relate well 

1706 to the concept of a URI the returned URI will be 

1707 descriptive. The returned URI is not guaranteed to be obtainable. 

1708 

1709 Raises 

1710 ------ 

1711 FileNotFoundError 

1712 Raised if a URI has been requested for a dataset that does not 

1713 exist and guessing is not allowed. 

1714 RuntimeError 

1715 Raised if a request is made for a single URI but multiple URIs 

1716 are associated with this dataset. 

1717 

1718 Notes 

1719 ----- 

1720 When a predicted URI is requested an attempt will be made to form 

1721 a reasonable URI based on file templates and the expected formatter. 

1722 """ 

1723 primary, components = self.getURIs(ref, predict) 

1724 if primary is None or components: 

1725 raise RuntimeError( 

1726 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

1727 ) 

1728 return primary 

1729 

1730 def _predict_URIs( 

1731 self, 

1732 ref: DatasetRef, 

1733 ) -> DatasetRefURIs: 

1734 """Predict the URIs of a dataset ref. 

1735 

1736 Parameters 

1737 ---------- 

1738 ref : `DatasetRef` 

1739 Reference to the required Dataset. 

1740 

1741 Returns 

1742 ------- 

1743 URI : DatasetRefUris 

1744 Primary and component URIs. URIs will contain a URI fragment 

1745 "#predicted". 

1746 """ 

1747 uris = DatasetRefURIs() 

1748 

1749 if self.composites.shouldBeDisassembled(ref): 

1750 for component, _ in ref.datasetType.storageClass.components.items(): 

1751 comp_ref = ref.makeComponentRef(component) 

1752 comp_location, _ = self._determine_put_formatter_location(comp_ref) 

1753 

1754 # Add the "#predicted" URI fragment to indicate this is a 

1755 # guess 

1756 uris.componentURIs[component] = ResourcePath(comp_location.uri.geturl() + "#predicted") 

1757 

1758 else: 

1759 location, _ = self._determine_put_formatter_location(ref) 

1760 

1761 # Add the "#predicted" URI fragment to indicate this is a guess 

1762 uris.primaryURI = ResourcePath(location.uri.geturl() + "#predicted") 

1763 

1764 return uris 

1765 

1766 def getManyURIs( 

1767 self, 

1768 refs: Iterable[DatasetRef], 

1769 predict: bool = False, 

1770 allow_missing: bool = False, 

1771 ) -> dict[DatasetRef, DatasetRefURIs]: 

1772 # Docstring inherited 

1773 

1774 uris: dict[DatasetRef, DatasetRefURIs] = {} 

1775 

1776 records = self._get_stored_records_associated_with_refs(refs) 

1777 records_keys = records.keys() 

1778 

1779 existing_refs = tuple(ref for ref in refs if ref.id in records_keys) 

1780 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys) 

1781 

1782 # Have to handle trustGetRequest mode by checking for the existence 

1783 # of the missing refs on disk. 

1784 if missing_refs: 

1785 dataset_existence = self._mexists_check_expected(missing_refs, None) 

1786 really_missing = set() 

1787 not_missing = set() 

1788 for ref, exists in dataset_existence.items(): 

1789 if exists: 

1790 not_missing.add(ref) 

1791 else: 

1792 really_missing.add(ref) 

1793 

1794 if not_missing: 

1795 # Need to recalculate the missing/existing split. 

1796 existing_refs = existing_refs + tuple(not_missing) 

1797 missing_refs = tuple(really_missing) 

1798 

1799 for ref in missing_refs: 

1800 # if this has never been written then we have to guess 

1801 if not predict: 

1802 if not allow_missing: 

1803 raise FileNotFoundError(f"Dataset {ref} not in this datastore.") 

1804 else: 

1805 uris[ref] = self._predict_URIs(ref) 

1806 

1807 for ref in existing_refs: 

1808 file_infos = records[ref.id] 

1809 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos] 

1810 uris[ref] = self._locations_to_URI(ref, file_locations) 

1811 

1812 return uris 

1813 

1814 def _locations_to_URI( 

1815 self, 

1816 ref: DatasetRef, 

1817 file_locations: Sequence[tuple[Location, StoredFileInfo]], 

1818 ) -> DatasetRefURIs: 

1819 """Convert one or more file locations associated with a DatasetRef 

1820 to a DatasetRefURIs. 

1821 

1822 Parameters 

1823 ---------- 

1824 ref : `DatasetRef` 

1825 Reference to the dataset. 

1826 file_locations : Sequence[Tuple[Location, StoredFileInfo]] 

1827 Each item in the sequence is the location of the dataset within the 

1828 datastore and stored information about the file and its formatter. 

1829 If there is only one item in the sequence then it is treated as the 

1830 primary URI. If there is more than one item then they are treated 

1831 as component URIs. If there are no items then an error is raised 

1832 unless ``self.trustGetRequest`` is `True`. 

1833 

1834 Returns 

1835 ------- 

1836 uris: DatasetRefURIs 

1837 Represents the primary URI or component URIs described by the 

1838 inputs. 

1839 

1840 Raises 

1841 ------ 

1842 RuntimeError 

1843 If no file locations are passed in and ``self.trustGetRequest`` is 

1844 `False`. 

1845 FileNotFoundError 

1846 If the a passed-in URI does not exist, and ``self.trustGetRequest`` 

1847 is `False`. 

1848 RuntimeError 

1849 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is 

1850 unexpected). 

1851 """ 

1852 guessing = False 

1853 uris = DatasetRefURIs() 

1854 

1855 if not file_locations: 

1856 if not self.trustGetRequest: 

1857 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1858 file_locations = self._get_expected_dataset_locations_info(ref) 

1859 guessing = True 

1860 

1861 if len(file_locations) == 1: 

1862 # No disassembly so this is the primary URI 

1863 uris.primaryURI = file_locations[0][0].uri 

1864 if guessing and not uris.primaryURI.exists(): 

1865 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist") 

1866 else: 

1867 for location, file_info in file_locations: 

1868 if file_info.component is None: 

1869 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1870 if guessing and not location.uri.exists(): 

1871 # If we are trusting then it is entirely possible for 

1872 # some components to be missing. In that case we skip 

1873 # to the next component. 

1874 if self.trustGetRequest: 

1875 continue 

1876 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist") 

1877 uris.componentURIs[file_info.component] = location.uri 

1878 

1879 return uris 

1880 

1881 def retrieveArtifacts( 

1882 self, 

1883 refs: Iterable[DatasetRef], 

1884 destination: ResourcePath, 

1885 transfer: str = "auto", 

1886 preserve_path: bool = True, 

1887 overwrite: bool = False, 

1888 ) -> list[ResourcePath]: 

1889 """Retrieve the file artifacts associated with the supplied refs. 

1890 

1891 Parameters 

1892 ---------- 

1893 refs : iterable of `DatasetRef` 

1894 The datasets for which file artifacts are to be retrieved. 

1895 A single ref can result in multiple files. The refs must 

1896 be resolved. 

1897 destination : `lsst.resources.ResourcePath` 

1898 Location to write the file artifacts. 

1899 transfer : `str`, optional 

1900 Method to use to transfer the artifacts. Must be one of the options 

1901 supported by `lsst.resources.ResourcePath.transfer_from()`. 

1902 "move" is not allowed. 

1903 preserve_path : `bool`, optional 

1904 If `True` the full path of the file artifact within the datastore 

1905 is preserved. If `False` the final file component of the path 

1906 is used. 

1907 overwrite : `bool`, optional 

1908 If `True` allow transfers to overwrite existing files at the 

1909 destination. 

1910 

1911 Returns 

1912 ------- 

1913 targets : `list` of `lsst.resources.ResourcePath` 

1914 URIs of file artifacts in destination location. Order is not 

1915 preserved. 

1916 """ 

1917 if not destination.isdir(): 

1918 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

1919 

1920 if transfer == "move": 

1921 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

1922 

1923 # Source -> Destination 

1924 # This also helps filter out duplicate DatasetRef in the request 

1925 # that will map to the same underlying file transfer. 

1926 to_transfer: dict[ResourcePath, ResourcePath] = {} 

1927 

1928 for ref in refs: 

1929 locations = self._get_dataset_locations_info(ref) 

1930 for location, _ in locations: 

1931 source_uri = location.uri 

1932 target_path: ResourcePathExpression 

1933 if preserve_path: 

1934 target_path = location.pathInStore 

1935 if target_path.isabs(): 

1936 # This is an absolute path to an external file. 

1937 # Use the full path. 

1938 target_path = target_path.relativeToPathRoot 

1939 else: 

1940 target_path = source_uri.basename() 

1941 target_uri = destination.join(target_path) 

1942 to_transfer[source_uri] = target_uri 

1943 

1944 # In theory can now parallelize the transfer 

1945 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer)) 

1946 for source_uri, target_uri in to_transfer.items(): 

1947 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

1948 

1949 return list(to_transfer.values()) 

1950 

1951 def get( 

1952 self, 

1953 ref: DatasetRef, 

1954 parameters: Mapping[str, Any] | None = None, 

1955 storageClass: StorageClass | str | None = None, 

1956 ) -> Any: 

1957 """Load an InMemoryDataset from the store. 

1958 

1959 Parameters 

1960 ---------- 

1961 ref : `DatasetRef` 

1962 Reference to the required Dataset. 

1963 parameters : `dict` 

1964 `StorageClass`-specific parameters that specify, for example, 

1965 a slice of the dataset to be loaded. 

1966 storageClass : `StorageClass` or `str`, optional 

1967 The storage class to be used to override the Python type 

1968 returned by this method. By default the returned type matches 

1969 the dataset type definition for this dataset. Specifying a 

1970 read `StorageClass` can force a different type to be returned. 

1971 This type must be compatible with the original type. 

1972 

1973 Returns 

1974 ------- 

1975 inMemoryDataset : `object` 

1976 Requested dataset or slice thereof as an InMemoryDataset. 

1977 

1978 Raises 

1979 ------ 

1980 FileNotFoundError 

1981 Requested dataset can not be retrieved. 

1982 TypeError 

1983 Return value from formatter has unexpected type. 

1984 ValueError 

1985 Formatter failed to process the dataset. 

1986 """ 

1987 # Supplied storage class for the component being read is either 

1988 # from the ref itself or some an override if we want to force 

1989 # type conversion. 

1990 if storageClass is not None: 

1991 ref = ref.overrideStorageClass(storageClass) 

1992 

1993 allGetInfo = self._prepare_for_direct_get(ref, parameters) 

1994 return get_dataset_as_python_object_from_get_info( 

1995 allGetInfo, ref=ref, parameters=parameters, cache_manager=self.cacheManager 

1996 ) 

1997 

1998 def prepare_get_for_external_client(self, ref: DatasetRef) -> FileDatastoreGetPayload: 

1999 # Docstring inherited 

2000 

2001 # 1 hour. Chosen somewhat arbitrarily -- this is long enough that the 

2002 # client should have time to download a large file with retries if 

2003 # needed, but short enough that it will become obvious quickly that 

2004 # these URLs expire. 

2005 # From a strictly technical standpoint there is no reason this 

2006 # shouldn't be a day or more, but there seems to be a political issue 

2007 # where people think there is a risk of end users posting presigned 

2008 # URLs for people without access rights to download. 

2009 url_expiration_time_seconds = 1 * 60 * 60 

2010 

2011 def to_file_info_payload(info: DatasetLocationInformation) -> FileDatastoreGetPayloadFileInfo: 

2012 location, file_info = info 

2013 return FileDatastoreGetPayloadFileInfo( 

2014 url=location.uri.generate_presigned_get_url( 

2015 expiration_time_seconds=url_expiration_time_seconds 

2016 ), 

2017 datastoreRecords=file_info.to_simple(), 

2018 ) 

2019 

2020 return FileDatastoreGetPayload( 

2021 datastore_type="file", 

2022 dataset_ref=ref.to_simple(), 

2023 file_info=[to_file_info_payload(info) for info in self._get_dataset_locations_info(ref)], 

2024 ) 

2025 

2026 @transactional 

2027 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

2028 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

2029 

2030 Parameters 

2031 ---------- 

2032 inMemoryDataset : `object` 

2033 The dataset to store. 

2034 ref : `DatasetRef` 

2035 Reference to the associated Dataset. 

2036 

2037 Raises 

2038 ------ 

2039 TypeError 

2040 Supplied object and storage class are inconsistent. 

2041 DatasetTypeNotSupportedError 

2042 The associated `DatasetType` is not handled by this datastore. 

2043 

2044 Notes 

2045 ----- 

2046 If the datastore is configured to reject certain dataset types it 

2047 is possible that the put will fail and raise a 

2048 `DatasetTypeNotSupportedError`. The main use case for this is to 

2049 allow `ChainedDatastore` to put to multiple datastores without 

2050 requiring that every datastore accepts the dataset. 

2051 """ 

2052 doDisassembly = self.composites.shouldBeDisassembled(ref) 

2053 # doDisassembly = True 

2054 

2055 artifacts = [] 

2056 if doDisassembly: 

2057 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

2058 if components is None: 

2059 raise RuntimeError( 

2060 f"Inconsistent configuration: dataset type {ref.datasetType.name} " 

2061 f"with storage class {ref.datasetType.storageClass.name} " 

2062 "is configured to be disassembled, but cannot be." 

2063 ) 

2064 for component, componentInfo in components.items(): 

2065 # Don't recurse because we want to take advantage of 

2066 # bulk insert -- need a new DatasetRef that refers to the 

2067 # same dataset_id but has the component DatasetType 

2068 # DatasetType does not refer to the types of components 

2069 # So we construct one ourselves. 

2070 compRef = ref.makeComponentRef(component) 

2071 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

2072 artifacts.append((compRef, storedInfo)) 

2073 else: 

2074 # Write the entire thing out 

2075 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

2076 artifacts.append((ref, storedInfo)) 

2077 

2078 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.INSERT) 

2079 

2080 @transactional 

2081 def put_new(self, in_memory_dataset: Any, ref: DatasetRef) -> Mapping[str, DatasetRef]: 

2082 doDisassembly = self.composites.shouldBeDisassembled(ref) 

2083 # doDisassembly = True 

2084 

2085 artifacts = [] 

2086 if doDisassembly: 

2087 components = ref.datasetType.storageClass.delegate().disassemble(in_memory_dataset) 

2088 if components is None: 

2089 raise RuntimeError( 

2090 f"Inconsistent configuration: dataset type {ref.datasetType.name} " 

2091 f"with storage class {ref.datasetType.storageClass.name} " 

2092 "is configured to be disassembled, but cannot be." 

2093 ) 

2094 for component, componentInfo in components.items(): 

2095 # Don't recurse because we want to take advantage of 

2096 # bulk insert -- need a new DatasetRef that refers to the 

2097 # same dataset_id but has the component DatasetType 

2098 # DatasetType does not refer to the types of components 

2099 # So we construct one ourselves. 

2100 compRef = ref.makeComponentRef(component) 

2101 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

2102 artifacts.append((compRef, storedInfo)) 

2103 else: 

2104 # Write the entire thing out 

2105 storedInfo = self._write_in_memory_to_artifact(in_memory_dataset, ref) 

2106 artifacts.append((ref, storedInfo)) 

2107 

2108 ref_records = {self._opaque_table_name: [info for _, info in artifacts]} 

2109 ref = ref.replace(datastore_records=ref_records) 

2110 return {self.name: ref} 

2111 

2112 @transactional 

2113 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None: 

2114 # At this point can safely remove these datasets from the cache 

2115 # to avoid confusion later on. If they are not trashed later 

2116 # the cache will simply be refilled. 

2117 self.cacheManager.remove_from_cache(ref) 

2118 

2119 # If we are in trust mode there will be nothing to move to 

2120 # the trash table and we will have to try to delete the file 

2121 # immediately. 

2122 if self.trustGetRequest: 

2123 # Try to keep the logic below for a single file trash. 

2124 if isinstance(ref, DatasetRef): 

2125 refs = {ref} 

2126 else: 

2127 # Will recreate ref at the end of this branch. 

2128 refs = set(ref) 

2129 

2130 # Determine which datasets are known to datastore directly. 

2131 id_to_ref = {ref.id: ref for ref in refs} 

2132 existing_ids = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True) 

2133 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids} 

2134 

2135 missing = refs - existing_refs 

2136 if missing: 

2137 # Do an explicit existence check on these refs. 

2138 # We only care about the artifacts at this point and not 

2139 # the dataset existence. 

2140 artifact_existence: dict[ResourcePath, bool] = {} 

2141 _ = self.mexists(missing, artifact_existence) 

2142 uris = [uri for uri, exists in artifact_existence.items() if exists] 

2143 

2144 # FUTURE UPGRADE: Implement a parallelized bulk remove. 

2145 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris)) 

2146 for uri in uris: 

2147 try: 

2148 uri.remove() 

2149 except Exception as e: 

2150 if ignore_errors: 

2151 log.debug("Artifact %s could not be removed: %s", uri, e) 

2152 continue 

2153 raise 

2154 

2155 # There is no point asking the code below to remove refs we 

2156 # know are missing so update it with the list of existing 

2157 # records. Try to retain one vs many logic. 

2158 if not existing_refs: 

2159 # Nothing more to do since none of the datasets were 

2160 # known to the datastore record table. 

2161 return 

2162 ref = list(existing_refs) 

2163 if len(ref) == 1: 

2164 ref = ref[0] 

2165 

2166 # Get file metadata and internal metadata 

2167 if not isinstance(ref, DatasetRef): 

2168 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

2169 # Assumed to be an iterable of refs so bulk mode enabled. 

2170 try: 

2171 self.bridge.moveToTrash(ref, transaction=self._transaction) 

2172 except Exception as e: 

2173 if ignore_errors: 

2174 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

2175 else: 

2176 raise 

2177 return 

2178 

2179 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

2180 

2181 fileLocations = self._get_dataset_locations_info(ref) 

2182 

2183 if not fileLocations: 

2184 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

2185 if ignore_errors: 

2186 log.warning(err_msg) 

2187 return 

2188 else: 

2189 raise FileNotFoundError(err_msg) 

2190 

2191 for location, _ in fileLocations: 

2192 if not self._artifact_exists(location): 

2193 err_msg = ( 

2194 f"Dataset is known to datastore {self.name} but " 

2195 f"associated artifact ({location.uri}) is missing" 

2196 ) 

2197 if ignore_errors: 

2198 log.warning(err_msg) 

2199 return 

2200 else: 

2201 raise FileNotFoundError(err_msg) 

2202 

2203 # Mark dataset as trashed 

2204 try: 

2205 self.bridge.moveToTrash([ref], transaction=self._transaction) 

2206 except Exception as e: 

2207 if ignore_errors: 

2208 log.warning( 

2209 "Attempted to mark dataset (%s) to be trashed in datastore %s " 

2210 "but encountered an error: %s", 

2211 ref, 

2212 self.name, 

2213 e, 

2214 ) 

2215 pass 

2216 else: 

2217 raise 

2218 

2219 @transactional 

2220 def emptyTrash(self, ignore_errors: bool = True) -> None: 

2221 """Remove all datasets from the trash. 

2222 

2223 Parameters 

2224 ---------- 

2225 ignore_errors : `bool` 

2226 If `True` return without error even if something went wrong. 

2227 Problems could occur if another process is simultaneously trying 

2228 to delete. 

2229 """ 

2230 log.debug("Emptying trash in datastore %s", self.name) 

2231 

2232 # Context manager will empty trash iff we finish it without raising. 

2233 # It will also automatically delete the relevant rows from the 

2234 # trash table and the records table. 

2235 with self.bridge.emptyTrash( 

2236 self._table, record_class=StoredFileInfo, record_column="path" 

2237 ) as trash_data: 

2238 # Removing the artifacts themselves requires that the files are 

2239 # not also associated with refs that are not to be trashed. 

2240 # Therefore need to do a query with the file paths themselves 

2241 # and return all the refs associated with them. Can only delete 

2242 # a file if the refs to be trashed are the only refs associated 

2243 # with the file. 

2244 # This requires multiple copies of the trashed items 

2245 trashed, artifacts_to_keep = trash_data 

2246 

2247 if artifacts_to_keep is None: 

2248 # The bridge is not helping us so have to work it out 

2249 # ourselves. This is not going to be as efficient. 

2250 trashed = list(trashed) 

2251 

2252 # The instance check is for mypy since up to this point it 

2253 # does not know the type of info. 

2254 path_map = self._refs_associated_with_artifacts( 

2255 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)] 

2256 ) 

2257 

2258 for ref, info in trashed: 

2259 # Mypy needs to know this is not the base class 

2260 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2261 

2262 path_map[info.path].remove(ref.id) 

2263 if not path_map[info.path]: 

2264 del path_map[info.path] 

2265 

2266 artifacts_to_keep = set(path_map) 

2267 

2268 for ref, info in trashed: 

2269 # Should not happen for this implementation but need 

2270 # to keep mypy happy. 

2271 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

2272 

2273 # Mypy needs to know this is not the base class 

2274 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2275 

2276 if info.path in artifacts_to_keep: 

2277 # This is a multi-dataset artifact and we are not 

2278 # removing all associated refs. 

2279 continue 

2280 

2281 # Only trashed refs still known to datastore will be returned. 

2282 location = info.file_location(self.locationFactory) 

2283 

2284 # Point of no return for this artifact 

2285 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

2286 try: 

2287 self._delete_artifact(location) 

2288 except FileNotFoundError: 

2289 # If the file itself has been deleted there is nothing 

2290 # we can do about it. It is possible that trash has 

2291 # been run in parallel in another process or someone 

2292 # decided to delete the file. It is unlikely to come 

2293 # back and so we should still continue with the removal 

2294 # of the entry from the trash table. It is also possible 

2295 # we removed it in a previous iteration if it was 

2296 # a multi-dataset artifact. The delete artifact method 

2297 # will log a debug message in this scenario. 

2298 # Distinguishing file missing before trash started and 

2299 # file already removed previously as part of this trash 

2300 # is not worth the distinction with regards to potential 

2301 # memory cost. 

2302 pass 

2303 except Exception as e: 

2304 if ignore_errors: 

2305 # Use a debug message here even though it's not 

2306 # a good situation. In some cases this can be 

2307 # caused by a race between user A and user B 

2308 # and neither of them has permissions for the 

2309 # other's files. Butler does not know about users 

2310 # and trash has no idea what collections these 

2311 # files were in (without guessing from a path). 

2312 log.debug( 

2313 "Encountered error removing artifact %s from datastore %s: %s", 

2314 location.uri, 

2315 self.name, 

2316 e, 

2317 ) 

2318 else: 

2319 raise 

2320 

2321 @transactional 

2322 def transfer_from( 

2323 self, 

2324 source_datastore: Datastore, 

2325 refs: Iterable[DatasetRef], 

2326 transfer: str = "auto", 

2327 artifact_existence: dict[ResourcePath, bool] | None = None, 

2328 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

2329 # Docstring inherited 

2330 if type(self) is not type(source_datastore): 

2331 raise TypeError( 

2332 f"Datastore mismatch between this datastore ({type(self)}) and the " 

2333 f"source datastore ({type(source_datastore)})." 

2334 ) 

2335 

2336 # Be explicit for mypy 

2337 if not isinstance(source_datastore, FileDatastore): 

2338 raise TypeError( 

2339 "Can only transfer to a FileDatastore from another FileDatastore, not" 

2340 f" {type(source_datastore)}" 

2341 ) 

2342 

2343 # Stop early if "direct" transfer mode is requested. That would 

2344 # require that the URI inside the source datastore should be stored 

2345 # directly in the target datastore, which seems unlikely to be useful 

2346 # since at any moment the source datastore could delete the file. 

2347 if transfer in ("direct", "split"): 

2348 raise ValueError( 

2349 f"Can not transfer from a source datastore using {transfer} mode since" 

2350 " those files are controlled by the other datastore." 

2351 ) 

2352 

2353 # Empty existence lookup if none given. 

2354 if artifact_existence is None: 

2355 artifact_existence = {} 

2356 

2357 # We will go through the list multiple times so must convert 

2358 # generators to lists. 

2359 refs = list(refs) 

2360 

2361 # In order to handle disassembled composites the code works 

2362 # at the records level since it can assume that internal APIs 

2363 # can be used. 

2364 # - If the record already exists in the destination this is assumed 

2365 # to be okay. 

2366 # - If there is no record but the source and destination URIs are 

2367 # identical no transfer is done but the record is added. 

2368 # - If the source record refers to an absolute URI currently assume 

2369 # that that URI should remain absolute and will be visible to the 

2370 # destination butler. May need to have a flag to indicate whether 

2371 # the dataset should be transferred. This will only happen if 

2372 # the detached Butler has had a local ingest. 

2373 

2374 # What we really want is all the records in the source datastore 

2375 # associated with these refs. Or derived ones if they don't exist 

2376 # in the source. 

2377 source_records = source_datastore._get_stored_records_associated_with_refs( 

2378 refs, ignore_datastore_records=True 

2379 ) 

2380 

2381 # The source dataset_ids are the keys in these records 

2382 source_ids = set(source_records) 

2383 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

2384 

2385 requested_ids = {ref.id for ref in refs} 

2386 missing_ids = requested_ids - source_ids 

2387 

2388 # Missing IDs can be okay if that datastore has allowed 

2389 # gets based on file existence. Should we transfer what we can 

2390 # or complain about it and warn? 

2391 if missing_ids and not source_datastore.trustGetRequest: 

2392 raise ValueError( 

2393 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}" 

2394 ) 

2395 

2396 # Need to map these missing IDs to a DatasetRef so we can guess 

2397 # the details. 

2398 if missing_ids: 

2399 log.info( 

2400 "Number of expected datasets missing from source datastore records: %d out of %d", 

2401 len(missing_ids), 

2402 len(requested_ids), 

2403 ) 

2404 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

2405 

2406 # This should be chunked in case we end up having to check 

2407 # the file store since we need some log output to show 

2408 # progress. 

2409 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000): 

2410 records = {} 

2411 for missing in missing_ids_chunk: 

2412 # Ask the source datastore where the missing artifacts 

2413 # should be. An execution butler might not know about the 

2414 # artifacts even if they are there. 

2415 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) 

2416 records[missing] = [info for _, info in expected] 

2417 

2418 # Call the mexist helper method in case we have not already 

2419 # checked these artifacts such that artifact_existence is 

2420 # empty. This allows us to benefit from parallelism. 

2421 # datastore.mexists() itself does not give us access to the 

2422 # derived datastore record. 

2423 log.verbose("Checking existence of %d datasets unknown to datastore", len(records)) 

2424 ref_exists = source_datastore._process_mexists_records( 

2425 id_to_ref, records, False, artifact_existence=artifact_existence 

2426 ) 

2427 

2428 # Now go through the records and propagate the ones that exist. 

2429 location_factory = source_datastore.locationFactory 

2430 for missing, record_list in records.items(): 

2431 # Skip completely if the ref does not exist. 

2432 ref = id_to_ref[missing] 

2433 if not ref_exists[ref]: 

2434 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref) 

2435 continue 

2436 # Check for file artifact to decide which parts of a 

2437 # disassembled composite do exist. If there is only a 

2438 # single record we don't even need to look because it can't 

2439 # be a composite and must exist. 

2440 if len(record_list) == 1: 

2441 dataset_records = record_list 

2442 else: 

2443 dataset_records = [ 

2444 record 

2445 for record in record_list 

2446 if artifact_existence[record.file_location(location_factory).uri] 

2447 ] 

2448 assert len(dataset_records) > 0, "Disassembled composite should have had some files." 

2449 

2450 # Rely on source_records being a defaultdict. 

2451 source_records[missing].extend(dataset_records) 

2452 

2453 # See if we already have these records 

2454 target_records = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True) 

2455 

2456 # The artifacts to register 

2457 artifacts = [] 

2458 

2459 # Refs that already exist 

2460 already_present = [] 

2461 

2462 # Refs that were rejected by this datastore. 

2463 rejected = set() 

2464 

2465 # Refs that were transferred successfully. 

2466 accepted = set() 

2467 

2468 # Record each time we have done a "direct" transfer. 

2469 direct_transfers = [] 

2470 

2471 # Now can transfer the artifacts 

2472 for ref in refs: 

2473 if not self.constraints.isAcceptable(ref): 

2474 # This datastore should not be accepting this dataset. 

2475 rejected.add(ref) 

2476 continue 

2477 

2478 accepted.add(ref) 

2479 

2480 if ref.id in target_records: 

2481 # Already have an artifact for this. 

2482 already_present.append(ref) 

2483 continue 

2484 

2485 # mypy needs to know these are always resolved refs 

2486 for info in source_records[ref.id]: 

2487 source_location = info.file_location(source_datastore.locationFactory) 

2488 target_location = info.file_location(self.locationFactory) 

2489 if source_location == target_location and not source_location.pathInStore.isabs(): 

2490 # Artifact is already in the target location. 

2491 # (which is how execution butler currently runs) 

2492 pass 

2493 else: 

2494 if target_location.pathInStore.isabs(): 

2495 # Just because we can see the artifact when running 

2496 # the transfer doesn't mean it will be generally 

2497 # accessible to a user of this butler. Need to decide 

2498 # what to do about an absolute path. 

2499 if transfer == "auto": 

2500 # For "auto" transfers we allow the absolute URI 

2501 # to be recorded in the target datastore. 

2502 direct_transfers.append(source_location) 

2503 else: 

2504 # The user is explicitly requesting a transfer 

2505 # even for an absolute URI. This requires us to 

2506 # calculate the target path. 

2507 template_ref = ref 

2508 if info.component: 

2509 template_ref = ref.makeComponentRef(info.component) 

2510 target_location = self._calculate_ingested_datastore_name( 

2511 source_location.uri, 

2512 template_ref, 

2513 ) 

2514 

2515 info = info.update(path=target_location.pathInStore.path) 

2516 

2517 # Need to transfer it to the new location. 

2518 # Assume we should always overwrite. If the artifact 

2519 # is there this might indicate that a previous transfer 

2520 # was interrupted but was not able to be rolled back 

2521 # completely (eg pre-emption) so follow Datastore default 

2522 # and overwrite. 

2523 target_location.uri.transfer_from( 

2524 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction 

2525 ) 

2526 

2527 artifacts.append((ref, info)) 

2528 

2529 if direct_transfers: 

2530 log.info( 

2531 "Transfer request for an outside-datastore artifact with absolute URI done %d time%s", 

2532 len(direct_transfers), 

2533 "" if len(direct_transfers) == 1 else "s", 

2534 ) 

2535 

2536 # We are overwriting previous datasets that may have already 

2537 # existed. We therefore should ensure that we force the 

2538 # datastore records to agree. Note that this can potentially lead 

2539 # to difficulties if the dataset has previously been ingested 

2540 # disassembled and is somehow now assembled, or vice versa. 

2541 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.REPLACE) 

2542 

2543 if already_present: 

2544 n_skipped = len(already_present) 

2545 log.info( 

2546 "Skipped transfer of %d dataset%s already present in datastore", 

2547 n_skipped, 

2548 "" if n_skipped == 1 else "s", 

2549 ) 

2550 

2551 return accepted, rejected 

2552 

2553 @transactional 

2554 def forget(self, refs: Iterable[DatasetRef]) -> None: 

2555 # Docstring inherited. 

2556 refs = list(refs) 

2557 self.bridge.forget(refs) 

2558 self._table.delete(["dataset_id"], *[{"dataset_id": ref.id} for ref in refs]) 

2559 

2560 def validateConfiguration( 

2561 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

2562 ) -> None: 

2563 """Validate some of the configuration for this datastore. 

2564 

2565 Parameters 

2566 ---------- 

2567 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

2568 Entities to test against this configuration. Can be differing 

2569 types. 

2570 logFailures : `bool`, optional 

2571 If `True`, output a log message for every validation error 

2572 detected. 

2573 

2574 Raises 

2575 ------ 

2576 DatastoreValidationError 

2577 Raised if there is a validation problem with a configuration. 

2578 All the problems are reported in a single exception. 

2579 

2580 Notes 

2581 ----- 

2582 This method checks that all the supplied entities have valid file 

2583 templates and also have formatters defined. 

2584 """ 

2585 templateFailed = None 

2586 try: 

2587 self.templates.validateTemplates(entities, logFailures=logFailures) 

2588 except FileTemplateValidationError as e: 

2589 templateFailed = str(e) 

2590 

2591 formatterFailed = [] 

2592 for entity in entities: 

2593 try: 

2594 self.formatterFactory.getFormatterClass(entity) 

2595 except KeyError as e: 

2596 formatterFailed.append(str(e)) 

2597 if logFailures: 

2598 log.critical("Formatter failure: %s", e) 

2599 

2600 if templateFailed or formatterFailed: 

2601 messages = [] 

2602 if templateFailed: 

2603 messages.append(templateFailed) 

2604 if formatterFailed: 

2605 messages.append(",".join(formatterFailed)) 

2606 msg = ";\n".join(messages) 

2607 raise DatastoreValidationError(msg) 

2608 

2609 def getLookupKeys(self) -> set[LookupKey]: 

2610 # Docstring is inherited from base class 

2611 return ( 

2612 self.templates.getLookupKeys() 

2613 | self.formatterFactory.getLookupKeys() 

2614 | self.constraints.getLookupKeys() 

2615 ) 

2616 

2617 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

2618 # Docstring is inherited from base class 

2619 # The key can be valid in either formatters or templates so we can 

2620 # only check the template if it exists 

2621 if lookupKey in self.templates: 

2622 try: 

2623 self.templates[lookupKey].validateTemplate(entity) 

2624 except FileTemplateValidationError as e: 

2625 raise DatastoreValidationError(e) from e 

2626 

2627 def export( 

2628 self, 

2629 refs: Iterable[DatasetRef], 

2630 *, 

2631 directory: ResourcePathExpression | None = None, 

2632 transfer: str | None = "auto", 

2633 ) -> Iterable[FileDataset]: 

2634 # Docstring inherited from Datastore.export. 

2635 if transfer == "auto" and directory is None: 

2636 transfer = None 

2637 

2638 if transfer is not None and directory is None: 

2639 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

2640 

2641 if transfer == "move": 

2642 raise TypeError("Can not export by moving files out of datastore.") 

2643 elif transfer == "direct": 

2644 # For an export, treat this as equivalent to None. We do not 

2645 # want an import to risk using absolute URIs to datasets owned 

2646 # by another datastore. 

2647 log.info("Treating 'direct' transfer mode as in-place export.") 

2648 transfer = None 

2649 

2650 # Force the directory to be a URI object 

2651 directoryUri: ResourcePath | None = None 

2652 if directory is not None: 

2653 directoryUri = ResourcePath(directory, forceDirectory=True) 

2654 

2655 if transfer is not None and directoryUri is not None and not directoryUri.exists(): 

2656 # mypy needs the second test 

2657 raise FileNotFoundError(f"Export location {directory} does not exist") 

2658 

2659 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

2660 for ref in progress.wrap(refs, "Exporting dataset files"): 

2661 fileLocations = self._get_dataset_locations_info(ref) 

2662 if not fileLocations: 

2663 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

2664 # For now we can not export disassembled datasets 

2665 if len(fileLocations) > 1: 

2666 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

2667 location, storedFileInfo = fileLocations[0] 

2668 

2669 pathInStore = location.pathInStore.path 

2670 if transfer is None: 

2671 # TODO: do we also need to return the readStorageClass somehow? 

2672 # We will use the path in store directly. If this is an 

2673 # absolute URI, preserve it. 

2674 if location.pathInStore.isabs(): 

2675 pathInStore = str(location.uri) 

2676 elif transfer == "direct": 

2677 # Use full URIs to the remote store in the export 

2678 pathInStore = str(location.uri) 

2679 else: 

2680 # mypy needs help 

2681 assert directoryUri is not None, "directoryUri must be defined to get here" 

2682 storeUri = ResourcePath(location.uri) 

2683 

2684 # if the datastore has an absolute URI to a resource, we 

2685 # have two options: 

2686 # 1. Keep the absolute URI in the exported YAML 

2687 # 2. Allocate a new name in the local datastore and transfer 

2688 # it. 

2689 # For now go with option 2 

2690 if location.pathInStore.isabs(): 

2691 template = self.templates.getTemplate(ref) 

2692 newURI = ResourcePath(template.format(ref), forceAbsolute=False) 

2693 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

2694 

2695 exportUri = directoryUri.join(pathInStore) 

2696 exportUri.transfer_from(storeUri, transfer=transfer) 

2697 

2698 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

2699 

2700 @staticmethod 

2701 def computeChecksum(uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192) -> str | None: 

2702 """Compute the checksum of the supplied file. 

2703 

2704 Parameters 

2705 ---------- 

2706 uri : `lsst.resources.ResourcePath` 

2707 Name of resource to calculate checksum from. 

2708 algorithm : `str`, optional 

2709 Name of algorithm to use. Must be one of the algorithms supported 

2710 by :py:class`hashlib`. 

2711 block_size : `int` 

2712 Number of bytes to read from file at one time. 

2713 

2714 Returns 

2715 ------- 

2716 hexdigest : `str` 

2717 Hex digest of the file. 

2718 

2719 Notes 

2720 ----- 

2721 Currently returns None if the URI is for a remote resource. 

2722 """ 

2723 if algorithm not in hashlib.algorithms_guaranteed: 

2724 raise NameError(f"The specified algorithm '{algorithm}' is not supported by hashlib") 

2725 

2726 if not uri.isLocal: 

2727 return None 

2728 

2729 hasher = hashlib.new(algorithm) 

2730 

2731 with uri.as_local() as local_uri, open(local_uri.ospath, "rb") as f: 

2732 for chunk in iter(lambda: f.read(block_size), b""): 

2733 hasher.update(chunk) 

2734 

2735 return hasher.hexdigest() 

2736 

2737 def needs_expanded_data_ids( 

2738 self, 

2739 transfer: str | None, 

2740 entity: DatasetRef | DatasetType | StorageClass | None = None, 

2741 ) -> bool: 

2742 # Docstring inherited. 

2743 # This _could_ also use entity to inspect whether the filename template 

2744 # involves placeholders other than the required dimensions for its 

2745 # dataset type, but that's not necessary for correctness; it just 

2746 # enables more optimizations (perhaps only in theory). 

2747 return transfer not in ("direct", None) 

2748 

2749 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

2750 # Docstring inherited from the base class. 

2751 record_data = data.get(self.name) 

2752 if not record_data: 

2753 return 

2754 

2755 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records) 

2756 

2757 # TODO: Verify that there are no unexpected table names in the dict? 

2758 unpacked_records = [] 

2759 for dataset_id, dataset_data in record_data.records.items(): 

2760 records = dataset_data.get(self._table.name) 

2761 if records: 

2762 for info in records: 

2763 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records" 

2764 unpacked_records.append(info.to_record(dataset_id=dataset_id)) 

2765 if unpacked_records: 

2766 self._table.insert(*unpacked_records, transaction=self._transaction) 

2767 

2768 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

2769 # Docstring inherited from the base class. 

2770 exported_refs = list(self._bridge.check(refs)) 

2771 ids = {ref.id for ref in exported_refs} 

2772 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids} 

2773 for row in self._table.fetch(dataset_id=ids): 

2774 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row) 

2775 dataset_records = records.setdefault(row["dataset_id"], {}) 

2776 dataset_records.setdefault(self._table.name, []).append(info) 

2777 

2778 record_data = DatastoreRecordData(records=records) 

2779 return {self.name: record_data} 

2780 

2781 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None: 

2782 # Docstring inherited from the base class. 

2783 self._retrieve_dataset_method = method 

2784 

2785 def _cast_storage_class(self, ref: DatasetRef) -> DatasetRef: 

2786 """Update dataset reference to use the storage class from registry.""" 

2787 if self._retrieve_dataset_method is None: 

2788 # We could raise an exception here but unit tests do not define 

2789 # this method. 

2790 return ref 

2791 dataset_type = self._retrieve_dataset_method(ref.datasetType.name) 

2792 if dataset_type is not None: 

2793 ref = ref.overrideStorageClass(dataset_type.storageClass) 

2794 return ref 

2795 

2796 def get_opaque_table_definitions(self) -> Mapping[str, DatastoreOpaqueTable]: 

2797 # Docstring inherited from the base class. 

2798 return {self._opaque_table_name: DatastoreOpaqueTable(self.makeTableSpec(ddl.GUID), StoredFileInfo)}