Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 10%

927 statements  

« prev     ^ index     » next       coverage.py v7.4.1, created at 2024-02-13 10:57 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Generic file-based datastore code.""" 

29 

30from __future__ import annotations 

31 

32__all__ = ("FileDatastore",) 

33 

34import contextlib 

35import hashlib 

36import logging 

37from collections import defaultdict 

38from collections.abc import Callable, Collection, Iterable, Mapping, Sequence 

39from typing import TYPE_CHECKING, Any, ClassVar, cast 

40 

41from lsst.daf.butler import ( 

42 Config, 

43 DatasetDatastoreRecords, 

44 DatasetId, 

45 DatasetRef, 

46 DatasetType, 

47 DatasetTypeNotSupportedError, 

48 FileDataset, 

49 FileDescriptor, 

50 Formatter, 

51 FormatterFactory, 

52 Location, 

53 LocationFactory, 

54 Progress, 

55 StorageClass, 

56 ddl, 

57) 

58from lsst.daf.butler.datastore import ( 

59 DatasetRefURIs, 

60 Datastore, 

61 DatastoreConfig, 

62 DatastoreOpaqueTable, 

63 DatastoreValidationError, 

64) 

65from lsst.daf.butler.datastore.cache_manager import ( 

66 AbstractDatastoreCacheManager, 

67 DatastoreCacheManager, 

68 DatastoreDisabledCacheManager, 

69) 

70from lsst.daf.butler.datastore.composites import CompositesMap 

71from lsst.daf.butler.datastore.file_templates import FileTemplates, FileTemplateValidationError 

72from lsst.daf.butler.datastore.generic_base import GenericBaseDatastore 

73from lsst.daf.butler.datastore.record_data import DatastoreRecordData 

74from lsst.daf.butler.datastore.stored_file_info import StoredDatastoreItemInfo, StoredFileInfo 

75from lsst.daf.butler.datastores.file_datastore.get import ( 

76 DatasetLocationInformation, 

77 DatastoreFileGetInformation, 

78 generate_datastore_get_information, 

79 get_dataset_as_python_object_from_get_info, 

80) 

81from lsst.daf.butler.datastores.fileDatastoreClient import ( 

82 FileDatastoreGetPayload, 

83 FileDatastoreGetPayloadFileInfo, 

84) 

85from lsst.daf.butler.registry.interfaces import ( 

86 DatabaseInsertMode, 

87 DatastoreRegistryBridge, 

88 FakeDatasetRef, 

89 ReadOnlyDatabaseError, 

90) 

91from lsst.daf.butler.repo_relocation import replaceRoot 

92from lsst.daf.butler.utils import transactional 

93from lsst.resources import ResourcePath, ResourcePathExpression 

94from lsst.utils.introspection import get_class_of 

95from lsst.utils.iteration import chunk_iterable 

96 

97# For VERBOSE logging usage. 

98from lsst.utils.logging import VERBOSE, getLogger 

99from sqlalchemy import BigInteger, String 

100 

101if TYPE_CHECKING: 

102 from lsst.daf.butler import LookupKey 

103 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

104 

105log = getLogger(__name__) 

106 

107 

108class _IngestPrepData(Datastore.IngestPrepData): 

109 """Helper class for FileDatastore ingest implementation. 

110 

111 Parameters 

112 ---------- 

113 datasets : `~collections.abc.Iterable` of `FileDataset` 

114 Files to be ingested by this datastore. 

115 """ 

116 

117 def __init__(self, datasets: Iterable[FileDataset]): 

118 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

119 self.datasets = datasets 

120 

121 

122class FileDatastore(GenericBaseDatastore[StoredFileInfo]): 

123 """Generic Datastore for file-based implementations. 

124 

125 Should always be sub-classed since key abstract methods are missing. 

126 

127 Parameters 

128 ---------- 

129 config : `DatastoreConfig` or `str` 

130 Configuration as either a `Config` object or URI to file. 

131 bridgeManager : `DatastoreRegistryBridgeManager` 

132 Object that manages the interface between `Registry` and datastores. 

133 root : `ResourcePath` 

134 Root directory URI of this `Datastore`. 

135 formatterFactory : `FormatterFactory` 

136 Factory for creating instances of formatters. 

137 templates : `FileTemplates` 

138 File templates that can be used by this `Datastore`. 

139 composites : `CompositesMap` 

140 Determines whether a dataset should be disassembled on put. 

141 trustGetRequest : `bool` 

142 Determine whether we can fall back to configuration if a requested 

143 dataset is not known to registry. 

144 

145 Raises 

146 ------ 

147 ValueError 

148 If root location does not exist and ``create`` is `False` in the 

149 configuration. 

150 """ 

151 

152 defaultConfigFile: ClassVar[str | None] = None 

153 """Path to configuration defaults. Accessed within the ``config`` resource 

154 or relative to a search path. Can be None if no defaults specified. 

155 """ 

156 

157 root: ResourcePath 

158 """Root directory URI of this `Datastore`.""" 

159 

160 locationFactory: LocationFactory 

161 """Factory for creating locations relative to the datastore root.""" 

162 

163 formatterFactory: FormatterFactory 

164 """Factory for creating instances of formatters.""" 

165 

166 templates: FileTemplates 

167 """File templates that can be used by this `Datastore`.""" 

168 

169 composites: CompositesMap 

170 """Determines whether a dataset should be disassembled on put.""" 

171 

172 defaultConfigFile = "datastores/fileDatastore.yaml" 

173 """Path to configuration defaults. Accessed within the ``config`` resource 

174 or relative to a search path. Can be None if no defaults specified. 

175 """ 

176 

177 _retrieve_dataset_method: Callable[[str], DatasetType | None] | None = None 

178 """Callable that is used in trusted mode to retrieve registry definition 

179 of a named dataset type. 

180 """ 

181 

182 @classmethod 

183 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

184 """Set any filesystem-dependent config options for this Datastore to 

185 be appropriate for a new empty repository with the given root. 

186 

187 Parameters 

188 ---------- 

189 root : `str` 

190 URI to the root of the data repository. 

191 config : `Config` 

192 A `Config` to update. Only the subset understood by 

193 this component will be updated. Will not expand 

194 defaults. 

195 full : `Config` 

196 A complete config with all defaults expanded that can be 

197 converted to a `DatastoreConfig`. Read-only and will not be 

198 modified by this method. 

199 Repository-specific options that should not be obtained 

200 from defaults when Butler instances are constructed 

201 should be copied from ``full`` to ``config``. 

202 overwrite : `bool`, optional 

203 If `False`, do not modify a value in ``config`` if the value 

204 already exists. Default is always to overwrite with the provided 

205 ``root``. 

206 

207 Notes 

208 ----- 

209 If a keyword is explicitly defined in the supplied ``config`` it 

210 will not be overridden by this method if ``overwrite`` is `False`. 

211 This allows explicit values set in external configs to be retained. 

212 """ 

213 Config.updateParameters( 

214 DatastoreConfig, 

215 config, 

216 full, 

217 toUpdate={"root": root}, 

218 toCopy=("cls", ("records", "table")), 

219 overwrite=overwrite, 

220 ) 

221 

222 @classmethod 

223 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

224 return ddl.TableSpec( 

225 fields=[ 

226 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

227 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

228 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

229 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

230 # Use empty string to indicate no component 

231 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

232 # TODO: should checksum be Base64Bytes instead? 

233 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

234 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

235 ], 

236 unique=frozenset(), 

237 indexes=[ddl.IndexSpec("path")], 

238 ) 

239 

240 def __init__( 

241 self, 

242 config: DatastoreConfig, 

243 bridgeManager: DatastoreRegistryBridgeManager, 

244 root: ResourcePath, 

245 formatterFactory: FormatterFactory, 

246 templates: FileTemplates, 

247 composites: CompositesMap, 

248 trustGetRequest: bool, 

249 ): 

250 super().__init__(config, bridgeManager) 

251 self.root = ResourcePath(root) 

252 self.formatterFactory = formatterFactory 

253 self.templates = templates 

254 self.composites = composites 

255 self.trustGetRequest = trustGetRequest 

256 

257 # Name ourselves either using an explicit name or a name 

258 # derived from the (unexpanded) root 

259 if "name" in self.config: 

260 self.name = self.config["name"] 

261 else: 

262 # We use the unexpanded root in the name to indicate that this 

263 # datastore can be moved without having to update registry. 

264 self.name = "{}@{}".format(type(self).__name__, self.config["root"]) 

265 

266 self.locationFactory = LocationFactory(self.root) 

267 

268 self._opaque_table_name = self.config["records", "table"] 

269 try: 

270 # Storage of paths and formatters, keyed by dataset_id 

271 self._table = bridgeManager.opaque.register( 

272 self._opaque_table_name, self.makeTableSpec(bridgeManager.datasetIdColumnType) 

273 ) 

274 # Interface to Registry. 

275 self._bridge = bridgeManager.register(self.name) 

276 except ReadOnlyDatabaseError: 

277 # If the database is read only and we just tried and failed to 

278 # create a table, it means someone is trying to create a read-only 

279 # butler client for an empty repo. That should be okay, as long 

280 # as they then try to get any datasets before some other client 

281 # creates the table. Chances are they're just validating 

282 # configuration. 

283 pass 

284 

285 # Determine whether checksums should be used - default to False 

286 self.useChecksum = self.config.get("checksum", False) 

287 

288 # Create a cache manager 

289 self.cacheManager: AbstractDatastoreCacheManager 

290 if "cached" in self.config: 

291 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe) 

292 else: 

293 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe) 

294 

295 @classmethod 

296 def _create_from_config( 

297 cls, 

298 config: DatastoreConfig, 

299 bridgeManager: DatastoreRegistryBridgeManager, 

300 butlerRoot: ResourcePathExpression | None, 

301 ) -> FileDatastore: 

302 if "root" not in config: 

303 raise ValueError("No root directory specified in configuration") 

304 

305 # Support repository relocation in config 

306 # Existence of self.root is checked in subclass 

307 root = ResourcePath(replaceRoot(config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True) 

308 

309 # Now associate formatters with storage classes 

310 formatterFactory = FormatterFactory() 

311 formatterFactory.registerFormatters(config["formatters"], universe=bridgeManager.universe) 

312 

313 # Read the file naming templates 

314 templates = FileTemplates(config["templates"], universe=bridgeManager.universe) 

315 

316 # See if composites should be disassembled 

317 composites = CompositesMap(config["composites"], universe=bridgeManager.universe) 

318 

319 # Determine whether we can fall back to configuration if a 

320 # requested dataset is not known to registry 

321 trustGetRequest = config.get("trust_get_request", False) 

322 

323 self = FileDatastore( 

324 config, bridgeManager, root, formatterFactory, templates, composites, trustGetRequest 

325 ) 

326 

327 # Check existence and create directory structure if necessary 

328 if not self.root.exists(): 

329 if "create" not in self.config or not self.config["create"]: 

330 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

331 try: 

332 self.root.mkdir() 

333 except Exception as e: 

334 raise ValueError( 

335 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}" 

336 ) from e 

337 

338 return self 

339 

340 def clone(self, bridgeManager: DatastoreRegistryBridgeManager) -> Datastore: 

341 return FileDatastore( 

342 self.config, 

343 bridgeManager, 

344 self.root, 

345 self.formatterFactory, 

346 self.templates, 

347 self.composites, 

348 self.trustGetRequest, 

349 ) 

350 

351 def __str__(self) -> str: 

352 return str(self.root) 

353 

354 @property 

355 def bridge(self) -> DatastoreRegistryBridge: 

356 return self._bridge 

357 

358 @property 

359 def roots(self) -> dict[str, ResourcePath | None]: 

360 # Docstring inherited. 

361 return {self.name: self.root} 

362 

363 def _set_trust_mode(self, mode: bool) -> None: 

364 self.trustGetRequest = mode 

365 

366 def _artifact_exists(self, location: Location) -> bool: 

367 """Check that an artifact exists in this datastore at the specified 

368 location. 

369 

370 Parameters 

371 ---------- 

372 location : `Location` 

373 Expected location of the artifact associated with this datastore. 

374 

375 Returns 

376 ------- 

377 exists : `bool` 

378 True if the location can be found, false otherwise. 

379 """ 

380 log.debug("Checking if resource exists: %s", location.uri) 

381 return location.uri.exists() 

382 

383 def _delete_artifact(self, location: Location) -> None: 

384 """Delete the artifact from the datastore. 

385 

386 Parameters 

387 ---------- 

388 location : `Location` 

389 Location of the artifact associated with this datastore. 

390 """ 

391 if location.pathInStore.isabs(): 

392 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

393 

394 try: 

395 location.uri.remove() 

396 except FileNotFoundError: 

397 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

398 raise 

399 except Exception as e: 

400 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

401 raise 

402 log.debug("Successfully deleted file: %s", location.uri) 

403 

404 def addStoredItemInfo( 

405 self, 

406 refs: Iterable[DatasetRef], 

407 infos: Iterable[StoredFileInfo], 

408 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT, 

409 ) -> None: 

410 """Record internal storage information associated with one or more 

411 datasets. 

412 

413 Parameters 

414 ---------- 

415 refs : sequence of `DatasetRef` 

416 The datasets that have been stored. 

417 infos : sequence of `StoredDatastoreItemInfo` 

418 Metadata associated with the stored datasets. 

419 insert_mode : `~lsst.daf.butler.registry.interfaces.DatabaseInsertMode` 

420 Mode to use to insert the new records into the table. The 

421 options are ``INSERT`` (error if pre-existing), ``REPLACE`` 

422 (replace content with new values), and ``ENSURE`` (skip if the row 

423 already exists). 

424 """ 

425 records = [ 

426 info.rebase(ref).to_record(dataset_id=ref.id) for ref, info in zip(refs, infos, strict=True) 

427 ] 

428 match insert_mode: 

429 case DatabaseInsertMode.INSERT: 

430 self._table.insert(*records, transaction=self._transaction) 

431 case DatabaseInsertMode.ENSURE: 

432 self._table.ensure(*records, transaction=self._transaction) 

433 case DatabaseInsertMode.REPLACE: 

434 self._table.replace(*records, transaction=self._transaction) 

435 case _: 

436 raise ValueError(f"Unknown insert mode of '{insert_mode}'") 

437 

438 def getStoredItemsInfo( 

439 self, ref: DatasetIdRef, ignore_datastore_records: bool = False 

440 ) -> list[StoredFileInfo]: 

441 """Retrieve information associated with files stored in this 

442 `Datastore` associated with this dataset ref. 

443 

444 Parameters 

445 ---------- 

446 ref : `DatasetRef` 

447 The dataset that is to be queried. 

448 ignore_datastore_records : `bool` 

449 If `True` then do not use datastore records stored in refs. 

450 

451 Returns 

452 ------- 

453 items : `~collections.abc.Iterable` [`StoredDatastoreItemInfo`] 

454 Stored information about the files and associated formatters 

455 associated with this dataset. Only one file will be returned 

456 if the dataset has not been disassembled. Can return an empty 

457 list if no matching datasets can be found. 

458 """ 

459 # Try to get them from the ref first. 

460 if ref._datastore_records is not None and not ignore_datastore_records: 

461 ref_records = ref._datastore_records.get(self._table.name, []) 

462 # Need to make sure they have correct type. 

463 for record in ref_records: 

464 if not isinstance(record, StoredFileInfo): 

465 raise TypeError(f"Datastore record has unexpected type {record.__class__.__name__}") 

466 return cast(list[StoredFileInfo], ref_records) 

467 

468 # Look for the dataset_id -- there might be multiple matches 

469 # if we have disassembled the dataset. 

470 records = self._table.fetch(dataset_id=ref.id) 

471 return [StoredFileInfo.from_record(record) for record in records] 

472 

473 def _register_datasets( 

474 self, 

475 refsAndInfos: Iterable[tuple[DatasetRef, StoredFileInfo]], 

476 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT, 

477 ) -> None: 

478 """Update registry to indicate that one or more datasets have been 

479 stored. 

480 

481 Parameters 

482 ---------- 

483 refsAndInfos : sequence `tuple` [`DatasetRef`, 

484 `StoredDatastoreItemInfo`] 

485 Datasets to register and the internal datastore metadata associated 

486 with them. 

487 insert_mode : `str`, optional 

488 Indicate whether the new records should be new ("insert", default), 

489 or allowed to exists ("ensure") or be replaced if already present 

490 ("replace"). 

491 """ 

492 expandedRefs: list[DatasetRef] = [] 

493 expandedItemInfos: list[StoredFileInfo] = [] 

494 

495 for ref, itemInfo in refsAndInfos: 

496 expandedRefs.append(ref) 

497 expandedItemInfos.append(itemInfo) 

498 

499 # Dataset location only cares about registry ID so if we have 

500 # disassembled in datastore we have to deduplicate. Since they 

501 # will have different datasetTypes we can't use a set 

502 registryRefs = {r.id: r for r in expandedRefs} 

503 if insert_mode == DatabaseInsertMode.INSERT: 

504 self.bridge.insert(registryRefs.values()) 

505 else: 

506 # There are only two columns and all that matters is the 

507 # dataset ID. 

508 self.bridge.ensure(registryRefs.values()) 

509 self.addStoredItemInfo(expandedRefs, expandedItemInfos, insert_mode=insert_mode) 

510 

511 def _get_stored_records_associated_with_refs( 

512 self, refs: Iterable[DatasetIdRef], ignore_datastore_records: bool = False 

513 ) -> dict[DatasetId, list[StoredFileInfo]]: 

514 """Retrieve all records associated with the provided refs. 

515 

516 Parameters 

517 ---------- 

518 refs : iterable of `DatasetIdRef` 

519 The refs for which records are to be retrieved. 

520 ignore_datastore_records : `bool` 

521 If `True` then do not use datastore records stored in refs. 

522 

523 Returns 

524 ------- 

525 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

526 The matching records indexed by the ref ID. The number of entries 

527 in the dict can be smaller than the number of requested refs. 

528 """ 

529 # Check datastore records in refs first. 

530 records_by_ref: defaultdict[DatasetId, list[StoredFileInfo]] = defaultdict(list) 

531 refs_with_no_records = [] 

532 for ref in refs: 

533 if ignore_datastore_records or ref._datastore_records is None: 

534 refs_with_no_records.append(ref) 

535 else: 

536 if (ref_records := ref._datastore_records.get(self._table.name)) is not None: 

537 # Need to make sure they have correct type. 

538 for ref_record in ref_records: 

539 if not isinstance(ref_record, StoredFileInfo): 

540 raise TypeError( 

541 f"Datastore record has unexpected type {ref_record.__class__.__name__}" 

542 ) 

543 records_by_ref[ref.id].append(ref_record) 

544 

545 # If there were any refs without datastore records, check opaque table. 

546 records = self._table.fetch(dataset_id=[ref.id for ref in refs_with_no_records]) 

547 

548 # Uniqueness is dataset_id + component so can have multiple records 

549 # per ref. 

550 for record in records: 

551 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

552 return records_by_ref 

553 

554 def _refs_associated_with_artifacts(self, paths: list[str | ResourcePath]) -> dict[str, set[DatasetId]]: 

555 """Return paths and associated dataset refs. 

556 

557 Parameters 

558 ---------- 

559 paths : `list` of `str` or `lsst.resources.ResourcePath` 

560 All the paths to include in search. 

561 

562 Returns 

563 ------- 

564 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

565 Mapping of each path to a set of associated database IDs. 

566 """ 

567 records = self._table.fetch(path=[str(path) for path in paths]) 

568 result = defaultdict(set) 

569 for row in records: 

570 result[row["path"]].add(row["dataset_id"]) 

571 return result 

572 

573 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> set[DatasetId]: 

574 """Return all dataset refs associated with the supplied path. 

575 

576 Parameters 

577 ---------- 

578 pathInStore : `lsst.resources.ResourcePath` 

579 Path of interest in the data store. 

580 

581 Returns 

582 ------- 

583 ids : `set` of `int` 

584 All `DatasetRef` IDs associated with this path. 

585 """ 

586 records = list(self._table.fetch(path=str(pathInStore))) 

587 ids = {r["dataset_id"] for r in records} 

588 return ids 

589 

590 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

591 """Remove information about the file associated with this dataset. 

592 

593 Parameters 

594 ---------- 

595 ref : `DatasetRef` 

596 The dataset that has been removed. 

597 """ 

598 # Note that this method is actually not used by this implementation, 

599 # we depend on bridge to delete opaque records. But there are some 

600 # tests that check that this method works, so we keep it for now. 

601 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

602 

603 def _get_dataset_locations_info( 

604 self, ref: DatasetIdRef, ignore_datastore_records: bool = False 

605 ) -> list[DatasetLocationInformation]: 

606 r"""Find all the `Location`\ s of the requested dataset in the 

607 `Datastore` and the associated stored file information. 

608 

609 Parameters 

610 ---------- 

611 ref : `DatasetRef` 

612 Reference to the required `Dataset`. 

613 ignore_datastore_records : `bool` 

614 If `True` then do not use datastore records stored in refs. 

615 

616 Returns 

617 ------- 

618 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

619 Location of the dataset within the datastore and 

620 stored information about each file and its formatter. 

621 """ 

622 # Get the file information (this will fail if no file) 

623 records = self.getStoredItemsInfo(ref, ignore_datastore_records) 

624 

625 # Use the path to determine the location -- we need to take 

626 # into account absolute URIs in the datastore record 

627 return [(r.file_location(self.locationFactory), r) for r in records] 

628 

629 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

630 """Check that there is only one dataset associated with the 

631 specified artifact. 

632 

633 Parameters 

634 ---------- 

635 ref : `DatasetRef` or `FakeDatasetRef` 

636 Dataset to be removed. 

637 location : `Location` 

638 The location of the artifact to be removed. 

639 

640 Returns 

641 ------- 

642 can_remove : `Bool` 

643 True if the artifact can be safely removed. 

644 """ 

645 # Can't ever delete absolute URIs. 

646 if location.pathInStore.isabs(): 

647 return False 

648 

649 # Get all entries associated with this path 

650 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

651 if not allRefs: 

652 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

653 

654 # Remove these refs from all the refs and if there is nothing left 

655 # then we can delete 

656 remainingRefs = allRefs - {ref.id} 

657 

658 if remainingRefs: 

659 return False 

660 return True 

661 

662 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> list[tuple[Location, StoredFileInfo]]: 

663 """Predict the location and related file information of the requested 

664 dataset in this datastore. 

665 

666 Parameters 

667 ---------- 

668 ref : `DatasetRef` 

669 Reference to the required `Dataset`. 

670 

671 Returns 

672 ------- 

673 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

674 Expected Location of the dataset within the datastore and 

675 placeholder information about each file and its formatter. 

676 

677 Notes 

678 ----- 

679 Uses the current configuration to determine how we would expect the 

680 datastore files to have been written if we couldn't ask registry. 

681 This is safe so long as there has been no change to datastore 

682 configuration between writing the dataset and wanting to read it. 

683 Will not work for files that have been ingested without using the 

684 standard file template or default formatter. 

685 """ 

686 # If we have a component ref we always need to ask the questions 

687 # of the composite. If the composite is disassembled this routine 

688 # should return all components. If the composite was not 

689 # disassembled the composite is what is stored regardless of 

690 # component request. Note that if the caller has disassembled 

691 # a composite there is no way for this guess to know that 

692 # without trying both the composite and component ref and seeing 

693 # if there is something at the component Location even without 

694 # disassembly being enabled. 

695 if ref.datasetType.isComponent(): 

696 ref = ref.makeCompositeRef() 

697 

698 # See if the ref is a composite that should be disassembled 

699 doDisassembly = self.composites.shouldBeDisassembled(ref) 

700 

701 all_info: list[tuple[Location, Formatter, StorageClass, str | None]] = [] 

702 

703 if doDisassembly: 

704 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

705 compRef = ref.makeComponentRef(component) 

706 location, formatter = self._determine_put_formatter_location(compRef) 

707 all_info.append((location, formatter, componentStorage, component)) 

708 

709 else: 

710 # Always use the composite ref if no disassembly 

711 location, formatter = self._determine_put_formatter_location(ref) 

712 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

713 

714 # Convert the list of tuples to have StoredFileInfo as second element 

715 return [ 

716 ( 

717 location, 

718 StoredFileInfo( 

719 formatter=formatter, 

720 path=location.pathInStore.path, 

721 storageClass=storageClass, 

722 component=component, 

723 checksum=None, 

724 file_size=-1, 

725 ), 

726 ) 

727 for location, formatter, storageClass, component in all_info 

728 ] 

729 

730 def _prepare_for_direct_get( 

731 self, ref: DatasetRef, parameters: Mapping[str, Any] | None = None 

732 ) -> list[DatastoreFileGetInformation]: 

733 """Check parameters for ``get`` and obtain formatter and 

734 location. 

735 

736 Parameters 

737 ---------- 

738 ref : `DatasetRef` 

739 Reference to the required Dataset. 

740 parameters : `dict` 

741 `StorageClass`-specific parameters that specify, for example, 

742 a slice of the dataset to be loaded. 

743 

744 Returns 

745 ------- 

746 getInfo : `list` [`DatastoreFileGetInformation`] 

747 Parameters needed to retrieve each file. 

748 """ 

749 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

750 

751 # The storage class we want to use eventually 

752 refStorageClass = ref.datasetType.storageClass 

753 

754 # For trusted mode need to reset storage class. 

755 ref = self._cast_storage_class(ref) 

756 

757 # Get file metadata and internal metadata 

758 fileLocations = self._get_dataset_locations_info(ref) 

759 if not fileLocations: 

760 if not self.trustGetRequest: 

761 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

762 # Assume the dataset is where we think it should be 

763 fileLocations = self._get_expected_dataset_locations_info(ref) 

764 

765 if len(fileLocations) > 1: 

766 # If trust is involved it is possible that there will be 

767 # components listed here that do not exist in the datastore. 

768 # Explicitly check for file artifact existence and filter out any 

769 # that are missing. 

770 if self.trustGetRequest: 

771 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()] 

772 

773 # For now complain only if we have no components at all. One 

774 # component is probably a problem but we can punt that to the 

775 # assembler. 

776 if not fileLocations: 

777 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") 

778 

779 return generate_datastore_get_information( 

780 fileLocations, 

781 readStorageClass=refStorageClass, 

782 ref=ref, 

783 parameters=parameters, 

784 ) 

785 

786 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> tuple[Location, Formatter]: 

787 """Check the arguments for ``put`` and obtain formatter and 

788 location. 

789 

790 Parameters 

791 ---------- 

792 inMemoryDataset : `object` 

793 The dataset to store. 

794 ref : `DatasetRef` 

795 Reference to the associated Dataset. 

796 

797 Returns 

798 ------- 

799 location : `Location` 

800 The location to write the dataset. 

801 formatter : `Formatter` 

802 The `Formatter` to use to write the dataset. 

803 

804 Raises 

805 ------ 

806 TypeError 

807 Supplied object and storage class are inconsistent. 

808 DatasetTypeNotSupportedError 

809 The associated `DatasetType` is not handled by this datastore. 

810 """ 

811 self._validate_put_parameters(inMemoryDataset, ref) 

812 return self._determine_put_formatter_location(ref) 

813 

814 def _determine_put_formatter_location(self, ref: DatasetRef) -> tuple[Location, Formatter]: 

815 """Calculate the formatter and output location to use for put. 

816 

817 Parameters 

818 ---------- 

819 ref : `DatasetRef` 

820 Reference to the associated Dataset. 

821 

822 Returns 

823 ------- 

824 location : `Location` 

825 The location to write the dataset. 

826 formatter : `Formatter` 

827 The `Formatter` to use to write the dataset. 

828 """ 

829 # Work out output file name 

830 try: 

831 template = self.templates.getTemplate(ref) 

832 except KeyError as e: 

833 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

834 

835 # Validate the template to protect against filenames from different 

836 # dataIds returning the same and causing overwrite confusion. 

837 template.validateTemplate(ref) 

838 

839 location = self.locationFactory.fromPath(template.format(ref), trusted_path=True) 

840 

841 # Get the formatter based on the storage class 

842 storageClass = ref.datasetType.storageClass 

843 try: 

844 formatter = self.formatterFactory.getFormatter( 

845 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId 

846 ) 

847 except KeyError as e: 

848 raise DatasetTypeNotSupportedError( 

849 f"Unable to find formatter for {ref} in datastore {self.name}" 

850 ) from e 

851 

852 # Now that we know the formatter, update the location 

853 location = formatter.makeUpdatedLocation(location) 

854 

855 return location, formatter 

856 

857 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None: 

858 # Docstring inherited from base class 

859 if transfer != "auto": 

860 return transfer 

861 

862 # See if the paths are within the datastore or not 

863 inside = [self._pathInStore(d.path) is not None for d in datasets] 

864 

865 if all(inside): 

866 transfer = None 

867 elif not any(inside): 

868 # Allow ResourcePath to use its own knowledge 

869 transfer = "auto" 

870 else: 

871 # This can happen when importing from a datastore that 

872 # has had some datasets ingested using "direct" mode. 

873 # Also allow ResourcePath to sort it out but warn about it. 

874 # This can happen if you are importing from a datastore 

875 # that had some direct transfer datasets. 

876 log.warning( 

877 "Some datasets are inside the datastore and some are outside. Using 'split' " 

878 "transfer mode. This assumes that the files outside the datastore are " 

879 "still accessible to the new butler since they will not be copied into " 

880 "the target datastore." 

881 ) 

882 transfer = "split" 

883 

884 return transfer 

885 

886 def _pathInStore(self, path: ResourcePathExpression) -> str | None: 

887 """Return path relative to datastore root. 

888 

889 Parameters 

890 ---------- 

891 path : `lsst.resources.ResourcePathExpression` 

892 Path to dataset. Can be absolute URI. If relative assumed to 

893 be relative to the datastore. Returns path in datastore 

894 or raises an exception if the path it outside. 

895 

896 Returns 

897 ------- 

898 inStore : `str` 

899 Path relative to datastore root. Returns `None` if the file is 

900 outside the root. 

901 """ 

902 # Relative path will always be relative to datastore 

903 pathUri = ResourcePath(path, forceAbsolute=False, forceDirectory=False) 

904 return pathUri.relative_to(self.root) 

905 

906 def _standardizeIngestPath( 

907 self, path: str | ResourcePath, *, transfer: str | None = None 

908 ) -> str | ResourcePath: 

909 """Standardize the path of a to-be-ingested file. 

910 

911 Parameters 

912 ---------- 

913 path : `str` or `lsst.resources.ResourcePath` 

914 Path of a file to be ingested. This parameter is not expected 

915 to be all the types that can be used to construct a 

916 `~lsst.resources.ResourcePath`. 

917 transfer : `str`, optional 

918 How (and whether) the dataset should be added to the datastore. 

919 See `ingest` for details of transfer modes. 

920 This implementation is provided only so 

921 `NotImplementedError` can be raised if the mode is not supported; 

922 actual transfers are deferred to `_extractIngestInfo`. 

923 

924 Returns 

925 ------- 

926 path : `str` or `lsst.resources.ResourcePath` 

927 New path in what the datastore considers standard form. If an 

928 absolute URI was given that will be returned unchanged. 

929 

930 Notes 

931 ----- 

932 Subclasses of `FileDatastore` can implement this method instead 

933 of `_prepIngest`. It should not modify the data repository or given 

934 file in any way. 

935 

936 Raises 

937 ------ 

938 NotImplementedError 

939 Raised if the datastore does not support the given transfer mode 

940 (including the case where ingest is not supported at all). 

941 FileNotFoundError 

942 Raised if one of the given files does not exist. 

943 """ 

944 if transfer not in (None, "direct", "split") + self.root.transferModes: 

945 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

946 

947 # A relative URI indicates relative to datastore root 

948 srcUri = ResourcePath(path, forceAbsolute=False, forceDirectory=False) 

949 if not srcUri.isabs(): 

950 srcUri = self.root.join(path) 

951 

952 if not srcUri.exists(): 

953 raise FileNotFoundError( 

954 f"Resource at {srcUri} does not exist; note that paths to ingest " 

955 f"are assumed to be relative to {self.root} unless they are absolute." 

956 ) 

957 

958 if transfer is None: 

959 relpath = srcUri.relative_to(self.root) 

960 if not relpath: 

961 raise RuntimeError( 

962 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})" 

963 ) 

964 

965 # Return the relative path within the datastore for internal 

966 # transfer 

967 path = relpath 

968 

969 return path 

970 

971 def _extractIngestInfo( 

972 self, 

973 path: ResourcePathExpression, 

974 ref: DatasetRef, 

975 *, 

976 formatter: Formatter | type[Formatter], 

977 transfer: str | None = None, 

978 record_validation_info: bool = True, 

979 ) -> StoredFileInfo: 

980 """Relocate (if necessary) and extract `StoredFileInfo` from a 

981 to-be-ingested file. 

982 

983 Parameters 

984 ---------- 

985 path : `lsst.resources.ResourcePathExpression` 

986 URI or path of a file to be ingested. 

987 ref : `DatasetRef` 

988 Reference for the dataset being ingested. Guaranteed to have 

989 ``dataset_id not None`. 

990 formatter : `type` or `Formatter` 

991 `Formatter` subclass to use for this dataset or an instance. 

992 transfer : `str`, optional 

993 How (and whether) the dataset should be added to the datastore. 

994 See `ingest` for details of transfer modes. 

995 record_validation_info : `bool`, optional 

996 If `True`, the default, the datastore can record validation 

997 information associated with the file. If `False` the datastore 

998 will not attempt to track any information such as checksums 

999 or file sizes. This can be useful if such information is tracked 

1000 in an external system or if the file is to be compressed in place. 

1001 It is up to the datastore whether this parameter is relevant. 

1002 

1003 Returns 

1004 ------- 

1005 info : `StoredFileInfo` 

1006 Internal datastore record for this file. This will be inserted by 

1007 the caller; the `_extractIngestInfo` is only responsible for 

1008 creating and populating the struct. 

1009 

1010 Raises 

1011 ------ 

1012 FileNotFoundError 

1013 Raised if one of the given files does not exist. 

1014 FileExistsError 

1015 Raised if transfer is not `None` but the (internal) location the 

1016 file would be moved to is already occupied. 

1017 """ 

1018 if self._transaction is None: 

1019 raise RuntimeError("Ingest called without transaction enabled") 

1020 

1021 # Create URI of the source path, do not need to force a relative 

1022 # path to absolute. 

1023 srcUri = ResourcePath(path, forceAbsolute=False, forceDirectory=False) 

1024 

1025 # Track whether we have read the size of the source yet 

1026 have_sized = False 

1027 

1028 tgtLocation: Location | None 

1029 if transfer is None or transfer == "split": 

1030 # A relative path is assumed to be relative to the datastore 

1031 # in this context 

1032 if not srcUri.isabs(): 

1033 tgtLocation = self.locationFactory.fromPath(srcUri.ospath, trusted_path=False) 

1034 else: 

1035 # Work out the path in the datastore from an absolute URI 

1036 # This is required to be within the datastore. 

1037 pathInStore = srcUri.relative_to(self.root) 

1038 if pathInStore is None and transfer is None: 

1039 raise RuntimeError( 

1040 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}" 

1041 ) 

1042 if pathInStore: 

1043 tgtLocation = self.locationFactory.fromPath(pathInStore, trusted_path=True) 

1044 elif transfer == "split": 

1045 # Outside the datastore but treat that as a direct ingest 

1046 # instead. 

1047 tgtLocation = None 

1048 else: 

1049 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}") 

1050 elif transfer == "direct": 

1051 # Want to store the full URI to the resource directly in 

1052 # datastore. This is useful for referring to permanent archive 

1053 # storage for raw data. 

1054 # Trust that people know what they are doing. 

1055 tgtLocation = None 

1056 else: 

1057 # Work out the name we want this ingested file to have 

1058 # inside the datastore 

1059 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

1060 if not tgtLocation.uri.dirname().exists(): 

1061 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

1062 tgtLocation.uri.dirname().mkdir() 

1063 

1064 # if we are transferring from a local file to a remote location 

1065 # it may be more efficient to get the size and checksum of the 

1066 # local file rather than the transferred one 

1067 if record_validation_info and srcUri.isLocal: 

1068 size = srcUri.size() 

1069 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

1070 have_sized = True 

1071 

1072 # Transfer the resource to the destination. 

1073 # Allow overwrite of an existing file. This matches the behavior 

1074 # of datastore.put() in that it trusts that registry would not 

1075 # be asking to overwrite unless registry thought that the 

1076 # overwrite was allowed. 

1077 tgtLocation.uri.transfer_from( 

1078 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True 

1079 ) 

1080 

1081 if tgtLocation is None: 

1082 # This means we are using direct mode 

1083 targetUri = srcUri 

1084 targetPath = str(srcUri) 

1085 else: 

1086 targetUri = tgtLocation.uri 

1087 targetPath = tgtLocation.pathInStore.path 

1088 

1089 # the file should exist in the datastore now 

1090 if record_validation_info: 

1091 if not have_sized: 

1092 size = targetUri.size() 

1093 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

1094 else: 

1095 # Not recording any file information. 

1096 size = -1 

1097 checksum = None 

1098 

1099 return StoredFileInfo( 

1100 formatter=formatter, 

1101 path=targetPath, 

1102 storageClass=ref.datasetType.storageClass, 

1103 component=ref.datasetType.component(), 

1104 file_size=size, 

1105 checksum=checksum, 

1106 ) 

1107 

1108 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData: 

1109 # Docstring inherited from Datastore._prepIngest. 

1110 filtered = [] 

1111 for dataset in datasets: 

1112 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

1113 if not acceptable: 

1114 continue 

1115 else: 

1116 dataset.refs = acceptable 

1117 if dataset.formatter is None: 

1118 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

1119 else: 

1120 assert isinstance(dataset.formatter, type | str) 

1121 formatter_class = get_class_of(dataset.formatter) 

1122 if not issubclass(formatter_class, Formatter): 

1123 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.") 

1124 dataset.formatter = formatter_class 

1125 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

1126 filtered.append(dataset) 

1127 return _IngestPrepData(filtered) 

1128 

1129 @transactional 

1130 def _finishIngest( 

1131 self, 

1132 prepData: Datastore.IngestPrepData, 

1133 *, 

1134 transfer: str | None = None, 

1135 record_validation_info: bool = True, 

1136 ) -> None: 

1137 # Docstring inherited from Datastore._finishIngest. 

1138 refsAndInfos = [] 

1139 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

1140 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

1141 # Do ingest as if the first dataset ref is associated with the file 

1142 info = self._extractIngestInfo( 

1143 dataset.path, 

1144 dataset.refs[0], 

1145 formatter=dataset.formatter, 

1146 transfer=transfer, 

1147 record_validation_info=record_validation_info, 

1148 ) 

1149 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

1150 

1151 # In direct mode we can allow repeated ingests of the same thing 

1152 # if we are sure that the external dataset is immutable. We use 

1153 # UUIDv5 to indicate this. If there is a mix of v4 and v5 they are 

1154 # separated. 

1155 refs_and_infos_replace = [] 

1156 refs_and_infos_insert = [] 

1157 if transfer == "direct": 

1158 for entry in refsAndInfos: 

1159 if entry[0].id.version == 5: 

1160 refs_and_infos_replace.append(entry) 

1161 else: 

1162 refs_and_infos_insert.append(entry) 

1163 else: 

1164 refs_and_infos_insert = refsAndInfos 

1165 

1166 if refs_and_infos_insert: 

1167 self._register_datasets(refs_and_infos_insert, insert_mode=DatabaseInsertMode.INSERT) 

1168 if refs_and_infos_replace: 

1169 self._register_datasets(refs_and_infos_replace, insert_mode=DatabaseInsertMode.REPLACE) 

1170 

1171 def _calculate_ingested_datastore_name( 

1172 self, 

1173 srcUri: ResourcePath, 

1174 ref: DatasetRef, 

1175 formatter: Formatter | type[Formatter] | None = None, 

1176 ) -> Location: 

1177 """Given a source URI and a DatasetRef, determine the name the 

1178 dataset will have inside datastore. 

1179 

1180 Parameters 

1181 ---------- 

1182 srcUri : `lsst.resources.ResourcePath` 

1183 URI to the source dataset file. 

1184 ref : `DatasetRef` 

1185 Ref associated with the newly-ingested dataset artifact. This 

1186 is used to determine the name within the datastore. 

1187 formatter : `Formatter` or Formatter class. 

1188 Formatter to use for validation. Can be a class or an instance. 

1189 No validation of the file extension is performed if the 

1190 ``formatter`` is `None`. This can be used if the caller knows 

1191 that the source URI and target URI will use the same formatter. 

1192 

1193 Returns 

1194 ------- 

1195 location : `Location` 

1196 Target location for the newly-ingested dataset. 

1197 """ 

1198 # Ingesting a file from outside the datastore. 

1199 # This involves a new name. 

1200 template = self.templates.getTemplate(ref) 

1201 location = self.locationFactory.fromPath(template.format(ref), trusted_path=True) 

1202 

1203 # Get the extension 

1204 ext = srcUri.getExtension() 

1205 

1206 # Update the destination to include that extension 

1207 location.updateExtension(ext) 

1208 

1209 # Ask the formatter to validate this extension 

1210 if formatter is not None: 

1211 formatter.validateExtension(location) 

1212 

1213 return location 

1214 

1215 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

1216 """Write out in memory dataset to datastore. 

1217 

1218 Parameters 

1219 ---------- 

1220 inMemoryDataset : `object` 

1221 Dataset to write to datastore. 

1222 ref : `DatasetRef` 

1223 Registry information associated with this dataset. 

1224 

1225 Returns 

1226 ------- 

1227 info : `StoredFileInfo` 

1228 Information describing the artifact written to the datastore. 

1229 """ 

1230 # May need to coerce the in memory dataset to the correct 

1231 # python type, but first we need to make sure the storage class 

1232 # reflects the one defined in the data repository. 

1233 ref = self._cast_storage_class(ref) 

1234 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset) 

1235 

1236 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

1237 uri = location.uri 

1238 

1239 if not uri.dirname().exists(): 

1240 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

1241 uri.dirname().mkdir() 

1242 

1243 if self._transaction is None: 

1244 raise RuntimeError("Attempting to write artifact without transaction enabled") 

1245 

1246 def _removeFileExists(uri: ResourcePath) -> None: 

1247 """Remove a file and do not complain if it is not there. 

1248 

1249 This is important since a formatter might fail before the file 

1250 is written and we should not confuse people by writing spurious 

1251 error messages to the log. 

1252 """ 

1253 with contextlib.suppress(FileNotFoundError): 

1254 uri.remove() 

1255 

1256 # Register a callback to try to delete the uploaded data if 

1257 # something fails below 

1258 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1259 

1260 data_written = False 

1261 

1262 # For remote URIs some datasets can be serialized directly 

1263 # to bytes and sent to the remote datastore without writing a 

1264 # file. If the dataset is intended to be saved to the cache 

1265 # a file is always written and direct write to the remote 

1266 # datastore is bypassed. 

1267 if not uri.isLocal and not self.cacheManager.should_be_cached(ref): 

1268 # Remote URI that is not cached so can write directly. 

1269 try: 

1270 serializedDataset = formatter.toBytes(inMemoryDataset) 

1271 except NotImplementedError: 

1272 # Fallback to the file writing option. 

1273 pass 

1274 except Exception as e: 

1275 raise RuntimeError( 

1276 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes." 

1277 ) from e 

1278 else: 

1279 log.debug("Writing bytes directly to %s", uri) 

1280 uri.write(serializedDataset, overwrite=True) 

1281 log.debug("Successfully wrote bytes directly to %s", uri) 

1282 data_written = True 

1283 

1284 if not data_written: 

1285 # Did not write the bytes directly to object store so instead 

1286 # write to temporary file. Always write to a temporary even if 

1287 # using a local file system -- that gives us atomic writes. 

1288 # If a process is killed as the file is being written we do not 

1289 # want it to remain in the correct place but in corrupt state. 

1290 # For local files write to the output directory not temporary dir. 

1291 prefix = uri.dirname() if uri.isLocal else None 

1292 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri: 

1293 # Need to configure the formatter to write to a different 

1294 # location and that needs us to overwrite internals 

1295 log.debug("Writing dataset to temporary location at %s", temporary_uri) 

1296 with formatter._updateLocation(Location(None, temporary_uri)): 

1297 try: 

1298 formatter.write(inMemoryDataset) 

1299 except Exception as e: 

1300 raise RuntimeError( 

1301 f"Failed to serialize dataset {ref} of type" 

1302 f" {type(inMemoryDataset)} to " 

1303 f"temporary location {temporary_uri}" 

1304 ) from e 

1305 

1306 # Use move for a local file since that becomes an efficient 

1307 # os.rename. For remote resources we use copy to allow the 

1308 # file to be cached afterwards. 

1309 transfer = "move" if uri.isLocal else "copy" 

1310 

1311 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True) 

1312 

1313 if transfer == "copy": 

1314 # Cache if required 

1315 self.cacheManager.move_to_cache(temporary_uri, ref) 

1316 

1317 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1318 

1319 # URI is needed to resolve what ingest case are we dealing with 

1320 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1321 

1322 def knows(self, ref: DatasetRef) -> bool: 

1323 """Check if the dataset is known to the datastore. 

1324 

1325 Does not check for existence of any artifact. 

1326 

1327 Parameters 

1328 ---------- 

1329 ref : `DatasetRef` 

1330 Reference to the required dataset. 

1331 

1332 Returns 

1333 ------- 

1334 exists : `bool` 

1335 `True` if the dataset is known to the datastore. 

1336 """ 

1337 fileLocations = self._get_dataset_locations_info(ref) 

1338 if fileLocations: 

1339 return True 

1340 return False 

1341 

1342 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

1343 # Docstring inherited from the base class. 

1344 

1345 # The records themselves. Could be missing some entries. 

1346 records = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True) 

1347 

1348 return {ref: ref.id in records for ref in refs} 

1349 

1350 def _process_mexists_records( 

1351 self, 

1352 id_to_ref: dict[DatasetId, DatasetRef], 

1353 records: dict[DatasetId, list[StoredFileInfo]], 

1354 all_required: bool, 

1355 artifact_existence: dict[ResourcePath, bool] | None = None, 

1356 ) -> dict[DatasetRef, bool]: 

1357 """Check given records for existence. 

1358 

1359 Helper function for `mexists()`. 

1360 

1361 Parameters 

1362 ---------- 

1363 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`] 

1364 Mapping of the dataset ID to the dataset ref itself. 

1365 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

1366 Records as generally returned by 

1367 ``_get_stored_records_associated_with_refs``. 

1368 all_required : `bool` 

1369 Flag to indicate whether existence requires all artifacts 

1370 associated with a dataset ID to exist or not for existence. 

1371 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1372 Optional mapping of datastore artifact to existence. Updated by 

1373 this method with details of all artifacts tested. Can be `None` 

1374 if the caller is not interested. 

1375 

1376 Returns 

1377 ------- 

1378 existence : `dict` of [`DatasetRef`, `bool`] 

1379 Mapping from dataset to boolean indicating existence. 

1380 """ 

1381 # The URIs to be checked and a mapping of those URIs to 

1382 # the dataset ID. 

1383 uris_to_check: list[ResourcePath] = [] 

1384 location_map: dict[ResourcePath, DatasetId] = {} 

1385 

1386 location_factory = self.locationFactory 

1387 

1388 uri_existence: dict[ResourcePath, bool] = {} 

1389 for ref_id, infos in records.items(): 

1390 # Key is the dataset Id, value is list of StoredItemInfo 

1391 uris = [info.file_location(location_factory).uri for info in infos] 

1392 location_map.update({uri: ref_id for uri in uris}) 

1393 

1394 # Check the local cache directly for a dataset corresponding 

1395 # to the remote URI. 

1396 if self.cacheManager.file_count > 0: 

1397 ref = id_to_ref[ref_id] 

1398 for uri, storedFileInfo in zip(uris, infos, strict=True): 

1399 check_ref = ref 

1400 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1401 check_ref = ref.makeComponentRef(component) 

1402 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()): 

1403 # Proxy for URI existence. 

1404 uri_existence[uri] = True 

1405 else: 

1406 uris_to_check.append(uri) 

1407 else: 

1408 # Check all of them. 

1409 uris_to_check.extend(uris) 

1410 

1411 if artifact_existence is not None: 

1412 # If a URI has already been checked remove it from the list 

1413 # and immediately add the status to the output dict. 

1414 filtered_uris_to_check = [] 

1415 for uri in uris_to_check: 

1416 if uri in artifact_existence: 

1417 uri_existence[uri] = artifact_existence[uri] 

1418 else: 

1419 filtered_uris_to_check.append(uri) 

1420 uris_to_check = filtered_uris_to_check 

1421 

1422 # Results. 

1423 dataset_existence: dict[DatasetRef, bool] = {} 

1424 

1425 uri_existence.update(ResourcePath.mexists(uris_to_check)) 

1426 for uri, exists in uri_existence.items(): 

1427 dataset_id = location_map[uri] 

1428 ref = id_to_ref[dataset_id] 

1429 

1430 # Disassembled composite needs to check all locations. 

1431 # all_required indicates whether all need to exist or not. 

1432 if ref in dataset_existence: 

1433 if all_required: 

1434 exists = dataset_existence[ref] and exists 

1435 else: 

1436 exists = dataset_existence[ref] or exists 

1437 dataset_existence[ref] = exists 

1438 

1439 if artifact_existence is not None: 

1440 artifact_existence.update(uri_existence) 

1441 

1442 return dataset_existence 

1443 

1444 def mexists( 

1445 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1446 ) -> dict[DatasetRef, bool]: 

1447 """Check the existence of multiple datasets at once. 

1448 

1449 Parameters 

1450 ---------- 

1451 refs : iterable of `DatasetRef` 

1452 The datasets to be checked. 

1453 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1454 Optional mapping of datastore artifact to existence. Updated by 

1455 this method with details of all artifacts tested. Can be `None` 

1456 if the caller is not interested. 

1457 

1458 Returns 

1459 ------- 

1460 existence : `dict` of [`DatasetRef`, `bool`] 

1461 Mapping from dataset to boolean indicating existence. 

1462 

1463 Notes 

1464 ----- 

1465 To minimize potentially costly remote existence checks, the local 

1466 cache is checked as a proxy for existence. If a file for this 

1467 `DatasetRef` does exist no check is done for the actual URI. This 

1468 could result in possibly unexpected behavior if the dataset itself 

1469 has been removed from the datastore by another process whilst it is 

1470 still in the cache. 

1471 """ 

1472 chunk_size = 10_000 

1473 dataset_existence: dict[DatasetRef, bool] = {} 

1474 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size) 

1475 n_found_total = 0 

1476 n_checked = 0 

1477 n_chunks = 0 

1478 for chunk in chunk_iterable(refs, chunk_size=chunk_size): 

1479 chunk_result = self._mexists(chunk, artifact_existence) 

1480 

1481 # The log message level and content depend on how many 

1482 # datasets we are processing. 

1483 n_results = len(chunk_result) 

1484 

1485 # Use verbose logging to ensure that messages can be seen 

1486 # easily if many refs are being checked. 

1487 log_threshold = VERBOSE 

1488 n_checked += n_results 

1489 

1490 # This sum can take some time so only do it if we know the 

1491 # result is going to be used. 

1492 n_found = 0 

1493 if log.isEnabledFor(log_threshold): 

1494 # Can treat the booleans as 0, 1 integers and sum them. 

1495 n_found = sum(chunk_result.values()) 

1496 n_found_total += n_found 

1497 

1498 # We are deliberately not trying to count the number of refs 

1499 # provided in case it's in the millions. This means there is a 

1500 # situation where the number of refs exactly matches the chunk 

1501 # size and we will switch to the multi-chunk path even though 

1502 # we only have a single chunk. 

1503 if n_results < chunk_size and n_chunks == 0: 

1504 # Single chunk will be processed so we can provide more detail. 

1505 if n_results == 1: 

1506 ref = list(chunk_result)[0] 

1507 # Use debug logging to be consistent with `exists()`. 

1508 log.debug( 

1509 "Calling mexists() with single ref that does%s exist (%s).", 

1510 "" if chunk_result[ref] else " not", 

1511 ref, 

1512 ) 

1513 else: 

1514 # Single chunk but multiple files. Summarize. 

1515 log.log( 

1516 log_threshold, 

1517 "Number of datasets found in datastore: %d out of %d datasets checked.", 

1518 n_found, 

1519 n_checked, 

1520 ) 

1521 

1522 else: 

1523 # Use incremental verbose logging when we have multiple chunks. 

1524 log.log( 

1525 log_threshold, 

1526 "Number of datasets found in datastore for chunk %d: %d out of %d checked " 

1527 "(running total from all chunks so far: %d found out of %d checked)", 

1528 n_chunks, 

1529 n_found, 

1530 n_results, 

1531 n_found_total, 

1532 n_checked, 

1533 ) 

1534 dataset_existence.update(chunk_result) 

1535 n_chunks += 1 

1536 

1537 return dataset_existence 

1538 

1539 def _mexists( 

1540 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1541 ) -> dict[DatasetRef, bool]: 

1542 """Check the existence of multiple datasets at once. 

1543 

1544 Parameters 

1545 ---------- 

1546 refs : iterable of `DatasetRef` 

1547 The datasets to be checked. 

1548 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1549 Optional mapping of datastore artifact to existence. Updated by 

1550 this method with details of all artifacts tested. Can be `None` 

1551 if the caller is not interested. 

1552 

1553 Returns 

1554 ------- 

1555 existence : `dict` of [`DatasetRef`, `bool`] 

1556 Mapping from dataset to boolean indicating existence. 

1557 """ 

1558 # Make a mapping from refs with the internal storage class to the given 

1559 # refs that may have a different one. We'll use the internal refs 

1560 # throughout this method and convert back at the very end. 

1561 internal_ref_to_input_ref = {self._cast_storage_class(ref): ref for ref in refs} 

1562 

1563 # Need a mapping of dataset_id to (internal) dataset ref since some 

1564 # internal APIs work with dataset_id. 

1565 id_to_ref = {ref.id: ref for ref in internal_ref_to_input_ref} 

1566 

1567 # Set of all IDs we are checking for. 

1568 requested_ids = set(id_to_ref.keys()) 

1569 

1570 # The records themselves. Could be missing some entries. 

1571 records = self._get_stored_records_associated_with_refs( 

1572 id_to_ref.values(), ignore_datastore_records=True 

1573 ) 

1574 

1575 dataset_existence = self._process_mexists_records( 

1576 id_to_ref, records, True, artifact_existence=artifact_existence 

1577 ) 

1578 

1579 # Set of IDs that have been handled. 

1580 handled_ids = {ref.id for ref in dataset_existence} 

1581 

1582 missing_ids = requested_ids - handled_ids 

1583 if missing_ids: 

1584 dataset_existence.update( 

1585 self._mexists_check_expected( 

1586 [id_to_ref[missing] for missing in missing_ids], artifact_existence 

1587 ) 

1588 ) 

1589 

1590 return { 

1591 internal_ref_to_input_ref[internal_ref]: existence 

1592 for internal_ref, existence in dataset_existence.items() 

1593 } 

1594 

1595 def _mexists_check_expected( 

1596 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1597 ) -> dict[DatasetRef, bool]: 

1598 """Check existence of refs that are not known to datastore. 

1599 

1600 Parameters 

1601 ---------- 

1602 refs : iterable of `DatasetRef` 

1603 The datasets to be checked. These are assumed not to be known 

1604 to datastore. 

1605 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1606 Optional mapping of datastore artifact to existence. Updated by 

1607 this method with details of all artifacts tested. Can be `None` 

1608 if the caller is not interested. 

1609 

1610 Returns 

1611 ------- 

1612 existence : `dict` of [`DatasetRef`, `bool`] 

1613 Mapping from dataset to boolean indicating existence. 

1614 """ 

1615 dataset_existence: dict[DatasetRef, bool] = {} 

1616 if not self.trustGetRequest: 

1617 # Must assume these do not exist 

1618 for ref in refs: 

1619 dataset_existence[ref] = False 

1620 else: 

1621 log.debug( 

1622 "%d datasets were not known to datastore during initial existence check.", 

1623 len(refs), 

1624 ) 

1625 

1626 # Construct data structure identical to that returned 

1627 # by _get_stored_records_associated_with_refs() but using 

1628 # guessed names. 

1629 records = {} 

1630 id_to_ref = {} 

1631 for missing_ref in refs: 

1632 expected = self._get_expected_dataset_locations_info(missing_ref) 

1633 dataset_id = missing_ref.id 

1634 records[dataset_id] = [info for _, info in expected] 

1635 id_to_ref[dataset_id] = missing_ref 

1636 

1637 dataset_existence.update( 

1638 self._process_mexists_records( 

1639 id_to_ref, 

1640 records, 

1641 False, 

1642 artifact_existence=artifact_existence, 

1643 ) 

1644 ) 

1645 

1646 return dataset_existence 

1647 

1648 def exists(self, ref: DatasetRef) -> bool: 

1649 """Check if the dataset exists in the datastore. 

1650 

1651 Parameters 

1652 ---------- 

1653 ref : `DatasetRef` 

1654 Reference to the required dataset. 

1655 

1656 Returns 

1657 ------- 

1658 exists : `bool` 

1659 `True` if the entity exists in the `Datastore`. 

1660 

1661 Notes 

1662 ----- 

1663 The local cache is checked as a proxy for existence in the remote 

1664 object store. It is possible that another process on a different 

1665 compute node could remove the file from the object store even 

1666 though it is present in the local cache. 

1667 """ 

1668 ref = self._cast_storage_class(ref) 

1669 # We cannot trust datastore records from ref, as many unit tests delete 

1670 # datasets and check their existence. 

1671 fileLocations = self._get_dataset_locations_info(ref, ignore_datastore_records=True) 

1672 

1673 # if we are being asked to trust that registry might not be correct 

1674 # we ask for the expected locations and check them explicitly 

1675 if not fileLocations: 

1676 if not self.trustGetRequest: 

1677 return False 

1678 

1679 # First check the cache. If it is not found we must check 

1680 # the datastore itself. Assume that any component in the cache 

1681 # means that the dataset does exist somewhere. 

1682 if self.cacheManager.known_to_cache(ref): 

1683 return True 

1684 

1685 # When we are guessing a dataset location we can not check 

1686 # for the existence of every component since we can not 

1687 # know if every component was written. Instead we check 

1688 # for the existence of any of the expected locations. 

1689 for location, _ in self._get_expected_dataset_locations_info(ref): 

1690 if self._artifact_exists(location): 

1691 return True 

1692 return False 

1693 

1694 # All listed artifacts must exist. 

1695 for location, storedFileInfo in fileLocations: 

1696 # Checking in cache needs the component ref. 

1697 check_ref = ref 

1698 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1699 check_ref = ref.makeComponentRef(component) 

1700 if self.cacheManager.known_to_cache(check_ref, location.getExtension()): 

1701 continue 

1702 

1703 if not self._artifact_exists(location): 

1704 return False 

1705 

1706 return True 

1707 

1708 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

1709 """Return URIs associated with dataset. 

1710 

1711 Parameters 

1712 ---------- 

1713 ref : `DatasetRef` 

1714 Reference to the required dataset. 

1715 predict : `bool`, optional 

1716 If the datastore does not know about the dataset, controls whether 

1717 it should return a predicted URI or not. 

1718 

1719 Returns 

1720 ------- 

1721 uris : `DatasetRefURIs` 

1722 The URI to the primary artifact associated with this dataset (if 

1723 the dataset was disassembled within the datastore this may be 

1724 `None`), and the URIs to any components associated with the dataset 

1725 artifact. (can be empty if there are no components). 

1726 """ 

1727 many = self.getManyURIs([ref], predict=predict, allow_missing=False) 

1728 return many[ref] 

1729 

1730 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

1731 """URI to the Dataset. 

1732 

1733 Parameters 

1734 ---------- 

1735 ref : `DatasetRef` 

1736 Reference to the required Dataset. 

1737 predict : `bool` 

1738 If `True`, allow URIs to be returned of datasets that have not 

1739 been written. 

1740 

1741 Returns 

1742 ------- 

1743 uri : `str` 

1744 URI pointing to the dataset within the datastore. If the 

1745 dataset does not exist in the datastore, and if ``predict`` is 

1746 `True`, the URI will be a prediction and will include a URI 

1747 fragment "#predicted". 

1748 If the datastore does not have entities that relate well 

1749 to the concept of a URI the returned URI will be 

1750 descriptive. The returned URI is not guaranteed to be obtainable. 

1751 

1752 Raises 

1753 ------ 

1754 FileNotFoundError 

1755 Raised if a URI has been requested for a dataset that does not 

1756 exist and guessing is not allowed. 

1757 RuntimeError 

1758 Raised if a request is made for a single URI but multiple URIs 

1759 are associated with this dataset. 

1760 

1761 Notes 

1762 ----- 

1763 When a predicted URI is requested an attempt will be made to form 

1764 a reasonable URI based on file templates and the expected formatter. 

1765 """ 

1766 primary, components = self.getURIs(ref, predict) 

1767 if primary is None or components: 

1768 raise RuntimeError( 

1769 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

1770 ) 

1771 return primary 

1772 

1773 def _predict_URIs( 

1774 self, 

1775 ref: DatasetRef, 

1776 ) -> DatasetRefURIs: 

1777 """Predict the URIs of a dataset ref. 

1778 

1779 Parameters 

1780 ---------- 

1781 ref : `DatasetRef` 

1782 Reference to the required Dataset. 

1783 

1784 Returns 

1785 ------- 

1786 URI : DatasetRefUris 

1787 Primary and component URIs. URIs will contain a URI fragment 

1788 "#predicted". 

1789 """ 

1790 uris = DatasetRefURIs() 

1791 

1792 if self.composites.shouldBeDisassembled(ref): 

1793 for component, _ in ref.datasetType.storageClass.components.items(): 

1794 comp_ref = ref.makeComponentRef(component) 

1795 comp_location, _ = self._determine_put_formatter_location(comp_ref) 

1796 

1797 # Add the "#predicted" URI fragment to indicate this is a 

1798 # guess 

1799 uris.componentURIs[component] = ResourcePath( 

1800 comp_location.uri.geturl() + "#predicted", forceDirectory=comp_location.uri.dirLike 

1801 ) 

1802 

1803 else: 

1804 location, _ = self._determine_put_formatter_location(ref) 

1805 

1806 # Add the "#predicted" URI fragment to indicate this is a guess 

1807 uris.primaryURI = ResourcePath( 

1808 location.uri.geturl() + "#predicted", forceDirectory=location.uri.dirLike 

1809 ) 

1810 

1811 return uris 

1812 

1813 def getManyURIs( 

1814 self, 

1815 refs: Iterable[DatasetRef], 

1816 predict: bool = False, 

1817 allow_missing: bool = False, 

1818 ) -> dict[DatasetRef, DatasetRefURIs]: 

1819 # Docstring inherited 

1820 

1821 uris: dict[DatasetRef, DatasetRefURIs] = {} 

1822 

1823 records = self._get_stored_records_associated_with_refs(refs) 

1824 records_keys = records.keys() 

1825 

1826 existing_refs = tuple(ref for ref in refs if ref.id in records_keys) 

1827 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys) 

1828 

1829 # Have to handle trustGetRequest mode by checking for the existence 

1830 # of the missing refs on disk. 

1831 if missing_refs: 

1832 dataset_existence = self._mexists_check_expected(missing_refs, None) 

1833 really_missing = set() 

1834 not_missing = set() 

1835 for ref, exists in dataset_existence.items(): 

1836 if exists: 

1837 not_missing.add(ref) 

1838 else: 

1839 really_missing.add(ref) 

1840 

1841 if not_missing: 

1842 # Need to recalculate the missing/existing split. 

1843 existing_refs = existing_refs + tuple(not_missing) 

1844 missing_refs = tuple(really_missing) 

1845 

1846 for ref in missing_refs: 

1847 # if this has never been written then we have to guess 

1848 if not predict: 

1849 if not allow_missing: 

1850 raise FileNotFoundError(f"Dataset {ref} not in this datastore.") 

1851 else: 

1852 uris[ref] = self._predict_URIs(ref) 

1853 

1854 for ref in existing_refs: 

1855 file_infos = records[ref.id] 

1856 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos] 

1857 uris[ref] = self._locations_to_URI(ref, file_locations) 

1858 

1859 return uris 

1860 

1861 def _locations_to_URI( 

1862 self, 

1863 ref: DatasetRef, 

1864 file_locations: Sequence[tuple[Location, StoredFileInfo]], 

1865 ) -> DatasetRefURIs: 

1866 """Convert one or more file locations associated with a DatasetRef 

1867 to a DatasetRefURIs. 

1868 

1869 Parameters 

1870 ---------- 

1871 ref : `DatasetRef` 

1872 Reference to the dataset. 

1873 file_locations : Sequence[Tuple[Location, StoredFileInfo]] 

1874 Each item in the sequence is the location of the dataset within the 

1875 datastore and stored information about the file and its formatter. 

1876 If there is only one item in the sequence then it is treated as the 

1877 primary URI. If there is more than one item then they are treated 

1878 as component URIs. If there are no items then an error is raised 

1879 unless ``self.trustGetRequest`` is `True`. 

1880 

1881 Returns 

1882 ------- 

1883 uris: DatasetRefURIs 

1884 Represents the primary URI or component URIs described by the 

1885 inputs. 

1886 

1887 Raises 

1888 ------ 

1889 RuntimeError 

1890 If no file locations are passed in and ``self.trustGetRequest`` is 

1891 `False`. 

1892 FileNotFoundError 

1893 If the a passed-in URI does not exist, and ``self.trustGetRequest`` 

1894 is `False`. 

1895 RuntimeError 

1896 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is 

1897 unexpected). 

1898 """ 

1899 guessing = False 

1900 uris = DatasetRefURIs() 

1901 

1902 if not file_locations: 

1903 if not self.trustGetRequest: 

1904 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1905 file_locations = self._get_expected_dataset_locations_info(ref) 

1906 guessing = True 

1907 

1908 if len(file_locations) == 1: 

1909 # No disassembly so this is the primary URI 

1910 uris.primaryURI = file_locations[0][0].uri 

1911 if guessing and not uris.primaryURI.exists(): 

1912 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist") 

1913 else: 

1914 for location, file_info in file_locations: 

1915 if file_info.component is None: 

1916 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1917 if guessing and not location.uri.exists(): 

1918 # If we are trusting then it is entirely possible for 

1919 # some components to be missing. In that case we skip 

1920 # to the next component. 

1921 if self.trustGetRequest: 

1922 continue 

1923 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist") 

1924 uris.componentURIs[file_info.component] = location.uri 

1925 

1926 return uris 

1927 

1928 def retrieveArtifacts( 

1929 self, 

1930 refs: Iterable[DatasetRef], 

1931 destination: ResourcePath, 

1932 transfer: str = "auto", 

1933 preserve_path: bool = True, 

1934 overwrite: bool = False, 

1935 ) -> list[ResourcePath]: 

1936 """Retrieve the file artifacts associated with the supplied refs. 

1937 

1938 Parameters 

1939 ---------- 

1940 refs : iterable of `DatasetRef` 

1941 The datasets for which file artifacts are to be retrieved. 

1942 A single ref can result in multiple files. The refs must 

1943 be resolved. 

1944 destination : `lsst.resources.ResourcePath` 

1945 Location to write the file artifacts. 

1946 transfer : `str`, optional 

1947 Method to use to transfer the artifacts. Must be one of the options 

1948 supported by `lsst.resources.ResourcePath.transfer_from()`. 

1949 "move" is not allowed. 

1950 preserve_path : `bool`, optional 

1951 If `True` the full path of the file artifact within the datastore 

1952 is preserved. If `False` the final file component of the path 

1953 is used. 

1954 overwrite : `bool`, optional 

1955 If `True` allow transfers to overwrite existing files at the 

1956 destination. 

1957 

1958 Returns 

1959 ------- 

1960 targets : `list` of `lsst.resources.ResourcePath` 

1961 URIs of file artifacts in destination location. Order is not 

1962 preserved. 

1963 """ 

1964 if not destination.isdir(): 

1965 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

1966 

1967 if transfer == "move": 

1968 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

1969 

1970 # Source -> Destination 

1971 # This also helps filter out duplicate DatasetRef in the request 

1972 # that will map to the same underlying file transfer. 

1973 to_transfer: dict[ResourcePath, ResourcePath] = {} 

1974 

1975 for ref in refs: 

1976 locations = self._get_dataset_locations_info(ref) 

1977 for location, _ in locations: 

1978 source_uri = location.uri 

1979 target_path: ResourcePathExpression 

1980 if preserve_path: 

1981 target_path = location.pathInStore 

1982 if target_path.isabs(): 

1983 # This is an absolute path to an external file. 

1984 # Use the full path. 

1985 target_path = target_path.relativeToPathRoot 

1986 else: 

1987 target_path = source_uri.basename() 

1988 target_uri = destination.join(target_path) 

1989 to_transfer[source_uri] = target_uri 

1990 

1991 # In theory can now parallelize the transfer 

1992 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer)) 

1993 for source_uri, target_uri in to_transfer.items(): 

1994 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

1995 

1996 return list(to_transfer.values()) 

1997 

1998 def get( 

1999 self, 

2000 ref: DatasetRef, 

2001 parameters: Mapping[str, Any] | None = None, 

2002 storageClass: StorageClass | str | None = None, 

2003 ) -> Any: 

2004 """Load an InMemoryDataset from the store. 

2005 

2006 Parameters 

2007 ---------- 

2008 ref : `DatasetRef` 

2009 Reference to the required Dataset. 

2010 parameters : `dict` 

2011 `StorageClass`-specific parameters that specify, for example, 

2012 a slice of the dataset to be loaded. 

2013 storageClass : `StorageClass` or `str`, optional 

2014 The storage class to be used to override the Python type 

2015 returned by this method. By default the returned type matches 

2016 the dataset type definition for this dataset. Specifying a 

2017 read `StorageClass` can force a different type to be returned. 

2018 This type must be compatible with the original type. 

2019 

2020 Returns 

2021 ------- 

2022 inMemoryDataset : `object` 

2023 Requested dataset or slice thereof as an InMemoryDataset. 

2024 

2025 Raises 

2026 ------ 

2027 FileNotFoundError 

2028 Requested dataset can not be retrieved. 

2029 TypeError 

2030 Return value from formatter has unexpected type. 

2031 ValueError 

2032 Formatter failed to process the dataset. 

2033 """ 

2034 # Supplied storage class for the component being read is either 

2035 # from the ref itself or some an override if we want to force 

2036 # type conversion. 

2037 if storageClass is not None: 

2038 ref = ref.overrideStorageClass(storageClass) 

2039 

2040 allGetInfo = self._prepare_for_direct_get(ref, parameters) 

2041 return get_dataset_as_python_object_from_get_info( 

2042 allGetInfo, ref=ref, parameters=parameters, cache_manager=self.cacheManager 

2043 ) 

2044 

2045 def prepare_get_for_external_client(self, ref: DatasetRef) -> FileDatastoreGetPayload: 

2046 # Docstring inherited 

2047 

2048 # 1 hour. Chosen somewhat arbitrarily -- this is long enough that the 

2049 # client should have time to download a large file with retries if 

2050 # needed, but short enough that it will become obvious quickly that 

2051 # these URLs expire. 

2052 # From a strictly technical standpoint there is no reason this 

2053 # shouldn't be a day or more, but there seems to be a political issue 

2054 # where people think there is a risk of end users posting presigned 

2055 # URLs for people without access rights to download. 

2056 url_expiration_time_seconds = 1 * 60 * 60 

2057 

2058 def to_file_info_payload(info: DatasetLocationInformation) -> FileDatastoreGetPayloadFileInfo: 

2059 location, file_info = info 

2060 return FileDatastoreGetPayloadFileInfo( 

2061 url=location.uri.generate_presigned_get_url( 

2062 expiration_time_seconds=url_expiration_time_seconds 

2063 ), 

2064 datastoreRecords=file_info.to_simple(), 

2065 ) 

2066 

2067 locations = self._get_dataset_locations_info(ref) 

2068 if len(locations) == 0: 

2069 raise FileNotFoundError(f"No artifacts found for DatasetId '{ref.id}'") 

2070 

2071 return FileDatastoreGetPayload( 

2072 datastore_type="file", 

2073 dataset_ref=ref.to_simple(), 

2074 file_info=[to_file_info_payload(info) for info in locations], 

2075 ) 

2076 

2077 @transactional 

2078 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

2079 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

2080 

2081 Parameters 

2082 ---------- 

2083 inMemoryDataset : `object` 

2084 The dataset to store. 

2085 ref : `DatasetRef` 

2086 Reference to the associated Dataset. 

2087 

2088 Raises 

2089 ------ 

2090 TypeError 

2091 Supplied object and storage class are inconsistent. 

2092 DatasetTypeNotSupportedError 

2093 The associated `DatasetType` is not handled by this datastore. 

2094 

2095 Notes 

2096 ----- 

2097 If the datastore is configured to reject certain dataset types it 

2098 is possible that the put will fail and raise a 

2099 `DatasetTypeNotSupportedError`. The main use case for this is to 

2100 allow `ChainedDatastore` to put to multiple datastores without 

2101 requiring that every datastore accepts the dataset. 

2102 """ 

2103 doDisassembly = self.composites.shouldBeDisassembled(ref) 

2104 # doDisassembly = True 

2105 

2106 artifacts = [] 

2107 if doDisassembly: 

2108 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

2109 if components is None: 

2110 raise RuntimeError( 

2111 f"Inconsistent configuration: dataset type {ref.datasetType.name} " 

2112 f"with storage class {ref.datasetType.storageClass.name} " 

2113 "is configured to be disassembled, but cannot be." 

2114 ) 

2115 for component, componentInfo in components.items(): 

2116 # Don't recurse because we want to take advantage of 

2117 # bulk insert -- need a new DatasetRef that refers to the 

2118 # same dataset_id but has the component DatasetType 

2119 # DatasetType does not refer to the types of components 

2120 # So we construct one ourselves. 

2121 compRef = ref.makeComponentRef(component) 

2122 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

2123 artifacts.append((compRef, storedInfo)) 

2124 else: 

2125 # Write the entire thing out 

2126 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

2127 artifacts.append((ref, storedInfo)) 

2128 

2129 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.INSERT) 

2130 

2131 @transactional 

2132 def put_new(self, in_memory_dataset: Any, ref: DatasetRef) -> Mapping[str, DatasetRef]: 

2133 doDisassembly = self.composites.shouldBeDisassembled(ref) 

2134 # doDisassembly = True 

2135 

2136 artifacts = [] 

2137 if doDisassembly: 

2138 components = ref.datasetType.storageClass.delegate().disassemble(in_memory_dataset) 

2139 if components is None: 

2140 raise RuntimeError( 

2141 f"Inconsistent configuration: dataset type {ref.datasetType.name} " 

2142 f"with storage class {ref.datasetType.storageClass.name} " 

2143 "is configured to be disassembled, but cannot be." 

2144 ) 

2145 for component, componentInfo in components.items(): 

2146 # Don't recurse because we want to take advantage of 

2147 # bulk insert -- need a new DatasetRef that refers to the 

2148 # same dataset_id but has the component DatasetType 

2149 # DatasetType does not refer to the types of components 

2150 # So we construct one ourselves. 

2151 compRef = ref.makeComponentRef(component) 

2152 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

2153 artifacts.append((compRef, storedInfo)) 

2154 else: 

2155 # Write the entire thing out 

2156 storedInfo = self._write_in_memory_to_artifact(in_memory_dataset, ref) 

2157 artifacts.append((ref, storedInfo)) 

2158 

2159 ref_records: DatasetDatastoreRecords = {self._opaque_table_name: [info for _, info in artifacts]} 

2160 ref = ref.replace(datastore_records=ref_records) 

2161 return {self.name: ref} 

2162 

2163 @transactional 

2164 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None: 

2165 # At this point can safely remove these datasets from the cache 

2166 # to avoid confusion later on. If they are not trashed later 

2167 # the cache will simply be refilled. 

2168 self.cacheManager.remove_from_cache(ref) 

2169 

2170 # If we are in trust mode there will be nothing to move to 

2171 # the trash table and we will have to try to delete the file 

2172 # immediately. 

2173 if self.trustGetRequest: 

2174 # Try to keep the logic below for a single file trash. 

2175 if isinstance(ref, DatasetRef): 

2176 refs = {ref} 

2177 else: 

2178 # Will recreate ref at the end of this branch. 

2179 refs = set(ref) 

2180 

2181 # Determine which datasets are known to datastore directly. 

2182 id_to_ref = {ref.id: ref for ref in refs} 

2183 existing_ids = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True) 

2184 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids} 

2185 

2186 missing = refs - existing_refs 

2187 if missing: 

2188 # Do an explicit existence check on these refs. 

2189 # We only care about the artifacts at this point and not 

2190 # the dataset existence. 

2191 artifact_existence: dict[ResourcePath, bool] = {} 

2192 _ = self.mexists(missing, artifact_existence) 

2193 uris = [uri for uri, exists in artifact_existence.items() if exists] 

2194 

2195 # FUTURE UPGRADE: Implement a parallelized bulk remove. 

2196 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris)) 

2197 for uri in uris: 

2198 try: 

2199 uri.remove() 

2200 except Exception as e: 

2201 if ignore_errors: 

2202 log.debug("Artifact %s could not be removed: %s", uri, e) 

2203 continue 

2204 raise 

2205 

2206 # There is no point asking the code below to remove refs we 

2207 # know are missing so update it with the list of existing 

2208 # records. Try to retain one vs many logic. 

2209 if not existing_refs: 

2210 # Nothing more to do since none of the datasets were 

2211 # known to the datastore record table. 

2212 return 

2213 ref = list(existing_refs) 

2214 if len(ref) == 1: 

2215 ref = ref[0] 

2216 

2217 # Get file metadata and internal metadata 

2218 if not isinstance(ref, DatasetRef): 

2219 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

2220 # Assumed to be an iterable of refs so bulk mode enabled. 

2221 try: 

2222 self.bridge.moveToTrash(ref, transaction=self._transaction) 

2223 except Exception as e: 

2224 if ignore_errors: 

2225 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

2226 else: 

2227 raise 

2228 return 

2229 

2230 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

2231 

2232 fileLocations = self._get_dataset_locations_info(ref) 

2233 

2234 if not fileLocations: 

2235 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

2236 if ignore_errors: 

2237 log.warning(err_msg) 

2238 return 

2239 else: 

2240 raise FileNotFoundError(err_msg) 

2241 

2242 for location, _ in fileLocations: 

2243 if not self._artifact_exists(location): 

2244 err_msg = ( 

2245 f"Dataset is known to datastore {self.name} but " 

2246 f"associated artifact ({location.uri}) is missing" 

2247 ) 

2248 if ignore_errors: 

2249 log.warning(err_msg) 

2250 return 

2251 else: 

2252 raise FileNotFoundError(err_msg) 

2253 

2254 # Mark dataset as trashed 

2255 try: 

2256 self.bridge.moveToTrash([ref], transaction=self._transaction) 

2257 except Exception as e: 

2258 if ignore_errors: 

2259 log.warning( 

2260 "Attempted to mark dataset (%s) to be trashed in datastore %s " 

2261 "but encountered an error: %s", 

2262 ref, 

2263 self.name, 

2264 e, 

2265 ) 

2266 pass 

2267 else: 

2268 raise 

2269 

2270 @transactional 

2271 def emptyTrash(self, ignore_errors: bool = True) -> None: 

2272 """Remove all datasets from the trash. 

2273 

2274 Parameters 

2275 ---------- 

2276 ignore_errors : `bool` 

2277 If `True` return without error even if something went wrong. 

2278 Problems could occur if another process is simultaneously trying 

2279 to delete. 

2280 """ 

2281 log.debug("Emptying trash in datastore %s", self.name) 

2282 

2283 # Context manager will empty trash iff we finish it without raising. 

2284 # It will also automatically delete the relevant rows from the 

2285 # trash table and the records table. 

2286 with self.bridge.emptyTrash( 

2287 self._table, record_class=StoredFileInfo, record_column="path" 

2288 ) as trash_data: 

2289 # Removing the artifacts themselves requires that the files are 

2290 # not also associated with refs that are not to be trashed. 

2291 # Therefore need to do a query with the file paths themselves 

2292 # and return all the refs associated with them. Can only delete 

2293 # a file if the refs to be trashed are the only refs associated 

2294 # with the file. 

2295 # This requires multiple copies of the trashed items 

2296 trashed, artifacts_to_keep = trash_data 

2297 

2298 if artifacts_to_keep is None: 

2299 # The bridge is not helping us so have to work it out 

2300 # ourselves. This is not going to be as efficient. 

2301 trashed = list(trashed) 

2302 

2303 # The instance check is for mypy since up to this point it 

2304 # does not know the type of info. 

2305 path_map = self._refs_associated_with_artifacts( 

2306 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)] 

2307 ) 

2308 

2309 for ref, info in trashed: 

2310 # Mypy needs to know this is not the base class 

2311 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2312 

2313 path_map[info.path].remove(ref.id) 

2314 if not path_map[info.path]: 

2315 del path_map[info.path] 

2316 

2317 artifacts_to_keep = set(path_map) 

2318 

2319 for ref, info in trashed: 

2320 # Should not happen for this implementation but need 

2321 # to keep mypy happy. 

2322 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

2323 

2324 # Mypy needs to know this is not the base class 

2325 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2326 

2327 if info.path in artifacts_to_keep: 

2328 # This is a multi-dataset artifact and we are not 

2329 # removing all associated refs. 

2330 continue 

2331 

2332 # Only trashed refs still known to datastore will be returned. 

2333 location = info.file_location(self.locationFactory) 

2334 

2335 # Point of no return for this artifact 

2336 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

2337 try: 

2338 self._delete_artifact(location) 

2339 except FileNotFoundError: 

2340 # If the file itself has been deleted there is nothing 

2341 # we can do about it. It is possible that trash has 

2342 # been run in parallel in another process or someone 

2343 # decided to delete the file. It is unlikely to come 

2344 # back and so we should still continue with the removal 

2345 # of the entry from the trash table. It is also possible 

2346 # we removed it in a previous iteration if it was 

2347 # a multi-dataset artifact. The delete artifact method 

2348 # will log a debug message in this scenario. 

2349 # Distinguishing file missing before trash started and 

2350 # file already removed previously as part of this trash 

2351 # is not worth the distinction with regards to potential 

2352 # memory cost. 

2353 pass 

2354 except Exception as e: 

2355 if ignore_errors: 

2356 # Use a debug message here even though it's not 

2357 # a good situation. In some cases this can be 

2358 # caused by a race between user A and user B 

2359 # and neither of them has permissions for the 

2360 # other's files. Butler does not know about users 

2361 # and trash has no idea what collections these 

2362 # files were in (without guessing from a path). 

2363 log.debug( 

2364 "Encountered error removing artifact %s from datastore %s: %s", 

2365 location.uri, 

2366 self.name, 

2367 e, 

2368 ) 

2369 else: 

2370 raise 

2371 

2372 @transactional 

2373 def transfer_from( 

2374 self, 

2375 source_datastore: Datastore, 

2376 refs: Collection[DatasetRef], 

2377 transfer: str = "auto", 

2378 artifact_existence: dict[ResourcePath, bool] | None = None, 

2379 dry_run: bool = False, 

2380 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

2381 # Docstring inherited 

2382 if type(self) is not type(source_datastore): 

2383 raise TypeError( 

2384 f"Datastore mismatch between this datastore ({type(self)}) and the " 

2385 f"source datastore ({type(source_datastore)})." 

2386 ) 

2387 

2388 # Be explicit for mypy 

2389 if not isinstance(source_datastore, FileDatastore): 

2390 raise TypeError( 

2391 "Can only transfer to a FileDatastore from another FileDatastore, not" 

2392 f" {type(source_datastore)}" 

2393 ) 

2394 

2395 # Stop early if "direct" transfer mode is requested. That would 

2396 # require that the URI inside the source datastore should be stored 

2397 # directly in the target datastore, which seems unlikely to be useful 

2398 # since at any moment the source datastore could delete the file. 

2399 if transfer in ("direct", "split"): 

2400 raise ValueError( 

2401 f"Can not transfer from a source datastore using {transfer} mode since" 

2402 " those files are controlled by the other datastore." 

2403 ) 

2404 

2405 # Empty existence lookup if none given. 

2406 if artifact_existence is None: 

2407 artifact_existence = {} 

2408 

2409 # In order to handle disassembled composites the code works 

2410 # at the records level since it can assume that internal APIs 

2411 # can be used. 

2412 # - If the record already exists in the destination this is assumed 

2413 # to be okay. 

2414 # - If there is no record but the source and destination URIs are 

2415 # identical no transfer is done but the record is added. 

2416 # - If the source record refers to an absolute URI currently assume 

2417 # that that URI should remain absolute and will be visible to the 

2418 # destination butler. May need to have a flag to indicate whether 

2419 # the dataset should be transferred. This will only happen if 

2420 # the detached Butler has had a local ingest. 

2421 

2422 # What we really want is all the records in the source datastore 

2423 # associated with these refs. Or derived ones if they don't exist 

2424 # in the source. 

2425 source_records = source_datastore._get_stored_records_associated_with_refs( 

2426 refs, ignore_datastore_records=True 

2427 ) 

2428 

2429 # The source dataset_ids are the keys in these records 

2430 source_ids = set(source_records) 

2431 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

2432 

2433 requested_ids = {ref.id for ref in refs} 

2434 missing_ids = requested_ids - source_ids 

2435 

2436 # Missing IDs can be okay if that datastore has allowed 

2437 # gets based on file existence. Should we transfer what we can 

2438 # or complain about it and warn? 

2439 if missing_ids and not source_datastore.trustGetRequest: 

2440 raise ValueError( 

2441 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}" 

2442 ) 

2443 

2444 # Need to map these missing IDs to a DatasetRef so we can guess 

2445 # the details. 

2446 if missing_ids: 

2447 log.info( 

2448 "Number of expected datasets missing from source datastore records: %d out of %d", 

2449 len(missing_ids), 

2450 len(requested_ids), 

2451 ) 

2452 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

2453 

2454 # This should be chunked in case we end up having to check 

2455 # the file store since we need some log output to show 

2456 # progress. 

2457 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000): 

2458 records = {} 

2459 for missing in missing_ids_chunk: 

2460 # Ask the source datastore where the missing artifacts 

2461 # should be. An execution butler might not know about the 

2462 # artifacts even if they are there. 

2463 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) 

2464 records[missing] = [info for _, info in expected] 

2465 

2466 # Call the mexist helper method in case we have not already 

2467 # checked these artifacts such that artifact_existence is 

2468 # empty. This allows us to benefit from parallelism. 

2469 # datastore.mexists() itself does not give us access to the 

2470 # derived datastore record. 

2471 log.verbose("Checking existence of %d datasets unknown to datastore", len(records)) 

2472 ref_exists = source_datastore._process_mexists_records( 

2473 id_to_ref, records, False, artifact_existence=artifact_existence 

2474 ) 

2475 

2476 # Now go through the records and propagate the ones that exist. 

2477 location_factory = source_datastore.locationFactory 

2478 for missing, record_list in records.items(): 

2479 # Skip completely if the ref does not exist. 

2480 ref = id_to_ref[missing] 

2481 if not ref_exists[ref]: 

2482 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref) 

2483 continue 

2484 # Check for file artifact to decide which parts of a 

2485 # disassembled composite do exist. If there is only a 

2486 # single record we don't even need to look because it can't 

2487 # be a composite and must exist. 

2488 if len(record_list) == 1: 

2489 dataset_records = record_list 

2490 else: 

2491 dataset_records = [ 

2492 record 

2493 for record in record_list 

2494 if artifact_existence[record.file_location(location_factory).uri] 

2495 ] 

2496 assert len(dataset_records) > 0, "Disassembled composite should have had some files." 

2497 

2498 # Rely on source_records being a defaultdict. 

2499 source_records[missing].extend(dataset_records) 

2500 log.verbose("Completed scan for missing data files") 

2501 

2502 # See if we already have these records 

2503 target_records = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True) 

2504 

2505 # The artifacts to register 

2506 artifacts = [] 

2507 

2508 # Refs that already exist 

2509 already_present = [] 

2510 

2511 # Refs that were rejected by this datastore. 

2512 rejected = set() 

2513 

2514 # Refs that were transferred successfully. 

2515 accepted = set() 

2516 

2517 # Record each time we have done a "direct" transfer. 

2518 direct_transfers = [] 

2519 

2520 # Now can transfer the artifacts 

2521 for ref in refs: 

2522 if not self.constraints.isAcceptable(ref): 

2523 # This datastore should not be accepting this dataset. 

2524 rejected.add(ref) 

2525 continue 

2526 

2527 accepted.add(ref) 

2528 

2529 if ref.id in target_records: 

2530 # Already have an artifact for this. 

2531 already_present.append(ref) 

2532 continue 

2533 

2534 # mypy needs to know these are always resolved refs 

2535 for info in source_records[ref.id]: 

2536 source_location = info.file_location(source_datastore.locationFactory) 

2537 target_location = info.file_location(self.locationFactory) 

2538 if source_location == target_location and not source_location.pathInStore.isabs(): 

2539 # Artifact is already in the target location. 

2540 # (which is how execution butler currently runs) 

2541 pass 

2542 else: 

2543 if target_location.pathInStore.isabs(): 

2544 # Just because we can see the artifact when running 

2545 # the transfer doesn't mean it will be generally 

2546 # accessible to a user of this butler. Need to decide 

2547 # what to do about an absolute path. 

2548 if transfer == "auto": 

2549 # For "auto" transfers we allow the absolute URI 

2550 # to be recorded in the target datastore. 

2551 direct_transfers.append(source_location) 

2552 else: 

2553 # The user is explicitly requesting a transfer 

2554 # even for an absolute URI. This requires us to 

2555 # calculate the target path. 

2556 template_ref = ref 

2557 if info.component: 

2558 template_ref = ref.makeComponentRef(info.component) 

2559 target_location = self._calculate_ingested_datastore_name( 

2560 source_location.uri, 

2561 template_ref, 

2562 ) 

2563 

2564 info = info.update(path=target_location.pathInStore.path) 

2565 

2566 # Need to transfer it to the new location. 

2567 # Assume we should always overwrite. If the artifact 

2568 # is there this might indicate that a previous transfer 

2569 # was interrupted but was not able to be rolled back 

2570 # completely (eg pre-emption) so follow Datastore default 

2571 # and overwrite. Do not copy if we are in dry-run mode. 

2572 if not dry_run: 

2573 target_location.uri.transfer_from( 

2574 source_location.uri, 

2575 transfer=transfer, 

2576 overwrite=True, 

2577 transaction=self._transaction, 

2578 ) 

2579 

2580 artifacts.append((ref, info)) 

2581 

2582 if direct_transfers: 

2583 log.info( 

2584 "Transfer request for an outside-datastore artifact with absolute URI done %d time%s", 

2585 len(direct_transfers), 

2586 "" if len(direct_transfers) == 1 else "s", 

2587 ) 

2588 

2589 # We are overwriting previous datasets that may have already 

2590 # existed. We therefore should ensure that we force the 

2591 # datastore records to agree. Note that this can potentially lead 

2592 # to difficulties if the dataset has previously been ingested 

2593 # disassembled and is somehow now assembled, or vice versa. 

2594 if not dry_run: 

2595 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.REPLACE) 

2596 

2597 if already_present: 

2598 n_skipped = len(already_present) 

2599 log.info( 

2600 "Skipped transfer of %d dataset%s already present in datastore", 

2601 n_skipped, 

2602 "" if n_skipped == 1 else "s", 

2603 ) 

2604 

2605 return accepted, rejected 

2606 

2607 @transactional 

2608 def forget(self, refs: Iterable[DatasetRef]) -> None: 

2609 # Docstring inherited. 

2610 refs = list(refs) 

2611 self.bridge.forget(refs) 

2612 self._table.delete(["dataset_id"], *[{"dataset_id": ref.id} for ref in refs]) 

2613 

2614 def validateConfiguration( 

2615 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

2616 ) -> None: 

2617 """Validate some of the configuration for this datastore. 

2618 

2619 Parameters 

2620 ---------- 

2621 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

2622 Entities to test against this configuration. Can be differing 

2623 types. 

2624 logFailures : `bool`, optional 

2625 If `True`, output a log message for every validation error 

2626 detected. 

2627 

2628 Raises 

2629 ------ 

2630 DatastoreValidationError 

2631 Raised if there is a validation problem with a configuration. 

2632 All the problems are reported in a single exception. 

2633 

2634 Notes 

2635 ----- 

2636 This method checks that all the supplied entities have valid file 

2637 templates and also have formatters defined. 

2638 """ 

2639 templateFailed = None 

2640 try: 

2641 self.templates.validateTemplates(entities, logFailures=logFailures) 

2642 except FileTemplateValidationError as e: 

2643 templateFailed = str(e) 

2644 

2645 formatterFailed = [] 

2646 for entity in entities: 

2647 try: 

2648 self.formatterFactory.getFormatterClass(entity) 

2649 except KeyError as e: 

2650 formatterFailed.append(str(e)) 

2651 if logFailures: 

2652 log.critical("Formatter failure: %s", e) 

2653 

2654 if templateFailed or formatterFailed: 

2655 messages = [] 

2656 if templateFailed: 

2657 messages.append(templateFailed) 

2658 if formatterFailed: 

2659 messages.append(",".join(formatterFailed)) 

2660 msg = ";\n".join(messages) 

2661 raise DatastoreValidationError(msg) 

2662 

2663 def getLookupKeys(self) -> set[LookupKey]: 

2664 # Docstring is inherited from base class 

2665 return ( 

2666 self.templates.getLookupKeys() 

2667 | self.formatterFactory.getLookupKeys() 

2668 | self.constraints.getLookupKeys() 

2669 ) 

2670 

2671 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

2672 # Docstring is inherited from base class 

2673 # The key can be valid in either formatters or templates so we can 

2674 # only check the template if it exists 

2675 if lookupKey in self.templates: 

2676 try: 

2677 self.templates[lookupKey].validateTemplate(entity) 

2678 except FileTemplateValidationError as e: 

2679 raise DatastoreValidationError(e) from e 

2680 

2681 def export( 

2682 self, 

2683 refs: Iterable[DatasetRef], 

2684 *, 

2685 directory: ResourcePathExpression | None = None, 

2686 transfer: str | None = "auto", 

2687 ) -> Iterable[FileDataset]: 

2688 # Docstring inherited from Datastore.export. 

2689 if transfer == "auto" and directory is None: 

2690 transfer = None 

2691 

2692 if transfer is not None and directory is None: 

2693 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

2694 

2695 if transfer == "move": 

2696 raise TypeError("Can not export by moving files out of datastore.") 

2697 elif transfer == "direct": 

2698 # For an export, treat this as equivalent to None. We do not 

2699 # want an import to risk using absolute URIs to datasets owned 

2700 # by another datastore. 

2701 log.info("Treating 'direct' transfer mode as in-place export.") 

2702 transfer = None 

2703 

2704 # Force the directory to be a URI object 

2705 directoryUri: ResourcePath | None = None 

2706 if directory is not None: 

2707 directoryUri = ResourcePath(directory, forceDirectory=True) 

2708 

2709 if transfer is not None and directoryUri is not None and not directoryUri.exists(): 

2710 # mypy needs the second test 

2711 raise FileNotFoundError(f"Export location {directory} does not exist") 

2712 

2713 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

2714 for ref in progress.wrap(refs, "Exporting dataset files"): 

2715 fileLocations = self._get_dataset_locations_info(ref) 

2716 if not fileLocations: 

2717 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

2718 # For now we can not export disassembled datasets 

2719 if len(fileLocations) > 1: 

2720 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

2721 location, storedFileInfo = fileLocations[0] 

2722 

2723 pathInStore = location.pathInStore.path 

2724 if transfer is None: 

2725 # TODO: do we also need to return the readStorageClass somehow? 

2726 # We will use the path in store directly. If this is an 

2727 # absolute URI, preserve it. 

2728 if location.pathInStore.isabs(): 

2729 pathInStore = str(location.uri) 

2730 elif transfer == "direct": 

2731 # Use full URIs to the remote store in the export 

2732 pathInStore = str(location.uri) 

2733 else: 

2734 # mypy needs help 

2735 assert directoryUri is not None, "directoryUri must be defined to get here" 

2736 storeUri = ResourcePath(location.uri, forceDirectory=False) 

2737 

2738 # if the datastore has an absolute URI to a resource, we 

2739 # have two options: 

2740 # 1. Keep the absolute URI in the exported YAML 

2741 # 2. Allocate a new name in the local datastore and transfer 

2742 # it. 

2743 # For now go with option 2 

2744 if location.pathInStore.isabs(): 

2745 template = self.templates.getTemplate(ref) 

2746 newURI = ResourcePath(template.format(ref), forceAbsolute=False, forceDirectory=False) 

2747 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

2748 

2749 exportUri = directoryUri.join(pathInStore) 

2750 exportUri.transfer_from(storeUri, transfer=transfer) 

2751 

2752 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

2753 

2754 @staticmethod 

2755 def computeChecksum(uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192) -> str | None: 

2756 """Compute the checksum of the supplied file. 

2757 

2758 Parameters 

2759 ---------- 

2760 uri : `lsst.resources.ResourcePath` 

2761 Name of resource to calculate checksum from. 

2762 algorithm : `str`, optional 

2763 Name of algorithm to use. Must be one of the algorithms supported 

2764 by :py:class`hashlib`. 

2765 block_size : `int` 

2766 Number of bytes to read from file at one time. 

2767 

2768 Returns 

2769 ------- 

2770 hexdigest : `str` 

2771 Hex digest of the file. 

2772 

2773 Notes 

2774 ----- 

2775 Currently returns None if the URI is for a remote resource. 

2776 """ 

2777 if algorithm not in hashlib.algorithms_guaranteed: 

2778 raise NameError(f"The specified algorithm '{algorithm}' is not supported by hashlib") 

2779 

2780 if not uri.isLocal: 

2781 return None 

2782 

2783 hasher = hashlib.new(algorithm) 

2784 

2785 with uri.as_local() as local_uri, open(local_uri.ospath, "rb") as f: 

2786 for chunk in iter(lambda: f.read(block_size), b""): 

2787 hasher.update(chunk) 

2788 

2789 return hasher.hexdigest() 

2790 

2791 def needs_expanded_data_ids( 

2792 self, 

2793 transfer: str | None, 

2794 entity: DatasetRef | DatasetType | StorageClass | None = None, 

2795 ) -> bool: 

2796 # Docstring inherited. 

2797 # This _could_ also use entity to inspect whether the filename template 

2798 # involves placeholders other than the required dimensions for its 

2799 # dataset type, but that's not necessary for correctness; it just 

2800 # enables more optimizations (perhaps only in theory). 

2801 return transfer not in ("direct", None) 

2802 

2803 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

2804 # Docstring inherited from the base class. 

2805 record_data = data.get(self.name) 

2806 if not record_data: 

2807 return 

2808 

2809 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records) 

2810 

2811 # TODO: Verify that there are no unexpected table names in the dict? 

2812 unpacked_records = [] 

2813 for dataset_id, dataset_data in record_data.records.items(): 

2814 records = dataset_data.get(self._table.name) 

2815 if records: 

2816 for info in records: 

2817 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records" 

2818 unpacked_records.append(info.to_record(dataset_id=dataset_id)) 

2819 if unpacked_records: 

2820 self._table.insert(*unpacked_records, transaction=self._transaction) 

2821 

2822 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

2823 # Docstring inherited from the base class. 

2824 exported_refs = list(self._bridge.check(refs)) 

2825 ids = {ref.id for ref in exported_refs} 

2826 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids} 

2827 for row in self._table.fetch(dataset_id=ids): 

2828 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row) 

2829 dataset_records = records.setdefault(row["dataset_id"], {}) 

2830 dataset_records.setdefault(self._table.name, []).append(info) 

2831 

2832 record_data = DatastoreRecordData(records=records) 

2833 return {self.name: record_data} 

2834 

2835 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None: 

2836 # Docstring inherited from the base class. 

2837 self._retrieve_dataset_method = method 

2838 

2839 def _cast_storage_class(self, ref: DatasetRef) -> DatasetRef: 

2840 """Update dataset reference to use the storage class from registry.""" 

2841 if self._retrieve_dataset_method is None: 

2842 # We could raise an exception here but unit tests do not define 

2843 # this method. 

2844 return ref 

2845 dataset_type = self._retrieve_dataset_method(ref.datasetType.name) 

2846 if dataset_type is not None: 

2847 ref = ref.overrideStorageClass(dataset_type.storageClass) 

2848 return ref 

2849 

2850 def get_opaque_table_definitions(self) -> Mapping[str, DatastoreOpaqueTable]: 

2851 # Docstring inherited from the base class. 

2852 return {self._opaque_table_name: DatastoreOpaqueTable(self.makeTableSpec(ddl.GUID), StoredFileInfo)}