Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 10%

923 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-25 10:50 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Generic file-based datastore code.""" 

29 

30from __future__ import annotations 

31 

32__all__ = ("FileDatastore",) 

33 

34import contextlib 

35import hashlib 

36import logging 

37from collections import defaultdict 

38from collections.abc import Callable, Iterable, Mapping, Sequence 

39from typing import TYPE_CHECKING, Any, ClassVar, cast 

40 

41from lsst.daf.butler import ( 

42 Config, 

43 DatasetId, 

44 DatasetRef, 

45 DatasetType, 

46 DatasetTypeNotSupportedError, 

47 FileDataset, 

48 FileDescriptor, 

49 Formatter, 

50 FormatterFactory, 

51 Location, 

52 LocationFactory, 

53 Progress, 

54 StorageClass, 

55 ddl, 

56) 

57from lsst.daf.butler.datastore import ( 

58 DatasetRefURIs, 

59 Datastore, 

60 DatastoreConfig, 

61 DatastoreOpaqueTable, 

62 DatastoreValidationError, 

63) 

64from lsst.daf.butler.datastore.cache_manager import ( 

65 AbstractDatastoreCacheManager, 

66 DatastoreCacheManager, 

67 DatastoreDisabledCacheManager, 

68) 

69from lsst.daf.butler.datastore.composites import CompositesMap 

70from lsst.daf.butler.datastore.file_templates import FileTemplates, FileTemplateValidationError 

71from lsst.daf.butler.datastore.generic_base import GenericBaseDatastore 

72from lsst.daf.butler.datastore.record_data import DatastoreRecordData 

73from lsst.daf.butler.datastore.stored_file_info import StoredDatastoreItemInfo, StoredFileInfo 

74from lsst.daf.butler.datastores.file_datastore.get import ( 

75 DatasetLocationInformation, 

76 DatastoreFileGetInformation, 

77 generate_datastore_get_information, 

78 get_dataset_as_python_object_from_get_info, 

79) 

80from lsst.daf.butler.datastores.fileDatastoreClient import ( 

81 FileDatastoreGetPayload, 

82 FileDatastoreGetPayloadFileInfo, 

83) 

84from lsst.daf.butler.registry.interfaces import ( 

85 DatabaseInsertMode, 

86 DatastoreRegistryBridge, 

87 FakeDatasetRef, 

88 ReadOnlyDatabaseError, 

89) 

90from lsst.daf.butler.repo_relocation import replaceRoot 

91from lsst.daf.butler.utils import transactional 

92from lsst.resources import ResourcePath, ResourcePathExpression 

93from lsst.utils.introspection import get_class_of 

94from lsst.utils.iteration import chunk_iterable 

95 

96# For VERBOSE logging usage. 

97from lsst.utils.logging import VERBOSE, getLogger 

98from sqlalchemy import BigInteger, String 

99 

100if TYPE_CHECKING: 

101 from lsst.daf.butler import LookupKey 

102 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

103 

104log = getLogger(__name__) 

105 

106 

107class _IngestPrepData(Datastore.IngestPrepData): 

108 """Helper class for FileDatastore ingest implementation. 

109 

110 Parameters 

111 ---------- 

112 datasets : `~collections.abc.Iterable` of `FileDataset` 

113 Files to be ingested by this datastore. 

114 """ 

115 

116 def __init__(self, datasets: Iterable[FileDataset]): 

117 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

118 self.datasets = datasets 

119 

120 

121class FileDatastore(GenericBaseDatastore[StoredFileInfo]): 

122 """Generic Datastore for file-based implementations. 

123 

124 Should always be sub-classed since key abstract methods are missing. 

125 

126 Parameters 

127 ---------- 

128 config : `DatastoreConfig` or `str` 

129 Configuration as either a `Config` object or URI to file. 

130 bridgeManager : `DatastoreRegistryBridgeManager` 

131 Object that manages the interface between `Registry` and datastores. 

132 root : `ResourcePath` 

133 Root directory URI of this `Datastore`. 

134 formatterFactory : `FormatterFactory` 

135 Factory for creating instances of formatters. 

136 templates : `FileTemplates` 

137 File templates that can be used by this `Datastore`. 

138 composites : `CompositesMap` 

139 Determines whether a dataset should be disassembled on put. 

140 trustGetRequest : `bool` 

141 Determine whether we can fall back to configuration if a requested 

142 dataset is not known to registry. 

143 

144 Raises 

145 ------ 

146 ValueError 

147 If root location does not exist and ``create`` is `False` in the 

148 configuration. 

149 """ 

150 

151 defaultConfigFile: ClassVar[str | None] = None 

152 """Path to configuration defaults. Accessed within the ``config`` resource 

153 or relative to a search path. Can be None if no defaults specified. 

154 """ 

155 

156 root: ResourcePath 

157 """Root directory URI of this `Datastore`.""" 

158 

159 locationFactory: LocationFactory 

160 """Factory for creating locations relative to the datastore root.""" 

161 

162 formatterFactory: FormatterFactory 

163 """Factory for creating instances of formatters.""" 

164 

165 templates: FileTemplates 

166 """File templates that can be used by this `Datastore`.""" 

167 

168 composites: CompositesMap 

169 """Determines whether a dataset should be disassembled on put.""" 

170 

171 defaultConfigFile = "datastores/fileDatastore.yaml" 

172 """Path to configuration defaults. Accessed within the ``config`` resource 

173 or relative to a search path. Can be None if no defaults specified. 

174 """ 

175 

176 _retrieve_dataset_method: Callable[[str], DatasetType | None] | None = None 

177 """Callable that is used in trusted mode to retrieve registry definition 

178 of a named dataset type. 

179 """ 

180 

181 @classmethod 

182 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

183 """Set any filesystem-dependent config options for this Datastore to 

184 be appropriate for a new empty repository with the given root. 

185 

186 Parameters 

187 ---------- 

188 root : `str` 

189 URI to the root of the data repository. 

190 config : `Config` 

191 A `Config` to update. Only the subset understood by 

192 this component will be updated. Will not expand 

193 defaults. 

194 full : `Config` 

195 A complete config with all defaults expanded that can be 

196 converted to a `DatastoreConfig`. Read-only and will not be 

197 modified by this method. 

198 Repository-specific options that should not be obtained 

199 from defaults when Butler instances are constructed 

200 should be copied from ``full`` to ``config``. 

201 overwrite : `bool`, optional 

202 If `False`, do not modify a value in ``config`` if the value 

203 already exists. Default is always to overwrite with the provided 

204 ``root``. 

205 

206 Notes 

207 ----- 

208 If a keyword is explicitly defined in the supplied ``config`` it 

209 will not be overridden by this method if ``overwrite`` is `False`. 

210 This allows explicit values set in external configs to be retained. 

211 """ 

212 Config.updateParameters( 

213 DatastoreConfig, 

214 config, 

215 full, 

216 toUpdate={"root": root}, 

217 toCopy=("cls", ("records", "table")), 

218 overwrite=overwrite, 

219 ) 

220 

221 @classmethod 

222 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

223 return ddl.TableSpec( 

224 fields=[ 

225 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

226 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

227 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

228 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

229 # Use empty string to indicate no component 

230 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

231 # TODO: should checksum be Base64Bytes instead? 

232 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

233 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

234 ], 

235 unique=frozenset(), 

236 indexes=[ddl.IndexSpec("path")], 

237 ) 

238 

239 def __init__( 

240 self, 

241 config: DatastoreConfig, 

242 bridgeManager: DatastoreRegistryBridgeManager, 

243 root: ResourcePath, 

244 formatterFactory: FormatterFactory, 

245 templates: FileTemplates, 

246 composites: CompositesMap, 

247 trustGetRequest: bool, 

248 ): 

249 super().__init__(config, bridgeManager) 

250 self.root = ResourcePath(root) 

251 self.formatterFactory = formatterFactory 

252 self.templates = templates 

253 self.composites = composites 

254 self.trustGetRequest = trustGetRequest 

255 

256 # Name ourselves either using an explicit name or a name 

257 # derived from the (unexpanded) root 

258 if "name" in self.config: 

259 self.name = self.config["name"] 

260 else: 

261 # We use the unexpanded root in the name to indicate that this 

262 # datastore can be moved without having to update registry. 

263 self.name = "{}@{}".format(type(self).__name__, self.config["root"]) 

264 

265 self.locationFactory = LocationFactory(self.root) 

266 

267 self._opaque_table_name = self.config["records", "table"] 

268 try: 

269 # Storage of paths and formatters, keyed by dataset_id 

270 self._table = bridgeManager.opaque.register( 

271 self._opaque_table_name, self.makeTableSpec(bridgeManager.datasetIdColumnType) 

272 ) 

273 # Interface to Registry. 

274 self._bridge = bridgeManager.register(self.name) 

275 except ReadOnlyDatabaseError: 

276 # If the database is read only and we just tried and failed to 

277 # create a table, it means someone is trying to create a read-only 

278 # butler client for an empty repo. That should be okay, as long 

279 # as they then try to get any datasets before some other client 

280 # creates the table. Chances are they're just validating 

281 # configuration. 

282 pass 

283 

284 # Determine whether checksums should be used - default to False 

285 self.useChecksum = self.config.get("checksum", False) 

286 

287 # Create a cache manager 

288 self.cacheManager: AbstractDatastoreCacheManager 

289 if "cached" in self.config: 

290 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe) 

291 else: 

292 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe) 

293 

294 @classmethod 

295 def _create_from_config( 

296 cls, 

297 config: DatastoreConfig, 

298 bridgeManager: DatastoreRegistryBridgeManager, 

299 butlerRoot: ResourcePathExpression | None, 

300 ) -> FileDatastore: 

301 if "root" not in config: 

302 raise ValueError("No root directory specified in configuration") 

303 

304 # Support repository relocation in config 

305 # Existence of self.root is checked in subclass 

306 root = ResourcePath(replaceRoot(config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True) 

307 

308 # Now associate formatters with storage classes 

309 formatterFactory = FormatterFactory() 

310 formatterFactory.registerFormatters(config["formatters"], universe=bridgeManager.universe) 

311 

312 # Read the file naming templates 

313 templates = FileTemplates(config["templates"], universe=bridgeManager.universe) 

314 

315 # See if composites should be disassembled 

316 composites = CompositesMap(config["composites"], universe=bridgeManager.universe) 

317 

318 # Determine whether we can fall back to configuration if a 

319 # requested dataset is not known to registry 

320 trustGetRequest = config.get("trust_get_request", False) 

321 

322 self = FileDatastore( 

323 config, bridgeManager, root, formatterFactory, templates, composites, trustGetRequest 

324 ) 

325 

326 # Check existence and create directory structure if necessary 

327 if not self.root.exists(): 

328 if "create" not in self.config or not self.config["create"]: 

329 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

330 try: 

331 self.root.mkdir() 

332 except Exception as e: 

333 raise ValueError( 

334 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}" 

335 ) from e 

336 

337 return self 

338 

339 def clone(self, bridgeManager: DatastoreRegistryBridgeManager) -> Datastore: 

340 return FileDatastore( 

341 self.config, 

342 bridgeManager, 

343 self.root, 

344 self.formatterFactory, 

345 self.templates, 

346 self.composites, 

347 self.trustGetRequest, 

348 ) 

349 

350 def __str__(self) -> str: 

351 return str(self.root) 

352 

353 @property 

354 def bridge(self) -> DatastoreRegistryBridge: 

355 return self._bridge 

356 

357 @property 

358 def roots(self) -> dict[str, ResourcePath | None]: 

359 # Docstring inherited. 

360 return {self.name: self.root} 

361 

362 def _artifact_exists(self, location: Location) -> bool: 

363 """Check that an artifact exists in this datastore at the specified 

364 location. 

365 

366 Parameters 

367 ---------- 

368 location : `Location` 

369 Expected location of the artifact associated with this datastore. 

370 

371 Returns 

372 ------- 

373 exists : `bool` 

374 True if the location can be found, false otherwise. 

375 """ 

376 log.debug("Checking if resource exists: %s", location.uri) 

377 return location.uri.exists() 

378 

379 def _delete_artifact(self, location: Location) -> None: 

380 """Delete the artifact from the datastore. 

381 

382 Parameters 

383 ---------- 

384 location : `Location` 

385 Location of the artifact associated with this datastore. 

386 """ 

387 if location.pathInStore.isabs(): 

388 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

389 

390 try: 

391 location.uri.remove() 

392 except FileNotFoundError: 

393 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

394 raise 

395 except Exception as e: 

396 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

397 raise 

398 log.debug("Successfully deleted file: %s", location.uri) 

399 

400 def addStoredItemInfo( 

401 self, 

402 refs: Iterable[DatasetRef], 

403 infos: Iterable[StoredFileInfo], 

404 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT, 

405 ) -> None: 

406 """Record internal storage information associated with one or more 

407 datasets. 

408 

409 Parameters 

410 ---------- 

411 refs : sequence of `DatasetRef` 

412 The datasets that have been stored. 

413 infos : sequence of `StoredDatastoreItemInfo` 

414 Metadata associated with the stored datasets. 

415 insert_mode : `~lsst.daf.butler.registry.interfaces.DatabaseInsertMode` 

416 Mode to use to insert the new records into the table. The 

417 options are ``INSERT`` (error if pre-existing), ``REPLACE`` 

418 (replace content with new values), and ``ENSURE`` (skip if the row 

419 already exists). 

420 """ 

421 records = [ 

422 info.rebase(ref).to_record(dataset_id=ref.id) for ref, info in zip(refs, infos, strict=True) 

423 ] 

424 match insert_mode: 

425 case DatabaseInsertMode.INSERT: 

426 self._table.insert(*records, transaction=self._transaction) 

427 case DatabaseInsertMode.ENSURE: 

428 self._table.ensure(*records, transaction=self._transaction) 

429 case DatabaseInsertMode.REPLACE: 

430 self._table.replace(*records, transaction=self._transaction) 

431 case _: 

432 raise ValueError(f"Unknown insert mode of '{insert_mode}'") 

433 

434 def getStoredItemsInfo( 

435 self, ref: DatasetIdRef, ignore_datastore_records: bool = False 

436 ) -> list[StoredFileInfo]: 

437 """Retrieve information associated with files stored in this 

438 `Datastore` associated with this dataset ref. 

439 

440 Parameters 

441 ---------- 

442 ref : `DatasetRef` 

443 The dataset that is to be queried. 

444 ignore_datastore_records : `bool` 

445 If `True` then do not use datastore records stored in refs. 

446 

447 Returns 

448 ------- 

449 items : `~collections.abc.Iterable` [`StoredDatastoreItemInfo`] 

450 Stored information about the files and associated formatters 

451 associated with this dataset. Only one file will be returned 

452 if the dataset has not been disassembled. Can return an empty 

453 list if no matching datasets can be found. 

454 """ 

455 # Try to get them from the ref first. 

456 if ref._datastore_records is not None and not ignore_datastore_records: 

457 if (ref_records := ref._datastore_records.get(self._table.name)) is not None: 

458 # Need to make sure they have correct type. 

459 for record in ref_records: 

460 if not isinstance(record, StoredFileInfo): 

461 raise TypeError(f"Datastore record has unexpected type {record.__class__.__name__}") 

462 return cast(list[StoredFileInfo], ref_records) 

463 

464 # Look for the dataset_id -- there might be multiple matches 

465 # if we have disassembled the dataset. 

466 records = self._table.fetch(dataset_id=ref.id) 

467 return [StoredFileInfo.from_record(record) for record in records] 

468 

469 def _register_datasets( 

470 self, 

471 refsAndInfos: Iterable[tuple[DatasetRef, StoredFileInfo]], 

472 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT, 

473 ) -> None: 

474 """Update registry to indicate that one or more datasets have been 

475 stored. 

476 

477 Parameters 

478 ---------- 

479 refsAndInfos : sequence `tuple` [`DatasetRef`, 

480 `StoredDatastoreItemInfo`] 

481 Datasets to register and the internal datastore metadata associated 

482 with them. 

483 insert_mode : `str`, optional 

484 Indicate whether the new records should be new ("insert", default), 

485 or allowed to exists ("ensure") or be replaced if already present 

486 ("replace"). 

487 """ 

488 expandedRefs: list[DatasetRef] = [] 

489 expandedItemInfos: list[StoredFileInfo] = [] 

490 

491 for ref, itemInfo in refsAndInfos: 

492 expandedRefs.append(ref) 

493 expandedItemInfos.append(itemInfo) 

494 

495 # Dataset location only cares about registry ID so if we have 

496 # disassembled in datastore we have to deduplicate. Since they 

497 # will have different datasetTypes we can't use a set 

498 registryRefs = {r.id: r for r in expandedRefs} 

499 if insert_mode == DatabaseInsertMode.INSERT: 

500 self.bridge.insert(registryRefs.values()) 

501 else: 

502 # There are only two columns and all that matters is the 

503 # dataset ID. 

504 self.bridge.ensure(registryRefs.values()) 

505 self.addStoredItemInfo(expandedRefs, expandedItemInfos, insert_mode=insert_mode) 

506 

507 def _get_stored_records_associated_with_refs( 

508 self, refs: Iterable[DatasetIdRef], ignore_datastore_records: bool = False 

509 ) -> dict[DatasetId, list[StoredFileInfo]]: 

510 """Retrieve all records associated with the provided refs. 

511 

512 Parameters 

513 ---------- 

514 refs : iterable of `DatasetIdRef` 

515 The refs for which records are to be retrieved. 

516 ignore_datastore_records : `bool` 

517 If `True` then do not use datastore records stored in refs. 

518 

519 Returns 

520 ------- 

521 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

522 The matching records indexed by the ref ID. The number of entries 

523 in the dict can be smaller than the number of requested refs. 

524 """ 

525 # Check datastore records in refs first. 

526 records_by_ref: defaultdict[DatasetId, list[StoredFileInfo]] = defaultdict(list) 

527 refs_with_no_records = [] 

528 for ref in refs: 

529 if ignore_datastore_records or ref._datastore_records is None: 

530 refs_with_no_records.append(ref) 

531 else: 

532 if (ref_records := ref._datastore_records.get(self._table.name)) is not None: 

533 # Need to make sure they have correct type. 

534 for ref_record in ref_records: 

535 if not isinstance(ref_record, StoredFileInfo): 

536 raise TypeError( 

537 f"Datastore record has unexpected type {ref_record.__class__.__name__}" 

538 ) 

539 records_by_ref[ref.id].append(ref_record) 

540 

541 # If there were any refs without datastore records, check opaque table. 

542 records = self._table.fetch(dataset_id=[ref.id for ref in refs_with_no_records]) 

543 

544 # Uniqueness is dataset_id + component so can have multiple records 

545 # per ref. 

546 for record in records: 

547 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

548 return records_by_ref 

549 

550 def _refs_associated_with_artifacts(self, paths: list[str | ResourcePath]) -> dict[str, set[DatasetId]]: 

551 """Return paths and associated dataset refs. 

552 

553 Parameters 

554 ---------- 

555 paths : `list` of `str` or `lsst.resources.ResourcePath` 

556 All the paths to include in search. 

557 

558 Returns 

559 ------- 

560 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

561 Mapping of each path to a set of associated database IDs. 

562 """ 

563 records = self._table.fetch(path=[str(path) for path in paths]) 

564 result = defaultdict(set) 

565 for row in records: 

566 result[row["path"]].add(row["dataset_id"]) 

567 return result 

568 

569 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> set[DatasetId]: 

570 """Return all dataset refs associated with the supplied path. 

571 

572 Parameters 

573 ---------- 

574 pathInStore : `lsst.resources.ResourcePath` 

575 Path of interest in the data store. 

576 

577 Returns 

578 ------- 

579 ids : `set` of `int` 

580 All `DatasetRef` IDs associated with this path. 

581 """ 

582 records = list(self._table.fetch(path=str(pathInStore))) 

583 ids = {r["dataset_id"] for r in records} 

584 return ids 

585 

586 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

587 """Remove information about the file associated with this dataset. 

588 

589 Parameters 

590 ---------- 

591 ref : `DatasetRef` 

592 The dataset that has been removed. 

593 """ 

594 # Note that this method is actually not used by this implementation, 

595 # we depend on bridge to delete opaque records. But there are some 

596 # tests that check that this method works, so we keep it for now. 

597 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

598 

599 def _get_dataset_locations_info( 

600 self, ref: DatasetIdRef, ignore_datastore_records: bool = False 

601 ) -> list[DatasetLocationInformation]: 

602 r"""Find all the `Location`\ s of the requested dataset in the 

603 `Datastore` and the associated stored file information. 

604 

605 Parameters 

606 ---------- 

607 ref : `DatasetRef` 

608 Reference to the required `Dataset`. 

609 ignore_datastore_records : `bool` 

610 If `True` then do not use datastore records stored in refs. 

611 

612 Returns 

613 ------- 

614 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

615 Location of the dataset within the datastore and 

616 stored information about each file and its formatter. 

617 """ 

618 # Get the file information (this will fail if no file) 

619 records = self.getStoredItemsInfo(ref, ignore_datastore_records) 

620 

621 # Use the path to determine the location -- we need to take 

622 # into account absolute URIs in the datastore record 

623 return [(r.file_location(self.locationFactory), r) for r in records] 

624 

625 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

626 """Check that there is only one dataset associated with the 

627 specified artifact. 

628 

629 Parameters 

630 ---------- 

631 ref : `DatasetRef` or `FakeDatasetRef` 

632 Dataset to be removed. 

633 location : `Location` 

634 The location of the artifact to be removed. 

635 

636 Returns 

637 ------- 

638 can_remove : `Bool` 

639 True if the artifact can be safely removed. 

640 """ 

641 # Can't ever delete absolute URIs. 

642 if location.pathInStore.isabs(): 

643 return False 

644 

645 # Get all entries associated with this path 

646 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

647 if not allRefs: 

648 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

649 

650 # Remove these refs from all the refs and if there is nothing left 

651 # then we can delete 

652 remainingRefs = allRefs - {ref.id} 

653 

654 if remainingRefs: 

655 return False 

656 return True 

657 

658 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> list[tuple[Location, StoredFileInfo]]: 

659 """Predict the location and related file information of the requested 

660 dataset in this datastore. 

661 

662 Parameters 

663 ---------- 

664 ref : `DatasetRef` 

665 Reference to the required `Dataset`. 

666 

667 Returns 

668 ------- 

669 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

670 Expected Location of the dataset within the datastore and 

671 placeholder information about each file and its formatter. 

672 

673 Notes 

674 ----- 

675 Uses the current configuration to determine how we would expect the 

676 datastore files to have been written if we couldn't ask registry. 

677 This is safe so long as there has been no change to datastore 

678 configuration between writing the dataset and wanting to read it. 

679 Will not work for files that have been ingested without using the 

680 standard file template or default formatter. 

681 """ 

682 # If we have a component ref we always need to ask the questions 

683 # of the composite. If the composite is disassembled this routine 

684 # should return all components. If the composite was not 

685 # disassembled the composite is what is stored regardless of 

686 # component request. Note that if the caller has disassembled 

687 # a composite there is no way for this guess to know that 

688 # without trying both the composite and component ref and seeing 

689 # if there is something at the component Location even without 

690 # disassembly being enabled. 

691 if ref.datasetType.isComponent(): 

692 ref = ref.makeCompositeRef() 

693 

694 # See if the ref is a composite that should be disassembled 

695 doDisassembly = self.composites.shouldBeDisassembled(ref) 

696 

697 all_info: list[tuple[Location, Formatter, StorageClass, str | None]] = [] 

698 

699 if doDisassembly: 

700 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

701 compRef = ref.makeComponentRef(component) 

702 location, formatter = self._determine_put_formatter_location(compRef) 

703 all_info.append((location, formatter, componentStorage, component)) 

704 

705 else: 

706 # Always use the composite ref if no disassembly 

707 location, formatter = self._determine_put_formatter_location(ref) 

708 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

709 

710 # Convert the list of tuples to have StoredFileInfo as second element 

711 return [ 

712 ( 

713 location, 

714 StoredFileInfo( 

715 formatter=formatter, 

716 path=location.pathInStore.path, 

717 storageClass=storageClass, 

718 component=component, 

719 checksum=None, 

720 file_size=-1, 

721 ), 

722 ) 

723 for location, formatter, storageClass, component in all_info 

724 ] 

725 

726 def _prepare_for_direct_get( 

727 self, ref: DatasetRef, parameters: Mapping[str, Any] | None = None 

728 ) -> list[DatastoreFileGetInformation]: 

729 """Check parameters for ``get`` and obtain formatter and 

730 location. 

731 

732 Parameters 

733 ---------- 

734 ref : `DatasetRef` 

735 Reference to the required Dataset. 

736 parameters : `dict` 

737 `StorageClass`-specific parameters that specify, for example, 

738 a slice of the dataset to be loaded. 

739 

740 Returns 

741 ------- 

742 getInfo : `list` [`DatastoreFileGetInformation`] 

743 Parameters needed to retrieve each file. 

744 """ 

745 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

746 

747 # The storage class we want to use eventually 

748 refStorageClass = ref.datasetType.storageClass 

749 

750 # For trusted mode need to reset storage class. 

751 ref = self._cast_storage_class(ref) 

752 

753 # Get file metadata and internal metadata 

754 fileLocations = self._get_dataset_locations_info(ref) 

755 if not fileLocations: 

756 if not self.trustGetRequest: 

757 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

758 # Assume the dataset is where we think it should be 

759 fileLocations = self._get_expected_dataset_locations_info(ref) 

760 

761 if len(fileLocations) > 1: 

762 # If trust is involved it is possible that there will be 

763 # components listed here that do not exist in the datastore. 

764 # Explicitly check for file artifact existence and filter out any 

765 # that are missing. 

766 if self.trustGetRequest: 

767 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()] 

768 

769 # For now complain only if we have no components at all. One 

770 # component is probably a problem but we can punt that to the 

771 # assembler. 

772 if not fileLocations: 

773 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") 

774 

775 return generate_datastore_get_information( 

776 fileLocations, 

777 readStorageClass=refStorageClass, 

778 ref=ref, 

779 parameters=parameters, 

780 ) 

781 

782 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> tuple[Location, Formatter]: 

783 """Check the arguments for ``put`` and obtain formatter and 

784 location. 

785 

786 Parameters 

787 ---------- 

788 inMemoryDataset : `object` 

789 The dataset to store. 

790 ref : `DatasetRef` 

791 Reference to the associated Dataset. 

792 

793 Returns 

794 ------- 

795 location : `Location` 

796 The location to write the dataset. 

797 formatter : `Formatter` 

798 The `Formatter` to use to write the dataset. 

799 

800 Raises 

801 ------ 

802 TypeError 

803 Supplied object and storage class are inconsistent. 

804 DatasetTypeNotSupportedError 

805 The associated `DatasetType` is not handled by this datastore. 

806 """ 

807 self._validate_put_parameters(inMemoryDataset, ref) 

808 return self._determine_put_formatter_location(ref) 

809 

810 def _determine_put_formatter_location(self, ref: DatasetRef) -> tuple[Location, Formatter]: 

811 """Calculate the formatter and output location to use for put. 

812 

813 Parameters 

814 ---------- 

815 ref : `DatasetRef` 

816 Reference to the associated Dataset. 

817 

818 Returns 

819 ------- 

820 location : `Location` 

821 The location to write the dataset. 

822 formatter : `Formatter` 

823 The `Formatter` to use to write the dataset. 

824 """ 

825 # Work out output file name 

826 try: 

827 template = self.templates.getTemplate(ref) 

828 except KeyError as e: 

829 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

830 

831 # Validate the template to protect against filenames from different 

832 # dataIds returning the same and causing overwrite confusion. 

833 template.validateTemplate(ref) 

834 

835 location = self.locationFactory.fromPath(template.format(ref), trusted_path=True) 

836 

837 # Get the formatter based on the storage class 

838 storageClass = ref.datasetType.storageClass 

839 try: 

840 formatter = self.formatterFactory.getFormatter( 

841 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId 

842 ) 

843 except KeyError as e: 

844 raise DatasetTypeNotSupportedError( 

845 f"Unable to find formatter for {ref} in datastore {self.name}" 

846 ) from e 

847 

848 # Now that we know the formatter, update the location 

849 location = formatter.makeUpdatedLocation(location) 

850 

851 return location, formatter 

852 

853 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None: 

854 # Docstring inherited from base class 

855 if transfer != "auto": 

856 return transfer 

857 

858 # See if the paths are within the datastore or not 

859 inside = [self._pathInStore(d.path) is not None for d in datasets] 

860 

861 if all(inside): 

862 transfer = None 

863 elif not any(inside): 

864 # Allow ResourcePath to use its own knowledge 

865 transfer = "auto" 

866 else: 

867 # This can happen when importing from a datastore that 

868 # has had some datasets ingested using "direct" mode. 

869 # Also allow ResourcePath to sort it out but warn about it. 

870 # This can happen if you are importing from a datastore 

871 # that had some direct transfer datasets. 

872 log.warning( 

873 "Some datasets are inside the datastore and some are outside. Using 'split' " 

874 "transfer mode. This assumes that the files outside the datastore are " 

875 "still accessible to the new butler since they will not be copied into " 

876 "the target datastore." 

877 ) 

878 transfer = "split" 

879 

880 return transfer 

881 

882 def _pathInStore(self, path: ResourcePathExpression) -> str | None: 

883 """Return path relative to datastore root. 

884 

885 Parameters 

886 ---------- 

887 path : `lsst.resources.ResourcePathExpression` 

888 Path to dataset. Can be absolute URI. If relative assumed to 

889 be relative to the datastore. Returns path in datastore 

890 or raises an exception if the path it outside. 

891 

892 Returns 

893 ------- 

894 inStore : `str` 

895 Path relative to datastore root. Returns `None` if the file is 

896 outside the root. 

897 """ 

898 # Relative path will always be relative to datastore 

899 pathUri = ResourcePath(path, forceAbsolute=False, forceDirectory=False) 

900 return pathUri.relative_to(self.root) 

901 

902 def _standardizeIngestPath( 

903 self, path: str | ResourcePath, *, transfer: str | None = None 

904 ) -> str | ResourcePath: 

905 """Standardize the path of a to-be-ingested file. 

906 

907 Parameters 

908 ---------- 

909 path : `str` or `lsst.resources.ResourcePath` 

910 Path of a file to be ingested. This parameter is not expected 

911 to be all the types that can be used to construct a 

912 `~lsst.resources.ResourcePath`. 

913 transfer : `str`, optional 

914 How (and whether) the dataset should be added to the datastore. 

915 See `ingest` for details of transfer modes. 

916 This implementation is provided only so 

917 `NotImplementedError` can be raised if the mode is not supported; 

918 actual transfers are deferred to `_extractIngestInfo`. 

919 

920 Returns 

921 ------- 

922 path : `str` or `lsst.resources.ResourcePath` 

923 New path in what the datastore considers standard form. If an 

924 absolute URI was given that will be returned unchanged. 

925 

926 Notes 

927 ----- 

928 Subclasses of `FileDatastore` can implement this method instead 

929 of `_prepIngest`. It should not modify the data repository or given 

930 file in any way. 

931 

932 Raises 

933 ------ 

934 NotImplementedError 

935 Raised if the datastore does not support the given transfer mode 

936 (including the case where ingest is not supported at all). 

937 FileNotFoundError 

938 Raised if one of the given files does not exist. 

939 """ 

940 if transfer not in (None, "direct", "split") + self.root.transferModes: 

941 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

942 

943 # A relative URI indicates relative to datastore root 

944 srcUri = ResourcePath(path, forceAbsolute=False, forceDirectory=False) 

945 if not srcUri.isabs(): 

946 srcUri = self.root.join(path) 

947 

948 if not srcUri.exists(): 

949 raise FileNotFoundError( 

950 f"Resource at {srcUri} does not exist; note that paths to ingest " 

951 f"are assumed to be relative to {self.root} unless they are absolute." 

952 ) 

953 

954 if transfer is None: 

955 relpath = srcUri.relative_to(self.root) 

956 if not relpath: 

957 raise RuntimeError( 

958 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})" 

959 ) 

960 

961 # Return the relative path within the datastore for internal 

962 # transfer 

963 path = relpath 

964 

965 return path 

966 

967 def _extractIngestInfo( 

968 self, 

969 path: ResourcePathExpression, 

970 ref: DatasetRef, 

971 *, 

972 formatter: Formatter | type[Formatter], 

973 transfer: str | None = None, 

974 record_validation_info: bool = True, 

975 ) -> StoredFileInfo: 

976 """Relocate (if necessary) and extract `StoredFileInfo` from a 

977 to-be-ingested file. 

978 

979 Parameters 

980 ---------- 

981 path : `lsst.resources.ResourcePathExpression` 

982 URI or path of a file to be ingested. 

983 ref : `DatasetRef` 

984 Reference for the dataset being ingested. Guaranteed to have 

985 ``dataset_id not None`. 

986 formatter : `type` or `Formatter` 

987 `Formatter` subclass to use for this dataset or an instance. 

988 transfer : `str`, optional 

989 How (and whether) the dataset should be added to the datastore. 

990 See `ingest` for details of transfer modes. 

991 record_validation_info : `bool`, optional 

992 If `True`, the default, the datastore can record validation 

993 information associated with the file. If `False` the datastore 

994 will not attempt to track any information such as checksums 

995 or file sizes. This can be useful if such information is tracked 

996 in an external system or if the file is to be compressed in place. 

997 It is up to the datastore whether this parameter is relevant. 

998 

999 Returns 

1000 ------- 

1001 info : `StoredFileInfo` 

1002 Internal datastore record for this file. This will be inserted by 

1003 the caller; the `_extractIngestInfo` is only responsible for 

1004 creating and populating the struct. 

1005 

1006 Raises 

1007 ------ 

1008 FileNotFoundError 

1009 Raised if one of the given files does not exist. 

1010 FileExistsError 

1011 Raised if transfer is not `None` but the (internal) location the 

1012 file would be moved to is already occupied. 

1013 """ 

1014 if self._transaction is None: 

1015 raise RuntimeError("Ingest called without transaction enabled") 

1016 

1017 # Create URI of the source path, do not need to force a relative 

1018 # path to absolute. 

1019 srcUri = ResourcePath(path, forceAbsolute=False, forceDirectory=False) 

1020 

1021 # Track whether we have read the size of the source yet 

1022 have_sized = False 

1023 

1024 tgtLocation: Location | None 

1025 if transfer is None or transfer == "split": 

1026 # A relative path is assumed to be relative to the datastore 

1027 # in this context 

1028 if not srcUri.isabs(): 

1029 tgtLocation = self.locationFactory.fromPath(srcUri.ospath, trusted_path=False) 

1030 else: 

1031 # Work out the path in the datastore from an absolute URI 

1032 # This is required to be within the datastore. 

1033 pathInStore = srcUri.relative_to(self.root) 

1034 if pathInStore is None and transfer is None: 

1035 raise RuntimeError( 

1036 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}" 

1037 ) 

1038 if pathInStore: 

1039 tgtLocation = self.locationFactory.fromPath(pathInStore, trusted_path=True) 

1040 elif transfer == "split": 

1041 # Outside the datastore but treat that as a direct ingest 

1042 # instead. 

1043 tgtLocation = None 

1044 else: 

1045 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}") 

1046 elif transfer == "direct": 

1047 # Want to store the full URI to the resource directly in 

1048 # datastore. This is useful for referring to permanent archive 

1049 # storage for raw data. 

1050 # Trust that people know what they are doing. 

1051 tgtLocation = None 

1052 else: 

1053 # Work out the name we want this ingested file to have 

1054 # inside the datastore 

1055 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

1056 if not tgtLocation.uri.dirname().exists(): 

1057 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

1058 tgtLocation.uri.dirname().mkdir() 

1059 

1060 # if we are transferring from a local file to a remote location 

1061 # it may be more efficient to get the size and checksum of the 

1062 # local file rather than the transferred one 

1063 if record_validation_info and srcUri.isLocal: 

1064 size = srcUri.size() 

1065 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

1066 have_sized = True 

1067 

1068 # Transfer the resource to the destination. 

1069 # Allow overwrite of an existing file. This matches the behavior 

1070 # of datastore.put() in that it trusts that registry would not 

1071 # be asking to overwrite unless registry thought that the 

1072 # overwrite was allowed. 

1073 tgtLocation.uri.transfer_from( 

1074 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True 

1075 ) 

1076 

1077 if tgtLocation is None: 

1078 # This means we are using direct mode 

1079 targetUri = srcUri 

1080 targetPath = str(srcUri) 

1081 else: 

1082 targetUri = tgtLocation.uri 

1083 targetPath = tgtLocation.pathInStore.path 

1084 

1085 # the file should exist in the datastore now 

1086 if record_validation_info: 

1087 if not have_sized: 

1088 size = targetUri.size() 

1089 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

1090 else: 

1091 # Not recording any file information. 

1092 size = -1 

1093 checksum = None 

1094 

1095 return StoredFileInfo( 

1096 formatter=formatter, 

1097 path=targetPath, 

1098 storageClass=ref.datasetType.storageClass, 

1099 component=ref.datasetType.component(), 

1100 file_size=size, 

1101 checksum=checksum, 

1102 ) 

1103 

1104 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData: 

1105 # Docstring inherited from Datastore._prepIngest. 

1106 filtered = [] 

1107 for dataset in datasets: 

1108 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

1109 if not acceptable: 

1110 continue 

1111 else: 

1112 dataset.refs = acceptable 

1113 if dataset.formatter is None: 

1114 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

1115 else: 

1116 assert isinstance(dataset.formatter, type | str) 

1117 formatter_class = get_class_of(dataset.formatter) 

1118 if not issubclass(formatter_class, Formatter): 

1119 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.") 

1120 dataset.formatter = formatter_class 

1121 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

1122 filtered.append(dataset) 

1123 return _IngestPrepData(filtered) 

1124 

1125 @transactional 

1126 def _finishIngest( 

1127 self, 

1128 prepData: Datastore.IngestPrepData, 

1129 *, 

1130 transfer: str | None = None, 

1131 record_validation_info: bool = True, 

1132 ) -> None: 

1133 # Docstring inherited from Datastore._finishIngest. 

1134 refsAndInfos = [] 

1135 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

1136 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

1137 # Do ingest as if the first dataset ref is associated with the file 

1138 info = self._extractIngestInfo( 

1139 dataset.path, 

1140 dataset.refs[0], 

1141 formatter=dataset.formatter, 

1142 transfer=transfer, 

1143 record_validation_info=record_validation_info, 

1144 ) 

1145 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

1146 

1147 # In direct mode we can allow repeated ingests of the same thing 

1148 # if we are sure that the external dataset is immutable. We use 

1149 # UUIDv5 to indicate this. If there is a mix of v4 and v5 they are 

1150 # separated. 

1151 refs_and_infos_replace = [] 

1152 refs_and_infos_insert = [] 

1153 if transfer == "direct": 

1154 for entry in refsAndInfos: 

1155 if entry[0].id.version == 5: 

1156 refs_and_infos_replace.append(entry) 

1157 else: 

1158 refs_and_infos_insert.append(entry) 

1159 else: 

1160 refs_and_infos_insert = refsAndInfos 

1161 

1162 if refs_and_infos_insert: 

1163 self._register_datasets(refs_and_infos_insert, insert_mode=DatabaseInsertMode.INSERT) 

1164 if refs_and_infos_replace: 

1165 self._register_datasets(refs_and_infos_replace, insert_mode=DatabaseInsertMode.REPLACE) 

1166 

1167 def _calculate_ingested_datastore_name( 

1168 self, 

1169 srcUri: ResourcePath, 

1170 ref: DatasetRef, 

1171 formatter: Formatter | type[Formatter] | None = None, 

1172 ) -> Location: 

1173 """Given a source URI and a DatasetRef, determine the name the 

1174 dataset will have inside datastore. 

1175 

1176 Parameters 

1177 ---------- 

1178 srcUri : `lsst.resources.ResourcePath` 

1179 URI to the source dataset file. 

1180 ref : `DatasetRef` 

1181 Ref associated with the newly-ingested dataset artifact. This 

1182 is used to determine the name within the datastore. 

1183 formatter : `Formatter` or Formatter class. 

1184 Formatter to use for validation. Can be a class or an instance. 

1185 No validation of the file extension is performed if the 

1186 ``formatter`` is `None`. This can be used if the caller knows 

1187 that the source URI and target URI will use the same formatter. 

1188 

1189 Returns 

1190 ------- 

1191 location : `Location` 

1192 Target location for the newly-ingested dataset. 

1193 """ 

1194 # Ingesting a file from outside the datastore. 

1195 # This involves a new name. 

1196 template = self.templates.getTemplate(ref) 

1197 location = self.locationFactory.fromPath(template.format(ref), trusted_path=True) 

1198 

1199 # Get the extension 

1200 ext = srcUri.getExtension() 

1201 

1202 # Update the destination to include that extension 

1203 location.updateExtension(ext) 

1204 

1205 # Ask the formatter to validate this extension 

1206 if formatter is not None: 

1207 formatter.validateExtension(location) 

1208 

1209 return location 

1210 

1211 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

1212 """Write out in memory dataset to datastore. 

1213 

1214 Parameters 

1215 ---------- 

1216 inMemoryDataset : `object` 

1217 Dataset to write to datastore. 

1218 ref : `DatasetRef` 

1219 Registry information associated with this dataset. 

1220 

1221 Returns 

1222 ------- 

1223 info : `StoredFileInfo` 

1224 Information describing the artifact written to the datastore. 

1225 """ 

1226 # May need to coerce the in memory dataset to the correct 

1227 # python type, but first we need to make sure the storage class 

1228 # reflects the one defined in the data repository. 

1229 ref = self._cast_storage_class(ref) 

1230 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset) 

1231 

1232 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

1233 uri = location.uri 

1234 

1235 if not uri.dirname().exists(): 

1236 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

1237 uri.dirname().mkdir() 

1238 

1239 if self._transaction is None: 

1240 raise RuntimeError("Attempting to write artifact without transaction enabled") 

1241 

1242 def _removeFileExists(uri: ResourcePath) -> None: 

1243 """Remove a file and do not complain if it is not there. 

1244 

1245 This is important since a formatter might fail before the file 

1246 is written and we should not confuse people by writing spurious 

1247 error messages to the log. 

1248 """ 

1249 with contextlib.suppress(FileNotFoundError): 

1250 uri.remove() 

1251 

1252 # Register a callback to try to delete the uploaded data if 

1253 # something fails below 

1254 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1255 

1256 data_written = False 

1257 

1258 # For remote URIs some datasets can be serialized directly 

1259 # to bytes and sent to the remote datastore without writing a 

1260 # file. If the dataset is intended to be saved to the cache 

1261 # a file is always written and direct write to the remote 

1262 # datastore is bypassed. 

1263 if not uri.isLocal and not self.cacheManager.should_be_cached(ref): 

1264 # Remote URI that is not cached so can write directly. 

1265 try: 

1266 serializedDataset = formatter.toBytes(inMemoryDataset) 

1267 except NotImplementedError: 

1268 # Fallback to the file writing option. 

1269 pass 

1270 except Exception as e: 

1271 raise RuntimeError( 

1272 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes." 

1273 ) from e 

1274 else: 

1275 log.debug("Writing bytes directly to %s", uri) 

1276 uri.write(serializedDataset, overwrite=True) 

1277 log.debug("Successfully wrote bytes directly to %s", uri) 

1278 data_written = True 

1279 

1280 if not data_written: 

1281 # Did not write the bytes directly to object store so instead 

1282 # write to temporary file. Always write to a temporary even if 

1283 # using a local file system -- that gives us atomic writes. 

1284 # If a process is killed as the file is being written we do not 

1285 # want it to remain in the correct place but in corrupt state. 

1286 # For local files write to the output directory not temporary dir. 

1287 prefix = uri.dirname() if uri.isLocal else None 

1288 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri: 

1289 # Need to configure the formatter to write to a different 

1290 # location and that needs us to overwrite internals 

1291 log.debug("Writing dataset to temporary location at %s", temporary_uri) 

1292 with formatter._updateLocation(Location(None, temporary_uri)): 

1293 try: 

1294 formatter.write(inMemoryDataset) 

1295 except Exception as e: 

1296 raise RuntimeError( 

1297 f"Failed to serialize dataset {ref} of type" 

1298 f" {type(inMemoryDataset)} to " 

1299 f"temporary location {temporary_uri}" 

1300 ) from e 

1301 

1302 # Use move for a local file since that becomes an efficient 

1303 # os.rename. For remote resources we use copy to allow the 

1304 # file to be cached afterwards. 

1305 transfer = "move" if uri.isLocal else "copy" 

1306 

1307 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True) 

1308 

1309 if transfer == "copy": 

1310 # Cache if required 

1311 self.cacheManager.move_to_cache(temporary_uri, ref) 

1312 

1313 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1314 

1315 # URI is needed to resolve what ingest case are we dealing with 

1316 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1317 

1318 def knows(self, ref: DatasetRef) -> bool: 

1319 """Check if the dataset is known to the datastore. 

1320 

1321 Does not check for existence of any artifact. 

1322 

1323 Parameters 

1324 ---------- 

1325 ref : `DatasetRef` 

1326 Reference to the required dataset. 

1327 

1328 Returns 

1329 ------- 

1330 exists : `bool` 

1331 `True` if the dataset is known to the datastore. 

1332 """ 

1333 # We cannot trust datastore records from ref, as many unit tests delete 

1334 # datasets and check their existence. 

1335 fileLocations = self._get_dataset_locations_info(ref, ignore_datastore_records=True) 

1336 if fileLocations: 

1337 return True 

1338 return False 

1339 

1340 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

1341 # Docstring inherited from the base class. 

1342 

1343 # The records themselves. Could be missing some entries. 

1344 records = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True) 

1345 

1346 return {ref: ref.id in records for ref in refs} 

1347 

1348 def _process_mexists_records( 

1349 self, 

1350 id_to_ref: dict[DatasetId, DatasetRef], 

1351 records: dict[DatasetId, list[StoredFileInfo]], 

1352 all_required: bool, 

1353 artifact_existence: dict[ResourcePath, bool] | None = None, 

1354 ) -> dict[DatasetRef, bool]: 

1355 """Check given records for existence. 

1356 

1357 Helper function for `mexists()`. 

1358 

1359 Parameters 

1360 ---------- 

1361 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`] 

1362 Mapping of the dataset ID to the dataset ref itself. 

1363 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

1364 Records as generally returned by 

1365 ``_get_stored_records_associated_with_refs``. 

1366 all_required : `bool` 

1367 Flag to indicate whether existence requires all artifacts 

1368 associated with a dataset ID to exist or not for existence. 

1369 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1370 Optional mapping of datastore artifact to existence. Updated by 

1371 this method with details of all artifacts tested. Can be `None` 

1372 if the caller is not interested. 

1373 

1374 Returns 

1375 ------- 

1376 existence : `dict` of [`DatasetRef`, `bool`] 

1377 Mapping from dataset to boolean indicating existence. 

1378 """ 

1379 # The URIs to be checked and a mapping of those URIs to 

1380 # the dataset ID. 

1381 uris_to_check: list[ResourcePath] = [] 

1382 location_map: dict[ResourcePath, DatasetId] = {} 

1383 

1384 location_factory = self.locationFactory 

1385 

1386 uri_existence: dict[ResourcePath, bool] = {} 

1387 for ref_id, infos in records.items(): 

1388 # Key is the dataset Id, value is list of StoredItemInfo 

1389 uris = [info.file_location(location_factory).uri for info in infos] 

1390 location_map.update({uri: ref_id for uri in uris}) 

1391 

1392 # Check the local cache directly for a dataset corresponding 

1393 # to the remote URI. 

1394 if self.cacheManager.file_count > 0: 

1395 ref = id_to_ref[ref_id] 

1396 for uri, storedFileInfo in zip(uris, infos, strict=True): 

1397 check_ref = ref 

1398 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1399 check_ref = ref.makeComponentRef(component) 

1400 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()): 

1401 # Proxy for URI existence. 

1402 uri_existence[uri] = True 

1403 else: 

1404 uris_to_check.append(uri) 

1405 else: 

1406 # Check all of them. 

1407 uris_to_check.extend(uris) 

1408 

1409 if artifact_existence is not None: 

1410 # If a URI has already been checked remove it from the list 

1411 # and immediately add the status to the output dict. 

1412 filtered_uris_to_check = [] 

1413 for uri in uris_to_check: 

1414 if uri in artifact_existence: 

1415 uri_existence[uri] = artifact_existence[uri] 

1416 else: 

1417 filtered_uris_to_check.append(uri) 

1418 uris_to_check = filtered_uris_to_check 

1419 

1420 # Results. 

1421 dataset_existence: dict[DatasetRef, bool] = {} 

1422 

1423 uri_existence.update(ResourcePath.mexists(uris_to_check)) 

1424 for uri, exists in uri_existence.items(): 

1425 dataset_id = location_map[uri] 

1426 ref = id_to_ref[dataset_id] 

1427 

1428 # Disassembled composite needs to check all locations. 

1429 # all_required indicates whether all need to exist or not. 

1430 if ref in dataset_existence: 

1431 if all_required: 

1432 exists = dataset_existence[ref] and exists 

1433 else: 

1434 exists = dataset_existence[ref] or exists 

1435 dataset_existence[ref] = exists 

1436 

1437 if artifact_existence is not None: 

1438 artifact_existence.update(uri_existence) 

1439 

1440 return dataset_existence 

1441 

1442 def mexists( 

1443 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1444 ) -> dict[DatasetRef, bool]: 

1445 """Check the existence of multiple datasets at once. 

1446 

1447 Parameters 

1448 ---------- 

1449 refs : iterable of `DatasetRef` 

1450 The datasets to be checked. 

1451 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1452 Optional mapping of datastore artifact to existence. Updated by 

1453 this method with details of all artifacts tested. Can be `None` 

1454 if the caller is not interested. 

1455 

1456 Returns 

1457 ------- 

1458 existence : `dict` of [`DatasetRef`, `bool`] 

1459 Mapping from dataset to boolean indicating existence. 

1460 

1461 Notes 

1462 ----- 

1463 To minimize potentially costly remote existence checks, the local 

1464 cache is checked as a proxy for existence. If a file for this 

1465 `DatasetRef` does exist no check is done for the actual URI. This 

1466 could result in possibly unexpected behavior if the dataset itself 

1467 has been removed from the datastore by another process whilst it is 

1468 still in the cache. 

1469 """ 

1470 chunk_size = 10_000 

1471 dataset_existence: dict[DatasetRef, bool] = {} 

1472 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size) 

1473 n_found_total = 0 

1474 n_checked = 0 

1475 n_chunks = 0 

1476 for chunk in chunk_iterable(refs, chunk_size=chunk_size): 

1477 chunk_result = self._mexists(chunk, artifact_existence) 

1478 

1479 # The log message level and content depend on how many 

1480 # datasets we are processing. 

1481 n_results = len(chunk_result) 

1482 

1483 # Use verbose logging to ensure that messages can be seen 

1484 # easily if many refs are being checked. 

1485 log_threshold = VERBOSE 

1486 n_checked += n_results 

1487 

1488 # This sum can take some time so only do it if we know the 

1489 # result is going to be used. 

1490 n_found = 0 

1491 if log.isEnabledFor(log_threshold): 

1492 # Can treat the booleans as 0, 1 integers and sum them. 

1493 n_found = sum(chunk_result.values()) 

1494 n_found_total += n_found 

1495 

1496 # We are deliberately not trying to count the number of refs 

1497 # provided in case it's in the millions. This means there is a 

1498 # situation where the number of refs exactly matches the chunk 

1499 # size and we will switch to the multi-chunk path even though 

1500 # we only have a single chunk. 

1501 if n_results < chunk_size and n_chunks == 0: 

1502 # Single chunk will be processed so we can provide more detail. 

1503 if n_results == 1: 

1504 ref = list(chunk_result)[0] 

1505 # Use debug logging to be consistent with `exists()`. 

1506 log.debug( 

1507 "Calling mexists() with single ref that does%s exist (%s).", 

1508 "" if chunk_result[ref] else " not", 

1509 ref, 

1510 ) 

1511 else: 

1512 # Single chunk but multiple files. Summarize. 

1513 log.log( 

1514 log_threshold, 

1515 "Number of datasets found in datastore: %d out of %d datasets checked.", 

1516 n_found, 

1517 n_checked, 

1518 ) 

1519 

1520 else: 

1521 # Use incremental verbose logging when we have multiple chunks. 

1522 log.log( 

1523 log_threshold, 

1524 "Number of datasets found in datastore for chunk %d: %d out of %d checked " 

1525 "(running total from all chunks so far: %d found out of %d checked)", 

1526 n_chunks, 

1527 n_found, 

1528 n_results, 

1529 n_found_total, 

1530 n_checked, 

1531 ) 

1532 dataset_existence.update(chunk_result) 

1533 n_chunks += 1 

1534 

1535 return dataset_existence 

1536 

1537 def _mexists( 

1538 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1539 ) -> dict[DatasetRef, bool]: 

1540 """Check the existence of multiple datasets at once. 

1541 

1542 Parameters 

1543 ---------- 

1544 refs : iterable of `DatasetRef` 

1545 The datasets to be checked. 

1546 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1547 Optional mapping of datastore artifact to existence. Updated by 

1548 this method with details of all artifacts tested. Can be `None` 

1549 if the caller is not interested. 

1550 

1551 Returns 

1552 ------- 

1553 existence : `dict` of [`DatasetRef`, `bool`] 

1554 Mapping from dataset to boolean indicating existence. 

1555 """ 

1556 # Make a mapping from refs with the internal storage class to the given 

1557 # refs that may have a different one. We'll use the internal refs 

1558 # throughout this method and convert back at the very end. 

1559 internal_ref_to_input_ref = {self._cast_storage_class(ref): ref for ref in refs} 

1560 

1561 # Need a mapping of dataset_id to (internal) dataset ref since some 

1562 # internal APIs work with dataset_id. 

1563 id_to_ref = {ref.id: ref for ref in internal_ref_to_input_ref} 

1564 

1565 # Set of all IDs we are checking for. 

1566 requested_ids = set(id_to_ref.keys()) 

1567 

1568 # The records themselves. Could be missing some entries. 

1569 records = self._get_stored_records_associated_with_refs( 

1570 id_to_ref.values(), ignore_datastore_records=True 

1571 ) 

1572 

1573 dataset_existence = self._process_mexists_records( 

1574 id_to_ref, records, True, artifact_existence=artifact_existence 

1575 ) 

1576 

1577 # Set of IDs that have been handled. 

1578 handled_ids = {ref.id for ref in dataset_existence} 

1579 

1580 missing_ids = requested_ids - handled_ids 

1581 if missing_ids: 

1582 dataset_existence.update( 

1583 self._mexists_check_expected( 

1584 [id_to_ref[missing] for missing in missing_ids], artifact_existence 

1585 ) 

1586 ) 

1587 

1588 return { 

1589 internal_ref_to_input_ref[internal_ref]: existence 

1590 for internal_ref, existence in dataset_existence.items() 

1591 } 

1592 

1593 def _mexists_check_expected( 

1594 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1595 ) -> dict[DatasetRef, bool]: 

1596 """Check existence of refs that are not known to datastore. 

1597 

1598 Parameters 

1599 ---------- 

1600 refs : iterable of `DatasetRef` 

1601 The datasets to be checked. These are assumed not to be known 

1602 to datastore. 

1603 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1604 Optional mapping of datastore artifact to existence. Updated by 

1605 this method with details of all artifacts tested. Can be `None` 

1606 if the caller is not interested. 

1607 

1608 Returns 

1609 ------- 

1610 existence : `dict` of [`DatasetRef`, `bool`] 

1611 Mapping from dataset to boolean indicating existence. 

1612 """ 

1613 dataset_existence: dict[DatasetRef, bool] = {} 

1614 if not self.trustGetRequest: 

1615 # Must assume these do not exist 

1616 for ref in refs: 

1617 dataset_existence[ref] = False 

1618 else: 

1619 log.debug( 

1620 "%d datasets were not known to datastore during initial existence check.", 

1621 len(refs), 

1622 ) 

1623 

1624 # Construct data structure identical to that returned 

1625 # by _get_stored_records_associated_with_refs() but using 

1626 # guessed names. 

1627 records = {} 

1628 id_to_ref = {} 

1629 for missing_ref in refs: 

1630 expected = self._get_expected_dataset_locations_info(missing_ref) 

1631 dataset_id = missing_ref.id 

1632 records[dataset_id] = [info for _, info in expected] 

1633 id_to_ref[dataset_id] = missing_ref 

1634 

1635 dataset_existence.update( 

1636 self._process_mexists_records( 

1637 id_to_ref, 

1638 records, 

1639 False, 

1640 artifact_existence=artifact_existence, 

1641 ) 

1642 ) 

1643 

1644 return dataset_existence 

1645 

1646 def exists(self, ref: DatasetRef) -> bool: 

1647 """Check if the dataset exists in the datastore. 

1648 

1649 Parameters 

1650 ---------- 

1651 ref : `DatasetRef` 

1652 Reference to the required dataset. 

1653 

1654 Returns 

1655 ------- 

1656 exists : `bool` 

1657 `True` if the entity exists in the `Datastore`. 

1658 

1659 Notes 

1660 ----- 

1661 The local cache is checked as a proxy for existence in the remote 

1662 object store. It is possible that another process on a different 

1663 compute node could remove the file from the object store even 

1664 though it is present in the local cache. 

1665 """ 

1666 ref = self._cast_storage_class(ref) 

1667 # We cannot trust datastore records from ref, as many unit tests delete 

1668 # datasets and check their existence. 

1669 fileLocations = self._get_dataset_locations_info(ref, ignore_datastore_records=True) 

1670 

1671 # if we are being asked to trust that registry might not be correct 

1672 # we ask for the expected locations and check them explicitly 

1673 if not fileLocations: 

1674 if not self.trustGetRequest: 

1675 return False 

1676 

1677 # First check the cache. If it is not found we must check 

1678 # the datastore itself. Assume that any component in the cache 

1679 # means that the dataset does exist somewhere. 

1680 if self.cacheManager.known_to_cache(ref): 

1681 return True 

1682 

1683 # When we are guessing a dataset location we can not check 

1684 # for the existence of every component since we can not 

1685 # know if every component was written. Instead we check 

1686 # for the existence of any of the expected locations. 

1687 for location, _ in self._get_expected_dataset_locations_info(ref): 

1688 if self._artifact_exists(location): 

1689 return True 

1690 return False 

1691 

1692 # All listed artifacts must exist. 

1693 for location, storedFileInfo in fileLocations: 

1694 # Checking in cache needs the component ref. 

1695 check_ref = ref 

1696 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1697 check_ref = ref.makeComponentRef(component) 

1698 if self.cacheManager.known_to_cache(check_ref, location.getExtension()): 

1699 continue 

1700 

1701 if not self._artifact_exists(location): 

1702 return False 

1703 

1704 return True 

1705 

1706 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

1707 """Return URIs associated with dataset. 

1708 

1709 Parameters 

1710 ---------- 

1711 ref : `DatasetRef` 

1712 Reference to the required dataset. 

1713 predict : `bool`, optional 

1714 If the datastore does not know about the dataset, controls whether 

1715 it should return a predicted URI or not. 

1716 

1717 Returns 

1718 ------- 

1719 uris : `DatasetRefURIs` 

1720 The URI to the primary artifact associated with this dataset (if 

1721 the dataset was disassembled within the datastore this may be 

1722 `None`), and the URIs to any components associated with the dataset 

1723 artifact. (can be empty if there are no components). 

1724 """ 

1725 many = self.getManyURIs([ref], predict=predict, allow_missing=False) 

1726 return many[ref] 

1727 

1728 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

1729 """URI to the Dataset. 

1730 

1731 Parameters 

1732 ---------- 

1733 ref : `DatasetRef` 

1734 Reference to the required Dataset. 

1735 predict : `bool` 

1736 If `True`, allow URIs to be returned of datasets that have not 

1737 been written. 

1738 

1739 Returns 

1740 ------- 

1741 uri : `str` 

1742 URI pointing to the dataset within the datastore. If the 

1743 dataset does not exist in the datastore, and if ``predict`` is 

1744 `True`, the URI will be a prediction and will include a URI 

1745 fragment "#predicted". 

1746 If the datastore does not have entities that relate well 

1747 to the concept of a URI the returned URI will be 

1748 descriptive. The returned URI is not guaranteed to be obtainable. 

1749 

1750 Raises 

1751 ------ 

1752 FileNotFoundError 

1753 Raised if a URI has been requested for a dataset that does not 

1754 exist and guessing is not allowed. 

1755 RuntimeError 

1756 Raised if a request is made for a single URI but multiple URIs 

1757 are associated with this dataset. 

1758 

1759 Notes 

1760 ----- 

1761 When a predicted URI is requested an attempt will be made to form 

1762 a reasonable URI based on file templates and the expected formatter. 

1763 """ 

1764 primary, components = self.getURIs(ref, predict) 

1765 if primary is None or components: 

1766 raise RuntimeError( 

1767 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

1768 ) 

1769 return primary 

1770 

1771 def _predict_URIs( 

1772 self, 

1773 ref: DatasetRef, 

1774 ) -> DatasetRefURIs: 

1775 """Predict the URIs of a dataset ref. 

1776 

1777 Parameters 

1778 ---------- 

1779 ref : `DatasetRef` 

1780 Reference to the required Dataset. 

1781 

1782 Returns 

1783 ------- 

1784 URI : DatasetRefUris 

1785 Primary and component URIs. URIs will contain a URI fragment 

1786 "#predicted". 

1787 """ 

1788 uris = DatasetRefURIs() 

1789 

1790 if self.composites.shouldBeDisassembled(ref): 

1791 for component, _ in ref.datasetType.storageClass.components.items(): 

1792 comp_ref = ref.makeComponentRef(component) 

1793 comp_location, _ = self._determine_put_formatter_location(comp_ref) 

1794 

1795 # Add the "#predicted" URI fragment to indicate this is a 

1796 # guess 

1797 uris.componentURIs[component] = ResourcePath( 

1798 comp_location.uri.geturl() + "#predicted", forceDirectory=comp_location.uri.dirLike 

1799 ) 

1800 

1801 else: 

1802 location, _ = self._determine_put_formatter_location(ref) 

1803 

1804 # Add the "#predicted" URI fragment to indicate this is a guess 

1805 uris.primaryURI = ResourcePath( 

1806 location.uri.geturl() + "#predicted", forceDirectory=location.uri.dirLike 

1807 ) 

1808 

1809 return uris 

1810 

1811 def getManyURIs( 

1812 self, 

1813 refs: Iterable[DatasetRef], 

1814 predict: bool = False, 

1815 allow_missing: bool = False, 

1816 ) -> dict[DatasetRef, DatasetRefURIs]: 

1817 # Docstring inherited 

1818 

1819 uris: dict[DatasetRef, DatasetRefURIs] = {} 

1820 

1821 records = self._get_stored_records_associated_with_refs(refs) 

1822 records_keys = records.keys() 

1823 

1824 existing_refs = tuple(ref for ref in refs if ref.id in records_keys) 

1825 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys) 

1826 

1827 # Have to handle trustGetRequest mode by checking for the existence 

1828 # of the missing refs on disk. 

1829 if missing_refs: 

1830 dataset_existence = self._mexists_check_expected(missing_refs, None) 

1831 really_missing = set() 

1832 not_missing = set() 

1833 for ref, exists in dataset_existence.items(): 

1834 if exists: 

1835 not_missing.add(ref) 

1836 else: 

1837 really_missing.add(ref) 

1838 

1839 if not_missing: 

1840 # Need to recalculate the missing/existing split. 

1841 existing_refs = existing_refs + tuple(not_missing) 

1842 missing_refs = tuple(really_missing) 

1843 

1844 for ref in missing_refs: 

1845 # if this has never been written then we have to guess 

1846 if not predict: 

1847 if not allow_missing: 

1848 raise FileNotFoundError(f"Dataset {ref} not in this datastore.") 

1849 else: 

1850 uris[ref] = self._predict_URIs(ref) 

1851 

1852 for ref in existing_refs: 

1853 file_infos = records[ref.id] 

1854 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos] 

1855 uris[ref] = self._locations_to_URI(ref, file_locations) 

1856 

1857 return uris 

1858 

1859 def _locations_to_URI( 

1860 self, 

1861 ref: DatasetRef, 

1862 file_locations: Sequence[tuple[Location, StoredFileInfo]], 

1863 ) -> DatasetRefURIs: 

1864 """Convert one or more file locations associated with a DatasetRef 

1865 to a DatasetRefURIs. 

1866 

1867 Parameters 

1868 ---------- 

1869 ref : `DatasetRef` 

1870 Reference to the dataset. 

1871 file_locations : Sequence[Tuple[Location, StoredFileInfo]] 

1872 Each item in the sequence is the location of the dataset within the 

1873 datastore and stored information about the file and its formatter. 

1874 If there is only one item in the sequence then it is treated as the 

1875 primary URI. If there is more than one item then they are treated 

1876 as component URIs. If there are no items then an error is raised 

1877 unless ``self.trustGetRequest`` is `True`. 

1878 

1879 Returns 

1880 ------- 

1881 uris: DatasetRefURIs 

1882 Represents the primary URI or component URIs described by the 

1883 inputs. 

1884 

1885 Raises 

1886 ------ 

1887 RuntimeError 

1888 If no file locations are passed in and ``self.trustGetRequest`` is 

1889 `False`. 

1890 FileNotFoundError 

1891 If the a passed-in URI does not exist, and ``self.trustGetRequest`` 

1892 is `False`. 

1893 RuntimeError 

1894 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is 

1895 unexpected). 

1896 """ 

1897 guessing = False 

1898 uris = DatasetRefURIs() 

1899 

1900 if not file_locations: 

1901 if not self.trustGetRequest: 

1902 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1903 file_locations = self._get_expected_dataset_locations_info(ref) 

1904 guessing = True 

1905 

1906 if len(file_locations) == 1: 

1907 # No disassembly so this is the primary URI 

1908 uris.primaryURI = file_locations[0][0].uri 

1909 if guessing and not uris.primaryURI.exists(): 

1910 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist") 

1911 else: 

1912 for location, file_info in file_locations: 

1913 if file_info.component is None: 

1914 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1915 if guessing and not location.uri.exists(): 

1916 # If we are trusting then it is entirely possible for 

1917 # some components to be missing. In that case we skip 

1918 # to the next component. 

1919 if self.trustGetRequest: 

1920 continue 

1921 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist") 

1922 uris.componentURIs[file_info.component] = location.uri 

1923 

1924 return uris 

1925 

1926 def retrieveArtifacts( 

1927 self, 

1928 refs: Iterable[DatasetRef], 

1929 destination: ResourcePath, 

1930 transfer: str = "auto", 

1931 preserve_path: bool = True, 

1932 overwrite: bool = False, 

1933 ) -> list[ResourcePath]: 

1934 """Retrieve the file artifacts associated with the supplied refs. 

1935 

1936 Parameters 

1937 ---------- 

1938 refs : iterable of `DatasetRef` 

1939 The datasets for which file artifacts are to be retrieved. 

1940 A single ref can result in multiple files. The refs must 

1941 be resolved. 

1942 destination : `lsst.resources.ResourcePath` 

1943 Location to write the file artifacts. 

1944 transfer : `str`, optional 

1945 Method to use to transfer the artifacts. Must be one of the options 

1946 supported by `lsst.resources.ResourcePath.transfer_from()`. 

1947 "move" is not allowed. 

1948 preserve_path : `bool`, optional 

1949 If `True` the full path of the file artifact within the datastore 

1950 is preserved. If `False` the final file component of the path 

1951 is used. 

1952 overwrite : `bool`, optional 

1953 If `True` allow transfers to overwrite existing files at the 

1954 destination. 

1955 

1956 Returns 

1957 ------- 

1958 targets : `list` of `lsst.resources.ResourcePath` 

1959 URIs of file artifacts in destination location. Order is not 

1960 preserved. 

1961 """ 

1962 if not destination.isdir(): 

1963 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

1964 

1965 if transfer == "move": 

1966 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

1967 

1968 # Source -> Destination 

1969 # This also helps filter out duplicate DatasetRef in the request 

1970 # that will map to the same underlying file transfer. 

1971 to_transfer: dict[ResourcePath, ResourcePath] = {} 

1972 

1973 for ref in refs: 

1974 locations = self._get_dataset_locations_info(ref) 

1975 for location, _ in locations: 

1976 source_uri = location.uri 

1977 target_path: ResourcePathExpression 

1978 if preserve_path: 

1979 target_path = location.pathInStore 

1980 if target_path.isabs(): 

1981 # This is an absolute path to an external file. 

1982 # Use the full path. 

1983 target_path = target_path.relativeToPathRoot 

1984 else: 

1985 target_path = source_uri.basename() 

1986 target_uri = destination.join(target_path) 

1987 to_transfer[source_uri] = target_uri 

1988 

1989 # In theory can now parallelize the transfer 

1990 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer)) 

1991 for source_uri, target_uri in to_transfer.items(): 

1992 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

1993 

1994 return list(to_transfer.values()) 

1995 

1996 def get( 

1997 self, 

1998 ref: DatasetRef, 

1999 parameters: Mapping[str, Any] | None = None, 

2000 storageClass: StorageClass | str | None = None, 

2001 ) -> Any: 

2002 """Load an InMemoryDataset from the store. 

2003 

2004 Parameters 

2005 ---------- 

2006 ref : `DatasetRef` 

2007 Reference to the required Dataset. 

2008 parameters : `dict` 

2009 `StorageClass`-specific parameters that specify, for example, 

2010 a slice of the dataset to be loaded. 

2011 storageClass : `StorageClass` or `str`, optional 

2012 The storage class to be used to override the Python type 

2013 returned by this method. By default the returned type matches 

2014 the dataset type definition for this dataset. Specifying a 

2015 read `StorageClass` can force a different type to be returned. 

2016 This type must be compatible with the original type. 

2017 

2018 Returns 

2019 ------- 

2020 inMemoryDataset : `object` 

2021 Requested dataset or slice thereof as an InMemoryDataset. 

2022 

2023 Raises 

2024 ------ 

2025 FileNotFoundError 

2026 Requested dataset can not be retrieved. 

2027 TypeError 

2028 Return value from formatter has unexpected type. 

2029 ValueError 

2030 Formatter failed to process the dataset. 

2031 """ 

2032 # Supplied storage class for the component being read is either 

2033 # from the ref itself or some an override if we want to force 

2034 # type conversion. 

2035 if storageClass is not None: 

2036 ref = ref.overrideStorageClass(storageClass) 

2037 

2038 allGetInfo = self._prepare_for_direct_get(ref, parameters) 

2039 return get_dataset_as_python_object_from_get_info( 

2040 allGetInfo, ref=ref, parameters=parameters, cache_manager=self.cacheManager 

2041 ) 

2042 

2043 def prepare_get_for_external_client(self, ref: DatasetRef) -> FileDatastoreGetPayload: 

2044 # Docstring inherited 

2045 

2046 # 1 hour. Chosen somewhat arbitrarily -- this is long enough that the 

2047 # client should have time to download a large file with retries if 

2048 # needed, but short enough that it will become obvious quickly that 

2049 # these URLs expire. 

2050 # From a strictly technical standpoint there is no reason this 

2051 # shouldn't be a day or more, but there seems to be a political issue 

2052 # where people think there is a risk of end users posting presigned 

2053 # URLs for people without access rights to download. 

2054 url_expiration_time_seconds = 1 * 60 * 60 

2055 

2056 def to_file_info_payload(info: DatasetLocationInformation) -> FileDatastoreGetPayloadFileInfo: 

2057 location, file_info = info 

2058 return FileDatastoreGetPayloadFileInfo( 

2059 url=location.uri.generate_presigned_get_url( 

2060 expiration_time_seconds=url_expiration_time_seconds 

2061 ), 

2062 datastoreRecords=file_info.to_simple(), 

2063 ) 

2064 

2065 return FileDatastoreGetPayload( 

2066 datastore_type="file", 

2067 dataset_ref=ref.to_simple(), 

2068 file_info=[to_file_info_payload(info) for info in self._get_dataset_locations_info(ref)], 

2069 ) 

2070 

2071 @transactional 

2072 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

2073 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

2074 

2075 Parameters 

2076 ---------- 

2077 inMemoryDataset : `object` 

2078 The dataset to store. 

2079 ref : `DatasetRef` 

2080 Reference to the associated Dataset. 

2081 

2082 Raises 

2083 ------ 

2084 TypeError 

2085 Supplied object and storage class are inconsistent. 

2086 DatasetTypeNotSupportedError 

2087 The associated `DatasetType` is not handled by this datastore. 

2088 

2089 Notes 

2090 ----- 

2091 If the datastore is configured to reject certain dataset types it 

2092 is possible that the put will fail and raise a 

2093 `DatasetTypeNotSupportedError`. The main use case for this is to 

2094 allow `ChainedDatastore` to put to multiple datastores without 

2095 requiring that every datastore accepts the dataset. 

2096 """ 

2097 doDisassembly = self.composites.shouldBeDisassembled(ref) 

2098 # doDisassembly = True 

2099 

2100 artifacts = [] 

2101 if doDisassembly: 

2102 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

2103 if components is None: 

2104 raise RuntimeError( 

2105 f"Inconsistent configuration: dataset type {ref.datasetType.name} " 

2106 f"with storage class {ref.datasetType.storageClass.name} " 

2107 "is configured to be disassembled, but cannot be." 

2108 ) 

2109 for component, componentInfo in components.items(): 

2110 # Don't recurse because we want to take advantage of 

2111 # bulk insert -- need a new DatasetRef that refers to the 

2112 # same dataset_id but has the component DatasetType 

2113 # DatasetType does not refer to the types of components 

2114 # So we construct one ourselves. 

2115 compRef = ref.makeComponentRef(component) 

2116 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

2117 artifacts.append((compRef, storedInfo)) 

2118 else: 

2119 # Write the entire thing out 

2120 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

2121 artifacts.append((ref, storedInfo)) 

2122 

2123 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.INSERT) 

2124 

2125 @transactional 

2126 def put_new(self, in_memory_dataset: Any, ref: DatasetRef) -> Mapping[str, DatasetRef]: 

2127 doDisassembly = self.composites.shouldBeDisassembled(ref) 

2128 # doDisassembly = True 

2129 

2130 artifacts = [] 

2131 if doDisassembly: 

2132 components = ref.datasetType.storageClass.delegate().disassemble(in_memory_dataset) 

2133 if components is None: 

2134 raise RuntimeError( 

2135 f"Inconsistent configuration: dataset type {ref.datasetType.name} " 

2136 f"with storage class {ref.datasetType.storageClass.name} " 

2137 "is configured to be disassembled, but cannot be." 

2138 ) 

2139 for component, componentInfo in components.items(): 

2140 # Don't recurse because we want to take advantage of 

2141 # bulk insert -- need a new DatasetRef that refers to the 

2142 # same dataset_id but has the component DatasetType 

2143 # DatasetType does not refer to the types of components 

2144 # So we construct one ourselves. 

2145 compRef = ref.makeComponentRef(component) 

2146 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

2147 artifacts.append((compRef, storedInfo)) 

2148 else: 

2149 # Write the entire thing out 

2150 storedInfo = self._write_in_memory_to_artifact(in_memory_dataset, ref) 

2151 artifacts.append((ref, storedInfo)) 

2152 

2153 ref_records = {self._opaque_table_name: [info for _, info in artifacts]} 

2154 ref = ref.replace(datastore_records=ref_records) 

2155 return {self.name: ref} 

2156 

2157 @transactional 

2158 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None: 

2159 # At this point can safely remove these datasets from the cache 

2160 # to avoid confusion later on. If they are not trashed later 

2161 # the cache will simply be refilled. 

2162 self.cacheManager.remove_from_cache(ref) 

2163 

2164 # If we are in trust mode there will be nothing to move to 

2165 # the trash table and we will have to try to delete the file 

2166 # immediately. 

2167 if self.trustGetRequest: 

2168 # Try to keep the logic below for a single file trash. 

2169 if isinstance(ref, DatasetRef): 

2170 refs = {ref} 

2171 else: 

2172 # Will recreate ref at the end of this branch. 

2173 refs = set(ref) 

2174 

2175 # Determine which datasets are known to datastore directly. 

2176 id_to_ref = {ref.id: ref for ref in refs} 

2177 existing_ids = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True) 

2178 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids} 

2179 

2180 missing = refs - existing_refs 

2181 if missing: 

2182 # Do an explicit existence check on these refs. 

2183 # We only care about the artifacts at this point and not 

2184 # the dataset existence. 

2185 artifact_existence: dict[ResourcePath, bool] = {} 

2186 _ = self.mexists(missing, artifact_existence) 

2187 uris = [uri for uri, exists in artifact_existence.items() if exists] 

2188 

2189 # FUTURE UPGRADE: Implement a parallelized bulk remove. 

2190 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris)) 

2191 for uri in uris: 

2192 try: 

2193 uri.remove() 

2194 except Exception as e: 

2195 if ignore_errors: 

2196 log.debug("Artifact %s could not be removed: %s", uri, e) 

2197 continue 

2198 raise 

2199 

2200 # There is no point asking the code below to remove refs we 

2201 # know are missing so update it with the list of existing 

2202 # records. Try to retain one vs many logic. 

2203 if not existing_refs: 

2204 # Nothing more to do since none of the datasets were 

2205 # known to the datastore record table. 

2206 return 

2207 ref = list(existing_refs) 

2208 if len(ref) == 1: 

2209 ref = ref[0] 

2210 

2211 # Get file metadata and internal metadata 

2212 if not isinstance(ref, DatasetRef): 

2213 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

2214 # Assumed to be an iterable of refs so bulk mode enabled. 

2215 try: 

2216 self.bridge.moveToTrash(ref, transaction=self._transaction) 

2217 except Exception as e: 

2218 if ignore_errors: 

2219 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

2220 else: 

2221 raise 

2222 return 

2223 

2224 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

2225 

2226 fileLocations = self._get_dataset_locations_info(ref) 

2227 

2228 if not fileLocations: 

2229 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

2230 if ignore_errors: 

2231 log.warning(err_msg) 

2232 return 

2233 else: 

2234 raise FileNotFoundError(err_msg) 

2235 

2236 for location, _ in fileLocations: 

2237 if not self._artifact_exists(location): 

2238 err_msg = ( 

2239 f"Dataset is known to datastore {self.name} but " 

2240 f"associated artifact ({location.uri}) is missing" 

2241 ) 

2242 if ignore_errors: 

2243 log.warning(err_msg) 

2244 return 

2245 else: 

2246 raise FileNotFoundError(err_msg) 

2247 

2248 # Mark dataset as trashed 

2249 try: 

2250 self.bridge.moveToTrash([ref], transaction=self._transaction) 

2251 except Exception as e: 

2252 if ignore_errors: 

2253 log.warning( 

2254 "Attempted to mark dataset (%s) to be trashed in datastore %s " 

2255 "but encountered an error: %s", 

2256 ref, 

2257 self.name, 

2258 e, 

2259 ) 

2260 pass 

2261 else: 

2262 raise 

2263 

2264 @transactional 

2265 def emptyTrash(self, ignore_errors: bool = True) -> None: 

2266 """Remove all datasets from the trash. 

2267 

2268 Parameters 

2269 ---------- 

2270 ignore_errors : `bool` 

2271 If `True` return without error even if something went wrong. 

2272 Problems could occur if another process is simultaneously trying 

2273 to delete. 

2274 """ 

2275 log.debug("Emptying trash in datastore %s", self.name) 

2276 

2277 # Context manager will empty trash iff we finish it without raising. 

2278 # It will also automatically delete the relevant rows from the 

2279 # trash table and the records table. 

2280 with self.bridge.emptyTrash( 

2281 self._table, record_class=StoredFileInfo, record_column="path" 

2282 ) as trash_data: 

2283 # Removing the artifacts themselves requires that the files are 

2284 # not also associated with refs that are not to be trashed. 

2285 # Therefore need to do a query with the file paths themselves 

2286 # and return all the refs associated with them. Can only delete 

2287 # a file if the refs to be trashed are the only refs associated 

2288 # with the file. 

2289 # This requires multiple copies of the trashed items 

2290 trashed, artifacts_to_keep = trash_data 

2291 

2292 if artifacts_to_keep is None: 

2293 # The bridge is not helping us so have to work it out 

2294 # ourselves. This is not going to be as efficient. 

2295 trashed = list(trashed) 

2296 

2297 # The instance check is for mypy since up to this point it 

2298 # does not know the type of info. 

2299 path_map = self._refs_associated_with_artifacts( 

2300 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)] 

2301 ) 

2302 

2303 for ref, info in trashed: 

2304 # Mypy needs to know this is not the base class 

2305 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2306 

2307 path_map[info.path].remove(ref.id) 

2308 if not path_map[info.path]: 

2309 del path_map[info.path] 

2310 

2311 artifacts_to_keep = set(path_map) 

2312 

2313 for ref, info in trashed: 

2314 # Should not happen for this implementation but need 

2315 # to keep mypy happy. 

2316 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

2317 

2318 # Mypy needs to know this is not the base class 

2319 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2320 

2321 if info.path in artifacts_to_keep: 

2322 # This is a multi-dataset artifact and we are not 

2323 # removing all associated refs. 

2324 continue 

2325 

2326 # Only trashed refs still known to datastore will be returned. 

2327 location = info.file_location(self.locationFactory) 

2328 

2329 # Point of no return for this artifact 

2330 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

2331 try: 

2332 self._delete_artifact(location) 

2333 except FileNotFoundError: 

2334 # If the file itself has been deleted there is nothing 

2335 # we can do about it. It is possible that trash has 

2336 # been run in parallel in another process or someone 

2337 # decided to delete the file. It is unlikely to come 

2338 # back and so we should still continue with the removal 

2339 # of the entry from the trash table. It is also possible 

2340 # we removed it in a previous iteration if it was 

2341 # a multi-dataset artifact. The delete artifact method 

2342 # will log a debug message in this scenario. 

2343 # Distinguishing file missing before trash started and 

2344 # file already removed previously as part of this trash 

2345 # is not worth the distinction with regards to potential 

2346 # memory cost. 

2347 pass 

2348 except Exception as e: 

2349 if ignore_errors: 

2350 # Use a debug message here even though it's not 

2351 # a good situation. In some cases this can be 

2352 # caused by a race between user A and user B 

2353 # and neither of them has permissions for the 

2354 # other's files. Butler does not know about users 

2355 # and trash has no idea what collections these 

2356 # files were in (without guessing from a path). 

2357 log.debug( 

2358 "Encountered error removing artifact %s from datastore %s: %s", 

2359 location.uri, 

2360 self.name, 

2361 e, 

2362 ) 

2363 else: 

2364 raise 

2365 

2366 @transactional 

2367 def transfer_from( 

2368 self, 

2369 source_datastore: Datastore, 

2370 refs: Iterable[DatasetRef], 

2371 transfer: str = "auto", 

2372 artifact_existence: dict[ResourcePath, bool] | None = None, 

2373 dry_run: bool = False, 

2374 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

2375 # Docstring inherited 

2376 if type(self) is not type(source_datastore): 

2377 raise TypeError( 

2378 f"Datastore mismatch between this datastore ({type(self)}) and the " 

2379 f"source datastore ({type(source_datastore)})." 

2380 ) 

2381 

2382 # Be explicit for mypy 

2383 if not isinstance(source_datastore, FileDatastore): 

2384 raise TypeError( 

2385 "Can only transfer to a FileDatastore from another FileDatastore, not" 

2386 f" {type(source_datastore)}" 

2387 ) 

2388 

2389 # Stop early if "direct" transfer mode is requested. That would 

2390 # require that the URI inside the source datastore should be stored 

2391 # directly in the target datastore, which seems unlikely to be useful 

2392 # since at any moment the source datastore could delete the file. 

2393 if transfer in ("direct", "split"): 

2394 raise ValueError( 

2395 f"Can not transfer from a source datastore using {transfer} mode since" 

2396 " those files are controlled by the other datastore." 

2397 ) 

2398 

2399 # Empty existence lookup if none given. 

2400 if artifact_existence is None: 

2401 artifact_existence = {} 

2402 

2403 # We will go through the list multiple times so must convert 

2404 # generators to lists. 

2405 refs = list(refs) 

2406 

2407 # In order to handle disassembled composites the code works 

2408 # at the records level since it can assume that internal APIs 

2409 # can be used. 

2410 # - If the record already exists in the destination this is assumed 

2411 # to be okay. 

2412 # - If there is no record but the source and destination URIs are 

2413 # identical no transfer is done but the record is added. 

2414 # - If the source record refers to an absolute URI currently assume 

2415 # that that URI should remain absolute and will be visible to the 

2416 # destination butler. May need to have a flag to indicate whether 

2417 # the dataset should be transferred. This will only happen if 

2418 # the detached Butler has had a local ingest. 

2419 

2420 # What we really want is all the records in the source datastore 

2421 # associated with these refs. Or derived ones if they don't exist 

2422 # in the source. 

2423 source_records = source_datastore._get_stored_records_associated_with_refs( 

2424 refs, ignore_datastore_records=True 

2425 ) 

2426 

2427 # The source dataset_ids are the keys in these records 

2428 source_ids = set(source_records) 

2429 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

2430 

2431 requested_ids = {ref.id for ref in refs} 

2432 missing_ids = requested_ids - source_ids 

2433 

2434 # Missing IDs can be okay if that datastore has allowed 

2435 # gets based on file existence. Should we transfer what we can 

2436 # or complain about it and warn? 

2437 if missing_ids and not source_datastore.trustGetRequest: 

2438 raise ValueError( 

2439 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}" 

2440 ) 

2441 

2442 # Need to map these missing IDs to a DatasetRef so we can guess 

2443 # the details. 

2444 if missing_ids: 

2445 log.info( 

2446 "Number of expected datasets missing from source datastore records: %d out of %d", 

2447 len(missing_ids), 

2448 len(requested_ids), 

2449 ) 

2450 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

2451 

2452 # This should be chunked in case we end up having to check 

2453 # the file store since we need some log output to show 

2454 # progress. 

2455 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000): 

2456 records = {} 

2457 for missing in missing_ids_chunk: 

2458 # Ask the source datastore where the missing artifacts 

2459 # should be. An execution butler might not know about the 

2460 # artifacts even if they are there. 

2461 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) 

2462 records[missing] = [info for _, info in expected] 

2463 

2464 # Call the mexist helper method in case we have not already 

2465 # checked these artifacts such that artifact_existence is 

2466 # empty. This allows us to benefit from parallelism. 

2467 # datastore.mexists() itself does not give us access to the 

2468 # derived datastore record. 

2469 log.verbose("Checking existence of %d datasets unknown to datastore", len(records)) 

2470 ref_exists = source_datastore._process_mexists_records( 

2471 id_to_ref, records, False, artifact_existence=artifact_existence 

2472 ) 

2473 

2474 # Now go through the records and propagate the ones that exist. 

2475 location_factory = source_datastore.locationFactory 

2476 for missing, record_list in records.items(): 

2477 # Skip completely if the ref does not exist. 

2478 ref = id_to_ref[missing] 

2479 if not ref_exists[ref]: 

2480 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref) 

2481 continue 

2482 # Check for file artifact to decide which parts of a 

2483 # disassembled composite do exist. If there is only a 

2484 # single record we don't even need to look because it can't 

2485 # be a composite and must exist. 

2486 if len(record_list) == 1: 

2487 dataset_records = record_list 

2488 else: 

2489 dataset_records = [ 

2490 record 

2491 for record in record_list 

2492 if artifact_existence[record.file_location(location_factory).uri] 

2493 ] 

2494 assert len(dataset_records) > 0, "Disassembled composite should have had some files." 

2495 

2496 # Rely on source_records being a defaultdict. 

2497 source_records[missing].extend(dataset_records) 

2498 log.verbose("Completed scan for missing data files") 

2499 

2500 # See if we already have these records 

2501 target_records = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True) 

2502 

2503 # The artifacts to register 

2504 artifacts = [] 

2505 

2506 # Refs that already exist 

2507 already_present = [] 

2508 

2509 # Refs that were rejected by this datastore. 

2510 rejected = set() 

2511 

2512 # Refs that were transferred successfully. 

2513 accepted = set() 

2514 

2515 # Record each time we have done a "direct" transfer. 

2516 direct_transfers = [] 

2517 

2518 # Now can transfer the artifacts 

2519 for ref in refs: 

2520 if not self.constraints.isAcceptable(ref): 

2521 # This datastore should not be accepting this dataset. 

2522 rejected.add(ref) 

2523 continue 

2524 

2525 accepted.add(ref) 

2526 

2527 if ref.id in target_records: 

2528 # Already have an artifact for this. 

2529 already_present.append(ref) 

2530 continue 

2531 

2532 # mypy needs to know these are always resolved refs 

2533 for info in source_records[ref.id]: 

2534 source_location = info.file_location(source_datastore.locationFactory) 

2535 target_location = info.file_location(self.locationFactory) 

2536 if source_location == target_location and not source_location.pathInStore.isabs(): 

2537 # Artifact is already in the target location. 

2538 # (which is how execution butler currently runs) 

2539 pass 

2540 else: 

2541 if target_location.pathInStore.isabs(): 

2542 # Just because we can see the artifact when running 

2543 # the transfer doesn't mean it will be generally 

2544 # accessible to a user of this butler. Need to decide 

2545 # what to do about an absolute path. 

2546 if transfer == "auto": 

2547 # For "auto" transfers we allow the absolute URI 

2548 # to be recorded in the target datastore. 

2549 direct_transfers.append(source_location) 

2550 else: 

2551 # The user is explicitly requesting a transfer 

2552 # even for an absolute URI. This requires us to 

2553 # calculate the target path. 

2554 template_ref = ref 

2555 if info.component: 

2556 template_ref = ref.makeComponentRef(info.component) 

2557 target_location = self._calculate_ingested_datastore_name( 

2558 source_location.uri, 

2559 template_ref, 

2560 ) 

2561 

2562 info = info.update(path=target_location.pathInStore.path) 

2563 

2564 # Need to transfer it to the new location. 

2565 # Assume we should always overwrite. If the artifact 

2566 # is there this might indicate that a previous transfer 

2567 # was interrupted but was not able to be rolled back 

2568 # completely (eg pre-emption) so follow Datastore default 

2569 # and overwrite. Do not copy if we are in dry-run mode. 

2570 if not dry_run: 

2571 target_location.uri.transfer_from( 

2572 source_location.uri, 

2573 transfer=transfer, 

2574 overwrite=True, 

2575 transaction=self._transaction, 

2576 ) 

2577 

2578 artifacts.append((ref, info)) 

2579 

2580 if direct_transfers: 

2581 log.info( 

2582 "Transfer request for an outside-datastore artifact with absolute URI done %d time%s", 

2583 len(direct_transfers), 

2584 "" if len(direct_transfers) == 1 else "s", 

2585 ) 

2586 

2587 # We are overwriting previous datasets that may have already 

2588 # existed. We therefore should ensure that we force the 

2589 # datastore records to agree. Note that this can potentially lead 

2590 # to difficulties if the dataset has previously been ingested 

2591 # disassembled and is somehow now assembled, or vice versa. 

2592 if not dry_run: 

2593 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.REPLACE) 

2594 

2595 if already_present: 

2596 n_skipped = len(already_present) 

2597 log.info( 

2598 "Skipped transfer of %d dataset%s already present in datastore", 

2599 n_skipped, 

2600 "" if n_skipped == 1 else "s", 

2601 ) 

2602 

2603 return accepted, rejected 

2604 

2605 @transactional 

2606 def forget(self, refs: Iterable[DatasetRef]) -> None: 

2607 # Docstring inherited. 

2608 refs = list(refs) 

2609 self.bridge.forget(refs) 

2610 self._table.delete(["dataset_id"], *[{"dataset_id": ref.id} for ref in refs]) 

2611 

2612 def validateConfiguration( 

2613 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

2614 ) -> None: 

2615 """Validate some of the configuration for this datastore. 

2616 

2617 Parameters 

2618 ---------- 

2619 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

2620 Entities to test against this configuration. Can be differing 

2621 types. 

2622 logFailures : `bool`, optional 

2623 If `True`, output a log message for every validation error 

2624 detected. 

2625 

2626 Raises 

2627 ------ 

2628 DatastoreValidationError 

2629 Raised if there is a validation problem with a configuration. 

2630 All the problems are reported in a single exception. 

2631 

2632 Notes 

2633 ----- 

2634 This method checks that all the supplied entities have valid file 

2635 templates and also have formatters defined. 

2636 """ 

2637 templateFailed = None 

2638 try: 

2639 self.templates.validateTemplates(entities, logFailures=logFailures) 

2640 except FileTemplateValidationError as e: 

2641 templateFailed = str(e) 

2642 

2643 formatterFailed = [] 

2644 for entity in entities: 

2645 try: 

2646 self.formatterFactory.getFormatterClass(entity) 

2647 except KeyError as e: 

2648 formatterFailed.append(str(e)) 

2649 if logFailures: 

2650 log.critical("Formatter failure: %s", e) 

2651 

2652 if templateFailed or formatterFailed: 

2653 messages = [] 

2654 if templateFailed: 

2655 messages.append(templateFailed) 

2656 if formatterFailed: 

2657 messages.append(",".join(formatterFailed)) 

2658 msg = ";\n".join(messages) 

2659 raise DatastoreValidationError(msg) 

2660 

2661 def getLookupKeys(self) -> set[LookupKey]: 

2662 # Docstring is inherited from base class 

2663 return ( 

2664 self.templates.getLookupKeys() 

2665 | self.formatterFactory.getLookupKeys() 

2666 | self.constraints.getLookupKeys() 

2667 ) 

2668 

2669 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

2670 # Docstring is inherited from base class 

2671 # The key can be valid in either formatters or templates so we can 

2672 # only check the template if it exists 

2673 if lookupKey in self.templates: 

2674 try: 

2675 self.templates[lookupKey].validateTemplate(entity) 

2676 except FileTemplateValidationError as e: 

2677 raise DatastoreValidationError(e) from e 

2678 

2679 def export( 

2680 self, 

2681 refs: Iterable[DatasetRef], 

2682 *, 

2683 directory: ResourcePathExpression | None = None, 

2684 transfer: str | None = "auto", 

2685 ) -> Iterable[FileDataset]: 

2686 # Docstring inherited from Datastore.export. 

2687 if transfer == "auto" and directory is None: 

2688 transfer = None 

2689 

2690 if transfer is not None and directory is None: 

2691 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

2692 

2693 if transfer == "move": 

2694 raise TypeError("Can not export by moving files out of datastore.") 

2695 elif transfer == "direct": 

2696 # For an export, treat this as equivalent to None. We do not 

2697 # want an import to risk using absolute URIs to datasets owned 

2698 # by another datastore. 

2699 log.info("Treating 'direct' transfer mode as in-place export.") 

2700 transfer = None 

2701 

2702 # Force the directory to be a URI object 

2703 directoryUri: ResourcePath | None = None 

2704 if directory is not None: 

2705 directoryUri = ResourcePath(directory, forceDirectory=True) 

2706 

2707 if transfer is not None and directoryUri is not None and not directoryUri.exists(): 

2708 # mypy needs the second test 

2709 raise FileNotFoundError(f"Export location {directory} does not exist") 

2710 

2711 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

2712 for ref in progress.wrap(refs, "Exporting dataset files"): 

2713 fileLocations = self._get_dataset_locations_info(ref) 

2714 if not fileLocations: 

2715 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

2716 # For now we can not export disassembled datasets 

2717 if len(fileLocations) > 1: 

2718 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

2719 location, storedFileInfo = fileLocations[0] 

2720 

2721 pathInStore = location.pathInStore.path 

2722 if transfer is None: 

2723 # TODO: do we also need to return the readStorageClass somehow? 

2724 # We will use the path in store directly. If this is an 

2725 # absolute URI, preserve it. 

2726 if location.pathInStore.isabs(): 

2727 pathInStore = str(location.uri) 

2728 elif transfer == "direct": 

2729 # Use full URIs to the remote store in the export 

2730 pathInStore = str(location.uri) 

2731 else: 

2732 # mypy needs help 

2733 assert directoryUri is not None, "directoryUri must be defined to get here" 

2734 storeUri = ResourcePath(location.uri, forceDirectory=False) 

2735 

2736 # if the datastore has an absolute URI to a resource, we 

2737 # have two options: 

2738 # 1. Keep the absolute URI in the exported YAML 

2739 # 2. Allocate a new name in the local datastore and transfer 

2740 # it. 

2741 # For now go with option 2 

2742 if location.pathInStore.isabs(): 

2743 template = self.templates.getTemplate(ref) 

2744 newURI = ResourcePath(template.format(ref), forceAbsolute=False, forceDirectory=False) 

2745 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

2746 

2747 exportUri = directoryUri.join(pathInStore) 

2748 exportUri.transfer_from(storeUri, transfer=transfer) 

2749 

2750 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

2751 

2752 @staticmethod 

2753 def computeChecksum(uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192) -> str | None: 

2754 """Compute the checksum of the supplied file. 

2755 

2756 Parameters 

2757 ---------- 

2758 uri : `lsst.resources.ResourcePath` 

2759 Name of resource to calculate checksum from. 

2760 algorithm : `str`, optional 

2761 Name of algorithm to use. Must be one of the algorithms supported 

2762 by :py:class`hashlib`. 

2763 block_size : `int` 

2764 Number of bytes to read from file at one time. 

2765 

2766 Returns 

2767 ------- 

2768 hexdigest : `str` 

2769 Hex digest of the file. 

2770 

2771 Notes 

2772 ----- 

2773 Currently returns None if the URI is for a remote resource. 

2774 """ 

2775 if algorithm not in hashlib.algorithms_guaranteed: 

2776 raise NameError(f"The specified algorithm '{algorithm}' is not supported by hashlib") 

2777 

2778 if not uri.isLocal: 

2779 return None 

2780 

2781 hasher = hashlib.new(algorithm) 

2782 

2783 with uri.as_local() as local_uri, open(local_uri.ospath, "rb") as f: 

2784 for chunk in iter(lambda: f.read(block_size), b""): 

2785 hasher.update(chunk) 

2786 

2787 return hasher.hexdigest() 

2788 

2789 def needs_expanded_data_ids( 

2790 self, 

2791 transfer: str | None, 

2792 entity: DatasetRef | DatasetType | StorageClass | None = None, 

2793 ) -> bool: 

2794 # Docstring inherited. 

2795 # This _could_ also use entity to inspect whether the filename template 

2796 # involves placeholders other than the required dimensions for its 

2797 # dataset type, but that's not necessary for correctness; it just 

2798 # enables more optimizations (perhaps only in theory). 

2799 return transfer not in ("direct", None) 

2800 

2801 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

2802 # Docstring inherited from the base class. 

2803 record_data = data.get(self.name) 

2804 if not record_data: 

2805 return 

2806 

2807 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records) 

2808 

2809 # TODO: Verify that there are no unexpected table names in the dict? 

2810 unpacked_records = [] 

2811 for dataset_id, dataset_data in record_data.records.items(): 

2812 records = dataset_data.get(self._table.name) 

2813 if records: 

2814 for info in records: 

2815 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records" 

2816 unpacked_records.append(info.to_record(dataset_id=dataset_id)) 

2817 if unpacked_records: 

2818 self._table.insert(*unpacked_records, transaction=self._transaction) 

2819 

2820 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

2821 # Docstring inherited from the base class. 

2822 exported_refs = list(self._bridge.check(refs)) 

2823 ids = {ref.id for ref in exported_refs} 

2824 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids} 

2825 for row in self._table.fetch(dataset_id=ids): 

2826 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row) 

2827 dataset_records = records.setdefault(row["dataset_id"], {}) 

2828 dataset_records.setdefault(self._table.name, []).append(info) 

2829 

2830 record_data = DatastoreRecordData(records=records) 

2831 return {self.name: record_data} 

2832 

2833 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None: 

2834 # Docstring inherited from the base class. 

2835 self._retrieve_dataset_method = method 

2836 

2837 def _cast_storage_class(self, ref: DatasetRef) -> DatasetRef: 

2838 """Update dataset reference to use the storage class from registry.""" 

2839 if self._retrieve_dataset_method is None: 

2840 # We could raise an exception here but unit tests do not define 

2841 # this method. 

2842 return ref 

2843 dataset_type = self._retrieve_dataset_method(ref.datasetType.name) 

2844 if dataset_type is not None: 

2845 ref = ref.overrideStorageClass(dataset_type.storageClass) 

2846 return ref 

2847 

2848 def get_opaque_table_definitions(self) -> Mapping[str, DatastoreOpaqueTable]: 

2849 # Docstring inherited from the base class. 

2850 return {self._opaque_table_name: DatastoreOpaqueTable(self.makeTableSpec(ddl.GUID), StoredFileInfo)}