Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 10%

924 statements  

« prev     ^ index     » next       coverage.py v7.4.1, created at 2024-02-01 11:20 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Generic file-based datastore code.""" 

29 

30from __future__ import annotations 

31 

32__all__ = ("FileDatastore",) 

33 

34import contextlib 

35import hashlib 

36import logging 

37from collections import defaultdict 

38from collections.abc import Callable, Collection, Iterable, Mapping, Sequence 

39from typing import TYPE_CHECKING, Any, ClassVar, cast 

40 

41from lsst.daf.butler import ( 

42 Config, 

43 DatasetId, 

44 DatasetRef, 

45 DatasetType, 

46 DatasetTypeNotSupportedError, 

47 FileDataset, 

48 FileDescriptor, 

49 Formatter, 

50 FormatterFactory, 

51 Location, 

52 LocationFactory, 

53 Progress, 

54 StorageClass, 

55 ddl, 

56) 

57from lsst.daf.butler.datastore import ( 

58 DatasetRefURIs, 

59 Datastore, 

60 DatastoreConfig, 

61 DatastoreOpaqueTable, 

62 DatastoreValidationError, 

63) 

64from lsst.daf.butler.datastore.cache_manager import ( 

65 AbstractDatastoreCacheManager, 

66 DatastoreCacheManager, 

67 DatastoreDisabledCacheManager, 

68) 

69from lsst.daf.butler.datastore.composites import CompositesMap 

70from lsst.daf.butler.datastore.file_templates import FileTemplates, FileTemplateValidationError 

71from lsst.daf.butler.datastore.generic_base import GenericBaseDatastore 

72from lsst.daf.butler.datastore.record_data import DatastoreRecordData 

73from lsst.daf.butler.datastore.stored_file_info import StoredDatastoreItemInfo, StoredFileInfo 

74from lsst.daf.butler.datastores.file_datastore.get import ( 

75 DatasetLocationInformation, 

76 DatastoreFileGetInformation, 

77 generate_datastore_get_information, 

78 get_dataset_as_python_object_from_get_info, 

79) 

80from lsst.daf.butler.datastores.fileDatastoreClient import ( 

81 FileDatastoreGetPayload, 

82 FileDatastoreGetPayloadFileInfo, 

83) 

84from lsst.daf.butler.registry.interfaces import ( 

85 DatabaseInsertMode, 

86 DatastoreRegistryBridge, 

87 FakeDatasetRef, 

88 ReadOnlyDatabaseError, 

89) 

90from lsst.daf.butler.repo_relocation import replaceRoot 

91from lsst.daf.butler.utils import transactional 

92from lsst.resources import ResourcePath, ResourcePathExpression 

93from lsst.utils.introspection import get_class_of 

94from lsst.utils.iteration import chunk_iterable 

95 

96# For VERBOSE logging usage. 

97from lsst.utils.logging import VERBOSE, getLogger 

98from sqlalchemy import BigInteger, String 

99 

100if TYPE_CHECKING: 

101 from lsst.daf.butler import LookupKey 

102 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

103 

104log = getLogger(__name__) 

105 

106 

107class _IngestPrepData(Datastore.IngestPrepData): 

108 """Helper class for FileDatastore ingest implementation. 

109 

110 Parameters 

111 ---------- 

112 datasets : `~collections.abc.Iterable` of `FileDataset` 

113 Files to be ingested by this datastore. 

114 """ 

115 

116 def __init__(self, datasets: Iterable[FileDataset]): 

117 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

118 self.datasets = datasets 

119 

120 

121class FileDatastore(GenericBaseDatastore[StoredFileInfo]): 

122 """Generic Datastore for file-based implementations. 

123 

124 Should always be sub-classed since key abstract methods are missing. 

125 

126 Parameters 

127 ---------- 

128 config : `DatastoreConfig` or `str` 

129 Configuration as either a `Config` object or URI to file. 

130 bridgeManager : `DatastoreRegistryBridgeManager` 

131 Object that manages the interface between `Registry` and datastores. 

132 root : `ResourcePath` 

133 Root directory URI of this `Datastore`. 

134 formatterFactory : `FormatterFactory` 

135 Factory for creating instances of formatters. 

136 templates : `FileTemplates` 

137 File templates that can be used by this `Datastore`. 

138 composites : `CompositesMap` 

139 Determines whether a dataset should be disassembled on put. 

140 trustGetRequest : `bool` 

141 Determine whether we can fall back to configuration if a requested 

142 dataset is not known to registry. 

143 

144 Raises 

145 ------ 

146 ValueError 

147 If root location does not exist and ``create`` is `False` in the 

148 configuration. 

149 """ 

150 

151 defaultConfigFile: ClassVar[str | None] = None 

152 """Path to configuration defaults. Accessed within the ``config`` resource 

153 or relative to a search path. Can be None if no defaults specified. 

154 """ 

155 

156 root: ResourcePath 

157 """Root directory URI of this `Datastore`.""" 

158 

159 locationFactory: LocationFactory 

160 """Factory for creating locations relative to the datastore root.""" 

161 

162 formatterFactory: FormatterFactory 

163 """Factory for creating instances of formatters.""" 

164 

165 templates: FileTemplates 

166 """File templates that can be used by this `Datastore`.""" 

167 

168 composites: CompositesMap 

169 """Determines whether a dataset should be disassembled on put.""" 

170 

171 defaultConfigFile = "datastores/fileDatastore.yaml" 

172 """Path to configuration defaults. Accessed within the ``config`` resource 

173 or relative to a search path. Can be None if no defaults specified. 

174 """ 

175 

176 _retrieve_dataset_method: Callable[[str], DatasetType | None] | None = None 

177 """Callable that is used in trusted mode to retrieve registry definition 

178 of a named dataset type. 

179 """ 

180 

181 @classmethod 

182 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

183 """Set any filesystem-dependent config options for this Datastore to 

184 be appropriate for a new empty repository with the given root. 

185 

186 Parameters 

187 ---------- 

188 root : `str` 

189 URI to the root of the data repository. 

190 config : `Config` 

191 A `Config` to update. Only the subset understood by 

192 this component will be updated. Will not expand 

193 defaults. 

194 full : `Config` 

195 A complete config with all defaults expanded that can be 

196 converted to a `DatastoreConfig`. Read-only and will not be 

197 modified by this method. 

198 Repository-specific options that should not be obtained 

199 from defaults when Butler instances are constructed 

200 should be copied from ``full`` to ``config``. 

201 overwrite : `bool`, optional 

202 If `False`, do not modify a value in ``config`` if the value 

203 already exists. Default is always to overwrite with the provided 

204 ``root``. 

205 

206 Notes 

207 ----- 

208 If a keyword is explicitly defined in the supplied ``config`` it 

209 will not be overridden by this method if ``overwrite`` is `False`. 

210 This allows explicit values set in external configs to be retained. 

211 """ 

212 Config.updateParameters( 

213 DatastoreConfig, 

214 config, 

215 full, 

216 toUpdate={"root": root}, 

217 toCopy=("cls", ("records", "table")), 

218 overwrite=overwrite, 

219 ) 

220 

221 @classmethod 

222 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

223 return ddl.TableSpec( 

224 fields=[ 

225 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

226 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

227 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

228 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

229 # Use empty string to indicate no component 

230 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

231 # TODO: should checksum be Base64Bytes instead? 

232 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

233 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

234 ], 

235 unique=frozenset(), 

236 indexes=[ddl.IndexSpec("path")], 

237 ) 

238 

239 def __init__( 

240 self, 

241 config: DatastoreConfig, 

242 bridgeManager: DatastoreRegistryBridgeManager, 

243 root: ResourcePath, 

244 formatterFactory: FormatterFactory, 

245 templates: FileTemplates, 

246 composites: CompositesMap, 

247 trustGetRequest: bool, 

248 ): 

249 super().__init__(config, bridgeManager) 

250 self.root = ResourcePath(root) 

251 self.formatterFactory = formatterFactory 

252 self.templates = templates 

253 self.composites = composites 

254 self.trustGetRequest = trustGetRequest 

255 

256 # Name ourselves either using an explicit name or a name 

257 # derived from the (unexpanded) root 

258 if "name" in self.config: 

259 self.name = self.config["name"] 

260 else: 

261 # We use the unexpanded root in the name to indicate that this 

262 # datastore can be moved without having to update registry. 

263 self.name = "{}@{}".format(type(self).__name__, self.config["root"]) 

264 

265 self.locationFactory = LocationFactory(self.root) 

266 

267 self._opaque_table_name = self.config["records", "table"] 

268 try: 

269 # Storage of paths and formatters, keyed by dataset_id 

270 self._table = bridgeManager.opaque.register( 

271 self._opaque_table_name, self.makeTableSpec(bridgeManager.datasetIdColumnType) 

272 ) 

273 # Interface to Registry. 

274 self._bridge = bridgeManager.register(self.name) 

275 except ReadOnlyDatabaseError: 

276 # If the database is read only and we just tried and failed to 

277 # create a table, it means someone is trying to create a read-only 

278 # butler client for an empty repo. That should be okay, as long 

279 # as they then try to get any datasets before some other client 

280 # creates the table. Chances are they're just validating 

281 # configuration. 

282 pass 

283 

284 # Determine whether checksums should be used - default to False 

285 self.useChecksum = self.config.get("checksum", False) 

286 

287 # Create a cache manager 

288 self.cacheManager: AbstractDatastoreCacheManager 

289 if "cached" in self.config: 

290 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe) 

291 else: 

292 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe) 

293 

294 @classmethod 

295 def _create_from_config( 

296 cls, 

297 config: DatastoreConfig, 

298 bridgeManager: DatastoreRegistryBridgeManager, 

299 butlerRoot: ResourcePathExpression | None, 

300 ) -> FileDatastore: 

301 if "root" not in config: 

302 raise ValueError("No root directory specified in configuration") 

303 

304 # Support repository relocation in config 

305 # Existence of self.root is checked in subclass 

306 root = ResourcePath(replaceRoot(config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True) 

307 

308 # Now associate formatters with storage classes 

309 formatterFactory = FormatterFactory() 

310 formatterFactory.registerFormatters(config["formatters"], universe=bridgeManager.universe) 

311 

312 # Read the file naming templates 

313 templates = FileTemplates(config["templates"], universe=bridgeManager.universe) 

314 

315 # See if composites should be disassembled 

316 composites = CompositesMap(config["composites"], universe=bridgeManager.universe) 

317 

318 # Determine whether we can fall back to configuration if a 

319 # requested dataset is not known to registry 

320 trustGetRequest = config.get("trust_get_request", False) 

321 

322 self = FileDatastore( 

323 config, bridgeManager, root, formatterFactory, templates, composites, trustGetRequest 

324 ) 

325 

326 # Check existence and create directory structure if necessary 

327 if not self.root.exists(): 

328 if "create" not in self.config or not self.config["create"]: 

329 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

330 try: 

331 self.root.mkdir() 

332 except Exception as e: 

333 raise ValueError( 

334 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}" 

335 ) from e 

336 

337 return self 

338 

339 def clone(self, bridgeManager: DatastoreRegistryBridgeManager) -> Datastore: 

340 return FileDatastore( 

341 self.config, 

342 bridgeManager, 

343 self.root, 

344 self.formatterFactory, 

345 self.templates, 

346 self.composites, 

347 self.trustGetRequest, 

348 ) 

349 

350 def __str__(self) -> str: 

351 return str(self.root) 

352 

353 @property 

354 def bridge(self) -> DatastoreRegistryBridge: 

355 return self._bridge 

356 

357 @property 

358 def roots(self) -> dict[str, ResourcePath | None]: 

359 # Docstring inherited. 

360 return {self.name: self.root} 

361 

362 def _set_trust_mode(self, mode: bool) -> None: 

363 self.trustGetRequest = mode 

364 

365 def _artifact_exists(self, location: Location) -> bool: 

366 """Check that an artifact exists in this datastore at the specified 

367 location. 

368 

369 Parameters 

370 ---------- 

371 location : `Location` 

372 Expected location of the artifact associated with this datastore. 

373 

374 Returns 

375 ------- 

376 exists : `bool` 

377 True if the location can be found, false otherwise. 

378 """ 

379 log.debug("Checking if resource exists: %s", location.uri) 

380 return location.uri.exists() 

381 

382 def _delete_artifact(self, location: Location) -> None: 

383 """Delete the artifact from the datastore. 

384 

385 Parameters 

386 ---------- 

387 location : `Location` 

388 Location of the artifact associated with this datastore. 

389 """ 

390 if location.pathInStore.isabs(): 

391 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

392 

393 try: 

394 location.uri.remove() 

395 except FileNotFoundError: 

396 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

397 raise 

398 except Exception as e: 

399 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

400 raise 

401 log.debug("Successfully deleted file: %s", location.uri) 

402 

403 def addStoredItemInfo( 

404 self, 

405 refs: Iterable[DatasetRef], 

406 infos: Iterable[StoredFileInfo], 

407 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT, 

408 ) -> None: 

409 """Record internal storage information associated with one or more 

410 datasets. 

411 

412 Parameters 

413 ---------- 

414 refs : sequence of `DatasetRef` 

415 The datasets that have been stored. 

416 infos : sequence of `StoredDatastoreItemInfo` 

417 Metadata associated with the stored datasets. 

418 insert_mode : `~lsst.daf.butler.registry.interfaces.DatabaseInsertMode` 

419 Mode to use to insert the new records into the table. The 

420 options are ``INSERT`` (error if pre-existing), ``REPLACE`` 

421 (replace content with new values), and ``ENSURE`` (skip if the row 

422 already exists). 

423 """ 

424 records = [ 

425 info.rebase(ref).to_record(dataset_id=ref.id) for ref, info in zip(refs, infos, strict=True) 

426 ] 

427 match insert_mode: 

428 case DatabaseInsertMode.INSERT: 

429 self._table.insert(*records, transaction=self._transaction) 

430 case DatabaseInsertMode.ENSURE: 

431 self._table.ensure(*records, transaction=self._transaction) 

432 case DatabaseInsertMode.REPLACE: 

433 self._table.replace(*records, transaction=self._transaction) 

434 case _: 

435 raise ValueError(f"Unknown insert mode of '{insert_mode}'") 

436 

437 def getStoredItemsInfo( 

438 self, ref: DatasetIdRef, ignore_datastore_records: bool = False 

439 ) -> list[StoredFileInfo]: 

440 """Retrieve information associated with files stored in this 

441 `Datastore` associated with this dataset ref. 

442 

443 Parameters 

444 ---------- 

445 ref : `DatasetRef` 

446 The dataset that is to be queried. 

447 ignore_datastore_records : `bool` 

448 If `True` then do not use datastore records stored in refs. 

449 

450 Returns 

451 ------- 

452 items : `~collections.abc.Iterable` [`StoredDatastoreItemInfo`] 

453 Stored information about the files and associated formatters 

454 associated with this dataset. Only one file will be returned 

455 if the dataset has not been disassembled. Can return an empty 

456 list if no matching datasets can be found. 

457 """ 

458 # Try to get them from the ref first. 

459 if ref._datastore_records is not None and not ignore_datastore_records: 

460 if (ref_records := ref._datastore_records.get(self._table.name)) is not None: 

461 # Need to make sure they have correct type. 

462 for record in ref_records: 

463 if not isinstance(record, StoredFileInfo): 

464 raise TypeError(f"Datastore record has unexpected type {record.__class__.__name__}") 

465 return cast(list[StoredFileInfo], ref_records) 

466 

467 # Look for the dataset_id -- there might be multiple matches 

468 # if we have disassembled the dataset. 

469 records = self._table.fetch(dataset_id=ref.id) 

470 return [StoredFileInfo.from_record(record) for record in records] 

471 

472 def _register_datasets( 

473 self, 

474 refsAndInfos: Iterable[tuple[DatasetRef, StoredFileInfo]], 

475 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT, 

476 ) -> None: 

477 """Update registry to indicate that one or more datasets have been 

478 stored. 

479 

480 Parameters 

481 ---------- 

482 refsAndInfos : sequence `tuple` [`DatasetRef`, 

483 `StoredDatastoreItemInfo`] 

484 Datasets to register and the internal datastore metadata associated 

485 with them. 

486 insert_mode : `str`, optional 

487 Indicate whether the new records should be new ("insert", default), 

488 or allowed to exists ("ensure") or be replaced if already present 

489 ("replace"). 

490 """ 

491 expandedRefs: list[DatasetRef] = [] 

492 expandedItemInfos: list[StoredFileInfo] = [] 

493 

494 for ref, itemInfo in refsAndInfos: 

495 expandedRefs.append(ref) 

496 expandedItemInfos.append(itemInfo) 

497 

498 # Dataset location only cares about registry ID so if we have 

499 # disassembled in datastore we have to deduplicate. Since they 

500 # will have different datasetTypes we can't use a set 

501 registryRefs = {r.id: r for r in expandedRefs} 

502 if insert_mode == DatabaseInsertMode.INSERT: 

503 self.bridge.insert(registryRefs.values()) 

504 else: 

505 # There are only two columns and all that matters is the 

506 # dataset ID. 

507 self.bridge.ensure(registryRefs.values()) 

508 self.addStoredItemInfo(expandedRefs, expandedItemInfos, insert_mode=insert_mode) 

509 

510 def _get_stored_records_associated_with_refs( 

511 self, refs: Iterable[DatasetIdRef], ignore_datastore_records: bool = False 

512 ) -> dict[DatasetId, list[StoredFileInfo]]: 

513 """Retrieve all records associated with the provided refs. 

514 

515 Parameters 

516 ---------- 

517 refs : iterable of `DatasetIdRef` 

518 The refs for which records are to be retrieved. 

519 ignore_datastore_records : `bool` 

520 If `True` then do not use datastore records stored in refs. 

521 

522 Returns 

523 ------- 

524 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

525 The matching records indexed by the ref ID. The number of entries 

526 in the dict can be smaller than the number of requested refs. 

527 """ 

528 # Check datastore records in refs first. 

529 records_by_ref: defaultdict[DatasetId, list[StoredFileInfo]] = defaultdict(list) 

530 refs_with_no_records = [] 

531 for ref in refs: 

532 if ignore_datastore_records or ref._datastore_records is None: 

533 refs_with_no_records.append(ref) 

534 else: 

535 if (ref_records := ref._datastore_records.get(self._table.name)) is not None: 

536 # Need to make sure they have correct type. 

537 for ref_record in ref_records: 

538 if not isinstance(ref_record, StoredFileInfo): 

539 raise TypeError( 

540 f"Datastore record has unexpected type {ref_record.__class__.__name__}" 

541 ) 

542 records_by_ref[ref.id].append(ref_record) 

543 

544 # If there were any refs without datastore records, check opaque table. 

545 records = self._table.fetch(dataset_id=[ref.id for ref in refs_with_no_records]) 

546 

547 # Uniqueness is dataset_id + component so can have multiple records 

548 # per ref. 

549 for record in records: 

550 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

551 return records_by_ref 

552 

553 def _refs_associated_with_artifacts(self, paths: list[str | ResourcePath]) -> dict[str, set[DatasetId]]: 

554 """Return paths and associated dataset refs. 

555 

556 Parameters 

557 ---------- 

558 paths : `list` of `str` or `lsst.resources.ResourcePath` 

559 All the paths to include in search. 

560 

561 Returns 

562 ------- 

563 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

564 Mapping of each path to a set of associated database IDs. 

565 """ 

566 records = self._table.fetch(path=[str(path) for path in paths]) 

567 result = defaultdict(set) 

568 for row in records: 

569 result[row["path"]].add(row["dataset_id"]) 

570 return result 

571 

572 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> set[DatasetId]: 

573 """Return all dataset refs associated with the supplied path. 

574 

575 Parameters 

576 ---------- 

577 pathInStore : `lsst.resources.ResourcePath` 

578 Path of interest in the data store. 

579 

580 Returns 

581 ------- 

582 ids : `set` of `int` 

583 All `DatasetRef` IDs associated with this path. 

584 """ 

585 records = list(self._table.fetch(path=str(pathInStore))) 

586 ids = {r["dataset_id"] for r in records} 

587 return ids 

588 

589 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

590 """Remove information about the file associated with this dataset. 

591 

592 Parameters 

593 ---------- 

594 ref : `DatasetRef` 

595 The dataset that has been removed. 

596 """ 

597 # Note that this method is actually not used by this implementation, 

598 # we depend on bridge to delete opaque records. But there are some 

599 # tests that check that this method works, so we keep it for now. 

600 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

601 

602 def _get_dataset_locations_info( 

603 self, ref: DatasetIdRef, ignore_datastore_records: bool = False 

604 ) -> list[DatasetLocationInformation]: 

605 r"""Find all the `Location`\ s of the requested dataset in the 

606 `Datastore` and the associated stored file information. 

607 

608 Parameters 

609 ---------- 

610 ref : `DatasetRef` 

611 Reference to the required `Dataset`. 

612 ignore_datastore_records : `bool` 

613 If `True` then do not use datastore records stored in refs. 

614 

615 Returns 

616 ------- 

617 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

618 Location of the dataset within the datastore and 

619 stored information about each file and its formatter. 

620 """ 

621 # Get the file information (this will fail if no file) 

622 records = self.getStoredItemsInfo(ref, ignore_datastore_records) 

623 

624 # Use the path to determine the location -- we need to take 

625 # into account absolute URIs in the datastore record 

626 return [(r.file_location(self.locationFactory), r) for r in records] 

627 

628 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

629 """Check that there is only one dataset associated with the 

630 specified artifact. 

631 

632 Parameters 

633 ---------- 

634 ref : `DatasetRef` or `FakeDatasetRef` 

635 Dataset to be removed. 

636 location : `Location` 

637 The location of the artifact to be removed. 

638 

639 Returns 

640 ------- 

641 can_remove : `Bool` 

642 True if the artifact can be safely removed. 

643 """ 

644 # Can't ever delete absolute URIs. 

645 if location.pathInStore.isabs(): 

646 return False 

647 

648 # Get all entries associated with this path 

649 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

650 if not allRefs: 

651 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

652 

653 # Remove these refs from all the refs and if there is nothing left 

654 # then we can delete 

655 remainingRefs = allRefs - {ref.id} 

656 

657 if remainingRefs: 

658 return False 

659 return True 

660 

661 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> list[tuple[Location, StoredFileInfo]]: 

662 """Predict the location and related file information of the requested 

663 dataset in this datastore. 

664 

665 Parameters 

666 ---------- 

667 ref : `DatasetRef` 

668 Reference to the required `Dataset`. 

669 

670 Returns 

671 ------- 

672 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

673 Expected Location of the dataset within the datastore and 

674 placeholder information about each file and its formatter. 

675 

676 Notes 

677 ----- 

678 Uses the current configuration to determine how we would expect the 

679 datastore files to have been written if we couldn't ask registry. 

680 This is safe so long as there has been no change to datastore 

681 configuration between writing the dataset and wanting to read it. 

682 Will not work for files that have been ingested without using the 

683 standard file template or default formatter. 

684 """ 

685 # If we have a component ref we always need to ask the questions 

686 # of the composite. If the composite is disassembled this routine 

687 # should return all components. If the composite was not 

688 # disassembled the composite is what is stored regardless of 

689 # component request. Note that if the caller has disassembled 

690 # a composite there is no way for this guess to know that 

691 # without trying both the composite and component ref and seeing 

692 # if there is something at the component Location even without 

693 # disassembly being enabled. 

694 if ref.datasetType.isComponent(): 

695 ref = ref.makeCompositeRef() 

696 

697 # See if the ref is a composite that should be disassembled 

698 doDisassembly = self.composites.shouldBeDisassembled(ref) 

699 

700 all_info: list[tuple[Location, Formatter, StorageClass, str | None]] = [] 

701 

702 if doDisassembly: 

703 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

704 compRef = ref.makeComponentRef(component) 

705 location, formatter = self._determine_put_formatter_location(compRef) 

706 all_info.append((location, formatter, componentStorage, component)) 

707 

708 else: 

709 # Always use the composite ref if no disassembly 

710 location, formatter = self._determine_put_formatter_location(ref) 

711 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

712 

713 # Convert the list of tuples to have StoredFileInfo as second element 

714 return [ 

715 ( 

716 location, 

717 StoredFileInfo( 

718 formatter=formatter, 

719 path=location.pathInStore.path, 

720 storageClass=storageClass, 

721 component=component, 

722 checksum=None, 

723 file_size=-1, 

724 ), 

725 ) 

726 for location, formatter, storageClass, component in all_info 

727 ] 

728 

729 def _prepare_for_direct_get( 

730 self, ref: DatasetRef, parameters: Mapping[str, Any] | None = None 

731 ) -> list[DatastoreFileGetInformation]: 

732 """Check parameters for ``get`` and obtain formatter and 

733 location. 

734 

735 Parameters 

736 ---------- 

737 ref : `DatasetRef` 

738 Reference to the required Dataset. 

739 parameters : `dict` 

740 `StorageClass`-specific parameters that specify, for example, 

741 a slice of the dataset to be loaded. 

742 

743 Returns 

744 ------- 

745 getInfo : `list` [`DatastoreFileGetInformation`] 

746 Parameters needed to retrieve each file. 

747 """ 

748 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

749 

750 # The storage class we want to use eventually 

751 refStorageClass = ref.datasetType.storageClass 

752 

753 # For trusted mode need to reset storage class. 

754 ref = self._cast_storage_class(ref) 

755 

756 # Get file metadata and internal metadata 

757 fileLocations = self._get_dataset_locations_info(ref) 

758 if not fileLocations: 

759 if not self.trustGetRequest: 

760 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

761 # Assume the dataset is where we think it should be 

762 fileLocations = self._get_expected_dataset_locations_info(ref) 

763 

764 if len(fileLocations) > 1: 

765 # If trust is involved it is possible that there will be 

766 # components listed here that do not exist in the datastore. 

767 # Explicitly check for file artifact existence and filter out any 

768 # that are missing. 

769 if self.trustGetRequest: 

770 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()] 

771 

772 # For now complain only if we have no components at all. One 

773 # component is probably a problem but we can punt that to the 

774 # assembler. 

775 if not fileLocations: 

776 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") 

777 

778 return generate_datastore_get_information( 

779 fileLocations, 

780 readStorageClass=refStorageClass, 

781 ref=ref, 

782 parameters=parameters, 

783 ) 

784 

785 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> tuple[Location, Formatter]: 

786 """Check the arguments for ``put`` and obtain formatter and 

787 location. 

788 

789 Parameters 

790 ---------- 

791 inMemoryDataset : `object` 

792 The dataset to store. 

793 ref : `DatasetRef` 

794 Reference to the associated Dataset. 

795 

796 Returns 

797 ------- 

798 location : `Location` 

799 The location to write the dataset. 

800 formatter : `Formatter` 

801 The `Formatter` to use to write the dataset. 

802 

803 Raises 

804 ------ 

805 TypeError 

806 Supplied object and storage class are inconsistent. 

807 DatasetTypeNotSupportedError 

808 The associated `DatasetType` is not handled by this datastore. 

809 """ 

810 self._validate_put_parameters(inMemoryDataset, ref) 

811 return self._determine_put_formatter_location(ref) 

812 

813 def _determine_put_formatter_location(self, ref: DatasetRef) -> tuple[Location, Formatter]: 

814 """Calculate the formatter and output location to use for put. 

815 

816 Parameters 

817 ---------- 

818 ref : `DatasetRef` 

819 Reference to the associated Dataset. 

820 

821 Returns 

822 ------- 

823 location : `Location` 

824 The location to write the dataset. 

825 formatter : `Formatter` 

826 The `Formatter` to use to write the dataset. 

827 """ 

828 # Work out output file name 

829 try: 

830 template = self.templates.getTemplate(ref) 

831 except KeyError as e: 

832 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

833 

834 # Validate the template to protect against filenames from different 

835 # dataIds returning the same and causing overwrite confusion. 

836 template.validateTemplate(ref) 

837 

838 location = self.locationFactory.fromPath(template.format(ref), trusted_path=True) 

839 

840 # Get the formatter based on the storage class 

841 storageClass = ref.datasetType.storageClass 

842 try: 

843 formatter = self.formatterFactory.getFormatter( 

844 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId 

845 ) 

846 except KeyError as e: 

847 raise DatasetTypeNotSupportedError( 

848 f"Unable to find formatter for {ref} in datastore {self.name}" 

849 ) from e 

850 

851 # Now that we know the formatter, update the location 

852 location = formatter.makeUpdatedLocation(location) 

853 

854 return location, formatter 

855 

856 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None: 

857 # Docstring inherited from base class 

858 if transfer != "auto": 

859 return transfer 

860 

861 # See if the paths are within the datastore or not 

862 inside = [self._pathInStore(d.path) is not None for d in datasets] 

863 

864 if all(inside): 

865 transfer = None 

866 elif not any(inside): 

867 # Allow ResourcePath to use its own knowledge 

868 transfer = "auto" 

869 else: 

870 # This can happen when importing from a datastore that 

871 # has had some datasets ingested using "direct" mode. 

872 # Also allow ResourcePath to sort it out but warn about it. 

873 # This can happen if you are importing from a datastore 

874 # that had some direct transfer datasets. 

875 log.warning( 

876 "Some datasets are inside the datastore and some are outside. Using 'split' " 

877 "transfer mode. This assumes that the files outside the datastore are " 

878 "still accessible to the new butler since they will not be copied into " 

879 "the target datastore." 

880 ) 

881 transfer = "split" 

882 

883 return transfer 

884 

885 def _pathInStore(self, path: ResourcePathExpression) -> str | None: 

886 """Return path relative to datastore root. 

887 

888 Parameters 

889 ---------- 

890 path : `lsst.resources.ResourcePathExpression` 

891 Path to dataset. Can be absolute URI. If relative assumed to 

892 be relative to the datastore. Returns path in datastore 

893 or raises an exception if the path it outside. 

894 

895 Returns 

896 ------- 

897 inStore : `str` 

898 Path relative to datastore root. Returns `None` if the file is 

899 outside the root. 

900 """ 

901 # Relative path will always be relative to datastore 

902 pathUri = ResourcePath(path, forceAbsolute=False, forceDirectory=False) 

903 return pathUri.relative_to(self.root) 

904 

905 def _standardizeIngestPath( 

906 self, path: str | ResourcePath, *, transfer: str | None = None 

907 ) -> str | ResourcePath: 

908 """Standardize the path of a to-be-ingested file. 

909 

910 Parameters 

911 ---------- 

912 path : `str` or `lsst.resources.ResourcePath` 

913 Path of a file to be ingested. This parameter is not expected 

914 to be all the types that can be used to construct a 

915 `~lsst.resources.ResourcePath`. 

916 transfer : `str`, optional 

917 How (and whether) the dataset should be added to the datastore. 

918 See `ingest` for details of transfer modes. 

919 This implementation is provided only so 

920 `NotImplementedError` can be raised if the mode is not supported; 

921 actual transfers are deferred to `_extractIngestInfo`. 

922 

923 Returns 

924 ------- 

925 path : `str` or `lsst.resources.ResourcePath` 

926 New path in what the datastore considers standard form. If an 

927 absolute URI was given that will be returned unchanged. 

928 

929 Notes 

930 ----- 

931 Subclasses of `FileDatastore` can implement this method instead 

932 of `_prepIngest`. It should not modify the data repository or given 

933 file in any way. 

934 

935 Raises 

936 ------ 

937 NotImplementedError 

938 Raised if the datastore does not support the given transfer mode 

939 (including the case where ingest is not supported at all). 

940 FileNotFoundError 

941 Raised if one of the given files does not exist. 

942 """ 

943 if transfer not in (None, "direct", "split") + self.root.transferModes: 

944 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

945 

946 # A relative URI indicates relative to datastore root 

947 srcUri = ResourcePath(path, forceAbsolute=False, forceDirectory=False) 

948 if not srcUri.isabs(): 

949 srcUri = self.root.join(path) 

950 

951 if not srcUri.exists(): 

952 raise FileNotFoundError( 

953 f"Resource at {srcUri} does not exist; note that paths to ingest " 

954 f"are assumed to be relative to {self.root} unless they are absolute." 

955 ) 

956 

957 if transfer is None: 

958 relpath = srcUri.relative_to(self.root) 

959 if not relpath: 

960 raise RuntimeError( 

961 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})" 

962 ) 

963 

964 # Return the relative path within the datastore for internal 

965 # transfer 

966 path = relpath 

967 

968 return path 

969 

970 def _extractIngestInfo( 

971 self, 

972 path: ResourcePathExpression, 

973 ref: DatasetRef, 

974 *, 

975 formatter: Formatter | type[Formatter], 

976 transfer: str | None = None, 

977 record_validation_info: bool = True, 

978 ) -> StoredFileInfo: 

979 """Relocate (if necessary) and extract `StoredFileInfo` from a 

980 to-be-ingested file. 

981 

982 Parameters 

983 ---------- 

984 path : `lsst.resources.ResourcePathExpression` 

985 URI or path of a file to be ingested. 

986 ref : `DatasetRef` 

987 Reference for the dataset being ingested. Guaranteed to have 

988 ``dataset_id not None`. 

989 formatter : `type` or `Formatter` 

990 `Formatter` subclass to use for this dataset or an instance. 

991 transfer : `str`, optional 

992 How (and whether) the dataset should be added to the datastore. 

993 See `ingest` for details of transfer modes. 

994 record_validation_info : `bool`, optional 

995 If `True`, the default, the datastore can record validation 

996 information associated with the file. If `False` the datastore 

997 will not attempt to track any information such as checksums 

998 or file sizes. This can be useful if such information is tracked 

999 in an external system or if the file is to be compressed in place. 

1000 It is up to the datastore whether this parameter is relevant. 

1001 

1002 Returns 

1003 ------- 

1004 info : `StoredFileInfo` 

1005 Internal datastore record for this file. This will be inserted by 

1006 the caller; the `_extractIngestInfo` is only responsible for 

1007 creating and populating the struct. 

1008 

1009 Raises 

1010 ------ 

1011 FileNotFoundError 

1012 Raised if one of the given files does not exist. 

1013 FileExistsError 

1014 Raised if transfer is not `None` but the (internal) location the 

1015 file would be moved to is already occupied. 

1016 """ 

1017 if self._transaction is None: 

1018 raise RuntimeError("Ingest called without transaction enabled") 

1019 

1020 # Create URI of the source path, do not need to force a relative 

1021 # path to absolute. 

1022 srcUri = ResourcePath(path, forceAbsolute=False, forceDirectory=False) 

1023 

1024 # Track whether we have read the size of the source yet 

1025 have_sized = False 

1026 

1027 tgtLocation: Location | None 

1028 if transfer is None or transfer == "split": 

1029 # A relative path is assumed to be relative to the datastore 

1030 # in this context 

1031 if not srcUri.isabs(): 

1032 tgtLocation = self.locationFactory.fromPath(srcUri.ospath, trusted_path=False) 

1033 else: 

1034 # Work out the path in the datastore from an absolute URI 

1035 # This is required to be within the datastore. 

1036 pathInStore = srcUri.relative_to(self.root) 

1037 if pathInStore is None and transfer is None: 

1038 raise RuntimeError( 

1039 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}" 

1040 ) 

1041 if pathInStore: 

1042 tgtLocation = self.locationFactory.fromPath(pathInStore, trusted_path=True) 

1043 elif transfer == "split": 

1044 # Outside the datastore but treat that as a direct ingest 

1045 # instead. 

1046 tgtLocation = None 

1047 else: 

1048 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}") 

1049 elif transfer == "direct": 

1050 # Want to store the full URI to the resource directly in 

1051 # datastore. This is useful for referring to permanent archive 

1052 # storage for raw data. 

1053 # Trust that people know what they are doing. 

1054 tgtLocation = None 

1055 else: 

1056 # Work out the name we want this ingested file to have 

1057 # inside the datastore 

1058 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

1059 if not tgtLocation.uri.dirname().exists(): 

1060 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

1061 tgtLocation.uri.dirname().mkdir() 

1062 

1063 # if we are transferring from a local file to a remote location 

1064 # it may be more efficient to get the size and checksum of the 

1065 # local file rather than the transferred one 

1066 if record_validation_info and srcUri.isLocal: 

1067 size = srcUri.size() 

1068 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

1069 have_sized = True 

1070 

1071 # Transfer the resource to the destination. 

1072 # Allow overwrite of an existing file. This matches the behavior 

1073 # of datastore.put() in that it trusts that registry would not 

1074 # be asking to overwrite unless registry thought that the 

1075 # overwrite was allowed. 

1076 tgtLocation.uri.transfer_from( 

1077 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True 

1078 ) 

1079 

1080 if tgtLocation is None: 

1081 # This means we are using direct mode 

1082 targetUri = srcUri 

1083 targetPath = str(srcUri) 

1084 else: 

1085 targetUri = tgtLocation.uri 

1086 targetPath = tgtLocation.pathInStore.path 

1087 

1088 # the file should exist in the datastore now 

1089 if record_validation_info: 

1090 if not have_sized: 

1091 size = targetUri.size() 

1092 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

1093 else: 

1094 # Not recording any file information. 

1095 size = -1 

1096 checksum = None 

1097 

1098 return StoredFileInfo( 

1099 formatter=formatter, 

1100 path=targetPath, 

1101 storageClass=ref.datasetType.storageClass, 

1102 component=ref.datasetType.component(), 

1103 file_size=size, 

1104 checksum=checksum, 

1105 ) 

1106 

1107 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData: 

1108 # Docstring inherited from Datastore._prepIngest. 

1109 filtered = [] 

1110 for dataset in datasets: 

1111 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

1112 if not acceptable: 

1113 continue 

1114 else: 

1115 dataset.refs = acceptable 

1116 if dataset.formatter is None: 

1117 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

1118 else: 

1119 assert isinstance(dataset.formatter, type | str) 

1120 formatter_class = get_class_of(dataset.formatter) 

1121 if not issubclass(formatter_class, Formatter): 

1122 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.") 

1123 dataset.formatter = formatter_class 

1124 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

1125 filtered.append(dataset) 

1126 return _IngestPrepData(filtered) 

1127 

1128 @transactional 

1129 def _finishIngest( 

1130 self, 

1131 prepData: Datastore.IngestPrepData, 

1132 *, 

1133 transfer: str | None = None, 

1134 record_validation_info: bool = True, 

1135 ) -> None: 

1136 # Docstring inherited from Datastore._finishIngest. 

1137 refsAndInfos = [] 

1138 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

1139 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

1140 # Do ingest as if the first dataset ref is associated with the file 

1141 info = self._extractIngestInfo( 

1142 dataset.path, 

1143 dataset.refs[0], 

1144 formatter=dataset.formatter, 

1145 transfer=transfer, 

1146 record_validation_info=record_validation_info, 

1147 ) 

1148 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

1149 

1150 # In direct mode we can allow repeated ingests of the same thing 

1151 # if we are sure that the external dataset is immutable. We use 

1152 # UUIDv5 to indicate this. If there is a mix of v4 and v5 they are 

1153 # separated. 

1154 refs_and_infos_replace = [] 

1155 refs_and_infos_insert = [] 

1156 if transfer == "direct": 

1157 for entry in refsAndInfos: 

1158 if entry[0].id.version == 5: 

1159 refs_and_infos_replace.append(entry) 

1160 else: 

1161 refs_and_infos_insert.append(entry) 

1162 else: 

1163 refs_and_infos_insert = refsAndInfos 

1164 

1165 if refs_and_infos_insert: 

1166 self._register_datasets(refs_and_infos_insert, insert_mode=DatabaseInsertMode.INSERT) 

1167 if refs_and_infos_replace: 

1168 self._register_datasets(refs_and_infos_replace, insert_mode=DatabaseInsertMode.REPLACE) 

1169 

1170 def _calculate_ingested_datastore_name( 

1171 self, 

1172 srcUri: ResourcePath, 

1173 ref: DatasetRef, 

1174 formatter: Formatter | type[Formatter] | None = None, 

1175 ) -> Location: 

1176 """Given a source URI and a DatasetRef, determine the name the 

1177 dataset will have inside datastore. 

1178 

1179 Parameters 

1180 ---------- 

1181 srcUri : `lsst.resources.ResourcePath` 

1182 URI to the source dataset file. 

1183 ref : `DatasetRef` 

1184 Ref associated with the newly-ingested dataset artifact. This 

1185 is used to determine the name within the datastore. 

1186 formatter : `Formatter` or Formatter class. 

1187 Formatter to use for validation. Can be a class or an instance. 

1188 No validation of the file extension is performed if the 

1189 ``formatter`` is `None`. This can be used if the caller knows 

1190 that the source URI and target URI will use the same formatter. 

1191 

1192 Returns 

1193 ------- 

1194 location : `Location` 

1195 Target location for the newly-ingested dataset. 

1196 """ 

1197 # Ingesting a file from outside the datastore. 

1198 # This involves a new name. 

1199 template = self.templates.getTemplate(ref) 

1200 location = self.locationFactory.fromPath(template.format(ref), trusted_path=True) 

1201 

1202 # Get the extension 

1203 ext = srcUri.getExtension() 

1204 

1205 # Update the destination to include that extension 

1206 location.updateExtension(ext) 

1207 

1208 # Ask the formatter to validate this extension 

1209 if formatter is not None: 

1210 formatter.validateExtension(location) 

1211 

1212 return location 

1213 

1214 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

1215 """Write out in memory dataset to datastore. 

1216 

1217 Parameters 

1218 ---------- 

1219 inMemoryDataset : `object` 

1220 Dataset to write to datastore. 

1221 ref : `DatasetRef` 

1222 Registry information associated with this dataset. 

1223 

1224 Returns 

1225 ------- 

1226 info : `StoredFileInfo` 

1227 Information describing the artifact written to the datastore. 

1228 """ 

1229 # May need to coerce the in memory dataset to the correct 

1230 # python type, but first we need to make sure the storage class 

1231 # reflects the one defined in the data repository. 

1232 ref = self._cast_storage_class(ref) 

1233 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset) 

1234 

1235 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

1236 uri = location.uri 

1237 

1238 if not uri.dirname().exists(): 

1239 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

1240 uri.dirname().mkdir() 

1241 

1242 if self._transaction is None: 

1243 raise RuntimeError("Attempting to write artifact without transaction enabled") 

1244 

1245 def _removeFileExists(uri: ResourcePath) -> None: 

1246 """Remove a file and do not complain if it is not there. 

1247 

1248 This is important since a formatter might fail before the file 

1249 is written and we should not confuse people by writing spurious 

1250 error messages to the log. 

1251 """ 

1252 with contextlib.suppress(FileNotFoundError): 

1253 uri.remove() 

1254 

1255 # Register a callback to try to delete the uploaded data if 

1256 # something fails below 

1257 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1258 

1259 data_written = False 

1260 

1261 # For remote URIs some datasets can be serialized directly 

1262 # to bytes and sent to the remote datastore without writing a 

1263 # file. If the dataset is intended to be saved to the cache 

1264 # a file is always written and direct write to the remote 

1265 # datastore is bypassed. 

1266 if not uri.isLocal and not self.cacheManager.should_be_cached(ref): 

1267 # Remote URI that is not cached so can write directly. 

1268 try: 

1269 serializedDataset = formatter.toBytes(inMemoryDataset) 

1270 except NotImplementedError: 

1271 # Fallback to the file writing option. 

1272 pass 

1273 except Exception as e: 

1274 raise RuntimeError( 

1275 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes." 

1276 ) from e 

1277 else: 

1278 log.debug("Writing bytes directly to %s", uri) 

1279 uri.write(serializedDataset, overwrite=True) 

1280 log.debug("Successfully wrote bytes directly to %s", uri) 

1281 data_written = True 

1282 

1283 if not data_written: 

1284 # Did not write the bytes directly to object store so instead 

1285 # write to temporary file. Always write to a temporary even if 

1286 # using a local file system -- that gives us atomic writes. 

1287 # If a process is killed as the file is being written we do not 

1288 # want it to remain in the correct place but in corrupt state. 

1289 # For local files write to the output directory not temporary dir. 

1290 prefix = uri.dirname() if uri.isLocal else None 

1291 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri: 

1292 # Need to configure the formatter to write to a different 

1293 # location and that needs us to overwrite internals 

1294 log.debug("Writing dataset to temporary location at %s", temporary_uri) 

1295 with formatter._updateLocation(Location(None, temporary_uri)): 

1296 try: 

1297 formatter.write(inMemoryDataset) 

1298 except Exception as e: 

1299 raise RuntimeError( 

1300 f"Failed to serialize dataset {ref} of type" 

1301 f" {type(inMemoryDataset)} to " 

1302 f"temporary location {temporary_uri}" 

1303 ) from e 

1304 

1305 # Use move for a local file since that becomes an efficient 

1306 # os.rename. For remote resources we use copy to allow the 

1307 # file to be cached afterwards. 

1308 transfer = "move" if uri.isLocal else "copy" 

1309 

1310 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True) 

1311 

1312 if transfer == "copy": 

1313 # Cache if required 

1314 self.cacheManager.move_to_cache(temporary_uri, ref) 

1315 

1316 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1317 

1318 # URI is needed to resolve what ingest case are we dealing with 

1319 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1320 

1321 def knows(self, ref: DatasetRef) -> bool: 

1322 """Check if the dataset is known to the datastore. 

1323 

1324 Does not check for existence of any artifact. 

1325 

1326 Parameters 

1327 ---------- 

1328 ref : `DatasetRef` 

1329 Reference to the required dataset. 

1330 

1331 Returns 

1332 ------- 

1333 exists : `bool` 

1334 `True` if the dataset is known to the datastore. 

1335 """ 

1336 # We cannot trust datastore records from ref, as many unit tests delete 

1337 # datasets and check their existence. 

1338 fileLocations = self._get_dataset_locations_info(ref, ignore_datastore_records=True) 

1339 if fileLocations: 

1340 return True 

1341 return False 

1342 

1343 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

1344 # Docstring inherited from the base class. 

1345 

1346 # The records themselves. Could be missing some entries. 

1347 records = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True) 

1348 

1349 return {ref: ref.id in records for ref in refs} 

1350 

1351 def _process_mexists_records( 

1352 self, 

1353 id_to_ref: dict[DatasetId, DatasetRef], 

1354 records: dict[DatasetId, list[StoredFileInfo]], 

1355 all_required: bool, 

1356 artifact_existence: dict[ResourcePath, bool] | None = None, 

1357 ) -> dict[DatasetRef, bool]: 

1358 """Check given records for existence. 

1359 

1360 Helper function for `mexists()`. 

1361 

1362 Parameters 

1363 ---------- 

1364 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`] 

1365 Mapping of the dataset ID to the dataset ref itself. 

1366 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

1367 Records as generally returned by 

1368 ``_get_stored_records_associated_with_refs``. 

1369 all_required : `bool` 

1370 Flag to indicate whether existence requires all artifacts 

1371 associated with a dataset ID to exist or not for existence. 

1372 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1373 Optional mapping of datastore artifact to existence. Updated by 

1374 this method with details of all artifacts tested. Can be `None` 

1375 if the caller is not interested. 

1376 

1377 Returns 

1378 ------- 

1379 existence : `dict` of [`DatasetRef`, `bool`] 

1380 Mapping from dataset to boolean indicating existence. 

1381 """ 

1382 # The URIs to be checked and a mapping of those URIs to 

1383 # the dataset ID. 

1384 uris_to_check: list[ResourcePath] = [] 

1385 location_map: dict[ResourcePath, DatasetId] = {} 

1386 

1387 location_factory = self.locationFactory 

1388 

1389 uri_existence: dict[ResourcePath, bool] = {} 

1390 for ref_id, infos in records.items(): 

1391 # Key is the dataset Id, value is list of StoredItemInfo 

1392 uris = [info.file_location(location_factory).uri for info in infos] 

1393 location_map.update({uri: ref_id for uri in uris}) 

1394 

1395 # Check the local cache directly for a dataset corresponding 

1396 # to the remote URI. 

1397 if self.cacheManager.file_count > 0: 

1398 ref = id_to_ref[ref_id] 

1399 for uri, storedFileInfo in zip(uris, infos, strict=True): 

1400 check_ref = ref 

1401 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1402 check_ref = ref.makeComponentRef(component) 

1403 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()): 

1404 # Proxy for URI existence. 

1405 uri_existence[uri] = True 

1406 else: 

1407 uris_to_check.append(uri) 

1408 else: 

1409 # Check all of them. 

1410 uris_to_check.extend(uris) 

1411 

1412 if artifact_existence is not None: 

1413 # If a URI has already been checked remove it from the list 

1414 # and immediately add the status to the output dict. 

1415 filtered_uris_to_check = [] 

1416 for uri in uris_to_check: 

1417 if uri in artifact_existence: 

1418 uri_existence[uri] = artifact_existence[uri] 

1419 else: 

1420 filtered_uris_to_check.append(uri) 

1421 uris_to_check = filtered_uris_to_check 

1422 

1423 # Results. 

1424 dataset_existence: dict[DatasetRef, bool] = {} 

1425 

1426 uri_existence.update(ResourcePath.mexists(uris_to_check)) 

1427 for uri, exists in uri_existence.items(): 

1428 dataset_id = location_map[uri] 

1429 ref = id_to_ref[dataset_id] 

1430 

1431 # Disassembled composite needs to check all locations. 

1432 # all_required indicates whether all need to exist or not. 

1433 if ref in dataset_existence: 

1434 if all_required: 

1435 exists = dataset_existence[ref] and exists 

1436 else: 

1437 exists = dataset_existence[ref] or exists 

1438 dataset_existence[ref] = exists 

1439 

1440 if artifact_existence is not None: 

1441 artifact_existence.update(uri_existence) 

1442 

1443 return dataset_existence 

1444 

1445 def mexists( 

1446 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1447 ) -> dict[DatasetRef, bool]: 

1448 """Check the existence of multiple datasets at once. 

1449 

1450 Parameters 

1451 ---------- 

1452 refs : iterable of `DatasetRef` 

1453 The datasets to be checked. 

1454 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1455 Optional mapping of datastore artifact to existence. Updated by 

1456 this method with details of all artifacts tested. Can be `None` 

1457 if the caller is not interested. 

1458 

1459 Returns 

1460 ------- 

1461 existence : `dict` of [`DatasetRef`, `bool`] 

1462 Mapping from dataset to boolean indicating existence. 

1463 

1464 Notes 

1465 ----- 

1466 To minimize potentially costly remote existence checks, the local 

1467 cache is checked as a proxy for existence. If a file for this 

1468 `DatasetRef` does exist no check is done for the actual URI. This 

1469 could result in possibly unexpected behavior if the dataset itself 

1470 has been removed from the datastore by another process whilst it is 

1471 still in the cache. 

1472 """ 

1473 chunk_size = 10_000 

1474 dataset_existence: dict[DatasetRef, bool] = {} 

1475 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size) 

1476 n_found_total = 0 

1477 n_checked = 0 

1478 n_chunks = 0 

1479 for chunk in chunk_iterable(refs, chunk_size=chunk_size): 

1480 chunk_result = self._mexists(chunk, artifact_existence) 

1481 

1482 # The log message level and content depend on how many 

1483 # datasets we are processing. 

1484 n_results = len(chunk_result) 

1485 

1486 # Use verbose logging to ensure that messages can be seen 

1487 # easily if many refs are being checked. 

1488 log_threshold = VERBOSE 

1489 n_checked += n_results 

1490 

1491 # This sum can take some time so only do it if we know the 

1492 # result is going to be used. 

1493 n_found = 0 

1494 if log.isEnabledFor(log_threshold): 

1495 # Can treat the booleans as 0, 1 integers and sum them. 

1496 n_found = sum(chunk_result.values()) 

1497 n_found_total += n_found 

1498 

1499 # We are deliberately not trying to count the number of refs 

1500 # provided in case it's in the millions. This means there is a 

1501 # situation where the number of refs exactly matches the chunk 

1502 # size and we will switch to the multi-chunk path even though 

1503 # we only have a single chunk. 

1504 if n_results < chunk_size and n_chunks == 0: 

1505 # Single chunk will be processed so we can provide more detail. 

1506 if n_results == 1: 

1507 ref = list(chunk_result)[0] 

1508 # Use debug logging to be consistent with `exists()`. 

1509 log.debug( 

1510 "Calling mexists() with single ref that does%s exist (%s).", 

1511 "" if chunk_result[ref] else " not", 

1512 ref, 

1513 ) 

1514 else: 

1515 # Single chunk but multiple files. Summarize. 

1516 log.log( 

1517 log_threshold, 

1518 "Number of datasets found in datastore: %d out of %d datasets checked.", 

1519 n_found, 

1520 n_checked, 

1521 ) 

1522 

1523 else: 

1524 # Use incremental verbose logging when we have multiple chunks. 

1525 log.log( 

1526 log_threshold, 

1527 "Number of datasets found in datastore for chunk %d: %d out of %d checked " 

1528 "(running total from all chunks so far: %d found out of %d checked)", 

1529 n_chunks, 

1530 n_found, 

1531 n_results, 

1532 n_found_total, 

1533 n_checked, 

1534 ) 

1535 dataset_existence.update(chunk_result) 

1536 n_chunks += 1 

1537 

1538 return dataset_existence 

1539 

1540 def _mexists( 

1541 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1542 ) -> dict[DatasetRef, bool]: 

1543 """Check the existence of multiple datasets at once. 

1544 

1545 Parameters 

1546 ---------- 

1547 refs : iterable of `DatasetRef` 

1548 The datasets to be checked. 

1549 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1550 Optional mapping of datastore artifact to existence. Updated by 

1551 this method with details of all artifacts tested. Can be `None` 

1552 if the caller is not interested. 

1553 

1554 Returns 

1555 ------- 

1556 existence : `dict` of [`DatasetRef`, `bool`] 

1557 Mapping from dataset to boolean indicating existence. 

1558 """ 

1559 # Make a mapping from refs with the internal storage class to the given 

1560 # refs that may have a different one. We'll use the internal refs 

1561 # throughout this method and convert back at the very end. 

1562 internal_ref_to_input_ref = {self._cast_storage_class(ref): ref for ref in refs} 

1563 

1564 # Need a mapping of dataset_id to (internal) dataset ref since some 

1565 # internal APIs work with dataset_id. 

1566 id_to_ref = {ref.id: ref for ref in internal_ref_to_input_ref} 

1567 

1568 # Set of all IDs we are checking for. 

1569 requested_ids = set(id_to_ref.keys()) 

1570 

1571 # The records themselves. Could be missing some entries. 

1572 records = self._get_stored_records_associated_with_refs( 

1573 id_to_ref.values(), ignore_datastore_records=True 

1574 ) 

1575 

1576 dataset_existence = self._process_mexists_records( 

1577 id_to_ref, records, True, artifact_existence=artifact_existence 

1578 ) 

1579 

1580 # Set of IDs that have been handled. 

1581 handled_ids = {ref.id for ref in dataset_existence} 

1582 

1583 missing_ids = requested_ids - handled_ids 

1584 if missing_ids: 

1585 dataset_existence.update( 

1586 self._mexists_check_expected( 

1587 [id_to_ref[missing] for missing in missing_ids], artifact_existence 

1588 ) 

1589 ) 

1590 

1591 return { 

1592 internal_ref_to_input_ref[internal_ref]: existence 

1593 for internal_ref, existence in dataset_existence.items() 

1594 } 

1595 

1596 def _mexists_check_expected( 

1597 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1598 ) -> dict[DatasetRef, bool]: 

1599 """Check existence of refs that are not known to datastore. 

1600 

1601 Parameters 

1602 ---------- 

1603 refs : iterable of `DatasetRef` 

1604 The datasets to be checked. These are assumed not to be known 

1605 to datastore. 

1606 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1607 Optional mapping of datastore artifact to existence. Updated by 

1608 this method with details of all artifacts tested. Can be `None` 

1609 if the caller is not interested. 

1610 

1611 Returns 

1612 ------- 

1613 existence : `dict` of [`DatasetRef`, `bool`] 

1614 Mapping from dataset to boolean indicating existence. 

1615 """ 

1616 dataset_existence: dict[DatasetRef, bool] = {} 

1617 if not self.trustGetRequest: 

1618 # Must assume these do not exist 

1619 for ref in refs: 

1620 dataset_existence[ref] = False 

1621 else: 

1622 log.debug( 

1623 "%d datasets were not known to datastore during initial existence check.", 

1624 len(refs), 

1625 ) 

1626 

1627 # Construct data structure identical to that returned 

1628 # by _get_stored_records_associated_with_refs() but using 

1629 # guessed names. 

1630 records = {} 

1631 id_to_ref = {} 

1632 for missing_ref in refs: 

1633 expected = self._get_expected_dataset_locations_info(missing_ref) 

1634 dataset_id = missing_ref.id 

1635 records[dataset_id] = [info for _, info in expected] 

1636 id_to_ref[dataset_id] = missing_ref 

1637 

1638 dataset_existence.update( 

1639 self._process_mexists_records( 

1640 id_to_ref, 

1641 records, 

1642 False, 

1643 artifact_existence=artifact_existence, 

1644 ) 

1645 ) 

1646 

1647 return dataset_existence 

1648 

1649 def exists(self, ref: DatasetRef) -> bool: 

1650 """Check if the dataset exists in the datastore. 

1651 

1652 Parameters 

1653 ---------- 

1654 ref : `DatasetRef` 

1655 Reference to the required dataset. 

1656 

1657 Returns 

1658 ------- 

1659 exists : `bool` 

1660 `True` if the entity exists in the `Datastore`. 

1661 

1662 Notes 

1663 ----- 

1664 The local cache is checked as a proxy for existence in the remote 

1665 object store. It is possible that another process on a different 

1666 compute node could remove the file from the object store even 

1667 though it is present in the local cache. 

1668 """ 

1669 ref = self._cast_storage_class(ref) 

1670 # We cannot trust datastore records from ref, as many unit tests delete 

1671 # datasets and check their existence. 

1672 fileLocations = self._get_dataset_locations_info(ref, ignore_datastore_records=True) 

1673 

1674 # if we are being asked to trust that registry might not be correct 

1675 # we ask for the expected locations and check them explicitly 

1676 if not fileLocations: 

1677 if not self.trustGetRequest: 

1678 return False 

1679 

1680 # First check the cache. If it is not found we must check 

1681 # the datastore itself. Assume that any component in the cache 

1682 # means that the dataset does exist somewhere. 

1683 if self.cacheManager.known_to_cache(ref): 

1684 return True 

1685 

1686 # When we are guessing a dataset location we can not check 

1687 # for the existence of every component since we can not 

1688 # know if every component was written. Instead we check 

1689 # for the existence of any of the expected locations. 

1690 for location, _ in self._get_expected_dataset_locations_info(ref): 

1691 if self._artifact_exists(location): 

1692 return True 

1693 return False 

1694 

1695 # All listed artifacts must exist. 

1696 for location, storedFileInfo in fileLocations: 

1697 # Checking in cache needs the component ref. 

1698 check_ref = ref 

1699 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1700 check_ref = ref.makeComponentRef(component) 

1701 if self.cacheManager.known_to_cache(check_ref, location.getExtension()): 

1702 continue 

1703 

1704 if not self._artifact_exists(location): 

1705 return False 

1706 

1707 return True 

1708 

1709 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

1710 """Return URIs associated with dataset. 

1711 

1712 Parameters 

1713 ---------- 

1714 ref : `DatasetRef` 

1715 Reference to the required dataset. 

1716 predict : `bool`, optional 

1717 If the datastore does not know about the dataset, controls whether 

1718 it should return a predicted URI or not. 

1719 

1720 Returns 

1721 ------- 

1722 uris : `DatasetRefURIs` 

1723 The URI to the primary artifact associated with this dataset (if 

1724 the dataset was disassembled within the datastore this may be 

1725 `None`), and the URIs to any components associated with the dataset 

1726 artifact. (can be empty if there are no components). 

1727 """ 

1728 many = self.getManyURIs([ref], predict=predict, allow_missing=False) 

1729 return many[ref] 

1730 

1731 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

1732 """URI to the Dataset. 

1733 

1734 Parameters 

1735 ---------- 

1736 ref : `DatasetRef` 

1737 Reference to the required Dataset. 

1738 predict : `bool` 

1739 If `True`, allow URIs to be returned of datasets that have not 

1740 been written. 

1741 

1742 Returns 

1743 ------- 

1744 uri : `str` 

1745 URI pointing to the dataset within the datastore. If the 

1746 dataset does not exist in the datastore, and if ``predict`` is 

1747 `True`, the URI will be a prediction and will include a URI 

1748 fragment "#predicted". 

1749 If the datastore does not have entities that relate well 

1750 to the concept of a URI the returned URI will be 

1751 descriptive. The returned URI is not guaranteed to be obtainable. 

1752 

1753 Raises 

1754 ------ 

1755 FileNotFoundError 

1756 Raised if a URI has been requested for a dataset that does not 

1757 exist and guessing is not allowed. 

1758 RuntimeError 

1759 Raised if a request is made for a single URI but multiple URIs 

1760 are associated with this dataset. 

1761 

1762 Notes 

1763 ----- 

1764 When a predicted URI is requested an attempt will be made to form 

1765 a reasonable URI based on file templates and the expected formatter. 

1766 """ 

1767 primary, components = self.getURIs(ref, predict) 

1768 if primary is None or components: 

1769 raise RuntimeError( 

1770 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

1771 ) 

1772 return primary 

1773 

1774 def _predict_URIs( 

1775 self, 

1776 ref: DatasetRef, 

1777 ) -> DatasetRefURIs: 

1778 """Predict the URIs of a dataset ref. 

1779 

1780 Parameters 

1781 ---------- 

1782 ref : `DatasetRef` 

1783 Reference to the required Dataset. 

1784 

1785 Returns 

1786 ------- 

1787 URI : DatasetRefUris 

1788 Primary and component URIs. URIs will contain a URI fragment 

1789 "#predicted". 

1790 """ 

1791 uris = DatasetRefURIs() 

1792 

1793 if self.composites.shouldBeDisassembled(ref): 

1794 for component, _ in ref.datasetType.storageClass.components.items(): 

1795 comp_ref = ref.makeComponentRef(component) 

1796 comp_location, _ = self._determine_put_formatter_location(comp_ref) 

1797 

1798 # Add the "#predicted" URI fragment to indicate this is a 

1799 # guess 

1800 uris.componentURIs[component] = ResourcePath( 

1801 comp_location.uri.geturl() + "#predicted", forceDirectory=comp_location.uri.dirLike 

1802 ) 

1803 

1804 else: 

1805 location, _ = self._determine_put_formatter_location(ref) 

1806 

1807 # Add the "#predicted" URI fragment to indicate this is a guess 

1808 uris.primaryURI = ResourcePath( 

1809 location.uri.geturl() + "#predicted", forceDirectory=location.uri.dirLike 

1810 ) 

1811 

1812 return uris 

1813 

1814 def getManyURIs( 

1815 self, 

1816 refs: Iterable[DatasetRef], 

1817 predict: bool = False, 

1818 allow_missing: bool = False, 

1819 ) -> dict[DatasetRef, DatasetRefURIs]: 

1820 # Docstring inherited 

1821 

1822 uris: dict[DatasetRef, DatasetRefURIs] = {} 

1823 

1824 records = self._get_stored_records_associated_with_refs(refs) 

1825 records_keys = records.keys() 

1826 

1827 existing_refs = tuple(ref for ref in refs if ref.id in records_keys) 

1828 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys) 

1829 

1830 # Have to handle trustGetRequest mode by checking for the existence 

1831 # of the missing refs on disk. 

1832 if missing_refs: 

1833 dataset_existence = self._mexists_check_expected(missing_refs, None) 

1834 really_missing = set() 

1835 not_missing = set() 

1836 for ref, exists in dataset_existence.items(): 

1837 if exists: 

1838 not_missing.add(ref) 

1839 else: 

1840 really_missing.add(ref) 

1841 

1842 if not_missing: 

1843 # Need to recalculate the missing/existing split. 

1844 existing_refs = existing_refs + tuple(not_missing) 

1845 missing_refs = tuple(really_missing) 

1846 

1847 for ref in missing_refs: 

1848 # if this has never been written then we have to guess 

1849 if not predict: 

1850 if not allow_missing: 

1851 raise FileNotFoundError(f"Dataset {ref} not in this datastore.") 

1852 else: 

1853 uris[ref] = self._predict_URIs(ref) 

1854 

1855 for ref in existing_refs: 

1856 file_infos = records[ref.id] 

1857 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos] 

1858 uris[ref] = self._locations_to_URI(ref, file_locations) 

1859 

1860 return uris 

1861 

1862 def _locations_to_URI( 

1863 self, 

1864 ref: DatasetRef, 

1865 file_locations: Sequence[tuple[Location, StoredFileInfo]], 

1866 ) -> DatasetRefURIs: 

1867 """Convert one or more file locations associated with a DatasetRef 

1868 to a DatasetRefURIs. 

1869 

1870 Parameters 

1871 ---------- 

1872 ref : `DatasetRef` 

1873 Reference to the dataset. 

1874 file_locations : Sequence[Tuple[Location, StoredFileInfo]] 

1875 Each item in the sequence is the location of the dataset within the 

1876 datastore and stored information about the file and its formatter. 

1877 If there is only one item in the sequence then it is treated as the 

1878 primary URI. If there is more than one item then they are treated 

1879 as component URIs. If there are no items then an error is raised 

1880 unless ``self.trustGetRequest`` is `True`. 

1881 

1882 Returns 

1883 ------- 

1884 uris: DatasetRefURIs 

1885 Represents the primary URI or component URIs described by the 

1886 inputs. 

1887 

1888 Raises 

1889 ------ 

1890 RuntimeError 

1891 If no file locations are passed in and ``self.trustGetRequest`` is 

1892 `False`. 

1893 FileNotFoundError 

1894 If the a passed-in URI does not exist, and ``self.trustGetRequest`` 

1895 is `False`. 

1896 RuntimeError 

1897 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is 

1898 unexpected). 

1899 """ 

1900 guessing = False 

1901 uris = DatasetRefURIs() 

1902 

1903 if not file_locations: 

1904 if not self.trustGetRequest: 

1905 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1906 file_locations = self._get_expected_dataset_locations_info(ref) 

1907 guessing = True 

1908 

1909 if len(file_locations) == 1: 

1910 # No disassembly so this is the primary URI 

1911 uris.primaryURI = file_locations[0][0].uri 

1912 if guessing and not uris.primaryURI.exists(): 

1913 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist") 

1914 else: 

1915 for location, file_info in file_locations: 

1916 if file_info.component is None: 

1917 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1918 if guessing and not location.uri.exists(): 

1919 # If we are trusting then it is entirely possible for 

1920 # some components to be missing. In that case we skip 

1921 # to the next component. 

1922 if self.trustGetRequest: 

1923 continue 

1924 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist") 

1925 uris.componentURIs[file_info.component] = location.uri 

1926 

1927 return uris 

1928 

1929 def retrieveArtifacts( 

1930 self, 

1931 refs: Iterable[DatasetRef], 

1932 destination: ResourcePath, 

1933 transfer: str = "auto", 

1934 preserve_path: bool = True, 

1935 overwrite: bool = False, 

1936 ) -> list[ResourcePath]: 

1937 """Retrieve the file artifacts associated with the supplied refs. 

1938 

1939 Parameters 

1940 ---------- 

1941 refs : iterable of `DatasetRef` 

1942 The datasets for which file artifacts are to be retrieved. 

1943 A single ref can result in multiple files. The refs must 

1944 be resolved. 

1945 destination : `lsst.resources.ResourcePath` 

1946 Location to write the file artifacts. 

1947 transfer : `str`, optional 

1948 Method to use to transfer the artifacts. Must be one of the options 

1949 supported by `lsst.resources.ResourcePath.transfer_from()`. 

1950 "move" is not allowed. 

1951 preserve_path : `bool`, optional 

1952 If `True` the full path of the file artifact within the datastore 

1953 is preserved. If `False` the final file component of the path 

1954 is used. 

1955 overwrite : `bool`, optional 

1956 If `True` allow transfers to overwrite existing files at the 

1957 destination. 

1958 

1959 Returns 

1960 ------- 

1961 targets : `list` of `lsst.resources.ResourcePath` 

1962 URIs of file artifacts in destination location. Order is not 

1963 preserved. 

1964 """ 

1965 if not destination.isdir(): 

1966 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

1967 

1968 if transfer == "move": 

1969 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

1970 

1971 # Source -> Destination 

1972 # This also helps filter out duplicate DatasetRef in the request 

1973 # that will map to the same underlying file transfer. 

1974 to_transfer: dict[ResourcePath, ResourcePath] = {} 

1975 

1976 for ref in refs: 

1977 locations = self._get_dataset_locations_info(ref) 

1978 for location, _ in locations: 

1979 source_uri = location.uri 

1980 target_path: ResourcePathExpression 

1981 if preserve_path: 

1982 target_path = location.pathInStore 

1983 if target_path.isabs(): 

1984 # This is an absolute path to an external file. 

1985 # Use the full path. 

1986 target_path = target_path.relativeToPathRoot 

1987 else: 

1988 target_path = source_uri.basename() 

1989 target_uri = destination.join(target_path) 

1990 to_transfer[source_uri] = target_uri 

1991 

1992 # In theory can now parallelize the transfer 

1993 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer)) 

1994 for source_uri, target_uri in to_transfer.items(): 

1995 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

1996 

1997 return list(to_transfer.values()) 

1998 

1999 def get( 

2000 self, 

2001 ref: DatasetRef, 

2002 parameters: Mapping[str, Any] | None = None, 

2003 storageClass: StorageClass | str | None = None, 

2004 ) -> Any: 

2005 """Load an InMemoryDataset from the store. 

2006 

2007 Parameters 

2008 ---------- 

2009 ref : `DatasetRef` 

2010 Reference to the required Dataset. 

2011 parameters : `dict` 

2012 `StorageClass`-specific parameters that specify, for example, 

2013 a slice of the dataset to be loaded. 

2014 storageClass : `StorageClass` or `str`, optional 

2015 The storage class to be used to override the Python type 

2016 returned by this method. By default the returned type matches 

2017 the dataset type definition for this dataset. Specifying a 

2018 read `StorageClass` can force a different type to be returned. 

2019 This type must be compatible with the original type. 

2020 

2021 Returns 

2022 ------- 

2023 inMemoryDataset : `object` 

2024 Requested dataset or slice thereof as an InMemoryDataset. 

2025 

2026 Raises 

2027 ------ 

2028 FileNotFoundError 

2029 Requested dataset can not be retrieved. 

2030 TypeError 

2031 Return value from formatter has unexpected type. 

2032 ValueError 

2033 Formatter failed to process the dataset. 

2034 """ 

2035 # Supplied storage class for the component being read is either 

2036 # from the ref itself or some an override if we want to force 

2037 # type conversion. 

2038 if storageClass is not None: 

2039 ref = ref.overrideStorageClass(storageClass) 

2040 

2041 allGetInfo = self._prepare_for_direct_get(ref, parameters) 

2042 return get_dataset_as_python_object_from_get_info( 

2043 allGetInfo, ref=ref, parameters=parameters, cache_manager=self.cacheManager 

2044 ) 

2045 

2046 def prepare_get_for_external_client(self, ref: DatasetRef) -> FileDatastoreGetPayload: 

2047 # Docstring inherited 

2048 

2049 # 1 hour. Chosen somewhat arbitrarily -- this is long enough that the 

2050 # client should have time to download a large file with retries if 

2051 # needed, but short enough that it will become obvious quickly that 

2052 # these URLs expire. 

2053 # From a strictly technical standpoint there is no reason this 

2054 # shouldn't be a day or more, but there seems to be a political issue 

2055 # where people think there is a risk of end users posting presigned 

2056 # URLs for people without access rights to download. 

2057 url_expiration_time_seconds = 1 * 60 * 60 

2058 

2059 def to_file_info_payload(info: DatasetLocationInformation) -> FileDatastoreGetPayloadFileInfo: 

2060 location, file_info = info 

2061 return FileDatastoreGetPayloadFileInfo( 

2062 url=location.uri.generate_presigned_get_url( 

2063 expiration_time_seconds=url_expiration_time_seconds 

2064 ), 

2065 datastoreRecords=file_info.to_simple(), 

2066 ) 

2067 

2068 return FileDatastoreGetPayload( 

2069 datastore_type="file", 

2070 dataset_ref=ref.to_simple(), 

2071 file_info=[to_file_info_payload(info) for info in self._get_dataset_locations_info(ref)], 

2072 ) 

2073 

2074 @transactional 

2075 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

2076 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

2077 

2078 Parameters 

2079 ---------- 

2080 inMemoryDataset : `object` 

2081 The dataset to store. 

2082 ref : `DatasetRef` 

2083 Reference to the associated Dataset. 

2084 

2085 Raises 

2086 ------ 

2087 TypeError 

2088 Supplied object and storage class are inconsistent. 

2089 DatasetTypeNotSupportedError 

2090 The associated `DatasetType` is not handled by this datastore. 

2091 

2092 Notes 

2093 ----- 

2094 If the datastore is configured to reject certain dataset types it 

2095 is possible that the put will fail and raise a 

2096 `DatasetTypeNotSupportedError`. The main use case for this is to 

2097 allow `ChainedDatastore` to put to multiple datastores without 

2098 requiring that every datastore accepts the dataset. 

2099 """ 

2100 doDisassembly = self.composites.shouldBeDisassembled(ref) 

2101 # doDisassembly = True 

2102 

2103 artifacts = [] 

2104 if doDisassembly: 

2105 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

2106 if components is None: 

2107 raise RuntimeError( 

2108 f"Inconsistent configuration: dataset type {ref.datasetType.name} " 

2109 f"with storage class {ref.datasetType.storageClass.name} " 

2110 "is configured to be disassembled, but cannot be." 

2111 ) 

2112 for component, componentInfo in components.items(): 

2113 # Don't recurse because we want to take advantage of 

2114 # bulk insert -- need a new DatasetRef that refers to the 

2115 # same dataset_id but has the component DatasetType 

2116 # DatasetType does not refer to the types of components 

2117 # So we construct one ourselves. 

2118 compRef = ref.makeComponentRef(component) 

2119 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

2120 artifacts.append((compRef, storedInfo)) 

2121 else: 

2122 # Write the entire thing out 

2123 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

2124 artifacts.append((ref, storedInfo)) 

2125 

2126 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.INSERT) 

2127 

2128 @transactional 

2129 def put_new(self, in_memory_dataset: Any, ref: DatasetRef) -> Mapping[str, DatasetRef]: 

2130 doDisassembly = self.composites.shouldBeDisassembled(ref) 

2131 # doDisassembly = True 

2132 

2133 artifacts = [] 

2134 if doDisassembly: 

2135 components = ref.datasetType.storageClass.delegate().disassemble(in_memory_dataset) 

2136 if components is None: 

2137 raise RuntimeError( 

2138 f"Inconsistent configuration: dataset type {ref.datasetType.name} " 

2139 f"with storage class {ref.datasetType.storageClass.name} " 

2140 "is configured to be disassembled, but cannot be." 

2141 ) 

2142 for component, componentInfo in components.items(): 

2143 # Don't recurse because we want to take advantage of 

2144 # bulk insert -- need a new DatasetRef that refers to the 

2145 # same dataset_id but has the component DatasetType 

2146 # DatasetType does not refer to the types of components 

2147 # So we construct one ourselves. 

2148 compRef = ref.makeComponentRef(component) 

2149 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

2150 artifacts.append((compRef, storedInfo)) 

2151 else: 

2152 # Write the entire thing out 

2153 storedInfo = self._write_in_memory_to_artifact(in_memory_dataset, ref) 

2154 artifacts.append((ref, storedInfo)) 

2155 

2156 ref_records = {self._opaque_table_name: [info for _, info in artifacts]} 

2157 ref = ref.replace(datastore_records=ref_records) 

2158 return {self.name: ref} 

2159 

2160 @transactional 

2161 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None: 

2162 # At this point can safely remove these datasets from the cache 

2163 # to avoid confusion later on. If they are not trashed later 

2164 # the cache will simply be refilled. 

2165 self.cacheManager.remove_from_cache(ref) 

2166 

2167 # If we are in trust mode there will be nothing to move to 

2168 # the trash table and we will have to try to delete the file 

2169 # immediately. 

2170 if self.trustGetRequest: 

2171 # Try to keep the logic below for a single file trash. 

2172 if isinstance(ref, DatasetRef): 

2173 refs = {ref} 

2174 else: 

2175 # Will recreate ref at the end of this branch. 

2176 refs = set(ref) 

2177 

2178 # Determine which datasets are known to datastore directly. 

2179 id_to_ref = {ref.id: ref for ref in refs} 

2180 existing_ids = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True) 

2181 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids} 

2182 

2183 missing = refs - existing_refs 

2184 if missing: 

2185 # Do an explicit existence check on these refs. 

2186 # We only care about the artifacts at this point and not 

2187 # the dataset existence. 

2188 artifact_existence: dict[ResourcePath, bool] = {} 

2189 _ = self.mexists(missing, artifact_existence) 

2190 uris = [uri for uri, exists in artifact_existence.items() if exists] 

2191 

2192 # FUTURE UPGRADE: Implement a parallelized bulk remove. 

2193 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris)) 

2194 for uri in uris: 

2195 try: 

2196 uri.remove() 

2197 except Exception as e: 

2198 if ignore_errors: 

2199 log.debug("Artifact %s could not be removed: %s", uri, e) 

2200 continue 

2201 raise 

2202 

2203 # There is no point asking the code below to remove refs we 

2204 # know are missing so update it with the list of existing 

2205 # records. Try to retain one vs many logic. 

2206 if not existing_refs: 

2207 # Nothing more to do since none of the datasets were 

2208 # known to the datastore record table. 

2209 return 

2210 ref = list(existing_refs) 

2211 if len(ref) == 1: 

2212 ref = ref[0] 

2213 

2214 # Get file metadata and internal metadata 

2215 if not isinstance(ref, DatasetRef): 

2216 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

2217 # Assumed to be an iterable of refs so bulk mode enabled. 

2218 try: 

2219 self.bridge.moveToTrash(ref, transaction=self._transaction) 

2220 except Exception as e: 

2221 if ignore_errors: 

2222 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

2223 else: 

2224 raise 

2225 return 

2226 

2227 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

2228 

2229 fileLocations = self._get_dataset_locations_info(ref) 

2230 

2231 if not fileLocations: 

2232 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

2233 if ignore_errors: 

2234 log.warning(err_msg) 

2235 return 

2236 else: 

2237 raise FileNotFoundError(err_msg) 

2238 

2239 for location, _ in fileLocations: 

2240 if not self._artifact_exists(location): 

2241 err_msg = ( 

2242 f"Dataset is known to datastore {self.name} but " 

2243 f"associated artifact ({location.uri}) is missing" 

2244 ) 

2245 if ignore_errors: 

2246 log.warning(err_msg) 

2247 return 

2248 else: 

2249 raise FileNotFoundError(err_msg) 

2250 

2251 # Mark dataset as trashed 

2252 try: 

2253 self.bridge.moveToTrash([ref], transaction=self._transaction) 

2254 except Exception as e: 

2255 if ignore_errors: 

2256 log.warning( 

2257 "Attempted to mark dataset (%s) to be trashed in datastore %s " 

2258 "but encountered an error: %s", 

2259 ref, 

2260 self.name, 

2261 e, 

2262 ) 

2263 pass 

2264 else: 

2265 raise 

2266 

2267 @transactional 

2268 def emptyTrash(self, ignore_errors: bool = True) -> None: 

2269 """Remove all datasets from the trash. 

2270 

2271 Parameters 

2272 ---------- 

2273 ignore_errors : `bool` 

2274 If `True` return without error even if something went wrong. 

2275 Problems could occur if another process is simultaneously trying 

2276 to delete. 

2277 """ 

2278 log.debug("Emptying trash in datastore %s", self.name) 

2279 

2280 # Context manager will empty trash iff we finish it without raising. 

2281 # It will also automatically delete the relevant rows from the 

2282 # trash table and the records table. 

2283 with self.bridge.emptyTrash( 

2284 self._table, record_class=StoredFileInfo, record_column="path" 

2285 ) as trash_data: 

2286 # Removing the artifacts themselves requires that the files are 

2287 # not also associated with refs that are not to be trashed. 

2288 # Therefore need to do a query with the file paths themselves 

2289 # and return all the refs associated with them. Can only delete 

2290 # a file if the refs to be trashed are the only refs associated 

2291 # with the file. 

2292 # This requires multiple copies of the trashed items 

2293 trashed, artifacts_to_keep = trash_data 

2294 

2295 if artifacts_to_keep is None: 

2296 # The bridge is not helping us so have to work it out 

2297 # ourselves. This is not going to be as efficient. 

2298 trashed = list(trashed) 

2299 

2300 # The instance check is for mypy since up to this point it 

2301 # does not know the type of info. 

2302 path_map = self._refs_associated_with_artifacts( 

2303 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)] 

2304 ) 

2305 

2306 for ref, info in trashed: 

2307 # Mypy needs to know this is not the base class 

2308 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2309 

2310 path_map[info.path].remove(ref.id) 

2311 if not path_map[info.path]: 

2312 del path_map[info.path] 

2313 

2314 artifacts_to_keep = set(path_map) 

2315 

2316 for ref, info in trashed: 

2317 # Should not happen for this implementation but need 

2318 # to keep mypy happy. 

2319 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

2320 

2321 # Mypy needs to know this is not the base class 

2322 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2323 

2324 if info.path in artifacts_to_keep: 

2325 # This is a multi-dataset artifact and we are not 

2326 # removing all associated refs. 

2327 continue 

2328 

2329 # Only trashed refs still known to datastore will be returned. 

2330 location = info.file_location(self.locationFactory) 

2331 

2332 # Point of no return for this artifact 

2333 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

2334 try: 

2335 self._delete_artifact(location) 

2336 except FileNotFoundError: 

2337 # If the file itself has been deleted there is nothing 

2338 # we can do about it. It is possible that trash has 

2339 # been run in parallel in another process or someone 

2340 # decided to delete the file. It is unlikely to come 

2341 # back and so we should still continue with the removal 

2342 # of the entry from the trash table. It is also possible 

2343 # we removed it in a previous iteration if it was 

2344 # a multi-dataset artifact. The delete artifact method 

2345 # will log a debug message in this scenario. 

2346 # Distinguishing file missing before trash started and 

2347 # file already removed previously as part of this trash 

2348 # is not worth the distinction with regards to potential 

2349 # memory cost. 

2350 pass 

2351 except Exception as e: 

2352 if ignore_errors: 

2353 # Use a debug message here even though it's not 

2354 # a good situation. In some cases this can be 

2355 # caused by a race between user A and user B 

2356 # and neither of them has permissions for the 

2357 # other's files. Butler does not know about users 

2358 # and trash has no idea what collections these 

2359 # files were in (without guessing from a path). 

2360 log.debug( 

2361 "Encountered error removing artifact %s from datastore %s: %s", 

2362 location.uri, 

2363 self.name, 

2364 e, 

2365 ) 

2366 else: 

2367 raise 

2368 

2369 @transactional 

2370 def transfer_from( 

2371 self, 

2372 source_datastore: Datastore, 

2373 refs: Collection[DatasetRef], 

2374 transfer: str = "auto", 

2375 artifact_existence: dict[ResourcePath, bool] | None = None, 

2376 dry_run: bool = False, 

2377 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

2378 # Docstring inherited 

2379 if type(self) is not type(source_datastore): 

2380 raise TypeError( 

2381 f"Datastore mismatch between this datastore ({type(self)}) and the " 

2382 f"source datastore ({type(source_datastore)})." 

2383 ) 

2384 

2385 # Be explicit for mypy 

2386 if not isinstance(source_datastore, FileDatastore): 

2387 raise TypeError( 

2388 "Can only transfer to a FileDatastore from another FileDatastore, not" 

2389 f" {type(source_datastore)}" 

2390 ) 

2391 

2392 # Stop early if "direct" transfer mode is requested. That would 

2393 # require that the URI inside the source datastore should be stored 

2394 # directly in the target datastore, which seems unlikely to be useful 

2395 # since at any moment the source datastore could delete the file. 

2396 if transfer in ("direct", "split"): 

2397 raise ValueError( 

2398 f"Can not transfer from a source datastore using {transfer} mode since" 

2399 " those files are controlled by the other datastore." 

2400 ) 

2401 

2402 # Empty existence lookup if none given. 

2403 if artifact_existence is None: 

2404 artifact_existence = {} 

2405 

2406 # In order to handle disassembled composites the code works 

2407 # at the records level since it can assume that internal APIs 

2408 # can be used. 

2409 # - If the record already exists in the destination this is assumed 

2410 # to be okay. 

2411 # - If there is no record but the source and destination URIs are 

2412 # identical no transfer is done but the record is added. 

2413 # - If the source record refers to an absolute URI currently assume 

2414 # that that URI should remain absolute and will be visible to the 

2415 # destination butler. May need to have a flag to indicate whether 

2416 # the dataset should be transferred. This will only happen if 

2417 # the detached Butler has had a local ingest. 

2418 

2419 # What we really want is all the records in the source datastore 

2420 # associated with these refs. Or derived ones if they don't exist 

2421 # in the source. 

2422 source_records = source_datastore._get_stored_records_associated_with_refs( 

2423 refs, ignore_datastore_records=True 

2424 ) 

2425 

2426 # The source dataset_ids are the keys in these records 

2427 source_ids = set(source_records) 

2428 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

2429 

2430 requested_ids = {ref.id for ref in refs} 

2431 missing_ids = requested_ids - source_ids 

2432 

2433 # Missing IDs can be okay if that datastore has allowed 

2434 # gets based on file existence. Should we transfer what we can 

2435 # or complain about it and warn? 

2436 if missing_ids and not source_datastore.trustGetRequest: 

2437 raise ValueError( 

2438 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}" 

2439 ) 

2440 

2441 # Need to map these missing IDs to a DatasetRef so we can guess 

2442 # the details. 

2443 if missing_ids: 

2444 log.info( 

2445 "Number of expected datasets missing from source datastore records: %d out of %d", 

2446 len(missing_ids), 

2447 len(requested_ids), 

2448 ) 

2449 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

2450 

2451 # This should be chunked in case we end up having to check 

2452 # the file store since we need some log output to show 

2453 # progress. 

2454 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000): 

2455 records = {} 

2456 for missing in missing_ids_chunk: 

2457 # Ask the source datastore where the missing artifacts 

2458 # should be. An execution butler might not know about the 

2459 # artifacts even if they are there. 

2460 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) 

2461 records[missing] = [info for _, info in expected] 

2462 

2463 # Call the mexist helper method in case we have not already 

2464 # checked these artifacts such that artifact_existence is 

2465 # empty. This allows us to benefit from parallelism. 

2466 # datastore.mexists() itself does not give us access to the 

2467 # derived datastore record. 

2468 log.verbose("Checking existence of %d datasets unknown to datastore", len(records)) 

2469 ref_exists = source_datastore._process_mexists_records( 

2470 id_to_ref, records, False, artifact_existence=artifact_existence 

2471 ) 

2472 

2473 # Now go through the records and propagate the ones that exist. 

2474 location_factory = source_datastore.locationFactory 

2475 for missing, record_list in records.items(): 

2476 # Skip completely if the ref does not exist. 

2477 ref = id_to_ref[missing] 

2478 if not ref_exists[ref]: 

2479 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref) 

2480 continue 

2481 # Check for file artifact to decide which parts of a 

2482 # disassembled composite do exist. If there is only a 

2483 # single record we don't even need to look because it can't 

2484 # be a composite and must exist. 

2485 if len(record_list) == 1: 

2486 dataset_records = record_list 

2487 else: 

2488 dataset_records = [ 

2489 record 

2490 for record in record_list 

2491 if artifact_existence[record.file_location(location_factory).uri] 

2492 ] 

2493 assert len(dataset_records) > 0, "Disassembled composite should have had some files." 

2494 

2495 # Rely on source_records being a defaultdict. 

2496 source_records[missing].extend(dataset_records) 

2497 log.verbose("Completed scan for missing data files") 

2498 

2499 # See if we already have these records 

2500 target_records = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True) 

2501 

2502 # The artifacts to register 

2503 artifacts = [] 

2504 

2505 # Refs that already exist 

2506 already_present = [] 

2507 

2508 # Refs that were rejected by this datastore. 

2509 rejected = set() 

2510 

2511 # Refs that were transferred successfully. 

2512 accepted = set() 

2513 

2514 # Record each time we have done a "direct" transfer. 

2515 direct_transfers = [] 

2516 

2517 # Now can transfer the artifacts 

2518 for ref in refs: 

2519 if not self.constraints.isAcceptable(ref): 

2520 # This datastore should not be accepting this dataset. 

2521 rejected.add(ref) 

2522 continue 

2523 

2524 accepted.add(ref) 

2525 

2526 if ref.id in target_records: 

2527 # Already have an artifact for this. 

2528 already_present.append(ref) 

2529 continue 

2530 

2531 # mypy needs to know these are always resolved refs 

2532 for info in source_records[ref.id]: 

2533 source_location = info.file_location(source_datastore.locationFactory) 

2534 target_location = info.file_location(self.locationFactory) 

2535 if source_location == target_location and not source_location.pathInStore.isabs(): 

2536 # Artifact is already in the target location. 

2537 # (which is how execution butler currently runs) 

2538 pass 

2539 else: 

2540 if target_location.pathInStore.isabs(): 

2541 # Just because we can see the artifact when running 

2542 # the transfer doesn't mean it will be generally 

2543 # accessible to a user of this butler. Need to decide 

2544 # what to do about an absolute path. 

2545 if transfer == "auto": 

2546 # For "auto" transfers we allow the absolute URI 

2547 # to be recorded in the target datastore. 

2548 direct_transfers.append(source_location) 

2549 else: 

2550 # The user is explicitly requesting a transfer 

2551 # even for an absolute URI. This requires us to 

2552 # calculate the target path. 

2553 template_ref = ref 

2554 if info.component: 

2555 template_ref = ref.makeComponentRef(info.component) 

2556 target_location = self._calculate_ingested_datastore_name( 

2557 source_location.uri, 

2558 template_ref, 

2559 ) 

2560 

2561 info = info.update(path=target_location.pathInStore.path) 

2562 

2563 # Need to transfer it to the new location. 

2564 # Assume we should always overwrite. If the artifact 

2565 # is there this might indicate that a previous transfer 

2566 # was interrupted but was not able to be rolled back 

2567 # completely (eg pre-emption) so follow Datastore default 

2568 # and overwrite. Do not copy if we are in dry-run mode. 

2569 if not dry_run: 

2570 target_location.uri.transfer_from( 

2571 source_location.uri, 

2572 transfer=transfer, 

2573 overwrite=True, 

2574 transaction=self._transaction, 

2575 ) 

2576 

2577 artifacts.append((ref, info)) 

2578 

2579 if direct_transfers: 

2580 log.info( 

2581 "Transfer request for an outside-datastore artifact with absolute URI done %d time%s", 

2582 len(direct_transfers), 

2583 "" if len(direct_transfers) == 1 else "s", 

2584 ) 

2585 

2586 # We are overwriting previous datasets that may have already 

2587 # existed. We therefore should ensure that we force the 

2588 # datastore records to agree. Note that this can potentially lead 

2589 # to difficulties if the dataset has previously been ingested 

2590 # disassembled and is somehow now assembled, or vice versa. 

2591 if not dry_run: 

2592 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.REPLACE) 

2593 

2594 if already_present: 

2595 n_skipped = len(already_present) 

2596 log.info( 

2597 "Skipped transfer of %d dataset%s already present in datastore", 

2598 n_skipped, 

2599 "" if n_skipped == 1 else "s", 

2600 ) 

2601 

2602 return accepted, rejected 

2603 

2604 @transactional 

2605 def forget(self, refs: Iterable[DatasetRef]) -> None: 

2606 # Docstring inherited. 

2607 refs = list(refs) 

2608 self.bridge.forget(refs) 

2609 self._table.delete(["dataset_id"], *[{"dataset_id": ref.id} for ref in refs]) 

2610 

2611 def validateConfiguration( 

2612 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

2613 ) -> None: 

2614 """Validate some of the configuration for this datastore. 

2615 

2616 Parameters 

2617 ---------- 

2618 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

2619 Entities to test against this configuration. Can be differing 

2620 types. 

2621 logFailures : `bool`, optional 

2622 If `True`, output a log message for every validation error 

2623 detected. 

2624 

2625 Raises 

2626 ------ 

2627 DatastoreValidationError 

2628 Raised if there is a validation problem with a configuration. 

2629 All the problems are reported in a single exception. 

2630 

2631 Notes 

2632 ----- 

2633 This method checks that all the supplied entities have valid file 

2634 templates and also have formatters defined. 

2635 """ 

2636 templateFailed = None 

2637 try: 

2638 self.templates.validateTemplates(entities, logFailures=logFailures) 

2639 except FileTemplateValidationError as e: 

2640 templateFailed = str(e) 

2641 

2642 formatterFailed = [] 

2643 for entity in entities: 

2644 try: 

2645 self.formatterFactory.getFormatterClass(entity) 

2646 except KeyError as e: 

2647 formatterFailed.append(str(e)) 

2648 if logFailures: 

2649 log.critical("Formatter failure: %s", e) 

2650 

2651 if templateFailed or formatterFailed: 

2652 messages = [] 

2653 if templateFailed: 

2654 messages.append(templateFailed) 

2655 if formatterFailed: 

2656 messages.append(",".join(formatterFailed)) 

2657 msg = ";\n".join(messages) 

2658 raise DatastoreValidationError(msg) 

2659 

2660 def getLookupKeys(self) -> set[LookupKey]: 

2661 # Docstring is inherited from base class 

2662 return ( 

2663 self.templates.getLookupKeys() 

2664 | self.formatterFactory.getLookupKeys() 

2665 | self.constraints.getLookupKeys() 

2666 ) 

2667 

2668 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

2669 # Docstring is inherited from base class 

2670 # The key can be valid in either formatters or templates so we can 

2671 # only check the template if it exists 

2672 if lookupKey in self.templates: 

2673 try: 

2674 self.templates[lookupKey].validateTemplate(entity) 

2675 except FileTemplateValidationError as e: 

2676 raise DatastoreValidationError(e) from e 

2677 

2678 def export( 

2679 self, 

2680 refs: Iterable[DatasetRef], 

2681 *, 

2682 directory: ResourcePathExpression | None = None, 

2683 transfer: str | None = "auto", 

2684 ) -> Iterable[FileDataset]: 

2685 # Docstring inherited from Datastore.export. 

2686 if transfer == "auto" and directory is None: 

2687 transfer = None 

2688 

2689 if transfer is not None and directory is None: 

2690 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

2691 

2692 if transfer == "move": 

2693 raise TypeError("Can not export by moving files out of datastore.") 

2694 elif transfer == "direct": 

2695 # For an export, treat this as equivalent to None. We do not 

2696 # want an import to risk using absolute URIs to datasets owned 

2697 # by another datastore. 

2698 log.info("Treating 'direct' transfer mode as in-place export.") 

2699 transfer = None 

2700 

2701 # Force the directory to be a URI object 

2702 directoryUri: ResourcePath | None = None 

2703 if directory is not None: 

2704 directoryUri = ResourcePath(directory, forceDirectory=True) 

2705 

2706 if transfer is not None and directoryUri is not None and not directoryUri.exists(): 

2707 # mypy needs the second test 

2708 raise FileNotFoundError(f"Export location {directory} does not exist") 

2709 

2710 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

2711 for ref in progress.wrap(refs, "Exporting dataset files"): 

2712 fileLocations = self._get_dataset_locations_info(ref) 

2713 if not fileLocations: 

2714 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

2715 # For now we can not export disassembled datasets 

2716 if len(fileLocations) > 1: 

2717 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

2718 location, storedFileInfo = fileLocations[0] 

2719 

2720 pathInStore = location.pathInStore.path 

2721 if transfer is None: 

2722 # TODO: do we also need to return the readStorageClass somehow? 

2723 # We will use the path in store directly. If this is an 

2724 # absolute URI, preserve it. 

2725 if location.pathInStore.isabs(): 

2726 pathInStore = str(location.uri) 

2727 elif transfer == "direct": 

2728 # Use full URIs to the remote store in the export 

2729 pathInStore = str(location.uri) 

2730 else: 

2731 # mypy needs help 

2732 assert directoryUri is not None, "directoryUri must be defined to get here" 

2733 storeUri = ResourcePath(location.uri, forceDirectory=False) 

2734 

2735 # if the datastore has an absolute URI to a resource, we 

2736 # have two options: 

2737 # 1. Keep the absolute URI in the exported YAML 

2738 # 2. Allocate a new name in the local datastore and transfer 

2739 # it. 

2740 # For now go with option 2 

2741 if location.pathInStore.isabs(): 

2742 template = self.templates.getTemplate(ref) 

2743 newURI = ResourcePath(template.format(ref), forceAbsolute=False, forceDirectory=False) 

2744 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

2745 

2746 exportUri = directoryUri.join(pathInStore) 

2747 exportUri.transfer_from(storeUri, transfer=transfer) 

2748 

2749 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

2750 

2751 @staticmethod 

2752 def computeChecksum(uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192) -> str | None: 

2753 """Compute the checksum of the supplied file. 

2754 

2755 Parameters 

2756 ---------- 

2757 uri : `lsst.resources.ResourcePath` 

2758 Name of resource to calculate checksum from. 

2759 algorithm : `str`, optional 

2760 Name of algorithm to use. Must be one of the algorithms supported 

2761 by :py:class`hashlib`. 

2762 block_size : `int` 

2763 Number of bytes to read from file at one time. 

2764 

2765 Returns 

2766 ------- 

2767 hexdigest : `str` 

2768 Hex digest of the file. 

2769 

2770 Notes 

2771 ----- 

2772 Currently returns None if the URI is for a remote resource. 

2773 """ 

2774 if algorithm not in hashlib.algorithms_guaranteed: 

2775 raise NameError(f"The specified algorithm '{algorithm}' is not supported by hashlib") 

2776 

2777 if not uri.isLocal: 

2778 return None 

2779 

2780 hasher = hashlib.new(algorithm) 

2781 

2782 with uri.as_local() as local_uri, open(local_uri.ospath, "rb") as f: 

2783 for chunk in iter(lambda: f.read(block_size), b""): 

2784 hasher.update(chunk) 

2785 

2786 return hasher.hexdigest() 

2787 

2788 def needs_expanded_data_ids( 

2789 self, 

2790 transfer: str | None, 

2791 entity: DatasetRef | DatasetType | StorageClass | None = None, 

2792 ) -> bool: 

2793 # Docstring inherited. 

2794 # This _could_ also use entity to inspect whether the filename template 

2795 # involves placeholders other than the required dimensions for its 

2796 # dataset type, but that's not necessary for correctness; it just 

2797 # enables more optimizations (perhaps only in theory). 

2798 return transfer not in ("direct", None) 

2799 

2800 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

2801 # Docstring inherited from the base class. 

2802 record_data = data.get(self.name) 

2803 if not record_data: 

2804 return 

2805 

2806 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records) 

2807 

2808 # TODO: Verify that there are no unexpected table names in the dict? 

2809 unpacked_records = [] 

2810 for dataset_id, dataset_data in record_data.records.items(): 

2811 records = dataset_data.get(self._table.name) 

2812 if records: 

2813 for info in records: 

2814 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records" 

2815 unpacked_records.append(info.to_record(dataset_id=dataset_id)) 

2816 if unpacked_records: 

2817 self._table.insert(*unpacked_records, transaction=self._transaction) 

2818 

2819 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

2820 # Docstring inherited from the base class. 

2821 exported_refs = list(self._bridge.check(refs)) 

2822 ids = {ref.id for ref in exported_refs} 

2823 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids} 

2824 for row in self._table.fetch(dataset_id=ids): 

2825 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row) 

2826 dataset_records = records.setdefault(row["dataset_id"], {}) 

2827 dataset_records.setdefault(self._table.name, []).append(info) 

2828 

2829 record_data = DatastoreRecordData(records=records) 

2830 return {self.name: record_data} 

2831 

2832 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None: 

2833 # Docstring inherited from the base class. 

2834 self._retrieve_dataset_method = method 

2835 

2836 def _cast_storage_class(self, ref: DatasetRef) -> DatasetRef: 

2837 """Update dataset reference to use the storage class from registry.""" 

2838 if self._retrieve_dataset_method is None: 

2839 # We could raise an exception here but unit tests do not define 

2840 # this method. 

2841 return ref 

2842 dataset_type = self._retrieve_dataset_method(ref.datasetType.name) 

2843 if dataset_type is not None: 

2844 ref = ref.overrideStorageClass(dataset_type.storageClass) 

2845 return ref 

2846 

2847 def get_opaque_table_definitions(self) -> Mapping[str, DatastoreOpaqueTable]: 

2848 # Docstring inherited from the base class. 

2849 return {self._opaque_table_name: DatastoreOpaqueTable(self.makeTableSpec(ddl.GUID), StoredFileInfo)}