Coverage for python/lsst/daf/butler/core/datastore.py: 51%

210 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-28 10:10 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Support for generic data stores.""" 

23 

24from __future__ import annotations 

25 

26__all__ = ("DatastoreConfig", "Datastore", "DatastoreValidationError", "DatasetRefURIs") 

27 

28import contextlib 

29import dataclasses 

30import logging 

31from abc import ABCMeta, abstractmethod 

32from collections import abc, defaultdict 

33from collections.abc import Callable, Iterable, Iterator, Mapping 

34from typing import TYPE_CHECKING, Any, ClassVar 

35 

36from lsst.utils import doImportType 

37 

38from .config import Config, ConfigSubset 

39from .constraints import Constraints 

40from .exceptions import DatasetTypeNotSupportedError, ValidationError 

41from .fileDataset import FileDataset 

42from .storageClass import StorageClassFactory 

43 

44if TYPE_CHECKING: 

45 from lsst.resources import ResourcePath, ResourcePathExpression 

46 

47 from ..registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

48 from .configSupport import LookupKey 

49 from .datasets import DatasetRef, DatasetType 

50 from .datastoreRecordData import DatastoreRecordData 

51 from .storageClass import StorageClass 

52 

53 

54class DatastoreConfig(ConfigSubset): 

55 """Configuration for Datastores.""" 

56 

57 component = "datastore" 

58 requiredKeys = ("cls",) 

59 defaultConfigFile = "datastore.yaml" 

60 

61 

62class DatastoreValidationError(ValidationError): 

63 """There is a problem with the Datastore configuration.""" 

64 

65 pass 

66 

67 

68@dataclasses.dataclass(frozen=True) 

69class Event: 

70 """Representation of an event that can be rolled back.""" 

71 

72 __slots__ = {"name", "undoFunc", "args", "kwargs"} 

73 name: str 

74 undoFunc: Callable 

75 args: tuple 

76 kwargs: dict 

77 

78 

79class IngestPrepData: 

80 """A helper base class for `Datastore` ingest implementations. 

81 

82 Datastore implementations will generally need a custom implementation of 

83 this class. 

84 

85 Should be accessed as ``Datastore.IngestPrepData`` instead of via direct 

86 import. 

87 

88 Parameters 

89 ---------- 

90 refs : iterable of `DatasetRef` 

91 References for the datasets that can be ingested by this datastore. 

92 """ 

93 

94 def __init__(self, refs: Iterable[DatasetRef]): 

95 self.refs = {ref.id: ref for ref in refs} 

96 

97 

98class DatastoreTransaction: 

99 """Keeps a log of `Datastore` activity and allow rollback. 

100 

101 Parameters 

102 ---------- 

103 parent : `DatastoreTransaction`, optional 

104 The parent transaction (if any) 

105 """ 

106 

107 Event: ClassVar[type] = Event 

108 

109 parent: DatastoreTransaction | None 

110 """The parent transaction. (`DatastoreTransaction`, optional)""" 

111 

112 def __init__(self, parent: DatastoreTransaction | None = None): 

113 self.parent = parent 

114 self._log: list[Event] = [] 

115 

116 def registerUndo(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> None: 

117 """Register event with undo function. 

118 

119 Parameters 

120 ---------- 

121 name : `str` 

122 Name of the event. 

123 undoFunc : func 

124 Function to undo this event. 

125 args : `tuple` 

126 Positional arguments to `undoFunc`. 

127 **kwargs 

128 Keyword arguments to `undoFunc`. 

129 """ 

130 self._log.append(self.Event(name, undoFunc, args, kwargs)) 

131 

132 @contextlib.contextmanager 

133 def undoWith(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> Iterator[None]: 

134 """Register undo function if nested operation succeeds. 

135 

136 Calls `registerUndo`. 

137 

138 This can be used to wrap individual undo-able statements within a 

139 DatastoreTransaction block. Multiple statements that can fail 

140 separately should not be part of the same `undoWith` block. 

141 

142 All arguments are forwarded directly to `registerUndo`. 

143 """ 

144 try: 

145 yield None 

146 except BaseException: 

147 raise 

148 else: 

149 self.registerUndo(name, undoFunc, *args, **kwargs) 

150 

151 def rollback(self) -> None: 

152 """Roll back all events in this transaction.""" 

153 log = logging.getLogger(__name__) 

154 while self._log: 

155 ev = self._log.pop() 

156 try: 

157 log.debug( 

158 "Rolling back transaction: %s: %s(%s,%s)", 

159 ev.name, 

160 ev.undoFunc, 

161 ",".join(str(a) for a in ev.args), 

162 ",".join(f"{k}={v}" for k, v in ev.kwargs.items()), 

163 ) 

164 except Exception: 

165 # In case we had a problem in stringification of arguments 

166 log.warning("Rolling back transaction: %s", ev.name) 

167 try: 

168 ev.undoFunc(*ev.args, **ev.kwargs) 

169 except BaseException as e: 

170 # Deliberately swallow error that may occur in unrolling 

171 log.warning("Exception: %s caught while unrolling: %s", e, ev.name) 

172 pass 

173 

174 def commit(self) -> None: 

175 """Commit this transaction.""" 

176 if self.parent is None: 

177 # Just forget about the events, they have already happened. 

178 return 

179 else: 

180 # We may still want to events from this transaction as part of 

181 # the parent. 

182 self.parent._log.extend(self._log) 

183 

184 

185@dataclasses.dataclass 

186class DatasetRefURIs(abc.Sequence): 

187 """Represents the primary and component ResourcePath(s) associated with a 

188 DatasetRef. 

189 

190 This is used in places where its members used to be represented as a tuple 

191 `(primaryURI, componentURIs)`. To maintain backward compatibility this 

192 inherits from Sequence and so instances can be treated as a two-item 

193 tuple. 

194 """ 

195 

196 def __init__( 

197 self, 

198 primaryURI: ResourcePath | None = None, 

199 componentURIs: dict[str, ResourcePath] | None = None, 

200 ): 

201 self.primaryURI = primaryURI 

202 """The URI to the primary artifact associated with this dataset. If the 

203 dataset was disassembled within the datastore this may be `None`. 

204 """ 

205 

206 self.componentURIs = componentURIs or {} 

207 """The URIs to any components associated with the dataset artifact 

208 indexed by component name. This can be empty if there are no 

209 components. 

210 """ 

211 

212 def __getitem__(self, index: Any) -> Any: 

213 """Get primaryURI and componentURIs by index. 

214 

215 Provides support for tuple-like access. 

216 """ 

217 if index == 0: 

218 return self.primaryURI 

219 elif index == 1: 

220 return self.componentURIs 

221 raise IndexError("list index out of range") 

222 

223 def __len__(self) -> int: 

224 """Get the number of data members. 

225 

226 Provides support for tuple-like access. 

227 """ 

228 return 2 

229 

230 def __repr__(self) -> str: 

231 return f"DatasetRefURIs({repr(self.primaryURI)}, {repr(self.componentURIs)})" 

232 

233 

234class Datastore(metaclass=ABCMeta): 

235 """Datastore interface. 

236 

237 Parameters 

238 ---------- 

239 config : `DatastoreConfig` or `str` 

240 Load configuration either from an existing config instance or by 

241 referring to a configuration file. 

242 bridgeManager : `DatastoreRegistryBridgeManager` 

243 Object that manages the interface between `Registry` and datastores. 

244 butlerRoot : `str`, optional 

245 New datastore root to use to override the configuration value. 

246 """ 

247 

248 defaultConfigFile: ClassVar[str | None] = None 

249 """Path to configuration defaults. Accessed within the ``config`` resource 

250 or relative to a search path. Can be None if no defaults specified. 

251 """ 

252 

253 containerKey: ClassVar[str | None] = None 

254 """Name of the key containing a list of subconfigurations that also 

255 need to be merged with defaults and will likely use different Python 

256 datastore classes (but all using DatastoreConfig). Assumed to be a 

257 list of configurations that can be represented in a DatastoreConfig 

258 and containing a "cls" definition. None indicates that no containers 

259 are expected in this Datastore.""" 

260 

261 isEphemeral: bool = False 

262 """Indicate whether this Datastore is ephemeral or not. An ephemeral 

263 datastore is one where the contents of the datastore will not exist 

264 across process restarts. This value can change per-instance.""" 

265 

266 config: DatastoreConfig 

267 """Configuration used to create Datastore.""" 

268 

269 name: str 

270 """Label associated with this Datastore.""" 

271 

272 storageClassFactory: StorageClassFactory 

273 """Factory for creating storage class instances from name.""" 

274 

275 constraints: Constraints 

276 """Constraints to apply when putting datasets into the datastore.""" 

277 

278 # MyPy does not like for this to be annotated as any kind of type, because 

279 # it can't do static checking on type variables that can change at runtime. 

280 IngestPrepData: ClassVar[Any] = IngestPrepData 

281 """Helper base class for ingest implementations. 

282 """ 

283 

284 @classmethod 

285 @abstractmethod 

286 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

287 """Set filesystem-dependent config options for this datastore. 

288 

289 The options will be appropriate for a new empty repository with the 

290 given root. 

291 

292 Parameters 

293 ---------- 

294 root : `str` 

295 Filesystem path to the root of the data repository. 

296 config : `Config` 

297 A `Config` to update. Only the subset understood by 

298 this component will be updated. Will not expand 

299 defaults. 

300 full : `Config` 

301 A complete config with all defaults expanded that can be 

302 converted to a `DatastoreConfig`. Read-only and will not be 

303 modified by this method. 

304 Repository-specific options that should not be obtained 

305 from defaults when Butler instances are constructed 

306 should be copied from ``full`` to ``config``. 

307 overwrite : `bool`, optional 

308 If `False`, do not modify a value in ``config`` if the value 

309 already exists. Default is always to overwrite with the provided 

310 ``root``. 

311 

312 Notes 

313 ----- 

314 If a keyword is explicitly defined in the supplied ``config`` it 

315 will not be overridden by this method if ``overwrite`` is `False`. 

316 This allows explicit values set in external configs to be retained. 

317 """ 

318 raise NotImplementedError() 

319 

320 @staticmethod 

321 def fromConfig( 

322 config: Config, 

323 bridgeManager: DatastoreRegistryBridgeManager, 

324 butlerRoot: ResourcePathExpression | None = None, 

325 ) -> Datastore: 

326 """Create datastore from type specified in config file. 

327 

328 Parameters 

329 ---------- 

330 config : `Config` or `~lsst.resources.ResourcePathExpression` 

331 Configuration instance. 

332 bridgeManager : `DatastoreRegistryBridgeManager` 

333 Object that manages the interface between `Registry` and 

334 datastores. 

335 butlerRoot : `str`, optional 

336 Butler root directory. 

337 """ 

338 cls = doImportType(config["datastore", "cls"]) 

339 if not issubclass(cls, Datastore): 

340 raise TypeError(f"Imported child class {config['datastore', 'cls']} is not a Datastore") 

341 return cls(config=config, bridgeManager=bridgeManager, butlerRoot=butlerRoot) 

342 

343 def __init__( 

344 self, 

345 config: Config | ResourcePathExpression, 

346 bridgeManager: DatastoreRegistryBridgeManager, 

347 butlerRoot: ResourcePathExpression | None = None, 

348 ): 

349 self.config = DatastoreConfig(config) 

350 self.name = "ABCDataStore" 

351 self._transaction: DatastoreTransaction | None = None 

352 

353 # All Datastores need storage classes and constraints 

354 self.storageClassFactory = StorageClassFactory() 

355 

356 # And read the constraints list 

357 constraintsConfig = self.config.get("constraints") 

358 self.constraints = Constraints(constraintsConfig, universe=bridgeManager.universe) 

359 

360 def __str__(self) -> str: 

361 return self.name 

362 

363 def __repr__(self) -> str: 

364 return self.name 

365 

366 @property 

367 def names(self) -> tuple[str, ...]: 

368 """Names associated with this datastore returned as a list. 

369 

370 Can be different to ``name`` for a chaining datastore. 

371 """ 

372 # Default implementation returns solely the name itself 

373 return (self.name,) 

374 

375 @contextlib.contextmanager 

376 def transaction(self) -> Iterator[DatastoreTransaction]: 

377 """Context manager supporting `Datastore` transactions. 

378 

379 Transactions can be nested, and are to be used in combination with 

380 `Registry.transaction`. 

381 """ 

382 self._transaction = DatastoreTransaction(self._transaction) 

383 try: 

384 yield self._transaction 

385 except BaseException: 

386 self._transaction.rollback() 

387 raise 

388 else: 

389 self._transaction.commit() 

390 self._transaction = self._transaction.parent 

391 

392 @abstractmethod 

393 def knows(self, ref: DatasetRef) -> bool: 

394 """Check if the dataset is known to the datastore. 

395 

396 Does not check for existence of any artifact. 

397 

398 Parameters 

399 ---------- 

400 ref : `DatasetRef` 

401 Reference to the required dataset. 

402 

403 Returns 

404 ------- 

405 exists : `bool` 

406 `True` if the dataset is known to the datastore. 

407 """ 

408 raise NotImplementedError() 

409 

410 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

411 """Check which of the given datasets are known to this datastore. 

412 

413 This is like ``mexist()`` but does not check that the file exists. 

414 

415 Parameters 

416 ---------- 

417 refs : iterable `DatasetRef` 

418 The datasets to check. 

419 

420 Returns 

421 ------- 

422 exists : `dict`[`DatasetRef`, `bool`] 

423 Mapping of dataset to boolean indicating whether the dataset 

424 is known to the datastore. 

425 """ 

426 # Non-optimized default calls knows() repeatedly. 

427 return {ref: self.knows(ref) for ref in refs} 

428 

429 def mexists( 

430 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

431 ) -> dict[DatasetRef, bool]: 

432 """Check the existence of multiple datasets at once. 

433 

434 Parameters 

435 ---------- 

436 refs : iterable of `DatasetRef` 

437 The datasets to be checked. 

438 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

439 Optional mapping of datastore artifact to existence. Updated by 

440 this method with details of all artifacts tested. Can be `None` 

441 if the caller is not interested. 

442 

443 Returns 

444 ------- 

445 existence : `dict` of [`DatasetRef`, `bool`] 

446 Mapping from dataset to boolean indicating existence. 

447 """ 

448 existence: dict[DatasetRef, bool] = {} 

449 # Non-optimized default. 

450 for ref in refs: 

451 existence[ref] = self.exists(ref) 

452 return existence 

453 

454 @abstractmethod 

455 def exists(self, datasetRef: DatasetRef) -> bool: 

456 """Check if the dataset exists in the datastore. 

457 

458 Parameters 

459 ---------- 

460 datasetRef : `DatasetRef` 

461 Reference to the required dataset. 

462 

463 Returns 

464 ------- 

465 exists : `bool` 

466 `True` if the entity exists in the `Datastore`. 

467 """ 

468 raise NotImplementedError("Must be implemented by subclass") 

469 

470 @abstractmethod 

471 def get( 

472 self, 

473 datasetRef: DatasetRef, 

474 parameters: Mapping[str, Any] | None = None, 

475 storageClass: StorageClass | str | None = None, 

476 ) -> Any: 

477 """Load an `InMemoryDataset` from the store. 

478 

479 Parameters 

480 ---------- 

481 datasetRef : `DatasetRef` 

482 Reference to the required Dataset. 

483 parameters : `dict` 

484 `StorageClass`-specific parameters that specify a slice of the 

485 Dataset to be loaded. 

486 storageClass : `StorageClass` or `str`, optional 

487 The storage class to be used to override the Python type 

488 returned by this method. By default the returned type matches 

489 the dataset type definition for this dataset. Specifying a 

490 read `StorageClass` can force a different type to be returned. 

491 This type must be compatible with the original type. 

492 

493 Returns 

494 ------- 

495 inMemoryDataset : `object` 

496 Requested Dataset or slice thereof as an InMemoryDataset. 

497 """ 

498 raise NotImplementedError("Must be implemented by subclass") 

499 

500 @abstractmethod 

501 def put(self, inMemoryDataset: Any, datasetRef: DatasetRef) -> None: 

502 """Write a `InMemoryDataset` with a given `DatasetRef` to the store. 

503 

504 Parameters 

505 ---------- 

506 inMemoryDataset : `object` 

507 The Dataset to store. 

508 datasetRef : `DatasetRef` 

509 Reference to the associated Dataset. 

510 """ 

511 raise NotImplementedError("Must be implemented by subclass") 

512 

513 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None: 

514 """Allow ingest transfer mode to be defaulted based on datasets. 

515 

516 Parameters 

517 ---------- 

518 datasets : `FileDataset` 

519 Each positional argument is a struct containing information about 

520 a file to be ingested, including its path (either absolute or 

521 relative to the datastore root, if applicable), a complete 

522 `DatasetRef` (with ``dataset_id not None``), and optionally a 

523 formatter class or its fully-qualified string name. If a formatter 

524 is not provided, this method should populate that attribute with 

525 the formatter the datastore would use for `put`. Subclasses are 

526 also permitted to modify the path attribute (typically to put it 

527 in what the datastore considers its standard form). 

528 transfer : `str`, optional 

529 How (and whether) the dataset should be added to the datastore. 

530 See `ingest` for details of transfer modes. 

531 

532 Returns 

533 ------- 

534 newTransfer : `str` 

535 Transfer mode to use. Will be identical to the supplied transfer 

536 mode unless "auto" is used. 

537 """ 

538 if transfer != "auto": 

539 return transfer 

540 raise RuntimeError(f"{transfer} is not allowed without specialization.") 

541 

542 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> IngestPrepData: 

543 """Process datasets to identify which ones can be ingested. 

544 

545 Parameters 

546 ---------- 

547 datasets : `FileDataset` 

548 Each positional argument is a struct containing information about 

549 a file to be ingested, including its path (either absolute or 

550 relative to the datastore root, if applicable), a complete 

551 `DatasetRef` (with ``dataset_id not None``), and optionally a 

552 formatter class or its fully-qualified string name. If a formatter 

553 is not provided, this method should populate that attribute with 

554 the formatter the datastore would use for `put`. Subclasses are 

555 also permitted to modify the path attribute (typically to put it 

556 in what the datastore considers its standard form). 

557 transfer : `str`, optional 

558 How (and whether) the dataset should be added to the datastore. 

559 See `ingest` for details of transfer modes. 

560 

561 Returns 

562 ------- 

563 data : `IngestPrepData` 

564 An instance of a subclass of `IngestPrepData`, used to pass 

565 arbitrary data from `_prepIngest` to `_finishIngest`. This should 

566 include only the datasets this datastore can actually ingest; 

567 others should be silently ignored (`Datastore.ingest` will inspect 

568 `IngestPrepData.refs` and raise `DatasetTypeNotSupportedError` if 

569 necessary). 

570 

571 Raises 

572 ------ 

573 NotImplementedError 

574 Raised if the datastore does not support the given transfer mode 

575 (including the case where ingest is not supported at all). 

576 FileNotFoundError 

577 Raised if one of the given files does not exist. 

578 FileExistsError 

579 Raised if transfer is not `None` but the (internal) location the 

580 file would be moved to is already occupied. 

581 

582 Notes 

583 ----- 

584 This method (along with `_finishIngest`) should be implemented by 

585 subclasses to provide ingest support instead of implementing `ingest` 

586 directly. 

587 

588 `_prepIngest` should not modify the data repository or given files in 

589 any way; all changes should be deferred to `_finishIngest`. 

590 

591 When possible, exceptions should be raised in `_prepIngest` instead of 

592 `_finishIngest`. `NotImplementedError` exceptions that indicate that 

593 the transfer mode is not supported must be raised by `_prepIngest` 

594 instead of `_finishIngest`. 

595 """ 

596 raise NotImplementedError(f"Datastore {self} does not support direct file-based ingest.") 

597 

598 def _finishIngest( 

599 self, prepData: IngestPrepData, *, transfer: str | None = None, record_validation_info: bool = True 

600 ) -> None: 

601 """Complete an ingest operation. 

602 

603 Parameters 

604 ---------- 

605 data : `IngestPrepData` 

606 An instance of a subclass of `IngestPrepData`. Guaranteed to be 

607 the direct result of a call to `_prepIngest` on this datastore. 

608 transfer : `str`, optional 

609 How (and whether) the dataset should be added to the datastore. 

610 See `ingest` for details of transfer modes. 

611 record_validation_info : `bool`, optional 

612 If `True`, the default, the datastore can record validation 

613 information associated with the file. If `False` the datastore 

614 will not attempt to track any information such as checksums 

615 or file sizes. This can be useful if such information is tracked 

616 in an external system or if the file is to be compressed in place. 

617 It is up to the datastore whether this parameter is relevant. 

618 

619 Raises 

620 ------ 

621 FileNotFoundError 

622 Raised if one of the given files does not exist. 

623 FileExistsError 

624 Raised if transfer is not `None` but the (internal) location the 

625 file would be moved to is already occupied. 

626 

627 Notes 

628 ----- 

629 This method (along with `_prepIngest`) should be implemented by 

630 subclasses to provide ingest support instead of implementing `ingest` 

631 directly. 

632 """ 

633 raise NotImplementedError(f"Datastore {self} does not support direct file-based ingest.") 

634 

635 def ingest( 

636 self, *datasets: FileDataset, transfer: str | None = None, record_validation_info: bool = True 

637 ) -> None: 

638 """Ingest one or more files into the datastore. 

639 

640 Parameters 

641 ---------- 

642 datasets : `FileDataset` 

643 Each positional argument is a struct containing information about 

644 a file to be ingested, including its path (either absolute or 

645 relative to the datastore root, if applicable), a complete 

646 `DatasetRef` (with ``dataset_id not None``), and optionally a 

647 formatter class or its fully-qualified string name. If a formatter 

648 is not provided, the one the datastore would use for ``put`` on 

649 that dataset is assumed. 

650 transfer : `str`, optional 

651 How (and whether) the dataset should be added to the datastore. 

652 If `None` (default), the file must already be in a location 

653 appropriate for the datastore (e.g. within its root directory), 

654 and will not be modified. Other choices include "move", "copy", 

655 "link", "symlink", "relsymlink", and "hardlink". "link" is a 

656 special transfer mode that will first try to make a hardlink and 

657 if that fails a symlink will be used instead. "relsymlink" creates 

658 a relative symlink rather than use an absolute path. 

659 Most datastores do not support all transfer modes. 

660 "auto" is a special option that will let the 

661 data store choose the most natural option for itself. 

662 record_validation_info : `bool`, optional 

663 If `True`, the default, the datastore can record validation 

664 information associated with the file. If `False` the datastore 

665 will not attempt to track any information such as checksums 

666 or file sizes. This can be useful if such information is tracked 

667 in an external system or if the file is to be compressed in place. 

668 It is up to the datastore whether this parameter is relevant. 

669 

670 Raises 

671 ------ 

672 NotImplementedError 

673 Raised if the datastore does not support the given transfer mode 

674 (including the case where ingest is not supported at all). 

675 DatasetTypeNotSupportedError 

676 Raised if one or more files to be ingested have a dataset type that 

677 is not supported by the datastore. 

678 FileNotFoundError 

679 Raised if one of the given files does not exist. 

680 FileExistsError 

681 Raised if transfer is not `None` but the (internal) location the 

682 file would be moved to is already occupied. 

683 

684 Notes 

685 ----- 

686 Subclasses should implement `_prepIngest` and `_finishIngest` instead 

687 of implementing `ingest` directly. Datastores that hold and 

688 delegate to child datastores may want to call those methods as well. 

689 

690 Subclasses are encouraged to document their supported transfer modes 

691 in their class documentation. 

692 """ 

693 # Allow a datastore to select a default transfer mode 

694 transfer = self._overrideTransferMode(*datasets, transfer=transfer) 

695 prepData = self._prepIngest(*datasets, transfer=transfer) 

696 refs = {ref.id: ref for dataset in datasets for ref in dataset.refs} 

697 if refs.keys() != prepData.refs.keys(): 

698 unsupported = refs.keys() - prepData.refs.keys() 

699 # Group unsupported refs by DatasetType for an informative 

700 # but still concise error message. 

701 byDatasetType = defaultdict(list) 

702 for datasetId in unsupported: 

703 ref = refs[datasetId] 

704 byDatasetType[ref.datasetType].append(ref) 

705 raise DatasetTypeNotSupportedError( 

706 "DatasetType(s) not supported in ingest: " 

707 + ", ".join(f"{k.name} ({len(v)} dataset(s))" for k, v in byDatasetType.items()) 

708 ) 

709 self._finishIngest(prepData, transfer=transfer, record_validation_info=record_validation_info) 

710 

711 def transfer_from( 

712 self, 

713 source_datastore: Datastore, 

714 refs: Iterable[DatasetRef], 

715 transfer: str = "auto", 

716 artifact_existence: dict[ResourcePath, bool] | None = None, 

717 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

718 """Transfer dataset artifacts from another datastore to this one. 

719 

720 Parameters 

721 ---------- 

722 source_datastore : `Datastore` 

723 The datastore from which to transfer artifacts. That datastore 

724 must be compatible with this datastore receiving the artifacts. 

725 refs : iterable of `DatasetRef` 

726 The datasets to transfer from the source datastore. 

727 transfer : `str`, optional 

728 How (and whether) the dataset should be added to the datastore. 

729 Choices include "move", "copy", 

730 "link", "symlink", "relsymlink", and "hardlink". "link" is a 

731 special transfer mode that will first try to make a hardlink and 

732 if that fails a symlink will be used instead. "relsymlink" creates 

733 a relative symlink rather than use an absolute path. 

734 Most datastores do not support all transfer modes. 

735 "auto" (the default) is a special option that will let the 

736 data store choose the most natural option for itself. 

737 If the source location and transfer location are identical the 

738 transfer mode will be ignored. 

739 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

740 Optional mapping of datastore artifact to existence. Updated by 

741 this method with details of all artifacts tested. Can be `None` 

742 if the caller is not interested. 

743 

744 Returns 

745 ------- 

746 accepted : `set` [`DatasetRef`] 

747 The datasets that were transferred. 

748 rejected : `set` [`DatasetRef`] 

749 The datasets that were rejected due to a constraints violation. 

750 

751 Raises 

752 ------ 

753 TypeError 

754 Raised if the two datastores are not compatible. 

755 """ 

756 if type(self) is not type(source_datastore): 

757 raise TypeError( 

758 f"Datastore mismatch between this datastore ({type(self)}) and the " 

759 f"source datastore ({type(source_datastore)})." 

760 ) 

761 

762 raise NotImplementedError(f"Datastore {type(self)} must implement a transfer_from method.") 

763 

764 def getManyURIs( 

765 self, 

766 refs: Iterable[DatasetRef], 

767 predict: bool = False, 

768 allow_missing: bool = False, 

769 ) -> dict[DatasetRef, DatasetRefURIs]: 

770 """Return URIs associated with many datasets. 

771 

772 Parameters 

773 ---------- 

774 refs : iterable of `DatasetIdRef` 

775 References to the required datasets. 

776 predict : `bool`, optional 

777 If the datastore does not know about a dataset, should it 

778 return a predicted URI or not? 

779 allow_missing : `bool` 

780 If `False`, and `predict` is `False`, will raise if a `DatasetRef` 

781 does not exist. 

782 

783 Returns 

784 ------- 

785 URIs : `dict` of [`DatasetRef`, `DatasetRefUris`] 

786 A dict of primary and component URIs, indexed by the passed-in 

787 refs. 

788 

789 Raises 

790 ------ 

791 FileNotFoundError 

792 A URI has been requested for a dataset that does not exist and 

793 guessing is not allowed. 

794 

795 Notes 

796 ----- 

797 In file-based datastores, getManuURIs does not check that the file is 

798 really there, it's assuming it is if datastore is aware of the file 

799 then it actually exists. 

800 """ 

801 uris: dict[DatasetRef, DatasetRefURIs] = {} 

802 missing_refs = [] 

803 for ref in refs: 

804 try: 

805 uris[ref] = self.getURIs(ref, predict=predict) 

806 except FileNotFoundError: 

807 missing_refs.append(ref) 

808 if missing_refs and not allow_missing: 

809 raise FileNotFoundError( 

810 "Missing {} refs from datastore out of {} and predict=False.".format( 

811 num_missing := len(missing_refs), num_missing + len(uris) 

812 ) 

813 ) 

814 return uris 

815 

816 @abstractmethod 

817 def getURIs(self, datasetRef: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

818 """Return URIs associated with dataset. 

819 

820 Parameters 

821 ---------- 

822 ref : `DatasetRef` 

823 Reference to the required dataset. 

824 predict : `bool`, optional 

825 If the datastore does not know about the dataset, should it 

826 return a predicted URI or not? 

827 

828 Returns 

829 ------- 

830 uris : `DatasetRefURIs` 

831 The URI to the primary artifact associated with this dataset (if 

832 the dataset was disassembled within the datastore this may be 

833 `None`), and the URIs to any components associated with the dataset 

834 artifact. (can be empty if there are no components). 

835 """ 

836 raise NotImplementedError() 

837 

838 @abstractmethod 

839 def getURI(self, datasetRef: DatasetRef, predict: bool = False) -> ResourcePath: 

840 """URI to the Dataset. 

841 

842 Parameters 

843 ---------- 

844 datasetRef : `DatasetRef` 

845 Reference to the required Dataset. 

846 predict : `bool` 

847 If `True` attempt to predict the URI for a dataset if it does 

848 not exist in datastore. 

849 

850 Returns 

851 ------- 

852 uri : `str` 

853 URI string pointing to the Dataset within the datastore. If the 

854 Dataset does not exist in the datastore, the URI may be a guess. 

855 If the datastore does not have entities that relate well 

856 to the concept of a URI the returned URI string will be 

857 descriptive. The returned URI is not guaranteed to be obtainable. 

858 

859 Raises 

860 ------ 

861 FileNotFoundError 

862 A URI has been requested for a dataset that does not exist and 

863 guessing is not allowed. 

864 """ 

865 raise NotImplementedError("Must be implemented by subclass") 

866 

867 @abstractmethod 

868 def retrieveArtifacts( 

869 self, 

870 refs: Iterable[DatasetRef], 

871 destination: ResourcePath, 

872 transfer: str = "auto", 

873 preserve_path: bool = True, 

874 overwrite: bool = False, 

875 ) -> list[ResourcePath]: 

876 """Retrieve the artifacts associated with the supplied refs. 

877 

878 Parameters 

879 ---------- 

880 refs : iterable of `DatasetRef` 

881 The datasets for which artifacts are to be retrieved. 

882 A single ref can result in multiple artifacts. The refs must 

883 be resolved. 

884 destination : `lsst.resources.ResourcePath` 

885 Location to write the artifacts. 

886 transfer : `str`, optional 

887 Method to use to transfer the artifacts. Must be one of the options 

888 supported by `lsst.resources.ResourcePath.transfer_from()`. 

889 "move" is not allowed. 

890 preserve_path : `bool`, optional 

891 If `True` the full path of the artifact within the datastore 

892 is preserved. If `False` the final file component of the path 

893 is used. 

894 overwrite : `bool`, optional 

895 If `True` allow transfers to overwrite existing files at the 

896 destination. 

897 

898 Returns 

899 ------- 

900 targets : `list` of `lsst.resources.ResourcePath` 

901 URIs of file artifacts in destination location. Order is not 

902 preserved. 

903 

904 Notes 

905 ----- 

906 For non-file datastores the artifacts written to the destination 

907 may not match the representation inside the datastore. For example 

908 a hierarchichal data structure in a NoSQL database may well be stored 

909 as a JSON file. 

910 """ 

911 raise NotImplementedError() 

912 

913 @abstractmethod 

914 def remove(self, datasetRef: DatasetRef) -> None: 

915 """Indicate to the Datastore that a Dataset can be removed. 

916 

917 Parameters 

918 ---------- 

919 datasetRef : `DatasetRef` 

920 Reference to the required Dataset. 

921 

922 Raises 

923 ------ 

924 FileNotFoundError 

925 When Dataset does not exist. 

926 

927 Notes 

928 ----- 

929 Some Datastores may implement this method as a silent no-op to 

930 disable Dataset deletion through standard interfaces. 

931 """ 

932 raise NotImplementedError("Must be implemented by subclass") 

933 

934 @abstractmethod 

935 def forget(self, refs: Iterable[DatasetRef]) -> None: 

936 """Indicate to the Datastore that it should remove all records of the 

937 given datasets, without actually deleting them. 

938 

939 Parameters 

940 ---------- 

941 refs : `~collections.abc.Iterable` [ `DatasetRef` ] 

942 References to the datasets being forgotten. 

943 

944 Notes 

945 ----- 

946 Asking a datastore to forget a `DatasetRef` it does not hold should be 

947 a silent no-op, not an error. 

948 """ 

949 raise NotImplementedError("Must be implemented by subclass") 

950 

951 @abstractmethod 

952 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None: 

953 """Indicate to the Datastore that a Dataset can be moved to the trash. 

954 

955 Parameters 

956 ---------- 

957 ref : `DatasetRef` or iterable thereof 

958 Reference(s) to the required Dataset. 

959 ignore_errors : `bool`, optional 

960 Determine whether errors should be ignored. When multiple 

961 refs are being trashed there will be no per-ref check. 

962 

963 Raises 

964 ------ 

965 FileNotFoundError 

966 When Dataset does not exist and errors are not ignored. Only 

967 checked if a single ref is supplied (and not in a list). 

968 

969 Notes 

970 ----- 

971 Some Datastores may implement this method as a silent no-op to 

972 disable Dataset deletion through standard interfaces. 

973 """ 

974 raise NotImplementedError("Must be implemented by subclass") 

975 

976 @abstractmethod 

977 def emptyTrash(self, ignore_errors: bool = True) -> None: 

978 """Remove all datasets from the trash. 

979 

980 Parameters 

981 ---------- 

982 ignore_errors : `bool`, optional 

983 Determine whether errors should be ignored. 

984 

985 Notes 

986 ----- 

987 Some Datastores may implement this method as a silent no-op to 

988 disable Dataset deletion through standard interfaces. 

989 """ 

990 raise NotImplementedError("Must be implemented by subclass") 

991 

992 @abstractmethod 

993 def transfer(self, inputDatastore: Datastore, datasetRef: DatasetRef) -> None: 

994 """Transfer a dataset from another datastore to this datastore. 

995 

996 Parameters 

997 ---------- 

998 inputDatastore : `Datastore` 

999 The external `Datastore` from which to retrieve the Dataset. 

1000 datasetRef : `DatasetRef` 

1001 Reference to the required Dataset. 

1002 """ 

1003 raise NotImplementedError("Must be implemented by subclass") 

1004 

1005 def export( 

1006 self, 

1007 refs: Iterable[DatasetRef], 

1008 *, 

1009 directory: ResourcePathExpression | None = None, 

1010 transfer: str | None = "auto", 

1011 ) -> Iterable[FileDataset]: 

1012 """Export datasets for transfer to another data repository. 

1013 

1014 Parameters 

1015 ---------- 

1016 refs : iterable of `DatasetRef` 

1017 Dataset references to be exported. 

1018 directory : `str`, optional 

1019 Path to a directory that should contain files corresponding to 

1020 output datasets. Ignored if ``transfer`` is explicitly `None`. 

1021 transfer : `str`, optional 

1022 Mode that should be used to move datasets out of the repository. 

1023 Valid options are the same as those of the ``transfer`` argument 

1024 to ``ingest``, and datastores may similarly signal that a transfer 

1025 mode is not supported by raising `NotImplementedError`. If "auto" 

1026 is given and no ``directory`` is specified, `None` will be 

1027 implied. 

1028 

1029 Returns 

1030 ------- 

1031 dataset : iterable of `DatasetTransfer` 

1032 Structs containing information about the exported datasets, in the 

1033 same order as ``refs``. 

1034 

1035 Raises 

1036 ------ 

1037 NotImplementedError 

1038 Raised if the given transfer mode is not supported. 

1039 """ 

1040 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

1041 

1042 @abstractmethod 

1043 def validateConfiguration( 

1044 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

1045 ) -> None: 

1046 """Validate some of the configuration for this datastore. 

1047 

1048 Parameters 

1049 ---------- 

1050 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

1051 Entities to test against this configuration. Can be differing 

1052 types. 

1053 logFailures : `bool`, optional 

1054 If `True`, output a log message for every validation error 

1055 detected. 

1056 

1057 Raises 

1058 ------ 

1059 DatastoreValidationError 

1060 Raised if there is a validation problem with a configuration. 

1061 

1062 Notes 

1063 ----- 

1064 Which parts of the configuration are validated is at the discretion 

1065 of each Datastore implementation. 

1066 """ 

1067 raise NotImplementedError("Must be implemented by subclass") 

1068 

1069 @abstractmethod 

1070 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

1071 """Validate a specific look up key with supplied entity. 

1072 

1073 Parameters 

1074 ---------- 

1075 lookupKey : `LookupKey` 

1076 Key to use to retrieve information from the datastore 

1077 configuration. 

1078 entity : `DatasetRef`, `DatasetType`, or `StorageClass` 

1079 Entity to compare with configuration retrieved using the 

1080 specified lookup key. 

1081 

1082 Raises 

1083 ------ 

1084 DatastoreValidationError 

1085 Raised if there is a problem with the combination of entity 

1086 and lookup key. 

1087 

1088 Notes 

1089 ----- 

1090 Bypasses the normal selection priorities by allowing a key that 

1091 would normally not be selected to be validated. 

1092 """ 

1093 raise NotImplementedError("Must be implemented by subclass") 

1094 

1095 @abstractmethod 

1096 def getLookupKeys(self) -> set[LookupKey]: 

1097 """Return all the lookup keys relevant to this datastore. 

1098 

1099 Returns 

1100 ------- 

1101 keys : `set` of `LookupKey` 

1102 The keys stored internally for looking up information based 

1103 on `DatasetType` name or `StorageClass`. 

1104 """ 

1105 raise NotImplementedError("Must be implemented by subclass") 

1106 

1107 def needs_expanded_data_ids( 

1108 self, 

1109 transfer: str | None, 

1110 entity: DatasetRef | DatasetType | StorageClass | None = None, 

1111 ) -> bool: 

1112 """Test whether this datastore needs expanded data IDs to ingest. 

1113 

1114 Parameters 

1115 ---------- 

1116 transfer : `str` or `None` 

1117 Transfer mode for ingest. 

1118 entity, optional 

1119 Object representing what will be ingested. If not provided (or not 

1120 specific enough), `True` may be returned even if expanded data 

1121 IDs aren't necessary. 

1122 

1123 Returns 

1124 ------- 

1125 needed : `bool` 

1126 If `True`, expanded data IDs may be needed. `False` only if 

1127 expansion definitely isn't necessary. 

1128 """ 

1129 return True 

1130 

1131 @abstractmethod 

1132 def import_records( 

1133 self, 

1134 data: Mapping[str, DatastoreRecordData], 

1135 ) -> None: 

1136 """Import datastore location and record data from an in-memory data 

1137 structure. 

1138 

1139 Parameters 

1140 ---------- 

1141 data : `~collections.abc.Mapping` [ `str`, `DatastoreRecordData` ] 

1142 Datastore records indexed by datastore name. May contain data for 

1143 other `Datastore` instances (generally because they are chained to 

1144 this one), which should be ignored. 

1145 

1146 Notes 

1147 ----- 

1148 Implementations should generally not check that any external resources 

1149 (e.g. files) referred to by these records actually exist, for 

1150 performance reasons; we expect higher-level code to guarantee that they 

1151 do. 

1152 

1153 Implementations are responsible for calling 

1154 `DatastoreRegistryBridge.insert` on all datasets in ``data.locations`` 

1155 where the key is in `names`, as well as loading any opaque table data. 

1156 """ 

1157 raise NotImplementedError() 

1158 

1159 @abstractmethod 

1160 def export_records( 

1161 self, 

1162 refs: Iterable[DatasetIdRef], 

1163 ) -> Mapping[str, DatastoreRecordData]: 

1164 """Export datastore records and locations to an in-memory data 

1165 structure. 

1166 

1167 Parameters 

1168 ---------- 

1169 refs : `~collections.abc.Iterable` [ `DatasetIdRef` ] 

1170 Datasets to save. This may include datasets not known to this 

1171 datastore, which should be ignored. 

1172 

1173 Returns 

1174 ------- 

1175 data : `~collections.abc.Mapping` [ `str`, `DatastoreRecordData` ] 

1176 Exported datastore records indexed by datastore name. 

1177 """ 

1178 raise NotImplementedError() 

1179 

1180 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None: 

1181 """Specify a method that can be used by datastore to retrieve 

1182 registry-defined dataset type. 

1183 

1184 Parameters 

1185 ---------- 

1186 method : `~collections.abc.Callable` | `None` 

1187 Method that takes a name of the dataset type and returns a 

1188 corresponding `DatasetType` instance as defined in Registry. If 

1189 dataset type name is not known to registry `None` is returned. 

1190 

1191 Notes 

1192 ----- 

1193 This method is only needed for a Datastore supporting a "trusted" mode 

1194 when it does not have an access to datastore records and needs to 

1195 guess dataset location based on its stored dataset type. 

1196 """ 

1197 pass