Coverage for python/lsst/daf/butler/core/datastore.py: 59%

213 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-21 09:55 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Support for generic data stores.""" 

23 

24from __future__ import annotations 

25 

26__all__ = ("DatastoreConfig", "Datastore", "DatastoreValidationError", "DatasetRefURIs") 

27 

28import contextlib 

29import dataclasses 

30import logging 

31from abc import ABCMeta, abstractmethod 

32from collections import abc, defaultdict 

33from collections.abc import Callable, Iterable, Iterator, Mapping 

34from typing import TYPE_CHECKING, Any, ClassVar 

35 

36from lsst.utils import doImportType 

37 

38from .config import Config, ConfigSubset 

39from .constraints import Constraints 

40from .exceptions import DatasetTypeNotSupportedError, ValidationError 

41from .fileDataset import FileDataset 

42from .storageClass import StorageClassFactory 

43 

44if TYPE_CHECKING: 

45 from lsst.resources import ResourcePath, ResourcePathExpression 

46 

47 from ..registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

48 from .configSupport import LookupKey 

49 from .datasets import DatasetRef, DatasetType 

50 from .datastoreRecordData import DatastoreRecordData 

51 from .storageClass import StorageClass 

52 

53 

54class DatastoreConfig(ConfigSubset): 

55 """Configuration for Datastores.""" 

56 

57 component = "datastore" 

58 requiredKeys = ("cls",) 

59 defaultConfigFile = "datastore.yaml" 

60 

61 

62class DatastoreValidationError(ValidationError): 

63 """There is a problem with the Datastore configuration.""" 

64 

65 pass 

66 

67 

68@dataclasses.dataclass(frozen=True) 

69class Event: 

70 """Representation of an event that can be rolled back.""" 

71 

72 __slots__ = {"name", "undoFunc", "args", "kwargs"} 

73 name: str 

74 undoFunc: Callable 

75 args: tuple 

76 kwargs: dict 

77 

78 

79class IngestPrepData: 

80 """A helper base class for `Datastore` ingest implementations. 

81 

82 Datastore implementations will generally need a custom implementation of 

83 this class. 

84 

85 Should be accessed as ``Datastore.IngestPrepData`` instead of via direct 

86 import. 

87 

88 Parameters 

89 ---------- 

90 refs : iterable of `DatasetRef` 

91 References for the datasets that can be ingested by this datastore. 

92 """ 

93 

94 def __init__(self, refs: Iterable[DatasetRef]): 

95 self.refs = {ref.id: ref for ref in refs} 

96 

97 

98class DatastoreTransaction: 

99 """Keeps a log of `Datastore` activity and allow rollback. 

100 

101 Parameters 

102 ---------- 

103 parent : `DatastoreTransaction`, optional 

104 The parent transaction (if any) 

105 """ 

106 

107 Event: ClassVar[type] = Event 

108 

109 parent: DatastoreTransaction | None 

110 """The parent transaction. (`DatastoreTransaction`, optional)""" 

111 

112 def __init__(self, parent: DatastoreTransaction | None = None): 

113 self.parent = parent 

114 self._log: list[Event] = [] 

115 

116 def registerUndo(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> None: 

117 """Register event with undo function. 

118 

119 Parameters 

120 ---------- 

121 name : `str` 

122 Name of the event. 

123 undoFunc : func 

124 Function to undo this event. 

125 args : `tuple` 

126 Positional arguments to `undoFunc`. 

127 **kwargs 

128 Keyword arguments to `undoFunc`. 

129 """ 

130 self._log.append(self.Event(name, undoFunc, args, kwargs)) 

131 

132 @contextlib.contextmanager 

133 def undoWith(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> Iterator[None]: 

134 """Register undo function if nested operation succeeds. 

135 

136 Calls `registerUndo`. 

137 

138 This can be used to wrap individual undo-able statements within a 

139 DatastoreTransaction block. Multiple statements that can fail 

140 separately should not be part of the same `undoWith` block. 

141 

142 All arguments are forwarded directly to `registerUndo`. 

143 """ 

144 try: 

145 yield None 

146 except BaseException: 

147 raise 

148 else: 

149 self.registerUndo(name, undoFunc, *args, **kwargs) 

150 

151 def rollback(self) -> None: 

152 """Roll back all events in this transaction.""" 

153 log = logging.getLogger(__name__) 

154 while self._log: 

155 ev = self._log.pop() 

156 try: 

157 log.debug( 

158 "Rolling back transaction: %s: %s(%s,%s)", 

159 ev.name, 

160 ev.undoFunc, 

161 ",".join(str(a) for a in ev.args), 

162 ",".join(f"{k}={v}" for k, v in ev.kwargs.items()), 

163 ) 

164 except Exception: 

165 # In case we had a problem in stringification of arguments 

166 log.warning("Rolling back transaction: %s", ev.name) 

167 try: 

168 ev.undoFunc(*ev.args, **ev.kwargs) 

169 except BaseException as e: 

170 # Deliberately swallow error that may occur in unrolling 

171 log.warning("Exception: %s caught while unrolling: %s", e, ev.name) 

172 pass 

173 

174 def commit(self) -> None: 

175 """Commit this transaction.""" 

176 if self.parent is None: 

177 # Just forget about the events, they have already happened. 

178 return 

179 else: 

180 # We may still want to events from this transaction as part of 

181 # the parent. 

182 self.parent._log.extend(self._log) 

183 

184 

185@dataclasses.dataclass 

186class DatasetRefURIs(abc.Sequence): 

187 """Represents the primary and component ResourcePath(s) associated with a 

188 DatasetRef. 

189 

190 This is used in places where its members used to be represented as a tuple 

191 `(primaryURI, componentURIs)`. To maintain backward compatibility this 

192 inherits from Sequence and so instances can be treated as a two-item 

193 tuple. 

194 """ 

195 

196 def __init__( 

197 self, 

198 primaryURI: ResourcePath | None = None, 

199 componentURIs: dict[str, ResourcePath] | None = None, 

200 ): 

201 self.primaryURI = primaryURI 

202 """The URI to the primary artifact associated with this dataset. If the 

203 dataset was disassembled within the datastore this may be `None`. 

204 """ 

205 

206 self.componentURIs = componentURIs or {} 

207 """The URIs to any components associated with the dataset artifact 

208 indexed by component name. This can be empty if there are no 

209 components. 

210 """ 

211 

212 def __getitem__(self, index: Any) -> Any: 

213 """Get primaryURI and componentURIs by index. 

214 

215 Provides support for tuple-like access. 

216 """ 

217 if index == 0: 

218 return self.primaryURI 

219 elif index == 1: 

220 return self.componentURIs 

221 raise IndexError("list index out of range") 

222 

223 def __len__(self) -> int: 

224 """Get the number of data members. 

225 

226 Provides support for tuple-like access. 

227 """ 

228 return 2 

229 

230 def __repr__(self) -> str: 

231 return f"DatasetRefURIs({repr(self.primaryURI)}, {repr(self.componentURIs)})" 

232 

233 

234class Datastore(metaclass=ABCMeta): 

235 """Datastore interface. 

236 

237 Parameters 

238 ---------- 

239 config : `DatastoreConfig` or `str` 

240 Load configuration either from an existing config instance or by 

241 referring to a configuration file. 

242 bridgeManager : `DatastoreRegistryBridgeManager` 

243 Object that manages the interface between `Registry` and datastores. 

244 butlerRoot : `str`, optional 

245 New datastore root to use to override the configuration value. 

246 """ 

247 

248 defaultConfigFile: ClassVar[str | None] = None 

249 """Path to configuration defaults. Accessed within the ``config`` resource 

250 or relative to a search path. Can be None if no defaults specified. 

251 """ 

252 

253 containerKey: ClassVar[str | None] = None 

254 """Name of the key containing a list of subconfigurations that also 

255 need to be merged with defaults and will likely use different Python 

256 datastore classes (but all using DatastoreConfig). Assumed to be a 

257 list of configurations that can be represented in a DatastoreConfig 

258 and containing a "cls" definition. None indicates that no containers 

259 are expected in this Datastore.""" 

260 

261 isEphemeral: bool = False 

262 """Indicate whether this Datastore is ephemeral or not. An ephemeral 

263 datastore is one where the contents of the datastore will not exist 

264 across process restarts. This value can change per-instance.""" 

265 

266 config: DatastoreConfig 

267 """Configuration used to create Datastore.""" 

268 

269 name: str 

270 """Label associated with this Datastore.""" 

271 

272 storageClassFactory: StorageClassFactory 

273 """Factory for creating storage class instances from name.""" 

274 

275 constraints: Constraints 

276 """Constraints to apply when putting datasets into the datastore.""" 

277 

278 # MyPy does not like for this to be annotated as any kind of type, because 

279 # it can't do static checking on type variables that can change at runtime. 

280 IngestPrepData: ClassVar[Any] = IngestPrepData 

281 """Helper base class for ingest implementations. 

282 """ 

283 

284 @classmethod 

285 @abstractmethod 

286 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

287 """Set filesystem-dependent config options for this datastore. 

288 

289 The options will be appropriate for a new empty repository with the 

290 given root. 

291 

292 Parameters 

293 ---------- 

294 root : `str` 

295 Filesystem path to the root of the data repository. 

296 config : `Config` 

297 A `Config` to update. Only the subset understood by 

298 this component will be updated. Will not expand 

299 defaults. 

300 full : `Config` 

301 A complete config with all defaults expanded that can be 

302 converted to a `DatastoreConfig`. Read-only and will not be 

303 modified by this method. 

304 Repository-specific options that should not be obtained 

305 from defaults when Butler instances are constructed 

306 should be copied from ``full`` to ``config``. 

307 overwrite : `bool`, optional 

308 If `False`, do not modify a value in ``config`` if the value 

309 already exists. Default is always to overwrite with the provided 

310 ``root``. 

311 

312 Notes 

313 ----- 

314 If a keyword is explicitly defined in the supplied ``config`` it 

315 will not be overridden by this method if ``overwrite`` is `False`. 

316 This allows explicit values set in external configs to be retained. 

317 """ 

318 raise NotImplementedError() 

319 

320 @staticmethod 

321 def fromConfig( 

322 config: Config, 

323 bridgeManager: DatastoreRegistryBridgeManager, 

324 butlerRoot: ResourcePathExpression | None = None, 

325 ) -> Datastore: 

326 """Create datastore from type specified in config file. 

327 

328 Parameters 

329 ---------- 

330 config : `Config` or `~lsst.resources.ResourcePathExpression` 

331 Configuration instance. 

332 bridgeManager : `DatastoreRegistryBridgeManager` 

333 Object that manages the interface between `Registry` and 

334 datastores. 

335 butlerRoot : `str`, optional 

336 Butler root directory. 

337 """ 

338 cls = doImportType(config["datastore", "cls"]) 

339 if not issubclass(cls, Datastore): 

340 raise TypeError(f"Imported child class {config['datastore', 'cls']} is not a Datastore") 

341 return cls(config=config, bridgeManager=bridgeManager, butlerRoot=butlerRoot) 

342 

343 def __init__( 

344 self, 

345 config: Config | ResourcePathExpression, 

346 bridgeManager: DatastoreRegistryBridgeManager, 

347 butlerRoot: ResourcePathExpression | None = None, 

348 ): 

349 self.config = DatastoreConfig(config) 

350 self.name = "ABCDataStore" 

351 self._transaction: DatastoreTransaction | None = None 

352 

353 # All Datastores need storage classes and constraints 

354 self.storageClassFactory = StorageClassFactory() 

355 

356 # And read the constraints list 

357 constraintsConfig = self.config.get("constraints") 

358 self.constraints = Constraints(constraintsConfig, universe=bridgeManager.universe) 

359 

360 def __str__(self) -> str: 

361 return self.name 

362 

363 def __repr__(self) -> str: 

364 return self.name 

365 

366 @property 

367 def names(self) -> tuple[str, ...]: 

368 """Names associated with this datastore returned as a list. 

369 

370 Can be different to ``name`` for a chaining datastore. 

371 """ 

372 # Default implementation returns solely the name itself 

373 return (self.name,) 

374 

375 @property 

376 def roots(self) -> dict[str, ResourcePath | None]: 

377 """Return the root URIs for each named datastore. 

378 

379 Mapping from datastore name to root URI. The URI can be `None` 

380 if a datastore has no concept of a root URI. 

381 (`dict` [`str`, `ResourcePath` | `None`]) 

382 """ 

383 return {self.name: None} 

384 

385 @contextlib.contextmanager 

386 def transaction(self) -> Iterator[DatastoreTransaction]: 

387 """Context manager supporting `Datastore` transactions. 

388 

389 Transactions can be nested, and are to be used in combination with 

390 `Registry.transaction`. 

391 """ 

392 self._transaction = DatastoreTransaction(self._transaction) 

393 try: 

394 yield self._transaction 

395 except BaseException: 

396 self._transaction.rollback() 

397 raise 

398 else: 

399 self._transaction.commit() 

400 self._transaction = self._transaction.parent 

401 

402 @abstractmethod 

403 def knows(self, ref: DatasetRef) -> bool: 

404 """Check if the dataset is known to the datastore. 

405 

406 Does not check for existence of any artifact. 

407 

408 Parameters 

409 ---------- 

410 ref : `DatasetRef` 

411 Reference to the required dataset. 

412 

413 Returns 

414 ------- 

415 exists : `bool` 

416 `True` if the dataset is known to the datastore. 

417 """ 

418 raise NotImplementedError() 

419 

420 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

421 """Check which of the given datasets are known to this datastore. 

422 

423 This is like ``mexist()`` but does not check that the file exists. 

424 

425 Parameters 

426 ---------- 

427 refs : iterable `DatasetRef` 

428 The datasets to check. 

429 

430 Returns 

431 ------- 

432 exists : `dict`[`DatasetRef`, `bool`] 

433 Mapping of dataset to boolean indicating whether the dataset 

434 is known to the datastore. 

435 """ 

436 # Non-optimized default calls knows() repeatedly. 

437 return {ref: self.knows(ref) for ref in refs} 

438 

439 def mexists( 

440 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

441 ) -> dict[DatasetRef, bool]: 

442 """Check the existence of multiple datasets at once. 

443 

444 Parameters 

445 ---------- 

446 refs : iterable of `DatasetRef` 

447 The datasets to be checked. 

448 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

449 Optional mapping of datastore artifact to existence. Updated by 

450 this method with details of all artifacts tested. Can be `None` 

451 if the caller is not interested. 

452 

453 Returns 

454 ------- 

455 existence : `dict` of [`DatasetRef`, `bool`] 

456 Mapping from dataset to boolean indicating existence. 

457 """ 

458 existence: dict[DatasetRef, bool] = {} 

459 # Non-optimized default. 

460 for ref in refs: 

461 existence[ref] = self.exists(ref) 

462 return existence 

463 

464 @abstractmethod 

465 def exists(self, datasetRef: DatasetRef) -> bool: 

466 """Check if the dataset exists in the datastore. 

467 

468 Parameters 

469 ---------- 

470 datasetRef : `DatasetRef` 

471 Reference to the required dataset. 

472 

473 Returns 

474 ------- 

475 exists : `bool` 

476 `True` if the entity exists in the `Datastore`. 

477 """ 

478 raise NotImplementedError("Must be implemented by subclass") 

479 

480 @abstractmethod 

481 def get( 

482 self, 

483 datasetRef: DatasetRef, 

484 parameters: Mapping[str, Any] | None = None, 

485 storageClass: StorageClass | str | None = None, 

486 ) -> Any: 

487 """Load an `InMemoryDataset` from the store. 

488 

489 Parameters 

490 ---------- 

491 datasetRef : `DatasetRef` 

492 Reference to the required Dataset. 

493 parameters : `dict` 

494 `StorageClass`-specific parameters that specify a slice of the 

495 Dataset to be loaded. 

496 storageClass : `StorageClass` or `str`, optional 

497 The storage class to be used to override the Python type 

498 returned by this method. By default the returned type matches 

499 the dataset type definition for this dataset. Specifying a 

500 read `StorageClass` can force a different type to be returned. 

501 This type must be compatible with the original type. 

502 

503 Returns 

504 ------- 

505 inMemoryDataset : `object` 

506 Requested Dataset or slice thereof as an InMemoryDataset. 

507 """ 

508 raise NotImplementedError("Must be implemented by subclass") 

509 

510 @abstractmethod 

511 def put(self, inMemoryDataset: Any, datasetRef: DatasetRef) -> None: 

512 """Write a `InMemoryDataset` with a given `DatasetRef` to the store. 

513 

514 Parameters 

515 ---------- 

516 inMemoryDataset : `object` 

517 The Dataset to store. 

518 datasetRef : `DatasetRef` 

519 Reference to the associated Dataset. 

520 """ 

521 raise NotImplementedError("Must be implemented by subclass") 

522 

523 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None: 

524 """Allow ingest transfer mode to be defaulted based on datasets. 

525 

526 Parameters 

527 ---------- 

528 datasets : `FileDataset` 

529 Each positional argument is a struct containing information about 

530 a file to be ingested, including its path (either absolute or 

531 relative to the datastore root, if applicable), a complete 

532 `DatasetRef` (with ``dataset_id not None``), and optionally a 

533 formatter class or its fully-qualified string name. If a formatter 

534 is not provided, this method should populate that attribute with 

535 the formatter the datastore would use for `put`. Subclasses are 

536 also permitted to modify the path attribute (typically to put it 

537 in what the datastore considers its standard form). 

538 transfer : `str`, optional 

539 How (and whether) the dataset should be added to the datastore. 

540 See `ingest` for details of transfer modes. 

541 

542 Returns 

543 ------- 

544 newTransfer : `str` 

545 Transfer mode to use. Will be identical to the supplied transfer 

546 mode unless "auto" is used. 

547 """ 

548 if transfer != "auto": 

549 return transfer 

550 raise RuntimeError(f"{transfer} is not allowed without specialization.") 

551 

552 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> IngestPrepData: 

553 """Process datasets to identify which ones can be ingested. 

554 

555 Parameters 

556 ---------- 

557 datasets : `FileDataset` 

558 Each positional argument is a struct containing information about 

559 a file to be ingested, including its path (either absolute or 

560 relative to the datastore root, if applicable), a complete 

561 `DatasetRef` (with ``dataset_id not None``), and optionally a 

562 formatter class or its fully-qualified string name. If a formatter 

563 is not provided, this method should populate that attribute with 

564 the formatter the datastore would use for `put`. Subclasses are 

565 also permitted to modify the path attribute (typically to put it 

566 in what the datastore considers its standard form). 

567 transfer : `str`, optional 

568 How (and whether) the dataset should be added to the datastore. 

569 See `ingest` for details of transfer modes. 

570 

571 Returns 

572 ------- 

573 data : `IngestPrepData` 

574 An instance of a subclass of `IngestPrepData`, used to pass 

575 arbitrary data from `_prepIngest` to `_finishIngest`. This should 

576 include only the datasets this datastore can actually ingest; 

577 others should be silently ignored (`Datastore.ingest` will inspect 

578 `IngestPrepData.refs` and raise `DatasetTypeNotSupportedError` if 

579 necessary). 

580 

581 Raises 

582 ------ 

583 NotImplementedError 

584 Raised if the datastore does not support the given transfer mode 

585 (including the case where ingest is not supported at all). 

586 FileNotFoundError 

587 Raised if one of the given files does not exist. 

588 FileExistsError 

589 Raised if transfer is not `None` but the (internal) location the 

590 file would be moved to is already occupied. 

591 

592 Notes 

593 ----- 

594 This method (along with `_finishIngest`) should be implemented by 

595 subclasses to provide ingest support instead of implementing `ingest` 

596 directly. 

597 

598 `_prepIngest` should not modify the data repository or given files in 

599 any way; all changes should be deferred to `_finishIngest`. 

600 

601 When possible, exceptions should be raised in `_prepIngest` instead of 

602 `_finishIngest`. `NotImplementedError` exceptions that indicate that 

603 the transfer mode is not supported must be raised by `_prepIngest` 

604 instead of `_finishIngest`. 

605 """ 

606 raise NotImplementedError(f"Datastore {self} does not support direct file-based ingest.") 

607 

608 def _finishIngest( 

609 self, prepData: IngestPrepData, *, transfer: str | None = None, record_validation_info: bool = True 

610 ) -> None: 

611 """Complete an ingest operation. 

612 

613 Parameters 

614 ---------- 

615 data : `IngestPrepData` 

616 An instance of a subclass of `IngestPrepData`. Guaranteed to be 

617 the direct result of a call to `_prepIngest` on this datastore. 

618 transfer : `str`, optional 

619 How (and whether) the dataset should be added to the datastore. 

620 See `ingest` for details of transfer modes. 

621 record_validation_info : `bool`, optional 

622 If `True`, the default, the datastore can record validation 

623 information associated with the file. If `False` the datastore 

624 will not attempt to track any information such as checksums 

625 or file sizes. This can be useful if such information is tracked 

626 in an external system or if the file is to be compressed in place. 

627 It is up to the datastore whether this parameter is relevant. 

628 

629 Raises 

630 ------ 

631 FileNotFoundError 

632 Raised if one of the given files does not exist. 

633 FileExistsError 

634 Raised if transfer is not `None` but the (internal) location the 

635 file would be moved to is already occupied. 

636 

637 Notes 

638 ----- 

639 This method (along with `_prepIngest`) should be implemented by 

640 subclasses to provide ingest support instead of implementing `ingest` 

641 directly. 

642 """ 

643 raise NotImplementedError(f"Datastore {self} does not support direct file-based ingest.") 

644 

645 def ingest( 

646 self, *datasets: FileDataset, transfer: str | None = None, record_validation_info: bool = True 

647 ) -> None: 

648 """Ingest one or more files into the datastore. 

649 

650 Parameters 

651 ---------- 

652 datasets : `FileDataset` 

653 Each positional argument is a struct containing information about 

654 a file to be ingested, including its path (either absolute or 

655 relative to the datastore root, if applicable), a complete 

656 `DatasetRef` (with ``dataset_id not None``), and optionally a 

657 formatter class or its fully-qualified string name. If a formatter 

658 is not provided, the one the datastore would use for ``put`` on 

659 that dataset is assumed. 

660 transfer : `str`, optional 

661 How (and whether) the dataset should be added to the datastore. 

662 If `None` (default), the file must already be in a location 

663 appropriate for the datastore (e.g. within its root directory), 

664 and will not be modified. Other choices include "move", "copy", 

665 "link", "symlink", "relsymlink", and "hardlink". "link" is a 

666 special transfer mode that will first try to make a hardlink and 

667 if that fails a symlink will be used instead. "relsymlink" creates 

668 a relative symlink rather than use an absolute path. 

669 Most datastores do not support all transfer modes. 

670 "auto" is a special option that will let the 

671 data store choose the most natural option for itself. 

672 record_validation_info : `bool`, optional 

673 If `True`, the default, the datastore can record validation 

674 information associated with the file. If `False` the datastore 

675 will not attempt to track any information such as checksums 

676 or file sizes. This can be useful if such information is tracked 

677 in an external system or if the file is to be compressed in place. 

678 It is up to the datastore whether this parameter is relevant. 

679 

680 Raises 

681 ------ 

682 NotImplementedError 

683 Raised if the datastore does not support the given transfer mode 

684 (including the case where ingest is not supported at all). 

685 DatasetTypeNotSupportedError 

686 Raised if one or more files to be ingested have a dataset type that 

687 is not supported by the datastore. 

688 FileNotFoundError 

689 Raised if one of the given files does not exist. 

690 FileExistsError 

691 Raised if transfer is not `None` but the (internal) location the 

692 file would be moved to is already occupied. 

693 

694 Notes 

695 ----- 

696 Subclasses should implement `_prepIngest` and `_finishIngest` instead 

697 of implementing `ingest` directly. Datastores that hold and 

698 delegate to child datastores may want to call those methods as well. 

699 

700 Subclasses are encouraged to document their supported transfer modes 

701 in their class documentation. 

702 """ 

703 # Allow a datastore to select a default transfer mode 

704 transfer = self._overrideTransferMode(*datasets, transfer=transfer) 

705 prepData = self._prepIngest(*datasets, transfer=transfer) 

706 refs = {ref.id: ref for dataset in datasets for ref in dataset.refs} 

707 if refs.keys() != prepData.refs.keys(): 

708 unsupported = refs.keys() - prepData.refs.keys() 

709 # Group unsupported refs by DatasetType for an informative 

710 # but still concise error message. 

711 byDatasetType = defaultdict(list) 

712 for datasetId in unsupported: 

713 ref = refs[datasetId] 

714 byDatasetType[ref.datasetType].append(ref) 

715 raise DatasetTypeNotSupportedError( 

716 "DatasetType(s) not supported in ingest: " 

717 + ", ".join(f"{k.name} ({len(v)} dataset(s))" for k, v in byDatasetType.items()) 

718 ) 

719 self._finishIngest(prepData, transfer=transfer, record_validation_info=record_validation_info) 

720 

721 def transfer_from( 

722 self, 

723 source_datastore: Datastore, 

724 refs: Iterable[DatasetRef], 

725 transfer: str = "auto", 

726 artifact_existence: dict[ResourcePath, bool] | None = None, 

727 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

728 """Transfer dataset artifacts from another datastore to this one. 

729 

730 Parameters 

731 ---------- 

732 source_datastore : `Datastore` 

733 The datastore from which to transfer artifacts. That datastore 

734 must be compatible with this datastore receiving the artifacts. 

735 refs : iterable of `DatasetRef` 

736 The datasets to transfer from the source datastore. 

737 transfer : `str`, optional 

738 How (and whether) the dataset should be added to the datastore. 

739 Choices include "move", "copy", 

740 "link", "symlink", "relsymlink", and "hardlink". "link" is a 

741 special transfer mode that will first try to make a hardlink and 

742 if that fails a symlink will be used instead. "relsymlink" creates 

743 a relative symlink rather than use an absolute path. 

744 Most datastores do not support all transfer modes. 

745 "auto" (the default) is a special option that will let the 

746 data store choose the most natural option for itself. 

747 If the source location and transfer location are identical the 

748 transfer mode will be ignored. 

749 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

750 Optional mapping of datastore artifact to existence. Updated by 

751 this method with details of all artifacts tested. Can be `None` 

752 if the caller is not interested. 

753 

754 Returns 

755 ------- 

756 accepted : `set` [`DatasetRef`] 

757 The datasets that were transferred. 

758 rejected : `set` [`DatasetRef`] 

759 The datasets that were rejected due to a constraints violation. 

760 

761 Raises 

762 ------ 

763 TypeError 

764 Raised if the two datastores are not compatible. 

765 """ 

766 if type(self) is not type(source_datastore): 

767 raise TypeError( 

768 f"Datastore mismatch between this datastore ({type(self)}) and the " 

769 f"source datastore ({type(source_datastore)})." 

770 ) 

771 

772 raise NotImplementedError(f"Datastore {type(self)} must implement a transfer_from method.") 

773 

774 def getManyURIs( 

775 self, 

776 refs: Iterable[DatasetRef], 

777 predict: bool = False, 

778 allow_missing: bool = False, 

779 ) -> dict[DatasetRef, DatasetRefURIs]: 

780 """Return URIs associated with many datasets. 

781 

782 Parameters 

783 ---------- 

784 refs : iterable of `DatasetIdRef` 

785 References to the required datasets. 

786 predict : `bool`, optional 

787 If `True`, allow URIs to be returned of datasets that have not 

788 been written. 

789 allow_missing : `bool` 

790 If `False`, and ``predict`` is `False`, will raise if a 

791 `DatasetRef` does not exist. 

792 

793 Returns 

794 ------- 

795 URIs : `dict` of [`DatasetRef`, `DatasetRefUris`] 

796 A dict of primary and component URIs, indexed by the passed-in 

797 refs. 

798 

799 Raises 

800 ------ 

801 FileNotFoundError 

802 A URI has been requested for a dataset that does not exist and 

803 guessing is not allowed. 

804 

805 Notes 

806 ----- 

807 In file-based datastores, getManyURIs does not check that the file is 

808 really there, it's assuming it is if datastore is aware of the file 

809 then it actually exists. 

810 """ 

811 uris: dict[DatasetRef, DatasetRefURIs] = {} 

812 missing_refs = [] 

813 for ref in refs: 

814 try: 

815 uris[ref] = self.getURIs(ref, predict=predict) 

816 except FileNotFoundError: 

817 missing_refs.append(ref) 

818 if missing_refs and not allow_missing: 

819 raise FileNotFoundError( 

820 "Missing {} refs from datastore out of {} and predict=False.".format( 

821 num_missing := len(missing_refs), num_missing + len(uris) 

822 ) 

823 ) 

824 return uris 

825 

826 @abstractmethod 

827 def getURIs(self, datasetRef: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

828 """Return URIs associated with dataset. 

829 

830 Parameters 

831 ---------- 

832 ref : `DatasetRef` 

833 Reference to the required dataset. 

834 predict : `bool`, optional 

835 If the datastore does not know about the dataset, should it 

836 return a predicted URI or not? 

837 

838 Returns 

839 ------- 

840 uris : `DatasetRefURIs` 

841 The URI to the primary artifact associated with this dataset (if 

842 the dataset was disassembled within the datastore this may be 

843 `None`), and the URIs to any components associated with the dataset 

844 artifact. (can be empty if there are no components). 

845 """ 

846 raise NotImplementedError() 

847 

848 @abstractmethod 

849 def getURI(self, datasetRef: DatasetRef, predict: bool = False) -> ResourcePath: 

850 """URI to the Dataset. 

851 

852 Parameters 

853 ---------- 

854 datasetRef : `DatasetRef` 

855 Reference to the required Dataset. 

856 predict : `bool` 

857 If `True` attempt to predict the URI for a dataset if it does 

858 not exist in datastore. 

859 

860 Returns 

861 ------- 

862 uri : `str` 

863 URI string pointing to the Dataset within the datastore. If the 

864 Dataset does not exist in the datastore, the URI may be a guess. 

865 If the datastore does not have entities that relate well 

866 to the concept of a URI the returned URI string will be 

867 descriptive. The returned URI is not guaranteed to be obtainable. 

868 

869 Raises 

870 ------ 

871 FileNotFoundError 

872 A URI has been requested for a dataset that does not exist and 

873 guessing is not allowed. 

874 """ 

875 raise NotImplementedError("Must be implemented by subclass") 

876 

877 @abstractmethod 

878 def retrieveArtifacts( 

879 self, 

880 refs: Iterable[DatasetRef], 

881 destination: ResourcePath, 

882 transfer: str = "auto", 

883 preserve_path: bool = True, 

884 overwrite: bool = False, 

885 ) -> list[ResourcePath]: 

886 """Retrieve the artifacts associated with the supplied refs. 

887 

888 Parameters 

889 ---------- 

890 refs : iterable of `DatasetRef` 

891 The datasets for which artifacts are to be retrieved. 

892 A single ref can result in multiple artifacts. The refs must 

893 be resolved. 

894 destination : `lsst.resources.ResourcePath` 

895 Location to write the artifacts. 

896 transfer : `str`, optional 

897 Method to use to transfer the artifacts. Must be one of the options 

898 supported by `lsst.resources.ResourcePath.transfer_from()`. 

899 "move" is not allowed. 

900 preserve_path : `bool`, optional 

901 If `True` the full path of the artifact within the datastore 

902 is preserved. If `False` the final file component of the path 

903 is used. 

904 overwrite : `bool`, optional 

905 If `True` allow transfers to overwrite existing files at the 

906 destination. 

907 

908 Returns 

909 ------- 

910 targets : `list` of `lsst.resources.ResourcePath` 

911 URIs of file artifacts in destination location. Order is not 

912 preserved. 

913 

914 Notes 

915 ----- 

916 For non-file datastores the artifacts written to the destination 

917 may not match the representation inside the datastore. For example 

918 a hierarchichal data structure in a NoSQL database may well be stored 

919 as a JSON file. 

920 """ 

921 raise NotImplementedError() 

922 

923 @abstractmethod 

924 def remove(self, datasetRef: DatasetRef) -> None: 

925 """Indicate to the Datastore that a Dataset can be removed. 

926 

927 Parameters 

928 ---------- 

929 datasetRef : `DatasetRef` 

930 Reference to the required Dataset. 

931 

932 Raises 

933 ------ 

934 FileNotFoundError 

935 When Dataset does not exist. 

936 

937 Notes 

938 ----- 

939 Some Datastores may implement this method as a silent no-op to 

940 disable Dataset deletion through standard interfaces. 

941 """ 

942 raise NotImplementedError("Must be implemented by subclass") 

943 

944 @abstractmethod 

945 def forget(self, refs: Iterable[DatasetRef]) -> None: 

946 """Indicate to the Datastore that it should remove all records of the 

947 given datasets, without actually deleting them. 

948 

949 Parameters 

950 ---------- 

951 refs : `~collections.abc.Iterable` [ `DatasetRef` ] 

952 References to the datasets being forgotten. 

953 

954 Notes 

955 ----- 

956 Asking a datastore to forget a `DatasetRef` it does not hold should be 

957 a silent no-op, not an error. 

958 """ 

959 raise NotImplementedError("Must be implemented by subclass") 

960 

961 @abstractmethod 

962 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None: 

963 """Indicate to the Datastore that a Dataset can be moved to the trash. 

964 

965 Parameters 

966 ---------- 

967 ref : `DatasetRef` or iterable thereof 

968 Reference(s) to the required Dataset. 

969 ignore_errors : `bool`, optional 

970 Determine whether errors should be ignored. When multiple 

971 refs are being trashed there will be no per-ref check. 

972 

973 Raises 

974 ------ 

975 FileNotFoundError 

976 When Dataset does not exist and errors are not ignored. Only 

977 checked if a single ref is supplied (and not in a list). 

978 

979 Notes 

980 ----- 

981 Some Datastores may implement this method as a silent no-op to 

982 disable Dataset deletion through standard interfaces. 

983 """ 

984 raise NotImplementedError("Must be implemented by subclass") 

985 

986 @abstractmethod 

987 def emptyTrash(self, ignore_errors: bool = True) -> None: 

988 """Remove all datasets from the trash. 

989 

990 Parameters 

991 ---------- 

992 ignore_errors : `bool`, optional 

993 Determine whether errors should be ignored. 

994 

995 Notes 

996 ----- 

997 Some Datastores may implement this method as a silent no-op to 

998 disable Dataset deletion through standard interfaces. 

999 """ 

1000 raise NotImplementedError("Must be implemented by subclass") 

1001 

1002 @abstractmethod 

1003 def transfer(self, inputDatastore: Datastore, datasetRef: DatasetRef) -> None: 

1004 """Transfer a dataset from another datastore to this datastore. 

1005 

1006 Parameters 

1007 ---------- 

1008 inputDatastore : `Datastore` 

1009 The external `Datastore` from which to retrieve the Dataset. 

1010 datasetRef : `DatasetRef` 

1011 Reference to the required Dataset. 

1012 """ 

1013 raise NotImplementedError("Must be implemented by subclass") 

1014 

1015 def export( 

1016 self, 

1017 refs: Iterable[DatasetRef], 

1018 *, 

1019 directory: ResourcePathExpression | None = None, 

1020 transfer: str | None = "auto", 

1021 ) -> Iterable[FileDataset]: 

1022 """Export datasets for transfer to another data repository. 

1023 

1024 Parameters 

1025 ---------- 

1026 refs : iterable of `DatasetRef` 

1027 Dataset references to be exported. 

1028 directory : `str`, optional 

1029 Path to a directory that should contain files corresponding to 

1030 output datasets. Ignored if ``transfer`` is explicitly `None`. 

1031 transfer : `str`, optional 

1032 Mode that should be used to move datasets out of the repository. 

1033 Valid options are the same as those of the ``transfer`` argument 

1034 to ``ingest``, and datastores may similarly signal that a transfer 

1035 mode is not supported by raising `NotImplementedError`. If "auto" 

1036 is given and no ``directory`` is specified, `None` will be 

1037 implied. 

1038 

1039 Returns 

1040 ------- 

1041 dataset : iterable of `DatasetTransfer` 

1042 Structs containing information about the exported datasets, in the 

1043 same order as ``refs``. 

1044 

1045 Raises 

1046 ------ 

1047 NotImplementedError 

1048 Raised if the given transfer mode is not supported. 

1049 """ 

1050 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

1051 

1052 @abstractmethod 

1053 def validateConfiguration( 

1054 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

1055 ) -> None: 

1056 """Validate some of the configuration for this datastore. 

1057 

1058 Parameters 

1059 ---------- 

1060 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

1061 Entities to test against this configuration. Can be differing 

1062 types. 

1063 logFailures : `bool`, optional 

1064 If `True`, output a log message for every validation error 

1065 detected. 

1066 

1067 Raises 

1068 ------ 

1069 DatastoreValidationError 

1070 Raised if there is a validation problem with a configuration. 

1071 

1072 Notes 

1073 ----- 

1074 Which parts of the configuration are validated is at the discretion 

1075 of each Datastore implementation. 

1076 """ 

1077 raise NotImplementedError("Must be implemented by subclass") 

1078 

1079 @abstractmethod 

1080 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

1081 """Validate a specific look up key with supplied entity. 

1082 

1083 Parameters 

1084 ---------- 

1085 lookupKey : `LookupKey` 

1086 Key to use to retrieve information from the datastore 

1087 configuration. 

1088 entity : `DatasetRef`, `DatasetType`, or `StorageClass` 

1089 Entity to compare with configuration retrieved using the 

1090 specified lookup key. 

1091 

1092 Raises 

1093 ------ 

1094 DatastoreValidationError 

1095 Raised if there is a problem with the combination of entity 

1096 and lookup key. 

1097 

1098 Notes 

1099 ----- 

1100 Bypasses the normal selection priorities by allowing a key that 

1101 would normally not be selected to be validated. 

1102 """ 

1103 raise NotImplementedError("Must be implemented by subclass") 

1104 

1105 @abstractmethod 

1106 def getLookupKeys(self) -> set[LookupKey]: 

1107 """Return all the lookup keys relevant to this datastore. 

1108 

1109 Returns 

1110 ------- 

1111 keys : `set` of `LookupKey` 

1112 The keys stored internally for looking up information based 

1113 on `DatasetType` name or `StorageClass`. 

1114 """ 

1115 raise NotImplementedError("Must be implemented by subclass") 

1116 

1117 def needs_expanded_data_ids( 

1118 self, 

1119 transfer: str | None, 

1120 entity: DatasetRef | DatasetType | StorageClass | None = None, 

1121 ) -> bool: 

1122 """Test whether this datastore needs expanded data IDs to ingest. 

1123 

1124 Parameters 

1125 ---------- 

1126 transfer : `str` or `None` 

1127 Transfer mode for ingest. 

1128 entity, optional 

1129 Object representing what will be ingested. If not provided (or not 

1130 specific enough), `True` may be returned even if expanded data 

1131 IDs aren't necessary. 

1132 

1133 Returns 

1134 ------- 

1135 needed : `bool` 

1136 If `True`, expanded data IDs may be needed. `False` only if 

1137 expansion definitely isn't necessary. 

1138 """ 

1139 return True 

1140 

1141 @abstractmethod 

1142 def import_records( 

1143 self, 

1144 data: Mapping[str, DatastoreRecordData], 

1145 ) -> None: 

1146 """Import datastore location and record data from an in-memory data 

1147 structure. 

1148 

1149 Parameters 

1150 ---------- 

1151 data : `~collections.abc.Mapping` [ `str`, `DatastoreRecordData` ] 

1152 Datastore records indexed by datastore name. May contain data for 

1153 other `Datastore` instances (generally because they are chained to 

1154 this one), which should be ignored. 

1155 

1156 Notes 

1157 ----- 

1158 Implementations should generally not check that any external resources 

1159 (e.g. files) referred to by these records actually exist, for 

1160 performance reasons; we expect higher-level code to guarantee that they 

1161 do. 

1162 

1163 Implementations are responsible for calling 

1164 `DatastoreRegistryBridge.insert` on all datasets in ``data.locations`` 

1165 where the key is in `names`, as well as loading any opaque table data. 

1166 """ 

1167 raise NotImplementedError() 

1168 

1169 @abstractmethod 

1170 def export_records( 

1171 self, 

1172 refs: Iterable[DatasetIdRef], 

1173 ) -> Mapping[str, DatastoreRecordData]: 

1174 """Export datastore records and locations to an in-memory data 

1175 structure. 

1176 

1177 Parameters 

1178 ---------- 

1179 refs : `~collections.abc.Iterable` [ `DatasetIdRef` ] 

1180 Datasets to save. This may include datasets not known to this 

1181 datastore, which should be ignored. 

1182 

1183 Returns 

1184 ------- 

1185 data : `~collections.abc.Mapping` [ `str`, `DatastoreRecordData` ] 

1186 Exported datastore records indexed by datastore name. 

1187 """ 

1188 raise NotImplementedError() 

1189 

1190 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None: 

1191 """Specify a method that can be used by datastore to retrieve 

1192 registry-defined dataset type. 

1193 

1194 Parameters 

1195 ---------- 

1196 method : `~collections.abc.Callable` | `None` 

1197 Method that takes a name of the dataset type and returns a 

1198 corresponding `DatasetType` instance as defined in Registry. If 

1199 dataset type name is not known to registry `None` is returned. 

1200 

1201 Notes 

1202 ----- 

1203 This method is only needed for a Datastore supporting a "trusted" mode 

1204 when it does not have an access to datastore records and needs to 

1205 guess dataset location based on its stored dataset type. 

1206 """ 

1207 pass