Coverage for python/lsst/daf/butler/_butler.py: 8%

704 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-02-28 02:30 -0800

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Butler top level classes. 

24""" 

25from __future__ import annotations 

26 

27__all__ = ( 

28 "Butler", 

29 "ButlerValidationError", 

30 "PruneCollectionsArgsError", 

31 "PurgeWithoutUnstorePruneCollectionsError", 

32 "RunWithoutPurgePruneCollectionsError", 

33 "PurgeUnsupportedPruneCollectionsError", 

34) 

35 

36import collections.abc 

37import contextlib 

38import logging 

39import numbers 

40import os 

41from collections import defaultdict 

42from typing import ( 

43 Any, 

44 ClassVar, 

45 Counter, 

46 Dict, 

47 Iterable, 

48 Iterator, 

49 List, 

50 MutableMapping, 

51 Optional, 

52 Sequence, 

53 Set, 

54 TextIO, 

55 Tuple, 

56 Type, 

57 Union, 

58) 

59 

60from lsst.resources import ResourcePath, ResourcePathExpression 

61from lsst.utils import doImportType 

62from lsst.utils.introspection import get_class_of 

63from lsst.utils.logging import VERBOSE, getLogger 

64 

65from ._butlerConfig import ButlerConfig 

66from ._butlerRepoIndex import ButlerRepoIndex 

67from ._deferredDatasetHandle import DeferredDatasetHandle 

68from ._limited_butler import LimitedButler 

69from .core import ( 

70 AmbiguousDatasetError, 

71 Config, 

72 ConfigSubset, 

73 DataCoordinate, 

74 DataId, 

75 DataIdValue, 

76 DatasetRef, 

77 DatasetRefURIs, 

78 DatasetType, 

79 Datastore, 

80 Dimension, 

81 DimensionConfig, 

82 DimensionElement, 

83 DimensionRecord, 

84 DimensionUniverse, 

85 FileDataset, 

86 Progress, 

87 StorageClass, 

88 StorageClassFactory, 

89 Timespan, 

90 ValidationError, 

91) 

92from .core.repoRelocation import BUTLER_ROOT_TAG 

93from .core.utils import transactional 

94from .registry import ( 

95 CollectionType, 

96 ConflictingDefinitionError, 

97 DataIdError, 

98 DatasetIdGenEnum, 

99 MissingDatasetTypeError, 

100 Registry, 

101 RegistryConfig, 

102 RegistryDefaults, 

103) 

104from .transfers import RepoExportContext 

105 

106log = getLogger(__name__) 

107 

108 

109class ButlerValidationError(ValidationError): 

110 """There is a problem with the Butler configuration.""" 

111 

112 pass 

113 

114 

115class PruneCollectionsArgsError(TypeError): 

116 """Base class for errors relating to Butler.pruneCollections input 

117 arguments. 

118 """ 

119 

120 pass 

121 

122 

123class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError): 

124 """Raised when purge and unstore are both required to be True, and 

125 purge is True but unstore is False. 

126 """ 

127 

128 def __init__(self) -> None: 

129 super().__init__("Cannot pass purge=True without unstore=True.") 

130 

131 

132class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError): 

133 """Raised when pruning a RUN collection but purge is False.""" 

134 

135 def __init__(self, collectionType: CollectionType): 

136 self.collectionType = collectionType 

137 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.") 

138 

139 

140class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError): 

141 """Raised when purge is True but is not supported for the given 

142 collection.""" 

143 

144 def __init__(self, collectionType: CollectionType): 

145 self.collectionType = collectionType 

146 super().__init__( 

147 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True." 

148 ) 

149 

150 

151class Butler(LimitedButler): 

152 """Main entry point for the data access system. 

153 

154 Parameters 

155 ---------- 

156 config : `ButlerConfig`, `Config` or `str`, optional. 

157 Configuration. Anything acceptable to the 

158 `ButlerConfig` constructor. If a directory path 

159 is given the configuration will be read from a ``butler.yaml`` file in 

160 that location. If `None` is given default values will be used. 

161 butler : `Butler`, optional. 

162 If provided, construct a new Butler that uses the same registry and 

163 datastore as the given one, but with the given collection and run. 

164 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

165 arguments. 

166 collections : `str` or `Iterable` [ `str` ], optional 

167 An expression specifying the collections to be searched (in order) when 

168 reading datasets. 

169 This may be a `str` collection name or an iterable thereof. 

170 See :ref:`daf_butler_collection_expressions` for more information. 

171 These collections are not registered automatically and must be 

172 manually registered before they are used by any method, but they may be 

173 manually registered after the `Butler` is initialized. 

174 run : `str`, optional 

175 Name of the `~CollectionType.RUN` collection new datasets should be 

176 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

177 ``collections`` will be set to ``[run]``. If not `None`, this 

178 collection will automatically be registered. If this is not set (and 

179 ``writeable`` is not set either), a read-only butler will be created. 

180 searchPaths : `list` of `str`, optional 

181 Directory paths to search when calculating the full Butler 

182 configuration. Not used if the supplied config is already a 

183 `ButlerConfig`. 

184 writeable : `bool`, optional 

185 Explicitly sets whether the butler supports write operations. If not 

186 provided, a read-write butler is created if any of ``run``, ``tags``, 

187 or ``chains`` is non-empty. 

188 inferDefaults : `bool`, optional 

189 If `True` (default) infer default data ID values from the values 

190 present in the datasets in ``collections``: if all collections have the 

191 same value (or no value) for a governor dimension, that value will be 

192 the default for that dimension. Nonexistent collections are ignored. 

193 If a default value is provided explicitly for a governor dimension via 

194 ``**kwargs``, no default will be inferred for that dimension. 

195 **kwargs : `str` 

196 Default data ID key-value pairs. These may only identify "governor" 

197 dimensions like ``instrument`` and ``skymap``. 

198 

199 Examples 

200 -------- 

201 While there are many ways to control exactly how a `Butler` interacts with 

202 the collections in its `Registry`, the most common cases are still simple. 

203 

204 For a read-only `Butler` that searches one collection, do:: 

205 

206 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

207 

208 For a read-write `Butler` that writes to and reads from a 

209 `~CollectionType.RUN` collection:: 

210 

211 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

212 

213 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

214 because we want to write to one `~CollectionType.RUN` collection but read 

215 from several others (as well):: 

216 

217 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

218 collections=["u/alice/DM-50000/a", 

219 "u/bob/DM-49998", 

220 "HSC/defaults"]) 

221 

222 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

223 Datasets will be read first from that run (since it appears first in the 

224 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

225 

226 Finally, one can always create a `Butler` with no collections:: 

227 

228 butler = Butler("/path/to/repo", writeable=True) 

229 

230 This can be extremely useful when you just want to use ``butler.registry``, 

231 e.g. for inserting dimension data or managing collections, or when the 

232 collections you want to use with the butler are not consistent. 

233 Passing ``writeable`` explicitly here is only necessary if you want to be 

234 able to make changes to the repo - usually the value for ``writeable`` can 

235 be guessed from the collection arguments provided, but it defaults to 

236 `False` when there are not collection arguments. 

237 """ 

238 

239 def __init__( 

240 self, 

241 config: Union[Config, str, None] = None, 

242 *, 

243 butler: Optional[Butler] = None, 

244 collections: Any = None, 

245 run: Optional[str] = None, 

246 searchPaths: Optional[List[str]] = None, 

247 writeable: Optional[bool] = None, 

248 inferDefaults: bool = True, 

249 **kwargs: str, 

250 ): 

251 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

252 # Load registry, datastore, etc. from config or existing butler. 

253 if butler is not None: 

254 if config is not None or searchPaths is not None or writeable is not None: 

255 raise TypeError( 

256 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument." 

257 ) 

258 self.registry = butler.registry.copy(defaults) 

259 self.datastore = butler.datastore 

260 self.storageClasses = butler.storageClasses 

261 self._config: ButlerConfig = butler._config 

262 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset 

263 else: 

264 # Can only look for strings in the known repos list. 

265 if isinstance(config, str) and config in self.get_known_repos(): 

266 config = str(self.get_repo_uri(config)) 

267 try: 

268 self._config = ButlerConfig(config, searchPaths=searchPaths) 

269 except FileNotFoundError as e: 

270 if known := self.get_known_repos(): 

271 aliases = f"(known aliases: {', '.join(known)})" 

272 else: 

273 aliases = "(no known aliases)" 

274 raise FileNotFoundError(f"{e} {aliases}") from e 

275 self._config = ButlerConfig(config, searchPaths=searchPaths) 

276 try: 

277 if "root" in self._config: 

278 butlerRoot = self._config["root"] 

279 else: 

280 butlerRoot = self._config.configDir 

281 if writeable is None: 

282 writeable = run is not None 

283 self.registry = Registry.fromConfig( 

284 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

285 ) 

286 self.datastore = Datastore.fromConfig( 

287 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

288 ) 

289 self.storageClasses = StorageClassFactory() 

290 self.storageClasses.addFromConfig(self._config) 

291 self._allow_put_of_predefined_dataset = self._config.get( 

292 "allow_put_of_predefined_dataset", False 

293 ) 

294 except Exception: 

295 # Failures here usually mean that configuration is incomplete, 

296 # just issue an error message which includes config file URI. 

297 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

298 raise 

299 

300 # For execution butler the datastore needs a special 

301 # dependency-inversion trick. This is not used by regular butler, 

302 # but we do not have a way to distinguish regular butler from execution 

303 # butler. 

304 self.datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

305 

306 if "run" in self._config or "collection" in self._config: 

307 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

308 

309 GENERATION: ClassVar[int] = 3 

310 """This is a Generation 3 Butler. 

311 

312 This attribute may be removed in the future, once the Generation 2 Butler 

313 interface has been fully retired; it should only be used in transitional 

314 code. 

315 """ 

316 

317 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

318 """Return DatasetType defined in registry given dataset type name.""" 

319 try: 

320 return self.registry.getDatasetType(name) 

321 except MissingDatasetTypeError: 

322 return None 

323 

324 @classmethod 

325 def get_repo_uri(cls, label: str) -> ResourcePath: 

326 """Look up the label in a butler repository index. 

327 

328 Parameters 

329 ---------- 

330 label : `str` 

331 Label of the Butler repository to look up. 

332 

333 Returns 

334 ------- 

335 uri : `lsst.resources.ResourcePath` 

336 URI to the Butler repository associated with the given label. 

337 

338 Raises 

339 ------ 

340 KeyError 

341 Raised if the label is not found in the index, or if an index 

342 can not be found at all. 

343 

344 Notes 

345 ----- 

346 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

347 information is discovered. 

348 """ 

349 return ButlerRepoIndex.get_repo_uri(label) 

350 

351 @classmethod 

352 def get_known_repos(cls) -> Set[str]: 

353 """Retrieve the list of known repository labels. 

354 

355 Returns 

356 ------- 

357 repos : `set` of `str` 

358 All the known labels. Can be empty if no index can be found. 

359 

360 Notes 

361 ----- 

362 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

363 information is discovered. 

364 """ 

365 return ButlerRepoIndex.get_known_repos() 

366 

367 @staticmethod 

368 def makeRepo( 

369 root: ResourcePathExpression, 

370 config: Union[Config, str, None] = None, 

371 dimensionConfig: Union[Config, str, None] = None, 

372 standalone: bool = False, 

373 searchPaths: Optional[List[str]] = None, 

374 forceConfigRoot: bool = True, 

375 outfile: Optional[ResourcePathExpression] = None, 

376 overwrite: bool = False, 

377 ) -> Config: 

378 """Create an empty data repository by adding a butler.yaml config 

379 to a repository root directory. 

380 

381 Parameters 

382 ---------- 

383 root : `lsst.resources.ResourcePathExpression` 

384 Path or URI to the root location of the new repository. Will be 

385 created if it does not exist. 

386 config : `Config` or `str`, optional 

387 Configuration to write to the repository, after setting any 

388 root-dependent Registry or Datastore config options. Can not 

389 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

390 configuration will be used. Root-dependent config options 

391 specified in this config are overwritten if ``forceConfigRoot`` 

392 is `True`. 

393 dimensionConfig : `Config` or `str`, optional 

394 Configuration for dimensions, will be used to initialize registry 

395 database. 

396 standalone : `bool` 

397 If True, write all expanded defaults, not just customized or 

398 repository-specific settings. 

399 This (mostly) decouples the repository from the default 

400 configuration, insulating it from changes to the defaults (which 

401 may be good or bad, depending on the nature of the changes). 

402 Future *additions* to the defaults will still be picked up when 

403 initializing `Butlers` to repos created with ``standalone=True``. 

404 searchPaths : `list` of `str`, optional 

405 Directory paths to search when calculating the full butler 

406 configuration. 

407 forceConfigRoot : `bool`, optional 

408 If `False`, any values present in the supplied ``config`` that 

409 would normally be reset are not overridden and will appear 

410 directly in the output config. This allows non-standard overrides 

411 of the root directory for a datastore or registry to be given. 

412 If this parameter is `True` the values for ``root`` will be 

413 forced into the resulting config if appropriate. 

414 outfile : `lss.resources.ResourcePathExpression`, optional 

415 If not-`None`, the output configuration will be written to this 

416 location rather than into the repository itself. Can be a URI 

417 string. Can refer to a directory that will be used to write 

418 ``butler.yaml``. 

419 overwrite : `bool`, optional 

420 Create a new configuration file even if one already exists 

421 in the specified output location. Default is to raise 

422 an exception. 

423 

424 Returns 

425 ------- 

426 config : `Config` 

427 The updated `Config` instance written to the repo. 

428 

429 Raises 

430 ------ 

431 ValueError 

432 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

433 regular Config (as these subclasses would make it impossible to 

434 support ``standalone=False``). 

435 FileExistsError 

436 Raised if the output config file already exists. 

437 os.error 

438 Raised if the directory does not exist, exists but is not a 

439 directory, or cannot be created. 

440 

441 Notes 

442 ----- 

443 Note that when ``standalone=False`` (the default), the configuration 

444 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

445 construct the repository should also be used to construct any Butlers 

446 to avoid configuration inconsistencies. 

447 """ 

448 if isinstance(config, (ButlerConfig, ConfigSubset)): 

449 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

450 

451 # Ensure that the root of the repository exists or can be made 

452 root_uri = ResourcePath(root, forceDirectory=True) 

453 root_uri.mkdir() 

454 

455 config = Config(config) 

456 

457 # If we are creating a new repo from scratch with relative roots, 

458 # do not propagate an explicit root from the config file 

459 if "root" in config: 

460 del config["root"] 

461 

462 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

463 imported_class = doImportType(full["datastore", "cls"]) 

464 if not issubclass(imported_class, Datastore): 

465 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore") 

466 datastoreClass: Type[Datastore] = imported_class 

467 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

468 

469 # if key exists in given config, parse it, otherwise parse the defaults 

470 # in the expanded config 

471 if config.get(("registry", "db")): 

472 registryConfig = RegistryConfig(config) 

473 else: 

474 registryConfig = RegistryConfig(full) 

475 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

476 if defaultDatabaseUri is not None: 

477 Config.updateParameters( 

478 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot 

479 ) 

480 else: 

481 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot) 

482 

483 if standalone: 

484 config.merge(full) 

485 else: 

486 # Always expand the registry.managers section into the per-repo 

487 # config, because after the database schema is created, it's not 

488 # allowed to change anymore. Note that in the standalone=True 

489 # branch, _everything_ in the config is expanded, so there's no 

490 # need to special case this. 

491 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

492 configURI: ResourcePathExpression 

493 if outfile is not None: 

494 # When writing to a separate location we must include 

495 # the root of the butler repo in the config else it won't know 

496 # where to look. 

497 config["root"] = root_uri.geturl() 

498 configURI = outfile 

499 else: 

500 configURI = root_uri 

501 # Strip obscore configuration, if it is present, before writing config 

502 # to a file, obscore config will be stored in registry. 

503 config_to_write = config 

504 if (obscore_config_key := ("registry", "managers", "obscore", "config")) in config: 

505 config_to_write = config.copy() 

506 del config_to_write[obscore_config_key] 

507 config_to_write.dumpToUri(configURI, overwrite=overwrite) 

508 

509 # Create Registry and populate tables 

510 registryConfig = RegistryConfig(config.get("registry")) 

511 dimensionConfig = DimensionConfig(dimensionConfig) 

512 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri) 

513 

514 log.verbose("Wrote new Butler configuration file to %s", configURI) 

515 

516 return config 

517 

518 @classmethod 

519 def _unpickle( 

520 cls, 

521 config: ButlerConfig, 

522 collections: Optional[tuple[str, ...]], 

523 run: Optional[str], 

524 defaultDataId: Dict[str, str], 

525 writeable: bool, 

526 ) -> Butler: 

527 """Callable used to unpickle a Butler. 

528 

529 We prefer not to use ``Butler.__init__`` directly so we can force some 

530 of its many arguments to be keyword-only (note that ``__reduce__`` 

531 can only invoke callables with positional arguments). 

532 

533 Parameters 

534 ---------- 

535 config : `ButlerConfig` 

536 Butler configuration, already coerced into a true `ButlerConfig` 

537 instance (and hence after any search paths for overrides have been 

538 utilized). 

539 collections : `tuple` [ `str` ] 

540 Names of the default collections to read from. 

541 run : `str`, optional 

542 Name of the default `~CollectionType.RUN` collection to write to. 

543 defaultDataId : `dict` [ `str`, `str` ] 

544 Default data ID values. 

545 writeable : `bool` 

546 Whether the Butler should support write operations. 

547 

548 Returns 

549 ------- 

550 butler : `Butler` 

551 A new `Butler` instance. 

552 """ 

553 # MyPy doesn't recognize that the kwargs below are totally valid; it 

554 # seems to think '**defaultDataId* is a _positional_ argument! 

555 return cls( 

556 config=config, 

557 collections=collections, 

558 run=run, 

559 writeable=writeable, 

560 **defaultDataId, # type: ignore 

561 ) 

562 

563 def __reduce__(self) -> tuple: 

564 """Support pickling.""" 

565 return ( 

566 Butler._unpickle, 

567 ( 

568 self._config, 

569 self.collections, 

570 self.run, 

571 self.registry.defaults.dataId.byName(), 

572 self.registry.isWriteable(), 

573 ), 

574 ) 

575 

576 def __str__(self) -> str: 

577 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

578 self.collections, self.run, self.datastore, self.registry 

579 ) 

580 

581 def isWriteable(self) -> bool: 

582 """Return `True` if this `Butler` supports write operations.""" 

583 return self.registry.isWriteable() 

584 

585 @contextlib.contextmanager 

586 def transaction(self) -> Iterator[None]: 

587 """Context manager supporting `Butler` transactions. 

588 

589 Transactions can be nested. 

590 """ 

591 with self.registry.transaction(): 

592 with self.datastore.transaction(): 

593 yield 

594 

595 def _standardizeArgs( 

596 self, 

597 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

598 dataId: Optional[DataId] = None, 

599 for_put: bool = True, 

600 **kwargs: Any, 

601 ) -> Tuple[DatasetType, Optional[DataId]]: 

602 """Standardize the arguments passed to several Butler APIs. 

603 

604 Parameters 

605 ---------- 

606 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

607 When `DatasetRef` the `dataId` should be `None`. 

608 Otherwise the `DatasetType` or name thereof. 

609 dataId : `dict` or `DataCoordinate` 

610 A `dict` of `Dimension` link name, value pairs that label the 

611 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

612 should be provided as the second argument. 

613 for_put : `bool`, optional 

614 If `True` this call is invoked as part of a `Butler.put()`. 

615 Otherwise it is assumed to be part of a `Butler.get()`. This 

616 parameter is only relevant if there is dataset type 

617 inconsistency. 

618 **kwargs 

619 Additional keyword arguments used to augment or construct a 

620 `DataCoordinate`. See `DataCoordinate.standardize` 

621 parameters. 

622 

623 Returns 

624 ------- 

625 datasetType : `DatasetType` 

626 A `DatasetType` instance extracted from ``datasetRefOrType``. 

627 dataId : `dict` or `DataId`, optional 

628 Argument that can be used (along with ``kwargs``) to construct a 

629 `DataId`. 

630 

631 Notes 

632 ----- 

633 Butler APIs that conceptually need a DatasetRef also allow passing a 

634 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

635 keyword arguments that can be used to construct one) separately. This 

636 method accepts those arguments and always returns a true `DatasetType` 

637 and a `DataId` or `dict`. 

638 

639 Standardization of `dict` vs `DataId` is best handled by passing the 

640 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

641 generally similarly flexible. 

642 """ 

643 externalDatasetType: Optional[DatasetType] = None 

644 internalDatasetType: Optional[DatasetType] = None 

645 if isinstance(datasetRefOrType, DatasetRef): 

646 if dataId is not None or kwargs: 

647 raise ValueError("DatasetRef given, cannot use dataId as well") 

648 externalDatasetType = datasetRefOrType.datasetType 

649 dataId = datasetRefOrType.dataId 

650 else: 

651 # Don't check whether DataId is provided, because Registry APIs 

652 # can usually construct a better error message when it wasn't. 

653 if isinstance(datasetRefOrType, DatasetType): 

654 externalDatasetType = datasetRefOrType 

655 else: 

656 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

657 

658 # Check that they are self-consistent 

659 if externalDatasetType is not None: 

660 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

661 if externalDatasetType != internalDatasetType: 

662 # We can allow differences if they are compatible, depending 

663 # on whether this is a get or a put. A get requires that 

664 # the python type associated with the datastore can be 

665 # converted to the user type. A put requires that the user 

666 # supplied python type can be converted to the internal 

667 # type expected by registry. 

668 relevantDatasetType = internalDatasetType 

669 if for_put: 

670 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

671 else: 

672 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

673 relevantDatasetType = externalDatasetType 

674 if not is_compatible: 

675 raise ValueError( 

676 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

677 f"registry definition ({internalDatasetType})" 

678 ) 

679 # Override the internal definition. 

680 internalDatasetType = relevantDatasetType 

681 

682 assert internalDatasetType is not None 

683 return internalDatasetType, dataId 

684 

685 def _rewrite_data_id( 

686 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any 

687 ) -> Tuple[Optional[DataId], Dict[str, Any]]: 

688 """Rewrite a data ID taking into account dimension records. 

689 

690 Take a Data ID and keyword args and rewrite it if necessary to 

691 allow the user to specify dimension records rather than dimension 

692 primary values. 

693 

694 This allows a user to include a dataId dict with keys of 

695 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

696 the integer exposure ID. It also allows a string to be given 

697 for a dimension value rather than the integer ID if that is more 

698 convenient. For example, rather than having to specifyin the 

699 detector with ``detector.full_name``, a string given for ``detector`` 

700 will be interpreted as the full name and converted to the integer 

701 value. 

702 

703 Keyword arguments can also use strings for dimensions like detector 

704 and exposure but python does not allow them to include ``.`` and 

705 so the ``exposure.day_obs`` syntax can not be used in a keyword 

706 argument. 

707 

708 Parameters 

709 ---------- 

710 dataId : `dict` or `DataCoordinate` 

711 A `dict` of `Dimension` link name, value pairs that will label the 

712 `DatasetRef` within a Collection. 

713 datasetType : `DatasetType` 

714 The dataset type associated with this dataId. Required to 

715 determine the relevant dimensions. 

716 **kwargs 

717 Additional keyword arguments used to augment or construct a 

718 `DataId`. See `DataId` parameters. 

719 

720 Returns 

721 ------- 

722 dataId : `dict` or `DataCoordinate` 

723 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

724 no keyword arguments, the original dataId will be returned 

725 unchanged. 

726 **kwargs : `dict` 

727 Any unused keyword arguments (would normally be empty dict). 

728 """ 

729 # Do nothing if we have a standalone DataCoordinate. 

730 if isinstance(dataId, DataCoordinate) and not kwargs: 

731 return dataId, kwargs 

732 

733 # Process dimension records that are using record information 

734 # rather than ids 

735 newDataId: Dict[str, DataIdValue] = {} 

736 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict) 

737 

738 # if all the dataId comes from keyword parameters we do not need 

739 # to do anything here because they can't be of the form 

740 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

741 if dataId: 

742 for k, v in dataId.items(): 

743 # If we have a Dimension we do not need to do anything 

744 # because it cannot be a compound key. 

745 if isinstance(k, str) and "." in k: 

746 # Someone is using a more human-readable dataId 

747 dimensionName, record = k.split(".", 1) 

748 byRecord[dimensionName][record] = v 

749 elif isinstance(k, Dimension): 

750 newDataId[k.name] = v 

751 else: 

752 newDataId[k] = v 

753 

754 # Go through the updated dataId and check the type in case someone is 

755 # using an alternate key. We have already filtered out the compound 

756 # keys dimensions.record format. 

757 not_dimensions = {} 

758 

759 # Will need to look in the dataId and the keyword arguments 

760 # and will remove them if they need to be fixed or are unrecognized. 

761 for dataIdDict in (newDataId, kwargs): 

762 # Use a list so we can adjust the dict safely in the loop 

763 for dimensionName in list(dataIdDict): 

764 value = dataIdDict[dimensionName] 

765 try: 

766 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

767 except KeyError: 

768 # This is not a real dimension 

769 not_dimensions[dimensionName] = value 

770 del dataIdDict[dimensionName] 

771 continue 

772 

773 # Convert an integral type to an explicit int to simplify 

774 # comparisons here 

775 if isinstance(value, numbers.Integral): 

776 value = int(value) 

777 

778 if not isinstance(value, dimension.primaryKey.getPythonType()): 

779 for alternate in dimension.alternateKeys: 

780 if isinstance(value, alternate.getPythonType()): 

781 byRecord[dimensionName][alternate.name] = value 

782 del dataIdDict[dimensionName] 

783 log.debug( 

784 "Converting dimension %s to %s.%s=%s", 

785 dimensionName, 

786 dimensionName, 

787 alternate.name, 

788 value, 

789 ) 

790 break 

791 else: 

792 log.warning( 

793 "Type mismatch found for value '%r' provided for dimension %s. " 

794 "Could not find matching alternative (primary key has type %s) " 

795 "so attempting to use as-is.", 

796 value, 

797 dimensionName, 

798 dimension.primaryKey.getPythonType(), 

799 ) 

800 

801 # By this point kwargs and newDataId should only include valid 

802 # dimensions. Merge kwargs in to the new dataId and log if there 

803 # are dimensions in both (rather than calling update). 

804 for k, v in kwargs.items(): 

805 if k in newDataId and newDataId[k] != v: 

806 log.debug( 

807 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

808 ) 

809 newDataId[k] = v 

810 # No need to retain any values in kwargs now. 

811 kwargs = {} 

812 

813 # If we have some unrecognized dimensions we have to try to connect 

814 # them to records in other dimensions. This is made more complicated 

815 # by some dimensions having records with clashing names. A mitigation 

816 # is that we can tell by this point which dimensions are missing 

817 # for the DatasetType but this does not work for calibrations 

818 # where additional dimensions can be used to constrain the temporal 

819 # axis. 

820 if not_dimensions: 

821 # Search for all dimensions even if we have been given a value 

822 # explicitly. In some cases records are given as well as the 

823 # actually dimension and this should not be an error if they 

824 # match. 

825 mandatoryDimensions = datasetType.dimensions.names # - provided 

826 

827 candidateDimensions: Set[str] = set() 

828 candidateDimensions.update(mandatoryDimensions) 

829 

830 # For calibrations we may well be needing temporal dimensions 

831 # so rather than always including all dimensions in the scan 

832 # restrict things a little. It is still possible for there 

833 # to be confusion over day_obs in visit vs exposure for example. 

834 # If we are not searching calibration collections things may 

835 # fail but they are going to fail anyway because of the 

836 # ambiguousness of the dataId... 

837 if datasetType.isCalibration(): 

838 for dim in self.registry.dimensions.getStaticDimensions(): 

839 if dim.temporal: 

840 candidateDimensions.add(str(dim)) 

841 

842 # Look up table for the first association with a dimension 

843 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict) 

844 

845 # Keep track of whether an item is associated with multiple 

846 # dimensions. 

847 counter: Counter[str] = Counter() 

848 assigned: Dict[str, Set[str]] = defaultdict(set) 

849 

850 # Go through the missing dimensions and associate the 

851 # given names with records within those dimensions 

852 matched_dims = set() 

853 for dimensionName in candidateDimensions: 

854 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

855 fields = dimension.metadata.names | dimension.uniqueKeys.names 

856 for field in not_dimensions: 

857 if field in fields: 

858 guessedAssociation[dimensionName][field] = not_dimensions[field] 

859 counter[dimensionName] += 1 

860 assigned[field].add(dimensionName) 

861 matched_dims.add(field) 

862 

863 # Calculate the fields that matched nothing. 

864 never_found = set(not_dimensions) - matched_dims 

865 

866 if never_found: 

867 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

868 

869 # There is a chance we have allocated a single dataId item 

870 # to multiple dimensions. Need to decide which should be retained. 

871 # For now assume that the most popular alternative wins. 

872 # This means that day_obs with seq_num will result in 

873 # exposure.day_obs and not visit.day_obs 

874 # Also prefer an explicitly missing dimension over an inferred 

875 # temporal dimension. 

876 for fieldName, assignedDimensions in assigned.items(): 

877 if len(assignedDimensions) > 1: 

878 # Pick the most popular (preferring mandatory dimensions) 

879 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

880 if requiredButMissing: 

881 candidateDimensions = requiredButMissing 

882 else: 

883 candidateDimensions = assignedDimensions 

884 

885 # If this is a choice between visit and exposure and 

886 # neither was a required part of the dataset type, 

887 # (hence in this branch) always prefer exposure over 

888 # visit since exposures are always defined and visits 

889 # are defined from exposures. 

890 if candidateDimensions == {"exposure", "visit"}: 

891 candidateDimensions = {"exposure"} 

892 

893 # Select the relevant items and get a new restricted 

894 # counter. 

895 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

896 duplicatesCounter: Counter[str] = Counter() 

897 duplicatesCounter.update(theseCounts) 

898 

899 # Choose the most common. If they are equally common 

900 # we will pick the one that was found first. 

901 # Returns a list of tuples 

902 selected = duplicatesCounter.most_common(1)[0][0] 

903 

904 log.debug( 

905 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

906 " Removed ambiguity by choosing dimension %s.", 

907 fieldName, 

908 ", ".join(assignedDimensions), 

909 selected, 

910 ) 

911 

912 for candidateDimension in assignedDimensions: 

913 if candidateDimension != selected: 

914 del guessedAssociation[candidateDimension][fieldName] 

915 

916 # Update the record look up dict with the new associations 

917 for dimensionName, values in guessedAssociation.items(): 

918 if values: # A dict might now be empty 

919 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values) 

920 byRecord[dimensionName].update(values) 

921 

922 if byRecord: 

923 # Some record specifiers were found so we need to convert 

924 # them to the Id form 

925 for dimensionName, values in byRecord.items(): 

926 if dimensionName in newDataId: 

927 log.debug( 

928 "DataId specified explicit %s dimension value of %s in addition to" 

929 " general record specifiers for it of %s. Ignoring record information.", 

930 dimensionName, 

931 newDataId[dimensionName], 

932 str(values), 

933 ) 

934 # Get the actual record and compare with these values. 

935 try: 

936 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

937 except DataIdError: 

938 raise ValueError( 

939 f"Could not find dimension '{dimensionName}'" 

940 f" with dataId {newDataId} as part of comparing with" 

941 f" record values {byRecord[dimensionName]}" 

942 ) from None 

943 if len(recs) == 1: 

944 errmsg: List[str] = [] 

945 for k, v in values.items(): 

946 if (recval := getattr(recs[0], k)) != v: 

947 errmsg.append(f"{k}({recval} != {v})") 

948 if errmsg: 

949 raise ValueError( 

950 f"Dimension {dimensionName} in dataId has explicit value" 

951 " inconsistent with records: " + ", ".join(errmsg) 

952 ) 

953 else: 

954 # Multiple matches for an explicit dimension 

955 # should never happen but let downstream complain. 

956 pass 

957 continue 

958 

959 # Build up a WHERE expression 

960 bind = {k: v for k, v in values.items()} 

961 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

962 

963 # Hopefully we get a single record that matches 

964 records = set( 

965 self.registry.queryDimensionRecords( 

966 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

967 ) 

968 ) 

969 

970 if len(records) != 1: 

971 if len(records) > 1: 

972 # visit can have an ambiguous answer without involving 

973 # visit_system. The default visit_system is defined 

974 # by the instrument. 

975 if ( 

976 dimensionName == "visit" 

977 and "visit_system_membership" in self.registry.dimensions 

978 and "visit_system" in self.registry.dimensions["instrument"].metadata 

979 ): 

980 instrument_records = list( 

981 self.registry.queryDimensionRecords( 

982 "instrument", 

983 dataId=newDataId, 

984 **kwargs, 

985 ) 

986 ) 

987 if len(instrument_records) == 1: 

988 visit_system = instrument_records[0].visit_system 

989 if visit_system is None: 

990 # Set to a value that will never match. 

991 visit_system = -1 

992 

993 # Look up each visit in the 

994 # visit_system_membership records. 

995 for rec in records: 

996 membership = list( 

997 self.registry.queryDimensionRecords( 

998 # Use bind to allow zero results. 

999 # This is a fully-specified query. 

1000 "visit_system_membership", 

1001 where="instrument = inst AND visit_system = system AND visit = v", 

1002 bind=dict( 

1003 inst=instrument_records[0].name, system=visit_system, v=rec.id 

1004 ), 

1005 ) 

1006 ) 

1007 if membership: 

1008 # This record is the right answer. 

1009 records = set([rec]) 

1010 break 

1011 

1012 # The ambiguity may have been resolved so check again. 

1013 if len(records) > 1: 

1014 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

1015 for r in records: 

1016 log.debug("- %s", str(r)) 

1017 raise ValueError( 

1018 f"DataId specification for dimension {dimensionName} is not" 

1019 f" uniquely constrained to a single dataset by {values}." 

1020 f" Got {len(records)} results." 

1021 ) 

1022 else: 

1023 raise ValueError( 

1024 f"DataId specification for dimension {dimensionName} matched no" 

1025 f" records when constrained by {values}" 

1026 ) 

1027 

1028 # Get the primary key from the real dimension object 

1029 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

1030 if not isinstance(dimension, Dimension): 

1031 raise RuntimeError( 

1032 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

1033 ) 

1034 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

1035 

1036 return newDataId, kwargs 

1037 

1038 def _findDatasetRef( 

1039 self, 

1040 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1041 dataId: Optional[DataId] = None, 

1042 *, 

1043 collections: Any = None, 

1044 allowUnresolved: bool = False, 

1045 **kwargs: Any, 

1046 ) -> DatasetRef: 

1047 """Shared logic for methods that start with a search for a dataset in 

1048 the registry. 

1049 

1050 Parameters 

1051 ---------- 

1052 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1053 When `DatasetRef` the `dataId` should be `None`. 

1054 Otherwise the `DatasetType` or name thereof. 

1055 dataId : `dict` or `DataCoordinate`, optional 

1056 A `dict` of `Dimension` link name, value pairs that label the 

1057 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1058 should be provided as the first argument. 

1059 collections : Any, optional 

1060 Collections to be searched, overriding ``self.collections``. 

1061 Can be any of the types supported by the ``collections`` argument 

1062 to butler construction. 

1063 allowUnresolved : `bool`, optional 

1064 If `True`, return an unresolved `DatasetRef` if finding a resolved 

1065 one in the `Registry` fails. Defaults to `False`. 

1066 **kwargs 

1067 Additional keyword arguments used to augment or construct a 

1068 `DataId`. See `DataId` parameters. 

1069 

1070 Returns 

1071 ------- 

1072 ref : `DatasetRef` 

1073 A reference to the dataset identified by the given arguments. 

1074 

1075 Raises 

1076 ------ 

1077 LookupError 

1078 Raised if no matching dataset exists in the `Registry` (and 

1079 ``allowUnresolved is False``). 

1080 ValueError 

1081 Raised if a resolved `DatasetRef` was passed as an input, but it 

1082 differs from the one found in the registry. 

1083 TypeError 

1084 Raised if no collections were provided. 

1085 """ 

1086 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

1087 if isinstance(datasetRefOrType, DatasetRef): 

1088 idNumber = datasetRefOrType.id 

1089 else: 

1090 idNumber = None 

1091 timespan: Optional[Timespan] = None 

1092 

1093 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1094 

1095 if datasetType.isCalibration(): 

1096 # Because this is a calibration dataset, first try to make a 

1097 # standardize the data ID without restricting the dimensions to 

1098 # those of the dataset type requested, because there may be extra 

1099 # dimensions that provide temporal information for a validity-range 

1100 # lookup. 

1101 dataId = DataCoordinate.standardize( 

1102 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1103 ) 

1104 if dataId.graph.temporal: 

1105 dataId = self.registry.expandDataId(dataId) 

1106 timespan = dataId.timespan 

1107 else: 

1108 # Standardize the data ID to just the dimensions of the dataset 

1109 # type instead of letting registry.findDataset do it, so we get the 

1110 # result even if no dataset is found. 

1111 dataId = DataCoordinate.standardize( 

1112 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1113 ) 

1114 # Always lookup the DatasetRef, even if one is given, to ensure it is 

1115 # present in the current collection. 

1116 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

1117 if ref is None: 

1118 if allowUnresolved: 

1119 return DatasetRef(datasetType, dataId) 

1120 else: 

1121 if collections is None: 

1122 collections = self.registry.defaults.collections 

1123 raise LookupError( 

1124 f"Dataset {datasetType.name} with data ID {dataId} " 

1125 f"could not be found in collections {collections}." 

1126 ) 

1127 if idNumber is not None and idNumber != ref.id: 

1128 if collections is None: 

1129 collections = self.registry.defaults.collections 

1130 raise ValueError( 

1131 f"DatasetRef.id provided ({idNumber}) does not match " 

1132 f"id ({ref.id}) in registry in collections {collections}." 

1133 ) 

1134 if datasetType != ref.datasetType: 

1135 # If they differ it is because the user explicitly specified 

1136 # a compatible dataset type to this call rather than using the 

1137 # registry definition. The DatasetRef must therefore be recreated 

1138 # using the user definition such that the expected type is 

1139 # returned. 

1140 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id) 

1141 

1142 return ref 

1143 

1144 @transactional 

1145 def putDirect(self, obj: Any, ref: DatasetRef) -> DatasetRef: 

1146 # Docstring inherited. 

1147 (imported_ref,) = self.registry._importDatasets( 

1148 [ref], 

1149 expand=True, 

1150 ) 

1151 if imported_ref.id != ref.getCheckedId(): 

1152 raise RuntimeError("This registry configuration does not support putDirect.") 

1153 self.datastore.put(obj, ref) 

1154 return ref 

1155 

1156 @transactional 

1157 def put( 

1158 self, 

1159 obj: Any, 

1160 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1161 dataId: Optional[DataId] = None, 

1162 *, 

1163 run: Optional[str] = None, 

1164 **kwargs: Any, 

1165 ) -> DatasetRef: 

1166 """Store and register a dataset. 

1167 

1168 Parameters 

1169 ---------- 

1170 obj : `object` 

1171 The dataset. 

1172 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1173 When `DatasetRef` is provided, ``dataId`` should be `None`. 

1174 Otherwise the `DatasetType` or name thereof. 

1175 dataId : `dict` or `DataCoordinate` 

1176 A `dict` of `Dimension` link name, value pairs that label the 

1177 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1178 should be provided as the second argument. 

1179 run : `str`, optional 

1180 The name of the run the dataset should be added to, overriding 

1181 ``self.run``. 

1182 **kwargs 

1183 Additional keyword arguments used to augment or construct a 

1184 `DataCoordinate`. See `DataCoordinate.standardize` 

1185 parameters. 

1186 

1187 Returns 

1188 ------- 

1189 ref : `DatasetRef` 

1190 A reference to the stored dataset, updated with the correct id if 

1191 given. 

1192 

1193 Raises 

1194 ------ 

1195 TypeError 

1196 Raised if the butler is read-only or if no run has been provided. 

1197 """ 

1198 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

1199 if not self.isWriteable(): 

1200 raise TypeError("Butler is read-only.") 

1201 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

1202 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

1203 raise ValueError("DatasetRef must not be in registry, must have None id") 

1204 

1205 # Handle dimension records in dataId 

1206 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1207 

1208 # Add Registry Dataset entry. 

1209 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

1210 

1211 # For an execution butler the datasets will be pre-defined. 

1212 # If the butler is configured that way datasets should only be inserted 

1213 # if they do not already exist in registry. Trying and catching 

1214 # ConflictingDefinitionError will not work because the transaction 

1215 # will be corrupted. Instead, in this mode always check first. 

1216 ref = None 

1217 ref_is_predefined = False 

1218 if self._allow_put_of_predefined_dataset: 

1219 # Get the matching ref for this run. 

1220 ref = self.registry.findDataset(datasetType, collections=run, dataId=dataId) 

1221 

1222 if ref: 

1223 # Must be expanded form for datastore templating 

1224 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions) 

1225 ref = ref.expanded(dataId) 

1226 ref_is_predefined = True 

1227 

1228 if not ref: 

1229 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

1230 

1231 # If the ref is predefined it is possible that the datastore also 

1232 # has the record. Asking datastore to put it again will result in 

1233 # the artifact being recreated, overwriting previous, then will cause 

1234 # a failure in writing the record which will cause the artifact 

1235 # to be removed. Much safer to ask first before attempting to 

1236 # overwrite. Race conditions should not be an issue for the 

1237 # execution butler environment. 

1238 if ref_is_predefined: 

1239 if self.datastore.knows(ref): 

1240 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.") 

1241 

1242 self.datastore.put(obj, ref) 

1243 

1244 return ref 

1245 

1246 def getDirect( 

1247 self, 

1248 ref: DatasetRef, 

1249 *, 

1250 parameters: Optional[Dict[str, Any]] = None, 

1251 storageClass: Optional[Union[StorageClass, str]] = None, 

1252 ) -> Any: 

1253 """Retrieve a stored dataset. 

1254 

1255 Unlike `Butler.get`, this method allows datasets outside the Butler's 

1256 collection to be read as long as the `DatasetRef` that identifies them 

1257 can be obtained separately. 

1258 

1259 Parameters 

1260 ---------- 

1261 ref : `DatasetRef` 

1262 Resolved reference to an already stored dataset. 

1263 parameters : `dict` 

1264 Additional StorageClass-defined options to control reading, 

1265 typically used to efficiently read only a subset of the dataset. 

1266 storageClass : `StorageClass` or `str`, optional 

1267 The storage class to be used to override the Python type 

1268 returned by this method. By default the returned type matches 

1269 the dataset type definition for this dataset. Specifying a 

1270 read `StorageClass` can force a different type to be returned. 

1271 This type must be compatible with the original type. 

1272 

1273 Returns 

1274 ------- 

1275 obj : `object` 

1276 The dataset. 

1277 """ 

1278 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1279 

1280 def getDirectDeferred( 

1281 self, 

1282 ref: DatasetRef, 

1283 *, 

1284 parameters: Union[dict, None] = None, 

1285 storageClass: str | StorageClass | None = None, 

1286 ) -> DeferredDatasetHandle: 

1287 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1288 from a resolved `DatasetRef`. 

1289 

1290 Parameters 

1291 ---------- 

1292 ref : `DatasetRef` 

1293 Resolved reference to an already stored dataset. 

1294 parameters : `dict` 

1295 Additional StorageClass-defined options to control reading, 

1296 typically used to efficiently read only a subset of the dataset. 

1297 storageClass : `StorageClass` or `str`, optional 

1298 The storage class to be used to override the Python type 

1299 returned by this method. By default the returned type matches 

1300 the dataset type definition for this dataset. Specifying a 

1301 read `StorageClass` can force a different type to be returned. 

1302 This type must be compatible with the original type. 

1303 

1304 Returns 

1305 ------- 

1306 obj : `DeferredDatasetHandle` 

1307 A handle which can be used to retrieve a dataset at a later time. 

1308 

1309 Raises 

1310 ------ 

1311 AmbiguousDatasetError 

1312 Raised if ``ref.id is None``, i.e. the reference is unresolved. 

1313 """ 

1314 if ref.id is None: 

1315 raise AmbiguousDatasetError( 

1316 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved." 

1317 ) 

1318 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1319 

1320 def getDeferred( 

1321 self, 

1322 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1323 dataId: Optional[DataId] = None, 

1324 *, 

1325 parameters: Union[dict, None] = None, 

1326 collections: Any = None, 

1327 storageClass: str | StorageClass | None = None, 

1328 **kwargs: Any, 

1329 ) -> DeferredDatasetHandle: 

1330 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1331 after an immediate registry lookup. 

1332 

1333 Parameters 

1334 ---------- 

1335 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1336 When `DatasetRef` the `dataId` should be `None`. 

1337 Otherwise the `DatasetType` or name thereof. 

1338 dataId : `dict` or `DataCoordinate`, optional 

1339 A `dict` of `Dimension` link name, value pairs that label the 

1340 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1341 should be provided as the first argument. 

1342 parameters : `dict` 

1343 Additional StorageClass-defined options to control reading, 

1344 typically used to efficiently read only a subset of the dataset. 

1345 collections : Any, optional 

1346 Collections to be searched, overriding ``self.collections``. 

1347 Can be any of the types supported by the ``collections`` argument 

1348 to butler construction. 

1349 storageClass : `StorageClass` or `str`, optional 

1350 The storage class to be used to override the Python type 

1351 returned by this method. By default the returned type matches 

1352 the dataset type definition for this dataset. Specifying a 

1353 read `StorageClass` can force a different type to be returned. 

1354 This type must be compatible with the original type. 

1355 **kwargs 

1356 Additional keyword arguments used to augment or construct a 

1357 `DataId`. See `DataId` parameters. 

1358 

1359 Returns 

1360 ------- 

1361 obj : `DeferredDatasetHandle` 

1362 A handle which can be used to retrieve a dataset at a later time. 

1363 

1364 Raises 

1365 ------ 

1366 LookupError 

1367 Raised if no matching dataset exists in the `Registry` (and 

1368 ``allowUnresolved is False``). 

1369 ValueError 

1370 Raised if a resolved `DatasetRef` was passed as an input, but it 

1371 differs from the one found in the registry. 

1372 TypeError 

1373 Raised if no collections were provided. 

1374 """ 

1375 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1376 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1377 

1378 def get( 

1379 self, 

1380 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1381 dataId: Optional[DataId] = None, 

1382 *, 

1383 parameters: Optional[Dict[str, Any]] = None, 

1384 collections: Any = None, 

1385 storageClass: Optional[Union[StorageClass, str]] = None, 

1386 **kwargs: Any, 

1387 ) -> Any: 

1388 """Retrieve a stored dataset. 

1389 

1390 Parameters 

1391 ---------- 

1392 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1393 When `DatasetRef` the `dataId` should be `None`. 

1394 Otherwise the `DatasetType` or name thereof. 

1395 dataId : `dict` or `DataCoordinate` 

1396 A `dict` of `Dimension` link name, value pairs that label the 

1397 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1398 should be provided as the first argument. 

1399 parameters : `dict` 

1400 Additional StorageClass-defined options to control reading, 

1401 typically used to efficiently read only a subset of the dataset. 

1402 collections : Any, optional 

1403 Collections to be searched, overriding ``self.collections``. 

1404 Can be any of the types supported by the ``collections`` argument 

1405 to butler construction. 

1406 storageClass : `StorageClass` or `str`, optional 

1407 The storage class to be used to override the Python type 

1408 returned by this method. By default the returned type matches 

1409 the dataset type definition for this dataset. Specifying a 

1410 read `StorageClass` can force a different type to be returned. 

1411 This type must be compatible with the original type. 

1412 **kwargs 

1413 Additional keyword arguments used to augment or construct a 

1414 `DataCoordinate`. See `DataCoordinate.standardize` 

1415 parameters. 

1416 

1417 Returns 

1418 ------- 

1419 obj : `object` 

1420 The dataset. 

1421 

1422 Raises 

1423 ------ 

1424 ValueError 

1425 Raised if a resolved `DatasetRef` was passed as an input, but it 

1426 differs from the one found in the registry. 

1427 LookupError 

1428 Raised if no matching dataset exists in the `Registry`. 

1429 TypeError 

1430 Raised if no collections were provided. 

1431 

1432 Notes 

1433 ----- 

1434 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1435 this method requires that the given data ID include temporal dimensions 

1436 beyond the dimensions of the dataset type itself, in order to find the 

1437 dataset with the appropriate validity range. For example, a "bias" 

1438 dataset with native dimensions ``{instrument, detector}`` could be 

1439 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1440 ``exposure`` is a temporal dimension. 

1441 """ 

1442 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1443 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1444 return self.getDirect(ref, parameters=parameters, storageClass=storageClass) 

1445 

1446 def getURIs( 

1447 self, 

1448 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1449 dataId: Optional[DataId] = None, 

1450 *, 

1451 predict: bool = False, 

1452 collections: Any = None, 

1453 run: Optional[str] = None, 

1454 **kwargs: Any, 

1455 ) -> DatasetRefURIs: 

1456 """Returns the URIs associated with the dataset. 

1457 

1458 Parameters 

1459 ---------- 

1460 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1461 When `DatasetRef` the `dataId` should be `None`. 

1462 Otherwise the `DatasetType` or name thereof. 

1463 dataId : `dict` or `DataCoordinate` 

1464 A `dict` of `Dimension` link name, value pairs that label the 

1465 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1466 should be provided as the first argument. 

1467 predict : `bool` 

1468 If `True`, allow URIs to be returned of datasets that have not 

1469 been written. 

1470 collections : Any, optional 

1471 Collections to be searched, overriding ``self.collections``. 

1472 Can be any of the types supported by the ``collections`` argument 

1473 to butler construction. 

1474 run : `str`, optional 

1475 Run to use for predictions, overriding ``self.run``. 

1476 **kwargs 

1477 Additional keyword arguments used to augment or construct a 

1478 `DataCoordinate`. See `DataCoordinate.standardize` 

1479 parameters. 

1480 

1481 Returns 

1482 ------- 

1483 uris : `DatasetRefURIs` 

1484 The URI to the primary artifact associated with this dataset (if 

1485 the dataset was disassembled within the datastore this may be 

1486 `None`), and the URIs to any components associated with the dataset 

1487 artifact. (can be empty if there are no components). 

1488 """ 

1489 ref = self._findDatasetRef( 

1490 datasetRefOrType, dataId, allowUnresolved=predict, collections=collections, **kwargs 

1491 ) 

1492 if ref.id is None: # only possible if predict is True 

1493 if run is None: 

1494 run = self.run 

1495 if run is None: 

1496 raise TypeError("Cannot predict location with run=None.") 

1497 # Lie about ID, because we can't guess it, and only 

1498 # Datastore.getURIs() will ever see it (and it doesn't use it). 

1499 ref = ref.resolved(id=0, run=run) 

1500 return self.datastore.getURIs(ref, predict) 

1501 

1502 def getURI( 

1503 self, 

1504 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1505 dataId: Optional[DataId] = None, 

1506 *, 

1507 predict: bool = False, 

1508 collections: Any = None, 

1509 run: Optional[str] = None, 

1510 **kwargs: Any, 

1511 ) -> ResourcePath: 

1512 """Return the URI to the Dataset. 

1513 

1514 Parameters 

1515 ---------- 

1516 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1517 When `DatasetRef` the `dataId` should be `None`. 

1518 Otherwise the `DatasetType` or name thereof. 

1519 dataId : `dict` or `DataCoordinate` 

1520 A `dict` of `Dimension` link name, value pairs that label the 

1521 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1522 should be provided as the first argument. 

1523 predict : `bool` 

1524 If `True`, allow URIs to be returned of datasets that have not 

1525 been written. 

1526 collections : Any, optional 

1527 Collections to be searched, overriding ``self.collections``. 

1528 Can be any of the types supported by the ``collections`` argument 

1529 to butler construction. 

1530 run : `str`, optional 

1531 Run to use for predictions, overriding ``self.run``. 

1532 **kwargs 

1533 Additional keyword arguments used to augment or construct a 

1534 `DataCoordinate`. See `DataCoordinate.standardize` 

1535 parameters. 

1536 

1537 Returns 

1538 ------- 

1539 uri : `lsst.resources.ResourcePath` 

1540 URI pointing to the Dataset within the datastore. If the 

1541 Dataset does not exist in the datastore, and if ``predict`` is 

1542 `True`, the URI will be a prediction and will include a URI 

1543 fragment "#predicted". 

1544 If the datastore does not have entities that relate well 

1545 to the concept of a URI the returned URI string will be 

1546 descriptive. The returned URI is not guaranteed to be obtainable. 

1547 

1548 Raises 

1549 ------ 

1550 LookupError 

1551 A URI has been requested for a dataset that does not exist and 

1552 guessing is not allowed. 

1553 ValueError 

1554 Raised if a resolved `DatasetRef` was passed as an input, but it 

1555 differs from the one found in the registry. 

1556 TypeError 

1557 Raised if no collections were provided. 

1558 RuntimeError 

1559 Raised if a URI is requested for a dataset that consists of 

1560 multiple artifacts. 

1561 """ 

1562 primary, components = self.getURIs( 

1563 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs 

1564 ) 

1565 

1566 if primary is None or components: 

1567 raise RuntimeError( 

1568 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1569 "Use Butler.getURIs() instead." 

1570 ) 

1571 return primary 

1572 

1573 def retrieveArtifacts( 

1574 self, 

1575 refs: Iterable[DatasetRef], 

1576 destination: ResourcePathExpression, 

1577 transfer: str = "auto", 

1578 preserve_path: bool = True, 

1579 overwrite: bool = False, 

1580 ) -> List[ResourcePath]: 

1581 """Retrieve the artifacts associated with the supplied refs. 

1582 

1583 Parameters 

1584 ---------- 

1585 refs : iterable of `DatasetRef` 

1586 The datasets for which artifacts are to be retrieved. 

1587 A single ref can result in multiple artifacts. The refs must 

1588 be resolved. 

1589 destination : `lsst.resources.ResourcePath` or `str` 

1590 Location to write the artifacts. 

1591 transfer : `str`, optional 

1592 Method to use to transfer the artifacts. Must be one of the options 

1593 supported by `~lsst.resources.ResourcePath.transfer_from()`. 

1594 "move" is not allowed. 

1595 preserve_path : `bool`, optional 

1596 If `True` the full path of the artifact within the datastore 

1597 is preserved. If `False` the final file component of the path 

1598 is used. 

1599 overwrite : `bool`, optional 

1600 If `True` allow transfers to overwrite existing files at the 

1601 destination. 

1602 

1603 Returns 

1604 ------- 

1605 targets : `list` of `lsst.resources.ResourcePath` 

1606 URIs of file artifacts in destination location. Order is not 

1607 preserved. 

1608 

1609 Notes 

1610 ----- 

1611 For non-file datastores the artifacts written to the destination 

1612 may not match the representation inside the datastore. For example 

1613 a hierarchical data structure in a NoSQL database may well be stored 

1614 as a JSON file. 

1615 """ 

1616 return self.datastore.retrieveArtifacts( 

1617 refs, 

1618 ResourcePath(destination), 

1619 transfer=transfer, 

1620 preserve_path=preserve_path, 

1621 overwrite=overwrite, 

1622 ) 

1623 

1624 def datasetExists( 

1625 self, 

1626 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1627 dataId: Optional[DataId] = None, 

1628 *, 

1629 collections: Any = None, 

1630 **kwargs: Any, 

1631 ) -> bool: 

1632 """Return True if the Dataset is actually present in the Datastore. 

1633 

1634 Parameters 

1635 ---------- 

1636 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1637 When `DatasetRef` the `dataId` should be `None`. 

1638 Otherwise the `DatasetType` or name thereof. 

1639 dataId : `dict` or `DataCoordinate` 

1640 A `dict` of `Dimension` link name, value pairs that label the 

1641 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1642 should be provided as the first argument. 

1643 collections : Any, optional 

1644 Collections to be searched, overriding ``self.collections``. 

1645 Can be any of the types supported by the ``collections`` argument 

1646 to butler construction. 

1647 **kwargs 

1648 Additional keyword arguments used to augment or construct a 

1649 `DataCoordinate`. See `DataCoordinate.standardize` 

1650 parameters. 

1651 

1652 Raises 

1653 ------ 

1654 LookupError 

1655 Raised if the dataset is not even present in the Registry. 

1656 ValueError 

1657 Raised if a resolved `DatasetRef` was passed as an input, but it 

1658 differs from the one found in the registry. 

1659 TypeError 

1660 Raised if no collections were provided. 

1661 """ 

1662 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1663 return self.datastore.exists(ref) 

1664 

1665 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1666 """Remove one or more `~CollectionType.RUN` collections and the 

1667 datasets within them. 

1668 

1669 Parameters 

1670 ---------- 

1671 names : `Iterable` [ `str` ] 

1672 The names of the collections to remove. 

1673 unstore : `bool`, optional 

1674 If `True` (default), delete datasets from all datastores in which 

1675 they are present, and attempt to rollback the registry deletions if 

1676 datastore deletions fail (which may not always be possible). If 

1677 `False`, datastore records for these datasets are still removed, 

1678 but any artifacts (e.g. files) will not be. 

1679 

1680 Raises 

1681 ------ 

1682 TypeError 

1683 Raised if one or more collections are not of type 

1684 `~CollectionType.RUN`. 

1685 """ 

1686 if not self.isWriteable(): 

1687 raise TypeError("Butler is read-only.") 

1688 names = list(names) 

1689 refs: List[DatasetRef] = [] 

1690 for name in names: 

1691 collectionType = self.registry.getCollectionType(name) 

1692 if collectionType is not CollectionType.RUN: 

1693 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1694 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True)) 

1695 with self.datastore.transaction(): 

1696 with self.registry.transaction(): 

1697 if unstore: 

1698 self.datastore.trash(refs) 

1699 else: 

1700 self.datastore.forget(refs) 

1701 for name in names: 

1702 self.registry.removeCollection(name) 

1703 if unstore: 

1704 # Point of no return for removing artifacts 

1705 self.datastore.emptyTrash() 

1706 

1707 def pruneCollection( 

1708 self, name: str, purge: bool = False, unstore: bool = False, unlink: Optional[List[str]] = None 

1709 ) -> None: 

1710 """Remove a collection and possibly prune datasets within it. 

1711 

1712 Parameters 

1713 ---------- 

1714 name : `str` 

1715 Name of the collection to remove. If this is a 

1716 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection, 

1717 datasets within the collection are not modified unless ``unstore`` 

1718 is `True`. If this is a `~CollectionType.RUN` collection, 

1719 ``purge`` and ``unstore`` must be `True`, and all datasets in it 

1720 are fully removed from the data repository. 

1721 purge : `bool`, optional 

1722 If `True`, permit `~CollectionType.RUN` collections to be removed, 

1723 fully removing datasets within them. Requires ``unstore=True`` as 

1724 well as an added precaution against accidental deletion. Must be 

1725 `False` (default) if the collection is not a ``RUN``. 

1726 unstore: `bool`, optional 

1727 If `True`, remove all datasets in the collection from all 

1728 datastores in which they appear. 

1729 unlink: `list` [`str`], optional 

1730 Before removing the given `collection` unlink it from from these 

1731 parent collections. 

1732 

1733 Raises 

1734 ------ 

1735 TypeError 

1736 Raised if the butler is read-only or arguments are mutually 

1737 inconsistent. 

1738 """ 

1739 # See pruneDatasets comments for more information about the logic here; 

1740 # the cases are almost the same, but here we can rely on Registry to 

1741 # take care everything but Datastore deletion when we remove the 

1742 # collection. 

1743 if not self.isWriteable(): 

1744 raise TypeError("Butler is read-only.") 

1745 collectionType = self.registry.getCollectionType(name) 

1746 if purge and not unstore: 

1747 raise PurgeWithoutUnstorePruneCollectionsError() 

1748 if collectionType is CollectionType.RUN and not purge: 

1749 raise RunWithoutPurgePruneCollectionsError(collectionType) 

1750 if collectionType is not CollectionType.RUN and purge: 

1751 raise PurgeUnsupportedPruneCollectionsError(collectionType) 

1752 

1753 def remove(child: str, parent: str) -> None: 

1754 """Remove a child collection from a parent collection.""" 

1755 # Remove child from parent. 

1756 chain = list(self.registry.getCollectionChain(parent)) 

1757 try: 

1758 chain.remove(name) 

1759 except ValueError as e: 

1760 raise RuntimeError(f"{name} is not a child of {parent}") from e 

1761 self.registry.setCollectionChain(parent, chain) 

1762 

1763 with self.datastore.transaction(): 

1764 with self.registry.transaction(): 

1765 if unlink: 

1766 for parent in unlink: 

1767 remove(name, parent) 

1768 if unstore: 

1769 refs = self.registry.queryDatasets(..., collections=name, findFirst=True) 

1770 self.datastore.trash(refs) 

1771 self.registry.removeCollection(name) 

1772 

1773 if unstore: 

1774 # Point of no return for removing artifacts 

1775 self.datastore.emptyTrash() 

1776 

1777 def pruneDatasets( 

1778 self, 

1779 refs: Iterable[DatasetRef], 

1780 *, 

1781 disassociate: bool = True, 

1782 unstore: bool = False, 

1783 tags: Iterable[str] = (), 

1784 purge: bool = False, 

1785 ) -> None: 

1786 # docstring inherited from LimitedButler 

1787 

1788 if not self.isWriteable(): 

1789 raise TypeError("Butler is read-only.") 

1790 if purge: 

1791 if not disassociate: 

1792 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1793 if not unstore: 

1794 raise TypeError("Cannot pass purge=True without unstore=True.") 

1795 elif disassociate: 

1796 tags = tuple(tags) 

1797 if not tags: 

1798 raise TypeError("No tags provided but disassociate=True.") 

1799 for tag in tags: 

1800 collectionType = self.registry.getCollectionType(tag) 

1801 if collectionType is not CollectionType.TAGGED: 

1802 raise TypeError( 

1803 f"Cannot disassociate from collection '{tag}' " 

1804 f"of non-TAGGED type {collectionType.name}." 

1805 ) 

1806 # For an execution butler we want to keep existing UUIDs for the 

1807 # datasets, for that we need to keep them in the collections but 

1808 # remove from datastore. 

1809 if self._allow_put_of_predefined_dataset and purge: 

1810 purge = False 

1811 disassociate = False 

1812 # Transform possibly-single-pass iterable into something we can iterate 

1813 # over multiple times. 

1814 refs = list(refs) 

1815 # Pruning a component of a DatasetRef makes no sense since registry 

1816 # doesn't know about components and datastore might not store 

1817 # components in a separate file 

1818 for ref in refs: 

1819 if ref.datasetType.component(): 

1820 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1821 # We don't need an unreliable Datastore transaction for this, because 

1822 # we've been extra careful to ensure that Datastore.trash only involves 

1823 # mutating the Registry (it can _look_ at Datastore-specific things, 

1824 # but shouldn't change them), and hence all operations here are 

1825 # Registry operations. 

1826 with self.datastore.transaction(): 

1827 with self.registry.transaction(): 

1828 if unstore: 

1829 self.datastore.trash(refs) 

1830 if purge: 

1831 self.registry.removeDatasets(refs) 

1832 elif disassociate: 

1833 assert tags, "Guaranteed by earlier logic in this function." 

1834 for tag in tags: 

1835 self.registry.disassociate(tag, refs) 

1836 # We've exited the Registry transaction, and apparently committed. 

1837 # (if there was an exception, everything rolled back, and it's as if 

1838 # nothing happened - and we never get here). 

1839 # Datastore artifacts are not yet gone, but they're clearly marked 

1840 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1841 # problems we can try again later, and if manual administrative 

1842 # intervention is required, it's pretty clear what that should entail: 

1843 # deleting everything on disk and in private Datastore tables that is 

1844 # in the dataset_location_trash table. 

1845 if unstore: 

1846 # Point of no return for removing artifacts 

1847 self.datastore.emptyTrash() 

1848 

1849 @transactional 

1850 def ingest( 

1851 self, 

1852 *datasets: FileDataset, 

1853 transfer: Optional[str] = "auto", 

1854 run: Optional[str] = None, 

1855 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1856 record_validation_info: bool = True, 

1857 ) -> None: 

1858 """Store and register one or more datasets that already exist on disk. 

1859 

1860 Parameters 

1861 ---------- 

1862 datasets : `FileDataset` 

1863 Each positional argument is a struct containing information about 

1864 a file to be ingested, including its URI (either absolute or 

1865 relative to the datastore root, if applicable), a `DatasetRef`, 

1866 and optionally a formatter class or its fully-qualified string 

1867 name. If a formatter is not provided, the formatter that would be 

1868 used for `put` is assumed. On successful return, all 

1869 `FileDataset.ref` attributes will have their `DatasetRef.id` 

1870 attribute populated and all `FileDataset.formatter` attributes will 

1871 be set to the formatter class used. `FileDataset.path` attributes 

1872 may be modified to put paths in whatever the datastore considers a 

1873 standardized form. 

1874 transfer : `str`, optional 

1875 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1876 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to 

1877 transfer the file. 

1878 run : `str`, optional 

1879 The name of the run ingested datasets should be added to, 

1880 overriding ``self.run``. 

1881 idGenerationMode : `DatasetIdGenEnum`, optional 

1882 Specifies option for generating dataset IDs. By default unique IDs 

1883 are generated for each inserted dataset. 

1884 record_validation_info : `bool`, optional 

1885 If `True`, the default, the datastore can record validation 

1886 information associated with the file. If `False` the datastore 

1887 will not attempt to track any information such as checksums 

1888 or file sizes. This can be useful if such information is tracked 

1889 in an external system or if the file is to be compressed in place. 

1890 It is up to the datastore whether this parameter is relevant. 

1891 

1892 Raises 

1893 ------ 

1894 TypeError 

1895 Raised if the butler is read-only or if no run was provided. 

1896 NotImplementedError 

1897 Raised if the `Datastore` does not support the given transfer mode. 

1898 DatasetTypeNotSupportedError 

1899 Raised if one or more files to be ingested have a dataset type that 

1900 is not supported by the `Datastore`.. 

1901 FileNotFoundError 

1902 Raised if one of the given files does not exist. 

1903 FileExistsError 

1904 Raised if transfer is not `None` but the (internal) location the 

1905 file would be moved to is already occupied. 

1906 

1907 Notes 

1908 ----- 

1909 This operation is not fully exception safe: if a database operation 

1910 fails, the given `FileDataset` instances may be only partially updated. 

1911 

1912 It is atomic in terms of database operations (they will either all 

1913 succeed or all fail) providing the database engine implements 

1914 transactions correctly. It will attempt to be atomic in terms of 

1915 filesystem operations as well, but this cannot be implemented 

1916 rigorously for most datastores. 

1917 """ 

1918 if not self.isWriteable(): 

1919 raise TypeError("Butler is read-only.") 

1920 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1921 # Reorganize the inputs so they're grouped by DatasetType and then 

1922 # data ID. We also include a list of DatasetRefs for each FileDataset 

1923 # to hold the resolved DatasetRefs returned by the Registry, before 

1924 # it's safe to swap them into FileDataset.refs. 

1925 # Some type annotation aliases to make that clearer: 

1926 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]] 

1927 GroupedData = MutableMapping[DatasetType, GroupForType] 

1928 # The actual data structure: 

1929 groupedData: GroupedData = defaultdict(dict) 

1930 # And the nested loop that populates it: 

1931 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1932 # This list intentionally shared across the inner loop, since it's 

1933 # associated with `dataset`. 

1934 resolvedRefs: List[DatasetRef] = [] 

1935 

1936 # Somewhere to store pre-existing refs if we have an 

1937 # execution butler. 

1938 existingRefs: List[DatasetRef] = [] 

1939 

1940 for ref in dataset.refs: 

1941 if ref.dataId in groupedData[ref.datasetType]: 

1942 raise ConflictingDefinitionError( 

1943 f"Ingest conflict. Dataset {dataset.path} has same" 

1944 " DataId as other ingest dataset" 

1945 f" {groupedData[ref.datasetType][ref.dataId][0].path} " 

1946 f" ({ref.dataId})" 

1947 ) 

1948 if self._allow_put_of_predefined_dataset: 

1949 existing_ref = self.registry.findDataset( 

1950 ref.datasetType, dataId=ref.dataId, collections=run 

1951 ) 

1952 if existing_ref: 

1953 if self.datastore.knows(existing_ref): 

1954 raise ConflictingDefinitionError( 

1955 f"Dataset associated with path {dataset.path}" 

1956 f" already exists as {existing_ref}." 

1957 ) 

1958 # Store this ref elsewhere since it already exists 

1959 # and we do not want to remake it but we do want 

1960 # to store it in the datastore. 

1961 existingRefs.append(existing_ref) 

1962 

1963 # Nothing else to do until we have finished 

1964 # iterating. 

1965 continue 

1966 

1967 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs) 

1968 

1969 if existingRefs: 

1970 if len(dataset.refs) != len(existingRefs): 

1971 # Keeping track of partially pre-existing datasets is hard 

1972 # and should generally never happen. For now don't allow 

1973 # it. 

1974 raise ConflictingDefinitionError( 

1975 f"For dataset {dataset.path} some dataIds already exist" 

1976 " in registry but others do not. This is not supported." 

1977 ) 

1978 

1979 # Attach the resolved refs if we found them. 

1980 dataset.refs = existingRefs 

1981 

1982 # Now we can bulk-insert into Registry for each DatasetType. 

1983 for datasetType, groupForType in progress.iter_item_chunks( 

1984 groupedData.items(), desc="Bulk-inserting datasets by type" 

1985 ): 

1986 refs = self.registry.insertDatasets( 

1987 datasetType, 

1988 dataIds=groupForType.keys(), 

1989 run=run, 

1990 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType), 

1991 idGenerationMode=idGenerationMode, 

1992 ) 

1993 # Append those resolved DatasetRefs to the new lists we set up for 

1994 # them. 

1995 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()): 

1996 resolvedRefs.append(ref) 

1997 

1998 # Go back to the original FileDatasets to replace their refs with the 

1999 # new resolved ones. 

2000 for groupForType in progress.iter_chunks( 

2001 groupedData.values(), desc="Reassociating resolved dataset refs with files" 

2002 ): 

2003 for dataset, resolvedRefs in groupForType.values(): 

2004 dataset.refs = resolvedRefs 

2005 

2006 # Bulk-insert everything into Datastore. 

2007 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info) 

2008 

2009 @contextlib.contextmanager 

2010 def export( 

2011 self, 

2012 *, 

2013 directory: Optional[str] = None, 

2014 filename: Optional[str] = None, 

2015 format: Optional[str] = None, 

2016 transfer: Optional[str] = None, 

2017 ) -> Iterator[RepoExportContext]: 

2018 """Export datasets from the repository represented by this `Butler`. 

2019 

2020 This method is a context manager that returns a helper object 

2021 (`RepoExportContext`) that is used to indicate what information from 

2022 the repository should be exported. 

2023 

2024 Parameters 

2025 ---------- 

2026 directory : `str`, optional 

2027 Directory dataset files should be written to if ``transfer`` is not 

2028 `None`. 

2029 filename : `str`, optional 

2030 Name for the file that will include database information associated 

2031 with the exported datasets. If this is not an absolute path and 

2032 ``directory`` is not `None`, it will be written to ``directory`` 

2033 instead of the current working directory. Defaults to 

2034 "export.{format}". 

2035 format : `str`, optional 

2036 File format for the database information file. If `None`, the 

2037 extension of ``filename`` will be used. 

2038 transfer : `str`, optional 

2039 Transfer mode passed to `Datastore.export`. 

2040 

2041 Raises 

2042 ------ 

2043 TypeError 

2044 Raised if the set of arguments passed is inconsistent. 

2045 

2046 Examples 

2047 -------- 

2048 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

2049 methods are used to provide the iterables over data IDs and/or datasets 

2050 to be exported:: 

2051 

2052 with butler.export("exports.yaml") as export: 

2053 # Export all flats, but none of the dimension element rows 

2054 # (i.e. data ID information) associated with them. 

2055 export.saveDatasets(butler.registry.queryDatasets("flat"), 

2056 elements=()) 

2057 # Export all datasets that start with "deepCoadd_" and all of 

2058 # their associated data ID information. 

2059 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

2060 """ 

2061 if directory is None and transfer is not None: 

2062 raise TypeError("Cannot transfer without providing a directory.") 

2063 if transfer == "move": 

2064 raise TypeError("Transfer may not be 'move': export is read-only") 

2065 if format is None: 

2066 if filename is None: 

2067 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2068 else: 

2069 _, format = os.path.splitext(filename) 

2070 if not format: 

2071 raise ValueError("Please specify a file extension to determine export format.") 

2072 format = format[1:] # Strip leading "."" 

2073 elif filename is None: 

2074 filename = f"export.{format}" 

2075 if directory is not None: 

2076 filename = os.path.join(directory, filename) 

2077 formats = self._config["repo_transfer_formats"] 

2078 if format not in formats: 

2079 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}") 

2080 BackendClass = get_class_of(formats[format, "export"]) 

2081 with open(filename, "w") as stream: 

2082 backend = BackendClass(stream, universe=self.registry.dimensions) 

2083 try: 

2084 helper = RepoExportContext( 

2085 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer 

2086 ) 

2087 yield helper 

2088 except BaseException: 

2089 raise 

2090 else: 

2091 helper._finish() 

2092 

2093 def import_( 

2094 self, 

2095 *, 

2096 directory: Optional[str] = None, 

2097 filename: Union[str, TextIO, None] = None, 

2098 format: Optional[str] = None, 

2099 transfer: Optional[str] = None, 

2100 skip_dimensions: Optional[Set] = None, 

2101 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

2102 reuseIds: bool = False, 

2103 ) -> None: 

2104 """Import datasets into this repository that were exported from a 

2105 different butler repository via `~lsst.daf.butler.Butler.export`. 

2106 

2107 Parameters 

2108 ---------- 

2109 directory : `str`, optional 

2110 Directory containing dataset files to import from. If `None`, 

2111 ``filename`` and all dataset file paths specified therein must 

2112 be absolute. 

2113 filename : `str` or `TextIO`, optional 

2114 A stream or name of file that contains database information 

2115 associated with the exported datasets, typically generated by 

2116 `~lsst.daf.butler.Butler.export`. If this a string (name) and 

2117 is not an absolute path, does not exist in the current working 

2118 directory, and ``directory`` is not `None`, it is assumed to be in 

2119 ``directory``. Defaults to "export.{format}". 

2120 format : `str`, optional 

2121 File format for ``filename``. If `None`, the extension of 

2122 ``filename`` will be used. 

2123 transfer : `str`, optional 

2124 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

2125 skip_dimensions : `set`, optional 

2126 Names of dimensions that should be skipped and not imported. 

2127 idGenerationMode : `DatasetIdGenEnum`, optional 

2128 Specifies option for generating dataset IDs when IDs are not 

2129 provided or their type does not match backend type. By default 

2130 unique IDs are generated for each inserted dataset. 

2131 reuseIds : `bool`, optional 

2132 If `True` then forces re-use of imported dataset IDs for integer 

2133 IDs which are normally generated as auto-incremented; exception 

2134 will be raised if imported IDs clash with existing ones. This 

2135 option has no effect on the use of globally-unique IDs which are 

2136 always re-used (or generated if integer IDs are being imported). 

2137 

2138 Raises 

2139 ------ 

2140 TypeError 

2141 Raised if the set of arguments passed is inconsistent, or if the 

2142 butler is read-only. 

2143 """ 

2144 if not self.isWriteable(): 

2145 raise TypeError("Butler is read-only.") 

2146 if format is None: 

2147 if filename is None: 

2148 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2149 else: 

2150 _, format = os.path.splitext(filename) # type: ignore 

2151 elif filename is None: 

2152 filename = f"export.{format}" 

2153 if isinstance(filename, str) and directory is not None and not os.path.exists(filename): 

2154 filename = os.path.join(directory, filename) 

2155 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"]) 

2156 

2157 def doImport(importStream: TextIO) -> None: 

2158 backend = BackendClass(importStream, self.registry) 

2159 backend.register() 

2160 with self.transaction(): 

2161 backend.load( 

2162 self.datastore, 

2163 directory=directory, 

2164 transfer=transfer, 

2165 skip_dimensions=skip_dimensions, 

2166 idGenerationMode=idGenerationMode, 

2167 reuseIds=reuseIds, 

2168 ) 

2169 

2170 if isinstance(filename, str): 

2171 with open(filename, "r") as stream: 

2172 doImport(stream) 

2173 else: 

2174 doImport(filename) 

2175 

2176 def transfer_from( 

2177 self, 

2178 source_butler: LimitedButler, 

2179 source_refs: Iterable[DatasetRef], 

2180 transfer: str = "auto", 

2181 id_gen_map: Dict[str, DatasetIdGenEnum] | None = None, 

2182 skip_missing: bool = True, 

2183 register_dataset_types: bool = False, 

2184 transfer_dimensions: bool = False, 

2185 ) -> List[DatasetRef]: 

2186 """Transfer datasets to this Butler from a run in another Butler. 

2187 

2188 Parameters 

2189 ---------- 

2190 source_butler : `LimitedButler` 

2191 Butler from which the datasets are to be transferred. If data IDs 

2192 in ``source_refs`` are not expanded then this has to be a full 

2193 `Butler` whose registry will be used to expand data IDs. 

2194 source_refs : iterable of `DatasetRef` 

2195 Datasets defined in the source butler that should be transferred to 

2196 this butler. 

2197 transfer : `str`, optional 

2198 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

2199 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional 

2200 A mapping of dataset type to ID generation mode. Only used if 

2201 the source butler is using integer IDs. Should not be used 

2202 if this receiving butler uses integer IDs. Without this dataset 

2203 import always uses unique. 

2204 skip_missing : `bool` 

2205 If `True`, datasets with no datastore artifact associated with 

2206 them are not transferred. If `False` a registry entry will be 

2207 created even if no datastore record is created (and so will 

2208 look equivalent to the dataset being unstored). 

2209 register_dataset_types : `bool` 

2210 If `True` any missing dataset types are registered. Otherwise 

2211 an exception is raised. 

2212 transfer_dimensions : `bool`, optional 

2213 If `True`, dimension record data associated with the new datasets 

2214 will be transferred. 

2215 

2216 Returns 

2217 ------- 

2218 refs : `list` of `DatasetRef` 

2219 The refs added to this Butler. 

2220 

2221 Notes 

2222 ----- 

2223 Requires that any dimension definitions are already present in the 

2224 receiving Butler. The datastore artifact has to exist for a transfer 

2225 to be made but non-existence is not an error. 

2226 

2227 Datasets that already exist in this run will be skipped. 

2228 

2229 The datasets are imported as part of a transaction, although 

2230 dataset types are registered before the transaction is started. 

2231 This means that it is possible for a dataset type to be registered 

2232 even though transfer has failed. 

2233 """ 

2234 if not self.isWriteable(): 

2235 raise TypeError("Butler is read-only.") 

2236 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

2237 

2238 # Will iterate through the refs multiple times so need to convert 

2239 # to a list if this isn't a collection. 

2240 if not isinstance(source_refs, collections.abc.Collection): 

2241 source_refs = list(source_refs) 

2242 

2243 original_count = len(source_refs) 

2244 log.info("Transferring %d datasets into %s", original_count, str(self)) 

2245 

2246 if id_gen_map is None: 

2247 id_gen_map = {} 

2248 

2249 # In some situations the datastore artifact may be missing 

2250 # and we do not want that registry entry to be imported. 

2251 # Asking datastore is not sufficient, the records may have been 

2252 # purged, we have to ask for the (predicted) URI and check 

2253 # existence explicitly. Execution butler is set up exactly like 

2254 # this with no datastore records. 

2255 artifact_existence: Dict[ResourcePath, bool] = {} 

2256 if skip_missing: 

2257 dataset_existence = source_butler.datastore.mexists( 

2258 source_refs, artifact_existence=artifact_existence 

2259 ) 

2260 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

2261 filtered_count = len(source_refs) 

2262 log.verbose( 

2263 "%d datasets removed because the artifact does not exist. Now have %d.", 

2264 original_count - filtered_count, 

2265 filtered_count, 

2266 ) 

2267 

2268 # Importing requires that we group the refs by dataset type and run 

2269 # before doing the import. 

2270 source_dataset_types = set() 

2271 grouped_refs = defaultdict(list) 

2272 grouped_indices = defaultdict(list) 

2273 for i, ref in enumerate(source_refs): 

2274 grouped_refs[ref.datasetType, ref.run].append(ref) 

2275 grouped_indices[ref.datasetType, ref.run].append(i) 

2276 source_dataset_types.add(ref.datasetType) 

2277 

2278 # Check to see if the dataset type in the source butler has 

2279 # the same definition in the target butler and register missing 

2280 # ones if requested. Registration must happen outside a transaction. 

2281 newly_registered_dataset_types = set() 

2282 for datasetType in source_dataset_types: 

2283 if register_dataset_types: 

2284 # Let this raise immediately if inconsistent. Continuing 

2285 # on to find additional inconsistent dataset types 

2286 # might result in additional unwanted dataset types being 

2287 # registered. 

2288 if self.registry.registerDatasetType(datasetType): 

2289 newly_registered_dataset_types.add(datasetType) 

2290 else: 

2291 # If the dataset type is missing, let it fail immediately. 

2292 target_dataset_type = self.registry.getDatasetType(datasetType.name) 

2293 if target_dataset_type != datasetType: 

2294 raise ConflictingDefinitionError( 

2295 "Source butler dataset type differs from definition" 

2296 f" in target butler: {datasetType} !=" 

2297 f" {target_dataset_type}" 

2298 ) 

2299 if newly_registered_dataset_types: 

2300 # We may have registered some even if there were inconsistencies 

2301 # but should let people know (or else remove them again). 

2302 log.log( 

2303 VERBOSE, 

2304 "Registered the following dataset types in the target Butler: %s", 

2305 ", ".join(d.name for d in newly_registered_dataset_types), 

2306 ) 

2307 else: 

2308 log.log(VERBOSE, "All required dataset types are known to the target Butler") 

2309 

2310 dimension_records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

2311 if transfer_dimensions: 

2312 # Collect all the dimension records for these refs. 

2313 # All dimensions are to be copied but the list of valid dimensions 

2314 # come from this butler's universe. 

2315 elements = frozenset( 

2316 element 

2317 for element in self.registry.dimensions.getStaticElements() 

2318 if element.hasTable() and element.viewOf is None 

2319 ) 

2320 dataIds = set(ref.dataId for ref in source_refs) 

2321 # This logic comes from saveDataIds. 

2322 for dataId in dataIds: 

2323 # Need an expanded record, if not expanded that we need a full 

2324 # butler with registry (allow mocks with registry too). 

2325 if not dataId.hasRecords(): 

2326 if registry := getattr(source_butler, "registry", None): 

2327 dataId = registry.expandDataId(dataId) 

2328 else: 

2329 raise TypeError("Input butler needs to be a full butler to expand DataId.") 

2330 # If this butler doesn't know about a dimension in the source 

2331 # butler things will break later. 

2332 for record in dataId.records.values(): 

2333 if record is not None and record.definition in elements: 

2334 dimension_records[record.definition].setdefault(record.dataId, record) 

2335 

2336 # The returned refs should be identical for UUIDs. 

2337 # For now must also support integers and so need to retain the 

2338 # newly-created refs from this registry. 

2339 # Pre-size it so we can assign refs into the correct slots 

2340 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs) 

2341 default_id_gen = DatasetIdGenEnum.UNIQUE 

2342 

2343 handled_collections: Set[str] = set() 

2344 

2345 # Do all the importing in a single transaction. 

2346 with self.transaction(): 

2347 if dimension_records: 

2348 log.verbose("Ensuring that dimension records exist for transferred datasets.") 

2349 for element, r in dimension_records.items(): 

2350 records = [r[dataId] for dataId in r] 

2351 # Assume that if the record is already present that we can 

2352 # use it without having to check that the record metadata 

2353 # is consistent. 

2354 self.registry.insertDimensionData(element, *records, skip_existing=True) 

2355 

2356 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

2357 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

2358 ): 

2359 if run not in handled_collections: 

2360 # May need to create output collection. If source butler 

2361 # has a registry, ask for documentation string. 

2362 run_doc = None 

2363 if registry := getattr(source_butler, "registry", None): 

2364 run_doc = registry.getCollectionDocumentation(run) 

2365 registered = self.registry.registerRun(run, doc=run_doc) 

2366 handled_collections.add(run) 

2367 if registered: 

2368 log.log(VERBOSE, "Creating output run %s", run) 

2369 

2370 id_generation_mode = default_id_gen 

2371 if isinstance(refs_to_import[0].id, int): 

2372 # ID generation mode might need to be overridden when 

2373 # targetting UUID 

2374 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen) 

2375 

2376 n_refs = len(refs_to_import) 

2377 log.verbose( 

2378 "Importing %d ref%s of dataset type %s into run %s", 

2379 n_refs, 

2380 "" if n_refs == 1 else "s", 

2381 datasetType.name, 

2382 run, 

2383 ) 

2384 

2385 # No way to know if this butler's registry uses UUID. 

2386 # We have to trust the caller on this. If it fails they will 

2387 # have to change their approach. We can't catch the exception 

2388 # and retry with unique because that will mess up the 

2389 # transaction handling. We aren't allowed to ask the registry 

2390 # manager what type of ID it is using. 

2391 imported_refs = self.registry._importDatasets( 

2392 refs_to_import, idGenerationMode=id_generation_mode, expand=False 

2393 ) 

2394 

2395 # Map them into the correct slots to match the initial order 

2396 for i, ref in zip(grouped_indices[datasetType, run], imported_refs): 

2397 transferred_refs_tmp[i] = ref 

2398 

2399 # Mypy insists that we might have None in here so we have to make 

2400 # that explicit by assigning to a new variable and filtering out 

2401 # something that won't be there. 

2402 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None] 

2403 

2404 # Check consistency 

2405 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given" 

2406 

2407 log.verbose("Imported %d datasets into destination butler", len(transferred_refs)) 

2408 

2409 # The transferred refs need to be reordered to match the original 

2410 # ordering given by the caller. Without this the datastore transfer 

2411 # will be broken. 

2412 

2413 # Ask the datastore to transfer. The datastore has to check that 

2414 # the source datastore is compatible with the target datastore. 

2415 self.datastore.transfer_from( 

2416 source_butler.datastore, 

2417 source_refs, 

2418 local_refs=transferred_refs, 

2419 transfer=transfer, 

2420 artifact_existence=artifact_existence, 

2421 ) 

2422 

2423 return transferred_refs 

2424 

2425 def validateConfiguration( 

2426 self, 

2427 logFailures: bool = False, 

2428 datasetTypeNames: Optional[Iterable[str]] = None, 

2429 ignore: Iterable[str] | None = None, 

2430 ) -> None: 

2431 """Validate butler configuration. 

2432 

2433 Checks that each `DatasetType` can be stored in the `Datastore`. 

2434 

2435 Parameters 

2436 ---------- 

2437 logFailures : `bool`, optional 

2438 If `True`, output a log message for every validation error 

2439 detected. 

2440 datasetTypeNames : iterable of `str`, optional 

2441 The `DatasetType` names that should be checked. This allows 

2442 only a subset to be selected. 

2443 ignore : iterable of `str`, optional 

2444 Names of DatasetTypes to skip over. This can be used to skip 

2445 known problems. If a named `DatasetType` corresponds to a 

2446 composite, all components of that `DatasetType` will also be 

2447 ignored. 

2448 

2449 Raises 

2450 ------ 

2451 ButlerValidationError 

2452 Raised if there is some inconsistency with how this Butler 

2453 is configured. 

2454 """ 

2455 if datasetTypeNames: 

2456 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

2457 else: 

2458 datasetTypes = list(self.registry.queryDatasetTypes()) 

2459 

2460 # filter out anything from the ignore list 

2461 if ignore: 

2462 ignore = set(ignore) 

2463 datasetTypes = [ 

2464 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

2465 ] 

2466 else: 

2467 ignore = set() 

2468 

2469 # Find all the registered instruments 

2470 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument")) 

2471 

2472 # For each datasetType that has an instrument dimension, create 

2473 # a DatasetRef for each defined instrument 

2474 datasetRefs = [] 

2475 

2476 for datasetType in datasetTypes: 

2477 if "instrument" in datasetType.dimensions: 

2478 for instrument in instruments: 

2479 datasetRef = DatasetRef( 

2480 datasetType, {"instrument": instrument}, conform=False # type: ignore 

2481 ) 

2482 datasetRefs.append(datasetRef) 

2483 

2484 entities: List[Union[DatasetType, DatasetRef]] = [] 

2485 entities.extend(datasetTypes) 

2486 entities.extend(datasetRefs) 

2487 

2488 datastoreErrorStr = None 

2489 try: 

2490 self.datastore.validateConfiguration(entities, logFailures=logFailures) 

2491 except ValidationError as e: 

2492 datastoreErrorStr = str(e) 

2493 

2494 # Also check that the LookupKeys used by the datastores match 

2495 # registry and storage class definitions 

2496 keys = self.datastore.getLookupKeys() 

2497 

2498 failedNames = set() 

2499 failedDataId = set() 

2500 for key in keys: 

2501 if key.name is not None: 

2502 if key.name in ignore: 

2503 continue 

2504 

2505 # skip if specific datasetType names were requested and this 

2506 # name does not match 

2507 if datasetTypeNames and key.name not in datasetTypeNames: 

2508 continue 

2509 

2510 # See if it is a StorageClass or a DatasetType 

2511 if key.name in self.storageClasses: 

2512 pass 

2513 else: 

2514 try: 

2515 self.registry.getDatasetType(key.name) 

2516 except KeyError: 

2517 if logFailures: 

2518 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

2519 failedNames.add(key) 

2520 else: 

2521 # Dimensions are checked for consistency when the Butler 

2522 # is created and rendezvoused with a universe. 

2523 pass 

2524 

2525 # Check that the instrument is a valid instrument 

2526 # Currently only support instrument so check for that 

2527 if key.dataId: 

2528 dataIdKeys = set(key.dataId) 

2529 if set(["instrument"]) != dataIdKeys: 

2530 if logFailures: 

2531 log.critical("Key '%s' has unsupported DataId override", key) 

2532 failedDataId.add(key) 

2533 elif key.dataId["instrument"] not in instruments: 

2534 if logFailures: 

2535 log.critical("Key '%s' has unknown instrument", key) 

2536 failedDataId.add(key) 

2537 

2538 messages = [] 

2539 

2540 if datastoreErrorStr: 

2541 messages.append(datastoreErrorStr) 

2542 

2543 for failed, msg in ( 

2544 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2545 (failedDataId, "Keys with bad DataId entries: "), 

2546 ): 

2547 if failed: 

2548 msg += ", ".join(str(k) for k in failed) 

2549 messages.append(msg) 

2550 

2551 if messages: 

2552 raise ValidationError(";\n".join(messages)) 

2553 

2554 @property 

2555 def collections(self) -> Sequence[str]: 

2556 """The collections to search by default, in order 

2557 (`Sequence` [ `str` ]). 

2558 

2559 This is an alias for ``self.registry.defaults.collections``. It cannot 

2560 be set directly in isolation, but all defaults may be changed together 

2561 by assigning a new `RegistryDefaults` instance to 

2562 ``self.registry.defaults``. 

2563 """ 

2564 return self.registry.defaults.collections 

2565 

2566 @property 

2567 def run(self) -> Optional[str]: 

2568 """Name of the run this butler writes outputs to by default (`str` or 

2569 `None`). 

2570 

2571 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2572 directly in isolation, but all defaults may be changed together by 

2573 assigning a new `RegistryDefaults` instance to 

2574 ``self.registry.defaults``. 

2575 """ 

2576 return self.registry.defaults.run 

2577 

2578 @property 

2579 def dimensions(self) -> DimensionUniverse: 

2580 # Docstring inherited. 

2581 return self.registry.dimensions 

2582 

2583 registry: Registry 

2584 """The object that manages dataset metadata and relationships (`Registry`). 

2585 

2586 Most operations that don't involve reading or writing butler datasets are 

2587 accessible only via `Registry` methods. 

2588 """ 

2589 

2590 datastore: Datastore 

2591 """The object that manages actual dataset storage (`Datastore`). 

2592 

2593 Direct user access to the datastore should rarely be necessary; the primary 

2594 exception is the case where a `Datastore` implementation provides extra 

2595 functionality beyond what the base class defines. 

2596 """ 

2597 

2598 storageClasses: StorageClassFactory 

2599 """An object that maps known storage class names to objects that fully 

2600 describe them (`StorageClassFactory`). 

2601 """ 

2602 

2603 _allow_put_of_predefined_dataset: bool 

2604 """Allow a put to succeed even if there is already a registry entry for it 

2605 but not a datastore record. (`bool`)."""