Coverage for python/lsst/daf/butler/_butler.py: 9%

666 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-07-14 15:54 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Butler top level classes. 

24""" 

25from __future__ import annotations 

26 

27__all__ = ( 

28 "Butler", 

29 "ButlerValidationError", 

30 "PruneCollectionsArgsError", 

31 "PurgeWithoutUnstorePruneCollectionsError", 

32 "RunWithoutPurgePruneCollectionsError", 

33 "PurgeUnsupportedPruneCollectionsError", 

34) 

35 

36import collections.abc 

37import contextlib 

38import logging 

39import numbers 

40import os 

41from collections import defaultdict 

42from typing import ( 

43 Any, 

44 ClassVar, 

45 Counter, 

46 Dict, 

47 Iterable, 

48 Iterator, 

49 List, 

50 MutableMapping, 

51 Optional, 

52 Set, 

53 TextIO, 

54 Tuple, 

55 Type, 

56 Union, 

57) 

58 

59from lsst.resources import ResourcePath, ResourcePathExpression 

60from lsst.utils import doImportType 

61from lsst.utils.introspection import get_class_of 

62from lsst.utils.logging import VERBOSE, getLogger 

63 

64from ._butlerConfig import ButlerConfig 

65from ._butlerRepoIndex import ButlerRepoIndex 

66from ._deferredDatasetHandle import DeferredDatasetHandle 

67from ._limited_butler import LimitedButler 

68from .core import ( 

69 AmbiguousDatasetError, 

70 Config, 

71 ConfigSubset, 

72 DataCoordinate, 

73 DataId, 

74 DataIdValue, 

75 DatasetRef, 

76 DatasetRefURIs, 

77 DatasetType, 

78 Datastore, 

79 Dimension, 

80 DimensionConfig, 

81 DimensionUniverse, 

82 FileDataset, 

83 Progress, 

84 StorageClassFactory, 

85 Timespan, 

86 ValidationError, 

87) 

88from .core.repoRelocation import BUTLER_ROOT_TAG 

89from .core.utils import transactional 

90from .registry import ( 

91 CollectionSearch, 

92 CollectionType, 

93 ConflictingDefinitionError, 

94 DataIdError, 

95 DatasetIdGenEnum, 

96 Registry, 

97 RegistryConfig, 

98 RegistryDefaults, 

99) 

100from .transfers import RepoExportContext 

101 

102log = getLogger(__name__) 

103 

104 

105class ButlerValidationError(ValidationError): 

106 """There is a problem with the Butler configuration.""" 

107 

108 pass 

109 

110 

111class PruneCollectionsArgsError(TypeError): 

112 """Base class for errors relating to Butler.pruneCollections input 

113 arguments. 

114 """ 

115 

116 pass 

117 

118 

119class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError): 

120 """Raised when purge and unstore are both required to be True, and 

121 purge is True but unstore is False. 

122 """ 

123 

124 def __init__(self) -> None: 

125 super().__init__("Cannot pass purge=True without unstore=True.") 

126 

127 

128class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError): 

129 """Raised when pruning a RUN collection but purge is False.""" 

130 

131 def __init__(self, collectionType: CollectionType): 

132 self.collectionType = collectionType 

133 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.") 

134 

135 

136class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError): 

137 """Raised when purge is True but is not supported for the given 

138 collection.""" 

139 

140 def __init__(self, collectionType: CollectionType): 

141 self.collectionType = collectionType 

142 super().__init__( 

143 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True." 

144 ) 

145 

146 

147class Butler(LimitedButler): 

148 """Main entry point for the data access system. 

149 

150 Parameters 

151 ---------- 

152 config : `ButlerConfig`, `Config` or `str`, optional. 

153 Configuration. Anything acceptable to the 

154 `ButlerConfig` constructor. If a directory path 

155 is given the configuration will be read from a ``butler.yaml`` file in 

156 that location. If `None` is given default values will be used. 

157 butler : `Butler`, optional. 

158 If provided, construct a new Butler that uses the same registry and 

159 datastore as the given one, but with the given collection and run. 

160 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

161 arguments. 

162 collections : `str` or `Iterable` [ `str` ], optional 

163 An expression specifying the collections to be searched (in order) when 

164 reading datasets. 

165 This may be a `str` collection name or an iterable thereof. 

166 See :ref:`daf_butler_collection_expressions` for more information. 

167 These collections are not registered automatically and must be 

168 manually registered before they are used by any method, but they may be 

169 manually registered after the `Butler` is initialized. 

170 run : `str`, optional 

171 Name of the `~CollectionType.RUN` collection new datasets should be 

172 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

173 ``collections`` will be set to ``[run]``. If not `None`, this 

174 collection will automatically be registered. If this is not set (and 

175 ``writeable`` is not set either), a read-only butler will be created. 

176 searchPaths : `list` of `str`, optional 

177 Directory paths to search when calculating the full Butler 

178 configuration. Not used if the supplied config is already a 

179 `ButlerConfig`. 

180 writeable : `bool`, optional 

181 Explicitly sets whether the butler supports write operations. If not 

182 provided, a read-write butler is created if any of ``run``, ``tags``, 

183 or ``chains`` is non-empty. 

184 inferDefaults : `bool`, optional 

185 If `True` (default) infer default data ID values from the values 

186 present in the datasets in ``collections``: if all collections have the 

187 same value (or no value) for a governor dimension, that value will be 

188 the default for that dimension. Nonexistent collections are ignored. 

189 If a default value is provided explicitly for a governor dimension via 

190 ``**kwargs``, no default will be inferred for that dimension. 

191 **kwargs : `str` 

192 Default data ID key-value pairs. These may only identify "governor" 

193 dimensions like ``instrument`` and ``skymap``. 

194 

195 Examples 

196 -------- 

197 While there are many ways to control exactly how a `Butler` interacts with 

198 the collections in its `Registry`, the most common cases are still simple. 

199 

200 For a read-only `Butler` that searches one collection, do:: 

201 

202 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

203 

204 For a read-write `Butler` that writes to and reads from a 

205 `~CollectionType.RUN` collection:: 

206 

207 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

208 

209 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

210 because we want to write to one `~CollectionType.RUN` collection but read 

211 from several others (as well):: 

212 

213 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

214 collections=["u/alice/DM-50000/a", 

215 "u/bob/DM-49998", 

216 "HSC/defaults"]) 

217 

218 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

219 Datasets will be read first from that run (since it appears first in the 

220 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

221 

222 Finally, one can always create a `Butler` with no collections:: 

223 

224 butler = Butler("/path/to/repo", writeable=True) 

225 

226 This can be extremely useful when you just want to use ``butler.registry``, 

227 e.g. for inserting dimension data or managing collections, or when the 

228 collections you want to use with the butler are not consistent. 

229 Passing ``writeable`` explicitly here is only necessary if you want to be 

230 able to make changes to the repo - usually the value for ``writeable`` can 

231 be guessed from the collection arguments provided, but it defaults to 

232 `False` when there are not collection arguments. 

233 """ 

234 

235 def __init__( 

236 self, 

237 config: Union[Config, str, None] = None, 

238 *, 

239 butler: Optional[Butler] = None, 

240 collections: Any = None, 

241 run: Optional[str] = None, 

242 searchPaths: Optional[List[str]] = None, 

243 writeable: Optional[bool] = None, 

244 inferDefaults: bool = True, 

245 **kwargs: str, 

246 ): 

247 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

248 # Load registry, datastore, etc. from config or existing butler. 

249 if butler is not None: 

250 if config is not None or searchPaths is not None or writeable is not None: 

251 raise TypeError( 

252 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument." 

253 ) 

254 self.registry = butler.registry.copy(defaults) 

255 self.datastore = butler.datastore 

256 self.storageClasses = butler.storageClasses 

257 self._config: ButlerConfig = butler._config 

258 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset 

259 else: 

260 # Can only look for strings in the known repos list. 

261 if isinstance(config, str) and config in self.get_known_repos(): 

262 config = str(self.get_repo_uri(config)) 

263 try: 

264 self._config = ButlerConfig(config, searchPaths=searchPaths) 

265 except FileNotFoundError as e: 

266 if known := self.get_known_repos(): 

267 aliases = f"(known aliases: {', '.join(known)})" 

268 else: 

269 aliases = "(no known aliases)" 

270 raise FileNotFoundError(f"{e} {aliases}") from e 

271 self._config = ButlerConfig(config, searchPaths=searchPaths) 

272 try: 

273 if "root" in self._config: 

274 butlerRoot = self._config["root"] 

275 else: 

276 butlerRoot = self._config.configDir 

277 if writeable is None: 

278 writeable = run is not None 

279 self.registry = Registry.fromConfig( 

280 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

281 ) 

282 self.datastore = Datastore.fromConfig( 

283 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

284 ) 

285 self.storageClasses = StorageClassFactory() 

286 self.storageClasses.addFromConfig(self._config) 

287 self._allow_put_of_predefined_dataset = self._config.get( 

288 "allow_put_of_predefined_dataset", False 

289 ) 

290 except Exception: 

291 # Failures here usually mean that configuration is incomplete, 

292 # just issue an error message which includes config file URI. 

293 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

294 raise 

295 

296 if "run" in self._config or "collection" in self._config: 

297 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

298 

299 GENERATION: ClassVar[int] = 3 

300 """This is a Generation 3 Butler. 

301 

302 This attribute may be removed in the future, once the Generation 2 Butler 

303 interface has been fully retired; it should only be used in transitional 

304 code. 

305 """ 

306 

307 @classmethod 

308 def get_repo_uri(cls, label: str) -> ResourcePath: 

309 """Look up the label in a butler repository index. 

310 

311 Parameters 

312 ---------- 

313 label : `str` 

314 Label of the Butler repository to look up. 

315 

316 Returns 

317 ------- 

318 uri : `lsst.resources.ResourcePath` 

319 URI to the Butler repository associated with the given label. 

320 

321 Raises 

322 ------ 

323 KeyError 

324 Raised if the label is not found in the index, or if an index 

325 can not be found at all. 

326 

327 Notes 

328 ----- 

329 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

330 information is discovered. 

331 """ 

332 return ButlerRepoIndex.get_repo_uri(label) 

333 

334 @classmethod 

335 def get_known_repos(cls) -> Set[str]: 

336 """Retrieve the list of known repository labels. 

337 

338 Returns 

339 ------- 

340 repos : `set` of `str` 

341 All the known labels. Can be empty if no index can be found. 

342 

343 Notes 

344 ----- 

345 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

346 information is discovered. 

347 """ 

348 return ButlerRepoIndex.get_known_repos() 

349 

350 @staticmethod 

351 def makeRepo( 

352 root: ResourcePathExpression, 

353 config: Union[Config, str, None] = None, 

354 dimensionConfig: Union[Config, str, None] = None, 

355 standalone: bool = False, 

356 searchPaths: Optional[List[str]] = None, 

357 forceConfigRoot: bool = True, 

358 outfile: Optional[ResourcePathExpression] = None, 

359 overwrite: bool = False, 

360 ) -> Config: 

361 """Create an empty data repository by adding a butler.yaml config 

362 to a repository root directory. 

363 

364 Parameters 

365 ---------- 

366 root : `lsst.resources.ResourcePathExpression` 

367 Path or URI to the root location of the new repository. Will be 

368 created if it does not exist. 

369 config : `Config` or `str`, optional 

370 Configuration to write to the repository, after setting any 

371 root-dependent Registry or Datastore config options. Can not 

372 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

373 configuration will be used. Root-dependent config options 

374 specified in this config are overwritten if ``forceConfigRoot`` 

375 is `True`. 

376 dimensionConfig : `Config` or `str`, optional 

377 Configuration for dimensions, will be used to initialize registry 

378 database. 

379 standalone : `bool` 

380 If True, write all expanded defaults, not just customized or 

381 repository-specific settings. 

382 This (mostly) decouples the repository from the default 

383 configuration, insulating it from changes to the defaults (which 

384 may be good or bad, depending on the nature of the changes). 

385 Future *additions* to the defaults will still be picked up when 

386 initializing `Butlers` to repos created with ``standalone=True``. 

387 searchPaths : `list` of `str`, optional 

388 Directory paths to search when calculating the full butler 

389 configuration. 

390 forceConfigRoot : `bool`, optional 

391 If `False`, any values present in the supplied ``config`` that 

392 would normally be reset are not overridden and will appear 

393 directly in the output config. This allows non-standard overrides 

394 of the root directory for a datastore or registry to be given. 

395 If this parameter is `True` the values for ``root`` will be 

396 forced into the resulting config if appropriate. 

397 outfile : `lss.resources.ResourcePathExpression`, optional 

398 If not-`None`, the output configuration will be written to this 

399 location rather than into the repository itself. Can be a URI 

400 string. Can refer to a directory that will be used to write 

401 ``butler.yaml``. 

402 overwrite : `bool`, optional 

403 Create a new configuration file even if one already exists 

404 in the specified output location. Default is to raise 

405 an exception. 

406 

407 Returns 

408 ------- 

409 config : `Config` 

410 The updated `Config` instance written to the repo. 

411 

412 Raises 

413 ------ 

414 ValueError 

415 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

416 regular Config (as these subclasses would make it impossible to 

417 support ``standalone=False``). 

418 FileExistsError 

419 Raised if the output config file already exists. 

420 os.error 

421 Raised if the directory does not exist, exists but is not a 

422 directory, or cannot be created. 

423 

424 Notes 

425 ----- 

426 Note that when ``standalone=False`` (the default), the configuration 

427 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

428 construct the repository should also be used to construct any Butlers 

429 to avoid configuration inconsistencies. 

430 """ 

431 if isinstance(config, (ButlerConfig, ConfigSubset)): 

432 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

433 

434 # Ensure that the root of the repository exists or can be made 

435 root_uri = ResourcePath(root, forceDirectory=True) 

436 root_uri.mkdir() 

437 

438 config = Config(config) 

439 

440 # If we are creating a new repo from scratch with relative roots, 

441 # do not propagate an explicit root from the config file 

442 if "root" in config: 

443 del config["root"] 

444 

445 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

446 imported_class = doImportType(full["datastore", "cls"]) 

447 if not issubclass(imported_class, Datastore): 

448 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore") 

449 datastoreClass: Type[Datastore] = imported_class 

450 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

451 

452 # if key exists in given config, parse it, otherwise parse the defaults 

453 # in the expanded config 

454 if config.get(("registry", "db")): 

455 registryConfig = RegistryConfig(config) 

456 else: 

457 registryConfig = RegistryConfig(full) 

458 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

459 if defaultDatabaseUri is not None: 

460 Config.updateParameters( 

461 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot 

462 ) 

463 else: 

464 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot) 

465 

466 if standalone: 

467 config.merge(full) 

468 else: 

469 # Always expand the registry.managers section into the per-repo 

470 # config, because after the database schema is created, it's not 

471 # allowed to change anymore. Note that in the standalone=True 

472 # branch, _everything_ in the config is expanded, so there's no 

473 # need to special case this. 

474 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

475 configURI: ResourcePathExpression 

476 if outfile is not None: 

477 # When writing to a separate location we must include 

478 # the root of the butler repo in the config else it won't know 

479 # where to look. 

480 config["root"] = root_uri.geturl() 

481 configURI = outfile 

482 else: 

483 configURI = root_uri 

484 config.dumpToUri(configURI, overwrite=overwrite) 

485 

486 # Create Registry and populate tables 

487 registryConfig = RegistryConfig(config.get("registry")) 

488 dimensionConfig = DimensionConfig(dimensionConfig) 

489 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri) 

490 

491 log.verbose("Wrote new Butler configuration file to %s", configURI) 

492 

493 return config 

494 

495 @classmethod 

496 def _unpickle( 

497 cls, 

498 config: ButlerConfig, 

499 collections: Optional[CollectionSearch], 

500 run: Optional[str], 

501 defaultDataId: Dict[str, str], 

502 writeable: bool, 

503 ) -> Butler: 

504 """Callable used to unpickle a Butler. 

505 

506 We prefer not to use ``Butler.__init__`` directly so we can force some 

507 of its many arguments to be keyword-only (note that ``__reduce__`` 

508 can only invoke callables with positional arguments). 

509 

510 Parameters 

511 ---------- 

512 config : `ButlerConfig` 

513 Butler configuration, already coerced into a true `ButlerConfig` 

514 instance (and hence after any search paths for overrides have been 

515 utilized). 

516 collections : `CollectionSearch` 

517 Names of the default collections to read from. 

518 run : `str`, optional 

519 Name of the default `~CollectionType.RUN` collection to write to. 

520 defaultDataId : `dict` [ `str`, `str` ] 

521 Default data ID values. 

522 writeable : `bool` 

523 Whether the Butler should support write operations. 

524 

525 Returns 

526 ------- 

527 butler : `Butler` 

528 A new `Butler` instance. 

529 """ 

530 # MyPy doesn't recognize that the kwargs below are totally valid; it 

531 # seems to think '**defaultDataId* is a _positional_ argument! 

532 return cls( 

533 config=config, 

534 collections=collections, 

535 run=run, 

536 writeable=writeable, 

537 **defaultDataId, # type: ignore 

538 ) 

539 

540 def __reduce__(self) -> tuple: 

541 """Support pickling.""" 

542 return ( 

543 Butler._unpickle, 

544 ( 

545 self._config, 

546 self.collections, 

547 self.run, 

548 self.registry.defaults.dataId.byName(), 

549 self.registry.isWriteable(), 

550 ), 

551 ) 

552 

553 def __str__(self) -> str: 

554 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

555 self.collections, self.run, self.datastore, self.registry 

556 ) 

557 

558 def isWriteable(self) -> bool: 

559 """Return `True` if this `Butler` supports write operations.""" 

560 return self.registry.isWriteable() 

561 

562 @contextlib.contextmanager 

563 def transaction(self) -> Iterator[None]: 

564 """Context manager supporting `Butler` transactions. 

565 

566 Transactions can be nested. 

567 """ 

568 with self.registry.transaction(): 

569 with self.datastore.transaction(): 

570 yield 

571 

572 def _standardizeArgs( 

573 self, 

574 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

575 dataId: Optional[DataId] = None, 

576 for_put: bool = True, 

577 **kwargs: Any, 

578 ) -> Tuple[DatasetType, Optional[DataId]]: 

579 """Standardize the arguments passed to several Butler APIs. 

580 

581 Parameters 

582 ---------- 

583 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

584 When `DatasetRef` the `dataId` should be `None`. 

585 Otherwise the `DatasetType` or name thereof. 

586 dataId : `dict` or `DataCoordinate` 

587 A `dict` of `Dimension` link name, value pairs that label the 

588 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

589 should be provided as the second argument. 

590 for_put : `bool`, optional 

591 If `True` this call is invoked as part of a `Butler.put()`. 

592 Otherwise it is assumed to be part of a `Butler.get()`. This 

593 parameter is only relevant if there is dataset type 

594 inconsistency. 

595 **kwargs 

596 Additional keyword arguments used to augment or construct a 

597 `DataCoordinate`. See `DataCoordinate.standardize` 

598 parameters. 

599 

600 Returns 

601 ------- 

602 datasetType : `DatasetType` 

603 A `DatasetType` instance extracted from ``datasetRefOrType``. 

604 dataId : `dict` or `DataId`, optional 

605 Argument that can be used (along with ``kwargs``) to construct a 

606 `DataId`. 

607 

608 Notes 

609 ----- 

610 Butler APIs that conceptually need a DatasetRef also allow passing a 

611 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

612 keyword arguments that can be used to construct one) separately. This 

613 method accepts those arguments and always returns a true `DatasetType` 

614 and a `DataId` or `dict`. 

615 

616 Standardization of `dict` vs `DataId` is best handled by passing the 

617 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

618 generally similarly flexible. 

619 """ 

620 externalDatasetType: Optional[DatasetType] = None 

621 internalDatasetType: Optional[DatasetType] = None 

622 if isinstance(datasetRefOrType, DatasetRef): 

623 if dataId is not None or kwargs: 

624 raise ValueError("DatasetRef given, cannot use dataId as well") 

625 externalDatasetType = datasetRefOrType.datasetType 

626 dataId = datasetRefOrType.dataId 

627 else: 

628 # Don't check whether DataId is provided, because Registry APIs 

629 # can usually construct a better error message when it wasn't. 

630 if isinstance(datasetRefOrType, DatasetType): 

631 externalDatasetType = datasetRefOrType 

632 else: 

633 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

634 

635 # Check that they are self-consistent 

636 if externalDatasetType is not None: 

637 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

638 if externalDatasetType != internalDatasetType: 

639 # We can allow differences if they are compatible, depending 

640 # on whether this is a get or a put. A get requires that 

641 # the python type associated with the datastore can be 

642 # converted to the user type. A put requires that the user 

643 # supplied python type can be converted to the internal 

644 # type expected by registry. 

645 relevantDatasetType = internalDatasetType 

646 if for_put: 

647 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

648 else: 

649 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

650 relevantDatasetType = externalDatasetType 

651 if not is_compatible: 

652 raise ValueError( 

653 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

654 f"registry definition ({internalDatasetType})" 

655 ) 

656 # Override the internal definition. 

657 internalDatasetType = relevantDatasetType 

658 

659 assert internalDatasetType is not None 

660 return internalDatasetType, dataId 

661 

662 def _rewrite_data_id( 

663 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any 

664 ) -> Tuple[Optional[DataId], Dict[str, Any]]: 

665 """Rewrite a data ID taking into account dimension records. 

666 

667 Take a Data ID and keyword args and rewrite it if necessary to 

668 allow the user to specify dimension records rather than dimension 

669 primary values. 

670 

671 This allows a user to include a dataId dict with keys of 

672 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

673 the integer exposure ID. It also allows a string to be given 

674 for a dimension value rather than the integer ID if that is more 

675 convenient. For example, rather than having to specifyin the 

676 detector with ``detector.full_name``, a string given for ``detector`` 

677 will be interpreted as the full name and converted to the integer 

678 value. 

679 

680 Keyword arguments can also use strings for dimensions like detector 

681 and exposure but python does not allow them to include ``.`` and 

682 so the ``exposure.day_obs`` syntax can not be used in a keyword 

683 argument. 

684 

685 Parameters 

686 ---------- 

687 dataId : `dict` or `DataCoordinate` 

688 A `dict` of `Dimension` link name, value pairs that will label the 

689 `DatasetRef` within a Collection. 

690 datasetType : `DatasetType` 

691 The dataset type associated with this dataId. Required to 

692 determine the relevant dimensions. 

693 **kwargs 

694 Additional keyword arguments used to augment or construct a 

695 `DataId`. See `DataId` parameters. 

696 

697 Returns 

698 ------- 

699 dataId : `dict` or `DataCoordinate` 

700 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

701 no keyword arguments, the original dataId will be returned 

702 unchanged. 

703 **kwargs : `dict` 

704 Any unused keyword arguments (would normally be empty dict). 

705 """ 

706 # Do nothing if we have a standalone DataCoordinate. 

707 if isinstance(dataId, DataCoordinate) and not kwargs: 

708 return dataId, kwargs 

709 

710 # Process dimension records that are using record information 

711 # rather than ids 

712 newDataId: Dict[str, DataIdValue] = {} 

713 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict) 

714 

715 # if all the dataId comes from keyword parameters we do not need 

716 # to do anything here because they can't be of the form 

717 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

718 if dataId: 

719 for k, v in dataId.items(): 

720 # If we have a Dimension we do not need to do anything 

721 # because it cannot be a compound key. 

722 if isinstance(k, str) and "." in k: 

723 # Someone is using a more human-readable dataId 

724 dimensionName, record = k.split(".", 1) 

725 byRecord[dimensionName][record] = v 

726 elif isinstance(k, Dimension): 

727 newDataId[k.name] = v 

728 else: 

729 newDataId[k] = v 

730 

731 # Go through the updated dataId and check the type in case someone is 

732 # using an alternate key. We have already filtered out the compound 

733 # keys dimensions.record format. 

734 not_dimensions = {} 

735 

736 # Will need to look in the dataId and the keyword arguments 

737 # and will remove them if they need to be fixed or are unrecognized. 

738 for dataIdDict in (newDataId, kwargs): 

739 # Use a list so we can adjust the dict safely in the loop 

740 for dimensionName in list(dataIdDict): 

741 value = dataIdDict[dimensionName] 

742 try: 

743 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

744 except KeyError: 

745 # This is not a real dimension 

746 not_dimensions[dimensionName] = value 

747 del dataIdDict[dimensionName] 

748 continue 

749 

750 # Convert an integral type to an explicit int to simplify 

751 # comparisons here 

752 if isinstance(value, numbers.Integral): 

753 value = int(value) 

754 

755 if not isinstance(value, dimension.primaryKey.getPythonType()): 

756 for alternate in dimension.alternateKeys: 

757 if isinstance(value, alternate.getPythonType()): 

758 byRecord[dimensionName][alternate.name] = value 

759 del dataIdDict[dimensionName] 

760 log.debug( 

761 "Converting dimension %s to %s.%s=%s", 

762 dimensionName, 

763 dimensionName, 

764 alternate.name, 

765 value, 

766 ) 

767 break 

768 else: 

769 log.warning( 

770 "Type mismatch found for value '%r' provided for dimension %s. " 

771 "Could not find matching alternative (primary key has type %s) " 

772 "so attempting to use as-is.", 

773 value, 

774 dimensionName, 

775 dimension.primaryKey.getPythonType(), 

776 ) 

777 

778 # By this point kwargs and newDataId should only include valid 

779 # dimensions. Merge kwargs in to the new dataId and log if there 

780 # are dimensions in both (rather than calling update). 

781 for k, v in kwargs.items(): 

782 if k in newDataId and newDataId[k] != v: 

783 log.debug( 

784 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

785 ) 

786 newDataId[k] = v 

787 # No need to retain any values in kwargs now. 

788 kwargs = {} 

789 

790 # If we have some unrecognized dimensions we have to try to connect 

791 # them to records in other dimensions. This is made more complicated 

792 # by some dimensions having records with clashing names. A mitigation 

793 # is that we can tell by this point which dimensions are missing 

794 # for the DatasetType but this does not work for calibrations 

795 # where additional dimensions can be used to constrain the temporal 

796 # axis. 

797 if not_dimensions: 

798 # Search for all dimensions even if we have been given a value 

799 # explicitly. In some cases records are given as well as the 

800 # actually dimension and this should not be an error if they 

801 # match. 

802 mandatoryDimensions = datasetType.dimensions.names # - provided 

803 

804 candidateDimensions: Set[str] = set() 

805 candidateDimensions.update(mandatoryDimensions) 

806 

807 # For calibrations we may well be needing temporal dimensions 

808 # so rather than always including all dimensions in the scan 

809 # restrict things a little. It is still possible for there 

810 # to be confusion over day_obs in visit vs exposure for example. 

811 # If we are not searching calibration collections things may 

812 # fail but they are going to fail anyway because of the 

813 # ambiguousness of the dataId... 

814 if datasetType.isCalibration(): 

815 for dim in self.registry.dimensions.getStaticDimensions(): 

816 if dim.temporal: 

817 candidateDimensions.add(str(dim)) 

818 

819 # Look up table for the first association with a dimension 

820 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict) 

821 

822 # Keep track of whether an item is associated with multiple 

823 # dimensions. 

824 counter: Counter[str] = Counter() 

825 assigned: Dict[str, Set[str]] = defaultdict(set) 

826 

827 # Go through the missing dimensions and associate the 

828 # given names with records within those dimensions 

829 matched_dims = set() 

830 for dimensionName in candidateDimensions: 

831 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

832 fields = dimension.metadata.names | dimension.uniqueKeys.names 

833 for field in not_dimensions: 

834 if field in fields: 

835 guessedAssociation[dimensionName][field] = not_dimensions[field] 

836 counter[dimensionName] += 1 

837 assigned[field].add(dimensionName) 

838 matched_dims.add(field) 

839 

840 # Calculate the fields that matched nothing. 

841 never_found = set(not_dimensions) - matched_dims 

842 

843 if never_found: 

844 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

845 

846 # There is a chance we have allocated a single dataId item 

847 # to multiple dimensions. Need to decide which should be retained. 

848 # For now assume that the most popular alternative wins. 

849 # This means that day_obs with seq_num will result in 

850 # exposure.day_obs and not visit.day_obs 

851 # Also prefer an explicitly missing dimension over an inferred 

852 # temporal dimension. 

853 for fieldName, assignedDimensions in assigned.items(): 

854 if len(assignedDimensions) > 1: 

855 # Pick the most popular (preferring mandatory dimensions) 

856 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

857 if requiredButMissing: 

858 candidateDimensions = requiredButMissing 

859 else: 

860 candidateDimensions = assignedDimensions 

861 

862 # If this is a choice between visit and exposure and 

863 # neither was a required part of the dataset type, 

864 # (hence in this branch) always prefer exposure over 

865 # visit since exposures are always defined and visits 

866 # are defined from exposures. 

867 if candidateDimensions == {"exposure", "visit"}: 

868 candidateDimensions = {"exposure"} 

869 

870 # Select the relevant items and get a new restricted 

871 # counter. 

872 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

873 duplicatesCounter: Counter[str] = Counter() 

874 duplicatesCounter.update(theseCounts) 

875 

876 # Choose the most common. If they are equally common 

877 # we will pick the one that was found first. 

878 # Returns a list of tuples 

879 selected = duplicatesCounter.most_common(1)[0][0] 

880 

881 log.debug( 

882 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

883 " Removed ambiguity by choosing dimension %s.", 

884 fieldName, 

885 ", ".join(assignedDimensions), 

886 selected, 

887 ) 

888 

889 for candidateDimension in assignedDimensions: 

890 if candidateDimension != selected: 

891 del guessedAssociation[candidateDimension][fieldName] 

892 

893 # Update the record look up dict with the new associations 

894 for dimensionName, values in guessedAssociation.items(): 

895 if values: # A dict might now be empty 

896 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values) 

897 byRecord[dimensionName].update(values) 

898 

899 if byRecord: 

900 # Some record specifiers were found so we need to convert 

901 # them to the Id form 

902 for dimensionName, values in byRecord.items(): 

903 if dimensionName in newDataId: 

904 log.debug( 

905 "DataId specified explicit %s dimension value of %s in addition to" 

906 " general record specifiers for it of %s. Ignoring record information.", 

907 dimensionName, 

908 newDataId[dimensionName], 

909 str(values), 

910 ) 

911 # Get the actual record and compare with these values. 

912 try: 

913 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

914 except DataIdError: 

915 raise ValueError( 

916 f"Could not find dimension '{dimensionName}'" 

917 f" with dataId {newDataId} as part of comparing with" 

918 f" record values {byRecord[dimensionName]}" 

919 ) from None 

920 if len(recs) == 1: 

921 errmsg: List[str] = [] 

922 for k, v in values.items(): 

923 if (recval := getattr(recs[0], k)) != v: 

924 errmsg.append(f"{k}({recval} != {v})") 

925 if errmsg: 

926 raise ValueError( 

927 f"Dimension {dimensionName} in dataId has explicit value" 

928 " inconsistent with records: " + ", ".join(errmsg) 

929 ) 

930 else: 

931 # Multiple matches for an explicit dimension 

932 # should never happen but let downstream complain. 

933 pass 

934 continue 

935 

936 # Build up a WHERE expression 

937 bind = {k: v for k, v in values.items()} 

938 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

939 

940 # Hopefully we get a single record that matches 

941 records = set( 

942 self.registry.queryDimensionRecords( 

943 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

944 ) 

945 ) 

946 

947 if len(records) != 1: 

948 if len(records) > 1: 

949 # visit can have an ambiguous answer without involving 

950 # visit_system. The default visit_system is defined 

951 # by the instrument. 

952 if ( 

953 dimensionName == "visit" 

954 and "visit_system_membership" in self.registry.dimensions 

955 and "visit_system" 

956 in self.registry.dimensions["instrument"].metadata # type: ignore 

957 ): 

958 instrument_records = list( 

959 self.registry.queryDimensionRecords( 

960 "instrument", 

961 dataId=newDataId, 

962 **kwargs, 

963 ) 

964 ) 

965 if len(instrument_records) == 1: 

966 visit_system = instrument_records[0].visit_system 

967 if visit_system is None: 

968 # Set to a value that will never match. 

969 visit_system = -1 

970 

971 # Look up each visit in the 

972 # visit_system_membership records. 

973 for rec in records: 

974 membership = list( 

975 self.registry.queryDimensionRecords( 

976 # Use bind to allow zero results. 

977 # This is a fully-specified query. 

978 "visit_system_membership", 

979 where="instrument = inst AND visit_system = system AND visit = v", 

980 bind=dict( 

981 inst=instrument_records[0].name, system=visit_system, v=rec.id 

982 ), 

983 ) 

984 ) 

985 if membership: 

986 # This record is the right answer. 

987 records = set([rec]) 

988 break 

989 

990 # The ambiguity may have been resolved so check again. 

991 if len(records) > 1: 

992 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

993 for r in records: 

994 log.debug("- %s", str(r)) 

995 raise ValueError( 

996 f"DataId specification for dimension {dimensionName} is not" 

997 f" uniquely constrained to a single dataset by {values}." 

998 f" Got {len(records)} results." 

999 ) 

1000 else: 

1001 raise ValueError( 

1002 f"DataId specification for dimension {dimensionName} matched no" 

1003 f" records when constrained by {values}" 

1004 ) 

1005 

1006 # Get the primary key from the real dimension object 

1007 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

1008 if not isinstance(dimension, Dimension): 

1009 raise RuntimeError( 

1010 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

1011 ) 

1012 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

1013 

1014 return newDataId, kwargs 

1015 

1016 def _findDatasetRef( 

1017 self, 

1018 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1019 dataId: Optional[DataId] = None, 

1020 *, 

1021 collections: Any = None, 

1022 allowUnresolved: bool = False, 

1023 **kwargs: Any, 

1024 ) -> DatasetRef: 

1025 """Shared logic for methods that start with a search for a dataset in 

1026 the registry. 

1027 

1028 Parameters 

1029 ---------- 

1030 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1031 When `DatasetRef` the `dataId` should be `None`. 

1032 Otherwise the `DatasetType` or name thereof. 

1033 dataId : `dict` or `DataCoordinate`, optional 

1034 A `dict` of `Dimension` link name, value pairs that label the 

1035 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1036 should be provided as the first argument. 

1037 collections : Any, optional 

1038 Collections to be searched, overriding ``self.collections``. 

1039 Can be any of the types supported by the ``collections`` argument 

1040 to butler construction. 

1041 allowUnresolved : `bool`, optional 

1042 If `True`, return an unresolved `DatasetRef` if finding a resolved 

1043 one in the `Registry` fails. Defaults to `False`. 

1044 **kwargs 

1045 Additional keyword arguments used to augment or construct a 

1046 `DataId`. See `DataId` parameters. 

1047 

1048 Returns 

1049 ------- 

1050 ref : `DatasetRef` 

1051 A reference to the dataset identified by the given arguments. 

1052 

1053 Raises 

1054 ------ 

1055 LookupError 

1056 Raised if no matching dataset exists in the `Registry` (and 

1057 ``allowUnresolved is False``). 

1058 ValueError 

1059 Raised if a resolved `DatasetRef` was passed as an input, but it 

1060 differs from the one found in the registry. 

1061 TypeError 

1062 Raised if no collections were provided. 

1063 """ 

1064 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

1065 if isinstance(datasetRefOrType, DatasetRef): 

1066 idNumber = datasetRefOrType.id 

1067 else: 

1068 idNumber = None 

1069 timespan: Optional[Timespan] = None 

1070 

1071 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1072 

1073 if datasetType.isCalibration(): 

1074 # Because this is a calibration dataset, first try to make a 

1075 # standardize the data ID without restricting the dimensions to 

1076 # those of the dataset type requested, because there may be extra 

1077 # dimensions that provide temporal information for a validity-range 

1078 # lookup. 

1079 dataId = DataCoordinate.standardize( 

1080 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1081 ) 

1082 if dataId.graph.temporal: 

1083 dataId = self.registry.expandDataId(dataId) 

1084 timespan = dataId.timespan 

1085 else: 

1086 # Standardize the data ID to just the dimensions of the dataset 

1087 # type instead of letting registry.findDataset do it, so we get the 

1088 # result even if no dataset is found. 

1089 dataId = DataCoordinate.standardize( 

1090 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1091 ) 

1092 # Always lookup the DatasetRef, even if one is given, to ensure it is 

1093 # present in the current collection. 

1094 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

1095 if ref is None: 

1096 if allowUnresolved: 

1097 return DatasetRef(datasetType, dataId) 

1098 else: 

1099 if collections is None: 

1100 collections = self.registry.defaults.collections 

1101 raise LookupError( 

1102 f"Dataset {datasetType.name} with data ID {dataId} " 

1103 f"could not be found in collections {collections}." 

1104 ) 

1105 if idNumber is not None and idNumber != ref.id: 

1106 if collections is None: 

1107 collections = self.registry.defaults.collections 

1108 raise ValueError( 

1109 f"DatasetRef.id provided ({idNumber}) does not match " 

1110 f"id ({ref.id}) in registry in collections {collections}." 

1111 ) 

1112 if datasetType != ref.datasetType: 

1113 # If they differ it is because the user explicitly specified 

1114 # a compatible dataset type to this call rather than using the 

1115 # registry definition. The DatasetRef must therefore be recreated 

1116 # using the user definition such that the expected type is 

1117 # returned. 

1118 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id) 

1119 

1120 return ref 

1121 

1122 @transactional 

1123 def putDirect(self, obj: Any, ref: DatasetRef) -> DatasetRef: 

1124 # Docstring inherited. 

1125 (imported_ref,) = self.registry._importDatasets( 

1126 [ref], 

1127 expand=True, 

1128 ) 

1129 if imported_ref.id != ref.getCheckedId(): 

1130 raise RuntimeError("This registry configuration does not support putDirect.") 

1131 self.datastore.put(obj, ref) 

1132 return ref 

1133 

1134 @transactional 

1135 def put( 

1136 self, 

1137 obj: Any, 

1138 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1139 dataId: Optional[DataId] = None, 

1140 *, 

1141 run: Optional[str] = None, 

1142 **kwargs: Any, 

1143 ) -> DatasetRef: 

1144 """Store and register a dataset. 

1145 

1146 Parameters 

1147 ---------- 

1148 obj : `object` 

1149 The dataset. 

1150 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1151 When `DatasetRef` is provided, ``dataId`` should be `None`. 

1152 Otherwise the `DatasetType` or name thereof. 

1153 dataId : `dict` or `DataCoordinate` 

1154 A `dict` of `Dimension` link name, value pairs that label the 

1155 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1156 should be provided as the second argument. 

1157 run : `str`, optional 

1158 The name of the run the dataset should be added to, overriding 

1159 ``self.run``. 

1160 **kwargs 

1161 Additional keyword arguments used to augment or construct a 

1162 `DataCoordinate`. See `DataCoordinate.standardize` 

1163 parameters. 

1164 

1165 Returns 

1166 ------- 

1167 ref : `DatasetRef` 

1168 A reference to the stored dataset, updated with the correct id if 

1169 given. 

1170 

1171 Raises 

1172 ------ 

1173 TypeError 

1174 Raised if the butler is read-only or if no run has been provided. 

1175 """ 

1176 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

1177 if not self.isWriteable(): 

1178 raise TypeError("Butler is read-only.") 

1179 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

1180 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

1181 raise ValueError("DatasetRef must not be in registry, must have None id") 

1182 

1183 # Handle dimension records in dataId 

1184 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1185 

1186 # Add Registry Dataset entry. 

1187 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

1188 

1189 # For an execution butler the datasets will be pre-defined. 

1190 # If the butler is configured that way datasets should only be inserted 

1191 # if they do not already exist in registry. Trying and catching 

1192 # ConflictingDefinitionError will not work because the transaction 

1193 # will be corrupted. Instead, in this mode always check first. 

1194 ref = None 

1195 ref_is_predefined = False 

1196 if self._allow_put_of_predefined_dataset: 

1197 # Get the matching ref for this run. 

1198 ref = self.registry.findDataset(datasetType, collections=run, dataId=dataId) 

1199 

1200 if ref: 

1201 # Must be expanded form for datastore templating 

1202 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions) 

1203 ref = ref.expanded(dataId) 

1204 ref_is_predefined = True 

1205 

1206 if not ref: 

1207 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

1208 

1209 # If the ref is predefined it is possible that the datastore also 

1210 # has the record. Asking datastore to put it again will result in 

1211 # the artifact being recreated, overwriting previous, then will cause 

1212 # a failure in writing the record which will cause the artifact 

1213 # to be removed. Much safer to ask first before attempting to 

1214 # overwrite. Race conditions should not be an issue for the 

1215 # execution butler environment. 

1216 if ref_is_predefined: 

1217 if self.datastore.knows(ref): 

1218 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.") 

1219 

1220 self.datastore.put(obj, ref) 

1221 

1222 return ref 

1223 

1224 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any: 

1225 """Retrieve a stored dataset. 

1226 

1227 Unlike `Butler.get`, this method allows datasets outside the Butler's 

1228 collection to be read as long as the `DatasetRef` that identifies them 

1229 can be obtained separately. 

1230 

1231 Parameters 

1232 ---------- 

1233 ref : `DatasetRef` 

1234 Resolved reference to an already stored dataset. 

1235 parameters : `dict` 

1236 Additional StorageClass-defined options to control reading, 

1237 typically used to efficiently read only a subset of the dataset. 

1238 

1239 Returns 

1240 ------- 

1241 obj : `object` 

1242 The dataset. 

1243 """ 

1244 return self.datastore.get(ref, parameters=parameters) 

1245 

1246 def getDirectDeferred( 

1247 self, ref: DatasetRef, *, parameters: Union[dict, None] = None 

1248 ) -> DeferredDatasetHandle: 

1249 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1250 from a resolved `DatasetRef`. 

1251 

1252 Parameters 

1253 ---------- 

1254 ref : `DatasetRef` 

1255 Resolved reference to an already stored dataset. 

1256 parameters : `dict` 

1257 Additional StorageClass-defined options to control reading, 

1258 typically used to efficiently read only a subset of the dataset. 

1259 

1260 Returns 

1261 ------- 

1262 obj : `DeferredDatasetHandle` 

1263 A handle which can be used to retrieve a dataset at a later time. 

1264 

1265 Raises 

1266 ------ 

1267 AmbiguousDatasetError 

1268 Raised if ``ref.id is None``, i.e. the reference is unresolved. 

1269 """ 

1270 if ref.id is None: 

1271 raise AmbiguousDatasetError( 

1272 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved." 

1273 ) 

1274 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

1275 

1276 def getDeferred( 

1277 self, 

1278 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1279 dataId: Optional[DataId] = None, 

1280 *, 

1281 parameters: Union[dict, None] = None, 

1282 collections: Any = None, 

1283 **kwargs: Any, 

1284 ) -> DeferredDatasetHandle: 

1285 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1286 after an immediate registry lookup. 

1287 

1288 Parameters 

1289 ---------- 

1290 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1291 When `DatasetRef` the `dataId` should be `None`. 

1292 Otherwise the `DatasetType` or name thereof. 

1293 dataId : `dict` or `DataCoordinate`, optional 

1294 A `dict` of `Dimension` link name, value pairs that label the 

1295 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1296 should be provided as the first argument. 

1297 parameters : `dict` 

1298 Additional StorageClass-defined options to control reading, 

1299 typically used to efficiently read only a subset of the dataset. 

1300 collections : Any, optional 

1301 Collections to be searched, overriding ``self.collections``. 

1302 Can be any of the types supported by the ``collections`` argument 

1303 to butler construction. 

1304 **kwargs 

1305 Additional keyword arguments used to augment or construct a 

1306 `DataId`. See `DataId` parameters. 

1307 

1308 Returns 

1309 ------- 

1310 obj : `DeferredDatasetHandle` 

1311 A handle which can be used to retrieve a dataset at a later time. 

1312 

1313 Raises 

1314 ------ 

1315 LookupError 

1316 Raised if no matching dataset exists in the `Registry` (and 

1317 ``allowUnresolved is False``). 

1318 ValueError 

1319 Raised if a resolved `DatasetRef` was passed as an input, but it 

1320 differs from the one found in the registry. 

1321 TypeError 

1322 Raised if no collections were provided. 

1323 """ 

1324 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1325 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

1326 

1327 def get( 

1328 self, 

1329 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1330 dataId: Optional[DataId] = None, 

1331 *, 

1332 parameters: Optional[Dict[str, Any]] = None, 

1333 collections: Any = None, 

1334 **kwargs: Any, 

1335 ) -> Any: 

1336 """Retrieve a stored dataset. 

1337 

1338 Parameters 

1339 ---------- 

1340 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1341 When `DatasetRef` the `dataId` should be `None`. 

1342 Otherwise the `DatasetType` or name thereof. 

1343 dataId : `dict` or `DataCoordinate` 

1344 A `dict` of `Dimension` link name, value pairs that label the 

1345 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1346 should be provided as the first argument. 

1347 parameters : `dict` 

1348 Additional StorageClass-defined options to control reading, 

1349 typically used to efficiently read only a subset of the dataset. 

1350 collections : Any, optional 

1351 Collections to be searched, overriding ``self.collections``. 

1352 Can be any of the types supported by the ``collections`` argument 

1353 to butler construction. 

1354 **kwargs 

1355 Additional keyword arguments used to augment or construct a 

1356 `DataCoordinate`. See `DataCoordinate.standardize` 

1357 parameters. 

1358 

1359 Returns 

1360 ------- 

1361 obj : `object` 

1362 The dataset. 

1363 

1364 Raises 

1365 ------ 

1366 ValueError 

1367 Raised if a resolved `DatasetRef` was passed as an input, but it 

1368 differs from the one found in the registry. 

1369 LookupError 

1370 Raised if no matching dataset exists in the `Registry`. 

1371 TypeError 

1372 Raised if no collections were provided. 

1373 

1374 Notes 

1375 ----- 

1376 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1377 this method requires that the given data ID include temporal dimensions 

1378 beyond the dimensions of the dataset type itself, in order to find the 

1379 dataset with the appropriate validity range. For example, a "bias" 

1380 dataset with native dimensions ``{instrument, detector}`` could be 

1381 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1382 ``exposure`` is a temporal dimension. 

1383 """ 

1384 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1385 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1386 return self.getDirect(ref, parameters=parameters) 

1387 

1388 def getURIs( 

1389 self, 

1390 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1391 dataId: Optional[DataId] = None, 

1392 *, 

1393 predict: bool = False, 

1394 collections: Any = None, 

1395 run: Optional[str] = None, 

1396 **kwargs: Any, 

1397 ) -> DatasetRefURIs: 

1398 """Returns the URIs associated with the dataset. 

1399 

1400 Parameters 

1401 ---------- 

1402 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1403 When `DatasetRef` the `dataId` should be `None`. 

1404 Otherwise the `DatasetType` or name thereof. 

1405 dataId : `dict` or `DataCoordinate` 

1406 A `dict` of `Dimension` link name, value pairs that label the 

1407 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1408 should be provided as the first argument. 

1409 predict : `bool` 

1410 If `True`, allow URIs to be returned of datasets that have not 

1411 been written. 

1412 collections : Any, optional 

1413 Collections to be searched, overriding ``self.collections``. 

1414 Can be any of the types supported by the ``collections`` argument 

1415 to butler construction. 

1416 run : `str`, optional 

1417 Run to use for predictions, overriding ``self.run``. 

1418 **kwargs 

1419 Additional keyword arguments used to augment or construct a 

1420 `DataCoordinate`. See `DataCoordinate.standardize` 

1421 parameters. 

1422 

1423 Returns 

1424 ------- 

1425 uris : `DatasetRefURIs` 

1426 The URI to the primary artifact associated with this dataset (if 

1427 the dataset was disassembled within the datastore this may be 

1428 `None`), and the URIs to any components associated with the dataset 

1429 artifact. (can be empty if there are no components). 

1430 """ 

1431 ref = self._findDatasetRef( 

1432 datasetRefOrType, dataId, allowUnresolved=predict, collections=collections, **kwargs 

1433 ) 

1434 if ref.id is None: # only possible if predict is True 

1435 if run is None: 

1436 run = self.run 

1437 if run is None: 

1438 raise TypeError("Cannot predict location with run=None.") 

1439 # Lie about ID, because we can't guess it, and only 

1440 # Datastore.getURIs() will ever see it (and it doesn't use it). 

1441 ref = ref.resolved(id=0, run=run) 

1442 return self.datastore.getURIs(ref, predict) 

1443 

1444 def getURI( 

1445 self, 

1446 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1447 dataId: Optional[DataId] = None, 

1448 *, 

1449 predict: bool = False, 

1450 collections: Any = None, 

1451 run: Optional[str] = None, 

1452 **kwargs: Any, 

1453 ) -> ResourcePath: 

1454 """Return the URI to the Dataset. 

1455 

1456 Parameters 

1457 ---------- 

1458 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1459 When `DatasetRef` the `dataId` should be `None`. 

1460 Otherwise the `DatasetType` or name thereof. 

1461 dataId : `dict` or `DataCoordinate` 

1462 A `dict` of `Dimension` link name, value pairs that label the 

1463 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1464 should be provided as the first argument. 

1465 predict : `bool` 

1466 If `True`, allow URIs to be returned of datasets that have not 

1467 been written. 

1468 collections : Any, optional 

1469 Collections to be searched, overriding ``self.collections``. 

1470 Can be any of the types supported by the ``collections`` argument 

1471 to butler construction. 

1472 run : `str`, optional 

1473 Run to use for predictions, overriding ``self.run``. 

1474 **kwargs 

1475 Additional keyword arguments used to augment or construct a 

1476 `DataCoordinate`. See `DataCoordinate.standardize` 

1477 parameters. 

1478 

1479 Returns 

1480 ------- 

1481 uri : `lsst.resources.ResourcePath` 

1482 URI pointing to the Dataset within the datastore. If the 

1483 Dataset does not exist in the datastore, and if ``predict`` is 

1484 `True`, the URI will be a prediction and will include a URI 

1485 fragment "#predicted". 

1486 If the datastore does not have entities that relate well 

1487 to the concept of a URI the returned URI string will be 

1488 descriptive. The returned URI is not guaranteed to be obtainable. 

1489 

1490 Raises 

1491 ------ 

1492 LookupError 

1493 A URI has been requested for a dataset that does not exist and 

1494 guessing is not allowed. 

1495 ValueError 

1496 Raised if a resolved `DatasetRef` was passed as an input, but it 

1497 differs from the one found in the registry. 

1498 TypeError 

1499 Raised if no collections were provided. 

1500 RuntimeError 

1501 Raised if a URI is requested for a dataset that consists of 

1502 multiple artifacts. 

1503 """ 

1504 primary, components = self.getURIs( 

1505 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs 

1506 ) 

1507 

1508 if primary is None or components: 

1509 raise RuntimeError( 

1510 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1511 "Use Butler.getURIs() instead." 

1512 ) 

1513 return primary 

1514 

1515 def retrieveArtifacts( 

1516 self, 

1517 refs: Iterable[DatasetRef], 

1518 destination: ResourcePathExpression, 

1519 transfer: str = "auto", 

1520 preserve_path: bool = True, 

1521 overwrite: bool = False, 

1522 ) -> List[ResourcePath]: 

1523 """Retrieve the artifacts associated with the supplied refs. 

1524 

1525 Parameters 

1526 ---------- 

1527 refs : iterable of `DatasetRef` 

1528 The datasets for which artifacts are to be retrieved. 

1529 A single ref can result in multiple artifacts. The refs must 

1530 be resolved. 

1531 destination : `lsst.resources.ResourcePath` or `str` 

1532 Location to write the artifacts. 

1533 transfer : `str`, optional 

1534 Method to use to transfer the artifacts. Must be one of the options 

1535 supported by `~lsst.resources.ResourcePath.transfer_from()`. 

1536 "move" is not allowed. 

1537 preserve_path : `bool`, optional 

1538 If `True` the full path of the artifact within the datastore 

1539 is preserved. If `False` the final file component of the path 

1540 is used. 

1541 overwrite : `bool`, optional 

1542 If `True` allow transfers to overwrite existing files at the 

1543 destination. 

1544 

1545 Returns 

1546 ------- 

1547 targets : `list` of `lsst.resources.ResourcePath` 

1548 URIs of file artifacts in destination location. Order is not 

1549 preserved. 

1550 

1551 Notes 

1552 ----- 

1553 For non-file datastores the artifacts written to the destination 

1554 may not match the representation inside the datastore. For example 

1555 a hierarchical data structure in a NoSQL database may well be stored 

1556 as a JSON file. 

1557 """ 

1558 return self.datastore.retrieveArtifacts( 

1559 refs, 

1560 ResourcePath(destination), 

1561 transfer=transfer, 

1562 preserve_path=preserve_path, 

1563 overwrite=overwrite, 

1564 ) 

1565 

1566 def datasetExists( 

1567 self, 

1568 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1569 dataId: Optional[DataId] = None, 

1570 *, 

1571 collections: Any = None, 

1572 **kwargs: Any, 

1573 ) -> bool: 

1574 """Return True if the Dataset is actually present in the Datastore. 

1575 

1576 Parameters 

1577 ---------- 

1578 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1579 When `DatasetRef` the `dataId` should be `None`. 

1580 Otherwise the `DatasetType` or name thereof. 

1581 dataId : `dict` or `DataCoordinate` 

1582 A `dict` of `Dimension` link name, value pairs that label the 

1583 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1584 should be provided as the first argument. 

1585 collections : Any, optional 

1586 Collections to be searched, overriding ``self.collections``. 

1587 Can be any of the types supported by the ``collections`` argument 

1588 to butler construction. 

1589 **kwargs 

1590 Additional keyword arguments used to augment or construct a 

1591 `DataCoordinate`. See `DataCoordinate.standardize` 

1592 parameters. 

1593 

1594 Raises 

1595 ------ 

1596 LookupError 

1597 Raised if the dataset is not even present in the Registry. 

1598 ValueError 

1599 Raised if a resolved `DatasetRef` was passed as an input, but it 

1600 differs from the one found in the registry. 

1601 TypeError 

1602 Raised if no collections were provided. 

1603 """ 

1604 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1605 return self.datastore.exists(ref) 

1606 

1607 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1608 """Remove one or more `~CollectionType.RUN` collections and the 

1609 datasets within them. 

1610 

1611 Parameters 

1612 ---------- 

1613 names : `Iterable` [ `str` ] 

1614 The names of the collections to remove. 

1615 unstore : `bool`, optional 

1616 If `True` (default), delete datasets from all datastores in which 

1617 they are present, and attempt to rollback the registry deletions if 

1618 datastore deletions fail (which may not always be possible). If 

1619 `False`, datastore records for these datasets are still removed, 

1620 but any artifacts (e.g. files) will not be. 

1621 

1622 Raises 

1623 ------ 

1624 TypeError 

1625 Raised if one or more collections are not of type 

1626 `~CollectionType.RUN`. 

1627 """ 

1628 if not self.isWriteable(): 

1629 raise TypeError("Butler is read-only.") 

1630 names = list(names) 

1631 refs: List[DatasetRef] = [] 

1632 for name in names: 

1633 collectionType = self.registry.getCollectionType(name) 

1634 if collectionType is not CollectionType.RUN: 

1635 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1636 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True)) 

1637 with self.registry.transaction(): 

1638 if unstore: 

1639 self.datastore.trash(refs) 

1640 else: 

1641 self.datastore.forget(refs) 

1642 for name in names: 

1643 self.registry.removeCollection(name) 

1644 if unstore: 

1645 # Point of no return for removing artifacts 

1646 self.datastore.emptyTrash() 

1647 

1648 def pruneCollection( 

1649 self, name: str, purge: bool = False, unstore: bool = False, unlink: Optional[List[str]] = None 

1650 ) -> None: 

1651 """Remove a collection and possibly prune datasets within it. 

1652 

1653 Parameters 

1654 ---------- 

1655 name : `str` 

1656 Name of the collection to remove. If this is a 

1657 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection, 

1658 datasets within the collection are not modified unless ``unstore`` 

1659 is `True`. If this is a `~CollectionType.RUN` collection, 

1660 ``purge`` and ``unstore`` must be `True`, and all datasets in it 

1661 are fully removed from the data repository. 

1662 purge : `bool`, optional 

1663 If `True`, permit `~CollectionType.RUN` collections to be removed, 

1664 fully removing datasets within them. Requires ``unstore=True`` as 

1665 well as an added precaution against accidental deletion. Must be 

1666 `False` (default) if the collection is not a ``RUN``. 

1667 unstore: `bool`, optional 

1668 If `True`, remove all datasets in the collection from all 

1669 datastores in which they appear. 

1670 unlink: `list` [`str`], optional 

1671 Before removing the given `collection` unlink it from from these 

1672 parent collections. 

1673 

1674 Raises 

1675 ------ 

1676 TypeError 

1677 Raised if the butler is read-only or arguments are mutually 

1678 inconsistent. 

1679 """ 

1680 # See pruneDatasets comments for more information about the logic here; 

1681 # the cases are almost the same, but here we can rely on Registry to 

1682 # take care everything but Datastore deletion when we remove the 

1683 # collection. 

1684 if not self.isWriteable(): 

1685 raise TypeError("Butler is read-only.") 

1686 collectionType = self.registry.getCollectionType(name) 

1687 if purge and not unstore: 

1688 raise PurgeWithoutUnstorePruneCollectionsError() 

1689 if collectionType is CollectionType.RUN and not purge: 

1690 raise RunWithoutPurgePruneCollectionsError(collectionType) 

1691 if collectionType is not CollectionType.RUN and purge: 

1692 raise PurgeUnsupportedPruneCollectionsError(collectionType) 

1693 

1694 def remove(child: str, parent: str) -> None: 

1695 """Remove a child collection from a parent collection.""" 

1696 # Remove child from parent. 

1697 chain = list(self.registry.getCollectionChain(parent)) 

1698 try: 

1699 chain.remove(name) 

1700 except ValueError as e: 

1701 raise RuntimeError(f"{name} is not a child of {parent}") from e 

1702 self.registry.setCollectionChain(parent, chain) 

1703 

1704 with self.registry.transaction(): 

1705 if unlink: 

1706 for parent in unlink: 

1707 remove(name, parent) 

1708 if unstore: 

1709 refs = self.registry.queryDatasets(..., collections=name, findFirst=True) 

1710 self.datastore.trash(refs) 

1711 self.registry.removeCollection(name) 

1712 

1713 if unstore: 

1714 # Point of no return for removing artifacts 

1715 self.datastore.emptyTrash() 

1716 

1717 def pruneDatasets( 

1718 self, 

1719 refs: Iterable[DatasetRef], 

1720 *, 

1721 disassociate: bool = True, 

1722 unstore: bool = False, 

1723 tags: Iterable[str] = (), 

1724 purge: bool = False, 

1725 ) -> None: 

1726 # docstring inherited from LimitedButler 

1727 

1728 if not self.isWriteable(): 

1729 raise TypeError("Butler is read-only.") 

1730 if purge: 

1731 if not disassociate: 

1732 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1733 if not unstore: 

1734 raise TypeError("Cannot pass purge=True without unstore=True.") 

1735 elif disassociate: 

1736 tags = tuple(tags) 

1737 if not tags: 

1738 raise TypeError("No tags provided but disassociate=True.") 

1739 for tag in tags: 

1740 collectionType = self.registry.getCollectionType(tag) 

1741 if collectionType is not CollectionType.TAGGED: 

1742 raise TypeError( 

1743 f"Cannot disassociate from collection '{tag}' " 

1744 f"of non-TAGGED type {collectionType.name}." 

1745 ) 

1746 # For an execution butler we want to keep existing UUIDs for the 

1747 # datasets, for that we need to keep them in the collections but 

1748 # remove from datastore. 

1749 if self._allow_put_of_predefined_dataset and purge: 

1750 purge = False 

1751 disassociate = False 

1752 # Transform possibly-single-pass iterable into something we can iterate 

1753 # over multiple times. 

1754 refs = list(refs) 

1755 # Pruning a component of a DatasetRef makes no sense since registry 

1756 # doesn't know about components and datastore might not store 

1757 # components in a separate file 

1758 for ref in refs: 

1759 if ref.datasetType.component(): 

1760 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1761 # We don't need an unreliable Datastore transaction for this, because 

1762 # we've been extra careful to ensure that Datastore.trash only involves 

1763 # mutating the Registry (it can _look_ at Datastore-specific things, 

1764 # but shouldn't change them), and hence all operations here are 

1765 # Registry operations. 

1766 with self.registry.transaction(): 

1767 if unstore: 

1768 self.datastore.trash(refs) 

1769 if purge: 

1770 self.registry.removeDatasets(refs) 

1771 elif disassociate: 

1772 assert tags, "Guaranteed by earlier logic in this function." 

1773 for tag in tags: 

1774 self.registry.disassociate(tag, refs) 

1775 # We've exited the Registry transaction, and apparently committed. 

1776 # (if there was an exception, everything rolled back, and it's as if 

1777 # nothing happened - and we never get here). 

1778 # Datastore artifacts are not yet gone, but they're clearly marked 

1779 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1780 # problems we can try again later, and if manual administrative 

1781 # intervention is required, it's pretty clear what that should entail: 

1782 # deleting everything on disk and in private Datastore tables that is 

1783 # in the dataset_location_trash table. 

1784 if unstore: 

1785 # Point of no return for removing artifacts 

1786 self.datastore.emptyTrash() 

1787 

1788 @transactional 

1789 def ingest( 

1790 self, 

1791 *datasets: FileDataset, 

1792 transfer: Optional[str] = "auto", 

1793 run: Optional[str] = None, 

1794 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1795 record_validation_info: bool = True, 

1796 ) -> None: 

1797 """Store and register one or more datasets that already exist on disk. 

1798 

1799 Parameters 

1800 ---------- 

1801 datasets : `FileDataset` 

1802 Each positional argument is a struct containing information about 

1803 a file to be ingested, including its URI (either absolute or 

1804 relative to the datastore root, if applicable), a `DatasetRef`, 

1805 and optionally a formatter class or its fully-qualified string 

1806 name. If a formatter is not provided, the formatter that would be 

1807 used for `put` is assumed. On successful return, all 

1808 `FileDataset.ref` attributes will have their `DatasetRef.id` 

1809 attribute populated and all `FileDataset.formatter` attributes will 

1810 be set to the formatter class used. `FileDataset.path` attributes 

1811 may be modified to put paths in whatever the datastore considers a 

1812 standardized form. 

1813 transfer : `str`, optional 

1814 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1815 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to 

1816 transfer the file. 

1817 run : `str`, optional 

1818 The name of the run ingested datasets should be added to, 

1819 overriding ``self.run``. 

1820 idGenerationMode : `DatasetIdGenEnum`, optional 

1821 Specifies option for generating dataset IDs. By default unique IDs 

1822 are generated for each inserted dataset. 

1823 record_validation_info : `bool`, optional 

1824 If `True`, the default, the datastore can record validation 

1825 information associated with the file. If `False` the datastore 

1826 will not attempt to track any information such as checksums 

1827 or file sizes. This can be useful if such information is tracked 

1828 in an external system or if the file is to be compressed in place. 

1829 It is up to the datastore whether this parameter is relevant. 

1830 

1831 Raises 

1832 ------ 

1833 TypeError 

1834 Raised if the butler is read-only or if no run was provided. 

1835 NotImplementedError 

1836 Raised if the `Datastore` does not support the given transfer mode. 

1837 DatasetTypeNotSupportedError 

1838 Raised if one or more files to be ingested have a dataset type that 

1839 is not supported by the `Datastore`.. 

1840 FileNotFoundError 

1841 Raised if one of the given files does not exist. 

1842 FileExistsError 

1843 Raised if transfer is not `None` but the (internal) location the 

1844 file would be moved to is already occupied. 

1845 

1846 Notes 

1847 ----- 

1848 This operation is not fully exception safe: if a database operation 

1849 fails, the given `FileDataset` instances may be only partially updated. 

1850 

1851 It is atomic in terms of database operations (they will either all 

1852 succeed or all fail) providing the database engine implements 

1853 transactions correctly. It will attempt to be atomic in terms of 

1854 filesystem operations as well, but this cannot be implemented 

1855 rigorously for most datastores. 

1856 """ 

1857 if not self.isWriteable(): 

1858 raise TypeError("Butler is read-only.") 

1859 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1860 # Reorganize the inputs so they're grouped by DatasetType and then 

1861 # data ID. We also include a list of DatasetRefs for each FileDataset 

1862 # to hold the resolved DatasetRefs returned by the Registry, before 

1863 # it's safe to swap them into FileDataset.refs. 

1864 # Some type annotation aliases to make that clearer: 

1865 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]] 

1866 GroupedData = MutableMapping[DatasetType, GroupForType] 

1867 # The actual data structure: 

1868 groupedData: GroupedData = defaultdict(dict) 

1869 # And the nested loop that populates it: 

1870 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1871 # This list intentionally shared across the inner loop, since it's 

1872 # associated with `dataset`. 

1873 resolvedRefs: List[DatasetRef] = [] 

1874 

1875 # Somewhere to store pre-existing refs if we have an 

1876 # execution butler. 

1877 existingRefs: List[DatasetRef] = [] 

1878 

1879 for ref in dataset.refs: 

1880 if ref.dataId in groupedData[ref.datasetType]: 

1881 raise ConflictingDefinitionError( 

1882 f"Ingest conflict. Dataset {dataset.path} has same" 

1883 " DataId as other ingest dataset" 

1884 f" {groupedData[ref.datasetType][ref.dataId][0].path} " 

1885 f" ({ref.dataId})" 

1886 ) 

1887 if self._allow_put_of_predefined_dataset: 

1888 existing_ref = self.registry.findDataset( 

1889 ref.datasetType, dataId=ref.dataId, collections=run 

1890 ) 

1891 if existing_ref: 

1892 if self.datastore.knows(existing_ref): 

1893 raise ConflictingDefinitionError( 

1894 f"Dataset associated with path {dataset.path}" 

1895 f" already exists as {existing_ref}." 

1896 ) 

1897 # Store this ref elsewhere since it already exists 

1898 # and we do not want to remake it but we do want 

1899 # to store it in the datastore. 

1900 existingRefs.append(existing_ref) 

1901 

1902 # Nothing else to do until we have finished 

1903 # iterating. 

1904 continue 

1905 

1906 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs) 

1907 

1908 if existingRefs: 

1909 

1910 if len(dataset.refs) != len(existingRefs): 

1911 # Keeping track of partially pre-existing datasets is hard 

1912 # and should generally never happen. For now don't allow 

1913 # it. 

1914 raise ConflictingDefinitionError( 

1915 f"For dataset {dataset.path} some dataIds already exist" 

1916 " in registry but others do not. This is not supported." 

1917 ) 

1918 

1919 # Attach the resolved refs if we found them. 

1920 dataset.refs = existingRefs 

1921 

1922 # Now we can bulk-insert into Registry for each DatasetType. 

1923 for datasetType, groupForType in progress.iter_item_chunks( 

1924 groupedData.items(), desc="Bulk-inserting datasets by type" 

1925 ): 

1926 refs = self.registry.insertDatasets( 

1927 datasetType, 

1928 dataIds=groupForType.keys(), 

1929 run=run, 

1930 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType), 

1931 idGenerationMode=idGenerationMode, 

1932 ) 

1933 # Append those resolved DatasetRefs to the new lists we set up for 

1934 # them. 

1935 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()): 

1936 resolvedRefs.append(ref) 

1937 

1938 # Go back to the original FileDatasets to replace their refs with the 

1939 # new resolved ones. 

1940 for groupForType in progress.iter_chunks( 

1941 groupedData.values(), desc="Reassociating resolved dataset refs with files" 

1942 ): 

1943 for dataset, resolvedRefs in groupForType.values(): 

1944 dataset.refs = resolvedRefs 

1945 

1946 # Bulk-insert everything into Datastore. 

1947 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info) 

1948 

1949 @contextlib.contextmanager 

1950 def export( 

1951 self, 

1952 *, 

1953 directory: Optional[str] = None, 

1954 filename: Optional[str] = None, 

1955 format: Optional[str] = None, 

1956 transfer: Optional[str] = None, 

1957 ) -> Iterator[RepoExportContext]: 

1958 """Export datasets from the repository represented by this `Butler`. 

1959 

1960 This method is a context manager that returns a helper object 

1961 (`RepoExportContext`) that is used to indicate what information from 

1962 the repository should be exported. 

1963 

1964 Parameters 

1965 ---------- 

1966 directory : `str`, optional 

1967 Directory dataset files should be written to if ``transfer`` is not 

1968 `None`. 

1969 filename : `str`, optional 

1970 Name for the file that will include database information associated 

1971 with the exported datasets. If this is not an absolute path and 

1972 ``directory`` is not `None`, it will be written to ``directory`` 

1973 instead of the current working directory. Defaults to 

1974 "export.{format}". 

1975 format : `str`, optional 

1976 File format for the database information file. If `None`, the 

1977 extension of ``filename`` will be used. 

1978 transfer : `str`, optional 

1979 Transfer mode passed to `Datastore.export`. 

1980 

1981 Raises 

1982 ------ 

1983 TypeError 

1984 Raised if the set of arguments passed is inconsistent. 

1985 

1986 Examples 

1987 -------- 

1988 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

1989 methods are used to provide the iterables over data IDs and/or datasets 

1990 to be exported:: 

1991 

1992 with butler.export("exports.yaml") as export: 

1993 # Export all flats, but none of the dimension element rows 

1994 # (i.e. data ID information) associated with them. 

1995 export.saveDatasets(butler.registry.queryDatasets("flat"), 

1996 elements=()) 

1997 # Export all datasets that start with "deepCoadd_" and all of 

1998 # their associated data ID information. 

1999 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

2000 """ 

2001 if directory is None and transfer is not None: 

2002 raise TypeError("Cannot transfer without providing a directory.") 

2003 if transfer == "move": 

2004 raise TypeError("Transfer may not be 'move': export is read-only") 

2005 if format is None: 

2006 if filename is None: 

2007 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2008 else: 

2009 _, format = os.path.splitext(filename) 

2010 elif filename is None: 

2011 filename = f"export.{format}" 

2012 if directory is not None: 

2013 filename = os.path.join(directory, filename) 

2014 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["export"]) 

2015 with open(filename, "w") as stream: 

2016 backend = BackendClass(stream, universe=self.registry.dimensions) 

2017 try: 

2018 helper = RepoExportContext( 

2019 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer 

2020 ) 

2021 yield helper 

2022 except BaseException: 

2023 raise 

2024 else: 

2025 helper._finish() 

2026 

2027 def import_( 

2028 self, 

2029 *, 

2030 directory: Optional[str] = None, 

2031 filename: Union[str, TextIO, None] = None, 

2032 format: Optional[str] = None, 

2033 transfer: Optional[str] = None, 

2034 skip_dimensions: Optional[Set] = None, 

2035 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

2036 reuseIds: bool = False, 

2037 ) -> None: 

2038 """Import datasets into this repository that were exported from a 

2039 different butler repository via `~lsst.daf.butler.Butler.export`. 

2040 

2041 Parameters 

2042 ---------- 

2043 directory : `str`, optional 

2044 Directory containing dataset files to import from. If `None`, 

2045 ``filename`` and all dataset file paths specified therein must 

2046 be absolute. 

2047 filename : `str` or `TextIO`, optional 

2048 A stream or name of file that contains database information 

2049 associated with the exported datasets, typically generated by 

2050 `~lsst.daf.butler.Butler.export`. If this a string (name) and 

2051 is not an absolute path, does not exist in the current working 

2052 directory, and ``directory`` is not `None`, it is assumed to be in 

2053 ``directory``. Defaults to "export.{format}". 

2054 format : `str`, optional 

2055 File format for ``filename``. If `None`, the extension of 

2056 ``filename`` will be used. 

2057 transfer : `str`, optional 

2058 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

2059 skip_dimensions : `set`, optional 

2060 Names of dimensions that should be skipped and not imported. 

2061 idGenerationMode : `DatasetIdGenEnum`, optional 

2062 Specifies option for generating dataset IDs when IDs are not 

2063 provided or their type does not match backend type. By default 

2064 unique IDs are generated for each inserted dataset. 

2065 reuseIds : `bool`, optional 

2066 If `True` then forces re-use of imported dataset IDs for integer 

2067 IDs which are normally generated as auto-incremented; exception 

2068 will be raised if imported IDs clash with existing ones. This 

2069 option has no effect on the use of globally-unique IDs which are 

2070 always re-used (or generated if integer IDs are being imported). 

2071 

2072 Raises 

2073 ------ 

2074 TypeError 

2075 Raised if the set of arguments passed is inconsistent, or if the 

2076 butler is read-only. 

2077 """ 

2078 if not self.isWriteable(): 

2079 raise TypeError("Butler is read-only.") 

2080 if format is None: 

2081 if filename is None: 

2082 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2083 else: 

2084 _, format = os.path.splitext(filename) # type: ignore 

2085 elif filename is None: 

2086 filename = f"export.{format}" 

2087 if isinstance(filename, str) and directory is not None and not os.path.exists(filename): 

2088 filename = os.path.join(directory, filename) 

2089 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"]) 

2090 

2091 def doImport(importStream: TextIO) -> None: 

2092 backend = BackendClass(importStream, self.registry) 

2093 backend.register() 

2094 with self.transaction(): 

2095 backend.load( 

2096 self.datastore, 

2097 directory=directory, 

2098 transfer=transfer, 

2099 skip_dimensions=skip_dimensions, 

2100 idGenerationMode=idGenerationMode, 

2101 reuseIds=reuseIds, 

2102 ) 

2103 

2104 if isinstance(filename, str): 

2105 with open(filename, "r") as stream: 

2106 doImport(stream) 

2107 else: 

2108 doImport(filename) 

2109 

2110 def transfer_from( 

2111 self, 

2112 source_butler: Butler, 

2113 source_refs: Iterable[DatasetRef], 

2114 transfer: str = "auto", 

2115 id_gen_map: Dict[str, DatasetIdGenEnum] = None, 

2116 skip_missing: bool = True, 

2117 register_dataset_types: bool = False, 

2118 ) -> List[DatasetRef]: 

2119 """Transfer datasets to this Butler from a run in another Butler. 

2120 

2121 Parameters 

2122 ---------- 

2123 source_butler : `Butler` 

2124 Butler from which the datasets are to be transferred. 

2125 source_refs : iterable of `DatasetRef` 

2126 Datasets defined in the source butler that should be transferred to 

2127 this butler. 

2128 transfer : `str`, optional 

2129 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

2130 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional 

2131 A mapping of dataset type to ID generation mode. Only used if 

2132 the source butler is using integer IDs. Should not be used 

2133 if this receiving butler uses integer IDs. Without this dataset 

2134 import always uses unique. 

2135 skip_missing : `bool` 

2136 If `True`, datasets with no datastore artifact associated with 

2137 them are not transferred. If `False` a registry entry will be 

2138 created even if no datastore record is created (and so will 

2139 look equivalent to the dataset being unstored). 

2140 register_dataset_types : `bool` 

2141 If `True` any missing dataset types are registered. Otherwise 

2142 an exception is raised. 

2143 

2144 Returns 

2145 ------- 

2146 refs : `list` of `DatasetRef` 

2147 The refs added to this Butler. 

2148 

2149 Notes 

2150 ----- 

2151 Requires that any dimension definitions are already present in the 

2152 receiving Butler. The datastore artifact has to exist for a transfer 

2153 to be made but non-existence is not an error. 

2154 

2155 Datasets that already exist in this run will be skipped. 

2156 

2157 The datasets are imported as part of a transaction, although 

2158 dataset types are registered before the transaction is started. 

2159 This means that it is possible for a dataset type to be registered 

2160 even though transfer has failed. 

2161 """ 

2162 if not self.isWriteable(): 

2163 raise TypeError("Butler is read-only.") 

2164 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

2165 

2166 # Will iterate through the refs multiple times so need to convert 

2167 # to a list if this isn't a collection. 

2168 if not isinstance(source_refs, collections.abc.Collection): 

2169 source_refs = list(source_refs) 

2170 

2171 original_count = len(source_refs) 

2172 log.info("Transferring %d datasets into %s", original_count, str(self)) 

2173 

2174 if id_gen_map is None: 

2175 id_gen_map = {} 

2176 

2177 # In some situations the datastore artifact may be missing 

2178 # and we do not want that registry entry to be imported. 

2179 # Asking datastore is not sufficient, the records may have been 

2180 # purged, we have to ask for the (predicted) URI and check 

2181 # existence explicitly. Execution butler is set up exactly like 

2182 # this with no datastore records. 

2183 artifact_existence: Dict[ResourcePath, bool] = {} 

2184 if skip_missing: 

2185 dataset_existence = source_butler.datastore.mexists( 

2186 source_refs, artifact_existence=artifact_existence 

2187 ) 

2188 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

2189 filtered_count = len(source_refs) 

2190 log.verbose( 

2191 "%d datasets removed because the artifact does not exist. Now have %d.", 

2192 original_count - filtered_count, 

2193 filtered_count, 

2194 ) 

2195 

2196 # Importing requires that we group the refs by dataset type and run 

2197 # before doing the import. 

2198 source_dataset_types = set() 

2199 grouped_refs = defaultdict(list) 

2200 grouped_indices = defaultdict(list) 

2201 for i, ref in enumerate(source_refs): 

2202 grouped_refs[ref.datasetType, ref.run].append(ref) 

2203 grouped_indices[ref.datasetType, ref.run].append(i) 

2204 source_dataset_types.add(ref.datasetType) 

2205 

2206 # Check to see if the dataset type in the source butler has 

2207 # the same definition in the target butler and register missing 

2208 # ones if requested. Registration must happen outside a transaction. 

2209 newly_registered_dataset_types = set() 

2210 for datasetType in source_dataset_types: 

2211 if register_dataset_types: 

2212 # Let this raise immediately if inconsistent. Continuing 

2213 # on to find additional inconsistent dataset types 

2214 # might result in additional unwanted dataset types being 

2215 # registered. 

2216 if self.registry.registerDatasetType(datasetType): 

2217 newly_registered_dataset_types.add(datasetType) 

2218 else: 

2219 # If the dataset type is missing, let it fail immediately. 

2220 target_dataset_type = self.registry.getDatasetType(datasetType.name) 

2221 if target_dataset_type != datasetType: 

2222 raise ConflictingDefinitionError( 

2223 "Source butler dataset type differs from definition" 

2224 f" in target butler: {datasetType} !=" 

2225 f" {target_dataset_type}" 

2226 ) 

2227 if newly_registered_dataset_types: 

2228 # We may have registered some even if there were inconsistencies 

2229 # but should let people know (or else remove them again). 

2230 log.log( 

2231 VERBOSE, 

2232 "Registered the following dataset types in the target Butler: %s", 

2233 ", ".join(d.name for d in newly_registered_dataset_types), 

2234 ) 

2235 else: 

2236 log.log(VERBOSE, "All required dataset types are known to the target Butler") 

2237 

2238 # The returned refs should be identical for UUIDs. 

2239 # For now must also support integers and so need to retain the 

2240 # newly-created refs from this registry. 

2241 # Pre-size it so we can assign refs into the correct slots 

2242 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs) 

2243 default_id_gen = DatasetIdGenEnum.UNIQUE 

2244 

2245 handled_collections: Set[str] = set() 

2246 

2247 # Do all the importing in a single transaction. 

2248 with self.transaction(): 

2249 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

2250 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

2251 ): 

2252 if run not in handled_collections: 

2253 run_doc = source_butler.registry.getCollectionDocumentation(run) 

2254 registered = self.registry.registerRun(run, doc=run_doc) 

2255 handled_collections.add(run) 

2256 if registered: 

2257 log.log(VERBOSE, "Creating output run %s", run) 

2258 

2259 id_generation_mode = default_id_gen 

2260 if isinstance(refs_to_import[0].id, int): 

2261 # ID generation mode might need to be overridden when 

2262 # targetting UUID 

2263 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen) 

2264 

2265 n_refs = len(refs_to_import) 

2266 log.verbose( 

2267 "Importing %d ref%s of dataset type %s into run %s", 

2268 n_refs, 

2269 "" if n_refs == 1 else "s", 

2270 datasetType.name, 

2271 run, 

2272 ) 

2273 

2274 # No way to know if this butler's registry uses UUID. 

2275 # We have to trust the caller on this. If it fails they will 

2276 # have to change their approach. We can't catch the exception 

2277 # and retry with unique because that will mess up the 

2278 # transaction handling. We aren't allowed to ask the registry 

2279 # manager what type of ID it is using. 

2280 imported_refs = self.registry._importDatasets( 

2281 refs_to_import, idGenerationMode=id_generation_mode, expand=False 

2282 ) 

2283 

2284 # Map them into the correct slots to match the initial order 

2285 for i, ref in zip(grouped_indices[datasetType, run], imported_refs): 

2286 transferred_refs_tmp[i] = ref 

2287 

2288 # Mypy insists that we might have None in here so we have to make 

2289 # that explicit by assigning to a new variable and filtering out 

2290 # something that won't be there. 

2291 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None] 

2292 

2293 # Check consistency 

2294 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given" 

2295 

2296 log.verbose("Imported %d datasets into destination butler", len(transferred_refs)) 

2297 

2298 # The transferred refs need to be reordered to match the original 

2299 # ordering given by the caller. Without this the datastore transfer 

2300 # will be broken. 

2301 

2302 # Ask the datastore to transfer. The datastore has to check that 

2303 # the source datastore is compatible with the target datastore. 

2304 self.datastore.transfer_from( 

2305 source_butler.datastore, 

2306 source_refs, 

2307 local_refs=transferred_refs, 

2308 transfer=transfer, 

2309 artifact_existence=artifact_existence, 

2310 ) 

2311 

2312 return transferred_refs 

2313 

2314 def validateConfiguration( 

2315 self, 

2316 logFailures: bool = False, 

2317 datasetTypeNames: Optional[Iterable[str]] = None, 

2318 ignore: Iterable[str] = None, 

2319 ) -> None: 

2320 """Validate butler configuration. 

2321 

2322 Checks that each `DatasetType` can be stored in the `Datastore`. 

2323 

2324 Parameters 

2325 ---------- 

2326 logFailures : `bool`, optional 

2327 If `True`, output a log message for every validation error 

2328 detected. 

2329 datasetTypeNames : iterable of `str`, optional 

2330 The `DatasetType` names that should be checked. This allows 

2331 only a subset to be selected. 

2332 ignore : iterable of `str`, optional 

2333 Names of DatasetTypes to skip over. This can be used to skip 

2334 known problems. If a named `DatasetType` corresponds to a 

2335 composite, all components of that `DatasetType` will also be 

2336 ignored. 

2337 

2338 Raises 

2339 ------ 

2340 ButlerValidationError 

2341 Raised if there is some inconsistency with how this Butler 

2342 is configured. 

2343 """ 

2344 if datasetTypeNames: 

2345 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

2346 else: 

2347 datasetTypes = list(self.registry.queryDatasetTypes()) 

2348 

2349 # filter out anything from the ignore list 

2350 if ignore: 

2351 ignore = set(ignore) 

2352 datasetTypes = [ 

2353 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

2354 ] 

2355 else: 

2356 ignore = set() 

2357 

2358 # Find all the registered instruments 

2359 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument")) 

2360 

2361 # For each datasetType that has an instrument dimension, create 

2362 # a DatasetRef for each defined instrument 

2363 datasetRefs = [] 

2364 

2365 for datasetType in datasetTypes: 

2366 if "instrument" in datasetType.dimensions: 

2367 for instrument in instruments: 

2368 datasetRef = DatasetRef( 

2369 datasetType, {"instrument": instrument}, conform=False # type: ignore 

2370 ) 

2371 datasetRefs.append(datasetRef) 

2372 

2373 entities: List[Union[DatasetType, DatasetRef]] = [] 

2374 entities.extend(datasetTypes) 

2375 entities.extend(datasetRefs) 

2376 

2377 datastoreErrorStr = None 

2378 try: 

2379 self.datastore.validateConfiguration(entities, logFailures=logFailures) 

2380 except ValidationError as e: 

2381 datastoreErrorStr = str(e) 

2382 

2383 # Also check that the LookupKeys used by the datastores match 

2384 # registry and storage class definitions 

2385 keys = self.datastore.getLookupKeys() 

2386 

2387 failedNames = set() 

2388 failedDataId = set() 

2389 for key in keys: 

2390 if key.name is not None: 

2391 if key.name in ignore: 

2392 continue 

2393 

2394 # skip if specific datasetType names were requested and this 

2395 # name does not match 

2396 if datasetTypeNames and key.name not in datasetTypeNames: 

2397 continue 

2398 

2399 # See if it is a StorageClass or a DatasetType 

2400 if key.name in self.storageClasses: 

2401 pass 

2402 else: 

2403 try: 

2404 self.registry.getDatasetType(key.name) 

2405 except KeyError: 

2406 if logFailures: 

2407 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

2408 failedNames.add(key) 

2409 else: 

2410 # Dimensions are checked for consistency when the Butler 

2411 # is created and rendezvoused with a universe. 

2412 pass 

2413 

2414 # Check that the instrument is a valid instrument 

2415 # Currently only support instrument so check for that 

2416 if key.dataId: 

2417 dataIdKeys = set(key.dataId) 

2418 if set(["instrument"]) != dataIdKeys: 

2419 if logFailures: 

2420 log.critical("Key '%s' has unsupported DataId override", key) 

2421 failedDataId.add(key) 

2422 elif key.dataId["instrument"] not in instruments: 

2423 if logFailures: 

2424 log.critical("Key '%s' has unknown instrument", key) 

2425 failedDataId.add(key) 

2426 

2427 messages = [] 

2428 

2429 if datastoreErrorStr: 

2430 messages.append(datastoreErrorStr) 

2431 

2432 for failed, msg in ( 

2433 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2434 (failedDataId, "Keys with bad DataId entries: "), 

2435 ): 

2436 if failed: 

2437 msg += ", ".join(str(k) for k in failed) 

2438 messages.append(msg) 

2439 

2440 if messages: 

2441 raise ValidationError(";\n".join(messages)) 

2442 

2443 @property 

2444 def collections(self) -> CollectionSearch: 

2445 """The collections to search by default, in order (`CollectionSearch`). 

2446 

2447 This is an alias for ``self.registry.defaults.collections``. It cannot 

2448 be set directly in isolation, but all defaults may be changed together 

2449 by assigning a new `RegistryDefaults` instance to 

2450 ``self.registry.defaults``. 

2451 """ 

2452 return self.registry.defaults.collections 

2453 

2454 @property 

2455 def run(self) -> Optional[str]: 

2456 """Name of the run this butler writes outputs to by default (`str` or 

2457 `None`). 

2458 

2459 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2460 directly in isolation, but all defaults may be changed together by 

2461 assigning a new `RegistryDefaults` instance to 

2462 ``self.registry.defaults``. 

2463 """ 

2464 return self.registry.defaults.run 

2465 

2466 @property 

2467 def dimensions(self) -> DimensionUniverse: 

2468 # Docstring inherited. 

2469 return self.registry.dimensions 

2470 

2471 registry: Registry 

2472 """The object that manages dataset metadata and relationships (`Registry`). 

2473 

2474 Most operations that don't involve reading or writing butler datasets are 

2475 accessible only via `Registry` methods. 

2476 """ 

2477 

2478 datastore: Datastore 

2479 """The object that manages actual dataset storage (`Datastore`). 

2480 

2481 Direct user access to the datastore should rarely be necessary; the primary 

2482 exception is the case where a `Datastore` implementation provides extra 

2483 functionality beyond what the base class defines. 

2484 """ 

2485 

2486 storageClasses: StorageClassFactory 

2487 """An object that maps known storage class names to objects that fully 

2488 describe them (`StorageClassFactory`). 

2489 """ 

2490 

2491 _allow_put_of_predefined_dataset: bool 

2492 """Allow a put to succeed even if there is already a registry entry for it 

2493 but not a datastore record. (`bool`)."""