Coverage for python/lsst/daf/butler/_butler.py: 10%

663 statements  

« prev     ^ index     » next       coverage.py v6.4.1, created at 2022-06-09 09:43 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Butler top level classes. 

24""" 

25from __future__ import annotations 

26 

27__all__ = ( 

28 "Butler", 

29 "ButlerValidationError", 

30 "PruneCollectionsArgsError", 

31 "PurgeWithoutUnstorePruneCollectionsError", 

32 "RunWithoutPurgePruneCollectionsError", 

33 "PurgeUnsupportedPruneCollectionsError", 

34) 

35 

36import collections.abc 

37import contextlib 

38import logging 

39import numbers 

40import os 

41from collections import defaultdict 

42from typing import ( 

43 Any, 

44 ClassVar, 

45 Counter, 

46 Dict, 

47 Iterable, 

48 Iterator, 

49 List, 

50 MutableMapping, 

51 Optional, 

52 Set, 

53 TextIO, 

54 Tuple, 

55 Type, 

56 Union, 

57) 

58 

59from lsst.resources import ResourcePath, ResourcePathExpression 

60from lsst.utils import doImportType 

61from lsst.utils.introspection import get_class_of 

62from lsst.utils.logging import VERBOSE, getLogger 

63 

64from ._butlerConfig import ButlerConfig 

65from ._butlerRepoIndex import ButlerRepoIndex 

66from ._deferredDatasetHandle import DeferredDatasetHandle 

67from ._limited_butler import LimitedButler 

68from .core import ( 

69 AmbiguousDatasetError, 

70 Config, 

71 ConfigSubset, 

72 DataCoordinate, 

73 DataId, 

74 DataIdValue, 

75 DatasetRef, 

76 DatasetType, 

77 Datastore, 

78 Dimension, 

79 DimensionConfig, 

80 DimensionUniverse, 

81 FileDataset, 

82 Progress, 

83 StorageClassFactory, 

84 Timespan, 

85 ValidationError, 

86) 

87from .core.repoRelocation import BUTLER_ROOT_TAG 

88from .core.utils import transactional 

89from .registry import ( 

90 CollectionSearch, 

91 CollectionType, 

92 ConflictingDefinitionError, 

93 DataIdError, 

94 DatasetIdGenEnum, 

95 Registry, 

96 RegistryConfig, 

97 RegistryDefaults, 

98) 

99from .transfers import RepoExportContext 

100 

101log = getLogger(__name__) 

102 

103 

104class ButlerValidationError(ValidationError): 

105 """There is a problem with the Butler configuration.""" 

106 

107 pass 

108 

109 

110class PruneCollectionsArgsError(TypeError): 

111 """Base class for errors relating to Butler.pruneCollections input 

112 arguments. 

113 """ 

114 

115 pass 

116 

117 

118class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError): 

119 """Raised when purge and unstore are both required to be True, and 

120 purge is True but unstore is False. 

121 """ 

122 

123 def __init__(self) -> None: 

124 super().__init__("Cannot pass purge=True without unstore=True.") 

125 

126 

127class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError): 

128 """Raised when pruning a RUN collection but purge is False.""" 

129 

130 def __init__(self, collectionType: CollectionType): 

131 self.collectionType = collectionType 

132 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.") 

133 

134 

135class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError): 

136 """Raised when purge is True but is not supported for the given 

137 collection.""" 

138 

139 def __init__(self, collectionType: CollectionType): 

140 self.collectionType = collectionType 

141 super().__init__( 

142 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True." 

143 ) 

144 

145 

146class Butler(LimitedButler): 

147 """Main entry point for the data access system. 

148 

149 Parameters 

150 ---------- 

151 config : `ButlerConfig`, `Config` or `str`, optional. 

152 Configuration. Anything acceptable to the 

153 `ButlerConfig` constructor. If a directory path 

154 is given the configuration will be read from a ``butler.yaml`` file in 

155 that location. If `None` is given default values will be used. 

156 butler : `Butler`, optional. 

157 If provided, construct a new Butler that uses the same registry and 

158 datastore as the given one, but with the given collection and run. 

159 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

160 arguments. 

161 collections : `str` or `Iterable` [ `str` ], optional 

162 An expression specifying the collections to be searched (in order) when 

163 reading datasets. 

164 This may be a `str` collection name or an iterable thereof. 

165 See :ref:`daf_butler_collection_expressions` for more information. 

166 These collections are not registered automatically and must be 

167 manually registered before they are used by any method, but they may be 

168 manually registered after the `Butler` is initialized. 

169 run : `str`, optional 

170 Name of the `~CollectionType.RUN` collection new datasets should be 

171 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

172 ``collections`` will be set to ``[run]``. If not `None`, this 

173 collection will automatically be registered. If this is not set (and 

174 ``writeable`` is not set either), a read-only butler will be created. 

175 searchPaths : `list` of `str`, optional 

176 Directory paths to search when calculating the full Butler 

177 configuration. Not used if the supplied config is already a 

178 `ButlerConfig`. 

179 writeable : `bool`, optional 

180 Explicitly sets whether the butler supports write operations. If not 

181 provided, a read-write butler is created if any of ``run``, ``tags``, 

182 or ``chains`` is non-empty. 

183 inferDefaults : `bool`, optional 

184 If `True` (default) infer default data ID values from the values 

185 present in the datasets in ``collections``: if all collections have the 

186 same value (or no value) for a governor dimension, that value will be 

187 the default for that dimension. Nonexistent collections are ignored. 

188 If a default value is provided explicitly for a governor dimension via 

189 ``**kwargs``, no default will be inferred for that dimension. 

190 **kwargs : `str` 

191 Default data ID key-value pairs. These may only identify "governor" 

192 dimensions like ``instrument`` and ``skymap``. 

193 

194 Examples 

195 -------- 

196 While there are many ways to control exactly how a `Butler` interacts with 

197 the collections in its `Registry`, the most common cases are still simple. 

198 

199 For a read-only `Butler` that searches one collection, do:: 

200 

201 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

202 

203 For a read-write `Butler` that writes to and reads from a 

204 `~CollectionType.RUN` collection:: 

205 

206 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

207 

208 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

209 because we want to write to one `~CollectionType.RUN` collection but read 

210 from several others (as well):: 

211 

212 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

213 collections=["u/alice/DM-50000/a", 

214 "u/bob/DM-49998", 

215 "HSC/defaults"]) 

216 

217 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

218 Datasets will be read first from that run (since it appears first in the 

219 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

220 

221 Finally, one can always create a `Butler` with no collections:: 

222 

223 butler = Butler("/path/to/repo", writeable=True) 

224 

225 This can be extremely useful when you just want to use ``butler.registry``, 

226 e.g. for inserting dimension data or managing collections, or when the 

227 collections you want to use with the butler are not consistent. 

228 Passing ``writeable`` explicitly here is only necessary if you want to be 

229 able to make changes to the repo - usually the value for ``writeable`` can 

230 be guessed from the collection arguments provided, but it defaults to 

231 `False` when there are not collection arguments. 

232 """ 

233 

234 def __init__( 

235 self, 

236 config: Union[Config, str, None] = None, 

237 *, 

238 butler: Optional[Butler] = None, 

239 collections: Any = None, 

240 run: Optional[str] = None, 

241 searchPaths: Optional[List[str]] = None, 

242 writeable: Optional[bool] = None, 

243 inferDefaults: bool = True, 

244 **kwargs: str, 

245 ): 

246 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

247 # Load registry, datastore, etc. from config or existing butler. 

248 if butler is not None: 

249 if config is not None or searchPaths is not None or writeable is not None: 

250 raise TypeError( 

251 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument." 

252 ) 

253 self.registry = butler.registry.copy(defaults) 

254 self.datastore = butler.datastore 

255 self.storageClasses = butler.storageClasses 

256 self._config: ButlerConfig = butler._config 

257 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset 

258 else: 

259 # Can only look for strings in the known repos list. 

260 if isinstance(config, str) and config in self.get_known_repos(): 

261 config = str(self.get_repo_uri(config)) 

262 try: 

263 self._config = ButlerConfig(config, searchPaths=searchPaths) 

264 except FileNotFoundError as e: 

265 if known := self.get_known_repos(): 

266 aliases = f"(known aliases: {', '.join(known)})" 

267 else: 

268 aliases = "(no known aliases)" 

269 raise FileNotFoundError(f"{e} {aliases}") from e 

270 self._config = ButlerConfig(config, searchPaths=searchPaths) 

271 try: 

272 if "root" in self._config: 

273 butlerRoot = self._config["root"] 

274 else: 

275 butlerRoot = self._config.configDir 

276 if writeable is None: 

277 writeable = run is not None 

278 self.registry = Registry.fromConfig( 

279 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

280 ) 

281 self.datastore = Datastore.fromConfig( 

282 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

283 ) 

284 self.storageClasses = StorageClassFactory() 

285 self.storageClasses.addFromConfig(self._config) 

286 self._allow_put_of_predefined_dataset = self._config.get( 

287 "allow_put_of_predefined_dataset", False 

288 ) 

289 except Exception: 

290 # Failures here usually mean that configuration is incomplete, 

291 # just issue an error message which includes config file URI. 

292 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

293 raise 

294 

295 if "run" in self._config or "collection" in self._config: 

296 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

297 

298 GENERATION: ClassVar[int] = 3 

299 """This is a Generation 3 Butler. 

300 

301 This attribute may be removed in the future, once the Generation 2 Butler 

302 interface has been fully retired; it should only be used in transitional 

303 code. 

304 """ 

305 

306 @classmethod 

307 def get_repo_uri(cls, label: str) -> ResourcePath: 

308 """Look up the label in a butler repository index. 

309 

310 Parameters 

311 ---------- 

312 label : `str` 

313 Label of the Butler repository to look up. 

314 

315 Returns 

316 ------- 

317 uri : `lsst.resources.ResourcePath` 

318 URI to the Butler repository associated with the given label. 

319 

320 Raises 

321 ------ 

322 KeyError 

323 Raised if the label is not found in the index, or if an index 

324 can not be found at all. 

325 

326 Notes 

327 ----- 

328 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

329 information is discovered. 

330 """ 

331 return ButlerRepoIndex.get_repo_uri(label) 

332 

333 @classmethod 

334 def get_known_repos(cls) -> Set[str]: 

335 """Retrieve the list of known repository labels. 

336 

337 Returns 

338 ------- 

339 repos : `set` of `str` 

340 All the known labels. Can be empty if no index can be found. 

341 

342 Notes 

343 ----- 

344 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

345 information is discovered. 

346 """ 

347 return ButlerRepoIndex.get_known_repos() 

348 

349 @staticmethod 

350 def makeRepo( 

351 root: ResourcePathExpression, 

352 config: Union[Config, str, None] = None, 

353 dimensionConfig: Union[Config, str, None] = None, 

354 standalone: bool = False, 

355 searchPaths: Optional[List[str]] = None, 

356 forceConfigRoot: bool = True, 

357 outfile: Optional[ResourcePathExpression] = None, 

358 overwrite: bool = False, 

359 ) -> Config: 

360 """Create an empty data repository by adding a butler.yaml config 

361 to a repository root directory. 

362 

363 Parameters 

364 ---------- 

365 root : `lsst.resources.ResourcePathExpression` 

366 Path or URI to the root location of the new repository. Will be 

367 created if it does not exist. 

368 config : `Config` or `str`, optional 

369 Configuration to write to the repository, after setting any 

370 root-dependent Registry or Datastore config options. Can not 

371 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

372 configuration will be used. Root-dependent config options 

373 specified in this config are overwritten if ``forceConfigRoot`` 

374 is `True`. 

375 dimensionConfig : `Config` or `str`, optional 

376 Configuration for dimensions, will be used to initialize registry 

377 database. 

378 standalone : `bool` 

379 If True, write all expanded defaults, not just customized or 

380 repository-specific settings. 

381 This (mostly) decouples the repository from the default 

382 configuration, insulating it from changes to the defaults (which 

383 may be good or bad, depending on the nature of the changes). 

384 Future *additions* to the defaults will still be picked up when 

385 initializing `Butlers` to repos created with ``standalone=True``. 

386 searchPaths : `list` of `str`, optional 

387 Directory paths to search when calculating the full butler 

388 configuration. 

389 forceConfigRoot : `bool`, optional 

390 If `False`, any values present in the supplied ``config`` that 

391 would normally be reset are not overridden and will appear 

392 directly in the output config. This allows non-standard overrides 

393 of the root directory for a datastore or registry to be given. 

394 If this parameter is `True` the values for ``root`` will be 

395 forced into the resulting config if appropriate. 

396 outfile : `lss.resources.ResourcePathExpression`, optional 

397 If not-`None`, the output configuration will be written to this 

398 location rather than into the repository itself. Can be a URI 

399 string. Can refer to a directory that will be used to write 

400 ``butler.yaml``. 

401 overwrite : `bool`, optional 

402 Create a new configuration file even if one already exists 

403 in the specified output location. Default is to raise 

404 an exception. 

405 

406 Returns 

407 ------- 

408 config : `Config` 

409 The updated `Config` instance written to the repo. 

410 

411 Raises 

412 ------ 

413 ValueError 

414 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

415 regular Config (as these subclasses would make it impossible to 

416 support ``standalone=False``). 

417 FileExistsError 

418 Raised if the output config file already exists. 

419 os.error 

420 Raised if the directory does not exist, exists but is not a 

421 directory, or cannot be created. 

422 

423 Notes 

424 ----- 

425 Note that when ``standalone=False`` (the default), the configuration 

426 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

427 construct the repository should also be used to construct any Butlers 

428 to avoid configuration inconsistencies. 

429 """ 

430 if isinstance(config, (ButlerConfig, ConfigSubset)): 

431 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

432 

433 # Ensure that the root of the repository exists or can be made 

434 root_uri = ResourcePath(root, forceDirectory=True) 

435 root_uri.mkdir() 

436 

437 config = Config(config) 

438 

439 # If we are creating a new repo from scratch with relative roots, 

440 # do not propagate an explicit root from the config file 

441 if "root" in config: 

442 del config["root"] 

443 

444 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

445 imported_class = doImportType(full["datastore", "cls"]) 

446 if not issubclass(imported_class, Datastore): 

447 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore") 

448 datastoreClass: Type[Datastore] = imported_class 

449 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

450 

451 # if key exists in given config, parse it, otherwise parse the defaults 

452 # in the expanded config 

453 if config.get(("registry", "db")): 

454 registryConfig = RegistryConfig(config) 

455 else: 

456 registryConfig = RegistryConfig(full) 

457 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

458 if defaultDatabaseUri is not None: 

459 Config.updateParameters( 

460 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot 

461 ) 

462 else: 

463 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot) 

464 

465 if standalone: 

466 config.merge(full) 

467 else: 

468 # Always expand the registry.managers section into the per-repo 

469 # config, because after the database schema is created, it's not 

470 # allowed to change anymore. Note that in the standalone=True 

471 # branch, _everything_ in the config is expanded, so there's no 

472 # need to special case this. 

473 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

474 configURI: ResourcePathExpression 

475 if outfile is not None: 

476 # When writing to a separate location we must include 

477 # the root of the butler repo in the config else it won't know 

478 # where to look. 

479 config["root"] = root_uri.geturl() 

480 configURI = outfile 

481 else: 

482 configURI = root_uri 

483 config.dumpToUri(configURI, overwrite=overwrite) 

484 

485 # Create Registry and populate tables 

486 registryConfig = RegistryConfig(config.get("registry")) 

487 dimensionConfig = DimensionConfig(dimensionConfig) 

488 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri) 

489 

490 log.verbose("Wrote new Butler configuration file to %s", configURI) 

491 

492 return config 

493 

494 @classmethod 

495 def _unpickle( 

496 cls, 

497 config: ButlerConfig, 

498 collections: Optional[CollectionSearch], 

499 run: Optional[str], 

500 defaultDataId: Dict[str, str], 

501 writeable: bool, 

502 ) -> Butler: 

503 """Callable used to unpickle a Butler. 

504 

505 We prefer not to use ``Butler.__init__`` directly so we can force some 

506 of its many arguments to be keyword-only (note that ``__reduce__`` 

507 can only invoke callables with positional arguments). 

508 

509 Parameters 

510 ---------- 

511 config : `ButlerConfig` 

512 Butler configuration, already coerced into a true `ButlerConfig` 

513 instance (and hence after any search paths for overrides have been 

514 utilized). 

515 collections : `CollectionSearch` 

516 Names of the default collections to read from. 

517 run : `str`, optional 

518 Name of the default `~CollectionType.RUN` collection to write to. 

519 defaultDataId : `dict` [ `str`, `str` ] 

520 Default data ID values. 

521 writeable : `bool` 

522 Whether the Butler should support write operations. 

523 

524 Returns 

525 ------- 

526 butler : `Butler` 

527 A new `Butler` instance. 

528 """ 

529 # MyPy doesn't recognize that the kwargs below are totally valid; it 

530 # seems to think '**defaultDataId* is a _positional_ argument! 

531 return cls( 

532 config=config, 

533 collections=collections, 

534 run=run, 

535 writeable=writeable, 

536 **defaultDataId, # type: ignore 

537 ) 

538 

539 def __reduce__(self) -> tuple: 

540 """Support pickling.""" 

541 return ( 

542 Butler._unpickle, 

543 ( 

544 self._config, 

545 self.collections, 

546 self.run, 

547 self.registry.defaults.dataId.byName(), 

548 self.registry.isWriteable(), 

549 ), 

550 ) 

551 

552 def __str__(self) -> str: 

553 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

554 self.collections, self.run, self.datastore, self.registry 

555 ) 

556 

557 def isWriteable(self) -> bool: 

558 """Return `True` if this `Butler` supports write operations.""" 

559 return self.registry.isWriteable() 

560 

561 @contextlib.contextmanager 

562 def transaction(self) -> Iterator[None]: 

563 """Context manager supporting `Butler` transactions. 

564 

565 Transactions can be nested. 

566 """ 

567 with self.registry.transaction(): 

568 with self.datastore.transaction(): 

569 yield 

570 

571 def _standardizeArgs( 

572 self, 

573 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

574 dataId: Optional[DataId] = None, 

575 for_put: bool = True, 

576 **kwargs: Any, 

577 ) -> Tuple[DatasetType, Optional[DataId]]: 

578 """Standardize the arguments passed to several Butler APIs. 

579 

580 Parameters 

581 ---------- 

582 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

583 When `DatasetRef` the `dataId` should be `None`. 

584 Otherwise the `DatasetType` or name thereof. 

585 dataId : `dict` or `DataCoordinate` 

586 A `dict` of `Dimension` link name, value pairs that label the 

587 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

588 should be provided as the second argument. 

589 for_put : `bool`, optional 

590 If `True` this call is invoked as part of a `Butler.put()`. 

591 Otherwise it is assumed to be part of a `Butler.get()`. This 

592 parameter is only relevant if there is dataset type 

593 inconsistency. 

594 **kwargs 

595 Additional keyword arguments used to augment or construct a 

596 `DataCoordinate`. See `DataCoordinate.standardize` 

597 parameters. 

598 

599 Returns 

600 ------- 

601 datasetType : `DatasetType` 

602 A `DatasetType` instance extracted from ``datasetRefOrType``. 

603 dataId : `dict` or `DataId`, optional 

604 Argument that can be used (along with ``kwargs``) to construct a 

605 `DataId`. 

606 

607 Notes 

608 ----- 

609 Butler APIs that conceptually need a DatasetRef also allow passing a 

610 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

611 keyword arguments that can be used to construct one) separately. This 

612 method accepts those arguments and always returns a true `DatasetType` 

613 and a `DataId` or `dict`. 

614 

615 Standardization of `dict` vs `DataId` is best handled by passing the 

616 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

617 generally similarly flexible. 

618 """ 

619 externalDatasetType: Optional[DatasetType] = None 

620 internalDatasetType: Optional[DatasetType] = None 

621 if isinstance(datasetRefOrType, DatasetRef): 

622 if dataId is not None or kwargs: 

623 raise ValueError("DatasetRef given, cannot use dataId as well") 

624 externalDatasetType = datasetRefOrType.datasetType 

625 dataId = datasetRefOrType.dataId 

626 else: 

627 # Don't check whether DataId is provided, because Registry APIs 

628 # can usually construct a better error message when it wasn't. 

629 if isinstance(datasetRefOrType, DatasetType): 

630 externalDatasetType = datasetRefOrType 

631 else: 

632 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

633 

634 # Check that they are self-consistent 

635 if externalDatasetType is not None: 

636 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

637 if externalDatasetType != internalDatasetType: 

638 # We can allow differences if they are compatible, depending 

639 # on whether this is a get or a put. A get requires that 

640 # the python type associated with the datastore can be 

641 # converted to the user type. A put requires that the user 

642 # supplied python type can be converted to the internal 

643 # type expected by registry. 

644 relevantDatasetType = internalDatasetType 

645 if for_put: 

646 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

647 else: 

648 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

649 relevantDatasetType = externalDatasetType 

650 if not is_compatible: 

651 raise ValueError( 

652 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

653 f"registry definition ({internalDatasetType})" 

654 ) 

655 # Override the internal definition. 

656 internalDatasetType = relevantDatasetType 

657 

658 assert internalDatasetType is not None 

659 return internalDatasetType, dataId 

660 

661 def _rewrite_data_id( 

662 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any 

663 ) -> Tuple[Optional[DataId], Dict[str, Any]]: 

664 """Rewrite a data ID taking into account dimension records. 

665 

666 Take a Data ID and keyword args and rewrite it if necessary to 

667 allow the user to specify dimension records rather than dimension 

668 primary values. 

669 

670 This allows a user to include a dataId dict with keys of 

671 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

672 the integer exposure ID. It also allows a string to be given 

673 for a dimension value rather than the integer ID if that is more 

674 convenient. For example, rather than having to specifyin the 

675 detector with ``detector.full_name``, a string given for ``detector`` 

676 will be interpreted as the full name and converted to the integer 

677 value. 

678 

679 Keyword arguments can also use strings for dimensions like detector 

680 and exposure but python does not allow them to include ``.`` and 

681 so the ``exposure.day_obs`` syntax can not be used in a keyword 

682 argument. 

683 

684 Parameters 

685 ---------- 

686 dataId : `dict` or `DataCoordinate` 

687 A `dict` of `Dimension` link name, value pairs that will label the 

688 `DatasetRef` within a Collection. 

689 datasetType : `DatasetType` 

690 The dataset type associated with this dataId. Required to 

691 determine the relevant dimensions. 

692 **kwargs 

693 Additional keyword arguments used to augment or construct a 

694 `DataId`. See `DataId` parameters. 

695 

696 Returns 

697 ------- 

698 dataId : `dict` or `DataCoordinate` 

699 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

700 no keyword arguments, the original dataId will be returned 

701 unchanged. 

702 **kwargs : `dict` 

703 Any unused keyword arguments (would normally be empty dict). 

704 """ 

705 # Do nothing if we have a standalone DataCoordinate. 

706 if isinstance(dataId, DataCoordinate) and not kwargs: 

707 return dataId, kwargs 

708 

709 # Process dimension records that are using record information 

710 # rather than ids 

711 newDataId: Dict[str, DataIdValue] = {} 

712 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict) 

713 

714 # if all the dataId comes from keyword parameters we do not need 

715 # to do anything here because they can't be of the form 

716 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

717 if dataId: 

718 for k, v in dataId.items(): 

719 # If we have a Dimension we do not need to do anything 

720 # because it cannot be a compound key. 

721 if isinstance(k, str) and "." in k: 

722 # Someone is using a more human-readable dataId 

723 dimensionName, record = k.split(".", 1) 

724 byRecord[dimensionName][record] = v 

725 elif isinstance(k, Dimension): 

726 newDataId[k.name] = v 

727 else: 

728 newDataId[k] = v 

729 

730 # Go through the updated dataId and check the type in case someone is 

731 # using an alternate key. We have already filtered out the compound 

732 # keys dimensions.record format. 

733 not_dimensions = {} 

734 

735 # Will need to look in the dataId and the keyword arguments 

736 # and will remove them if they need to be fixed or are unrecognized. 

737 for dataIdDict in (newDataId, kwargs): 

738 # Use a list so we can adjust the dict safely in the loop 

739 for dimensionName in list(dataIdDict): 

740 value = dataIdDict[dimensionName] 

741 try: 

742 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

743 except KeyError: 

744 # This is not a real dimension 

745 not_dimensions[dimensionName] = value 

746 del dataIdDict[dimensionName] 

747 continue 

748 

749 # Convert an integral type to an explicit int to simplify 

750 # comparisons here 

751 if isinstance(value, numbers.Integral): 

752 value = int(value) 

753 

754 if not isinstance(value, dimension.primaryKey.getPythonType()): 

755 for alternate in dimension.alternateKeys: 

756 if isinstance(value, alternate.getPythonType()): 

757 byRecord[dimensionName][alternate.name] = value 

758 del dataIdDict[dimensionName] 

759 log.debug( 

760 "Converting dimension %s to %s.%s=%s", 

761 dimensionName, 

762 dimensionName, 

763 alternate.name, 

764 value, 

765 ) 

766 break 

767 else: 

768 log.warning( 

769 "Type mismatch found for value '%r' provided for dimension %s. " 

770 "Could not find matching alternative (primary key has type %s) " 

771 "so attempting to use as-is.", 

772 value, 

773 dimensionName, 

774 dimension.primaryKey.getPythonType(), 

775 ) 

776 

777 # By this point kwargs and newDataId should only include valid 

778 # dimensions. Merge kwargs in to the new dataId and log if there 

779 # are dimensions in both (rather than calling update). 

780 for k, v in kwargs.items(): 

781 if k in newDataId and newDataId[k] != v: 

782 log.debug( 

783 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

784 ) 

785 newDataId[k] = v 

786 # No need to retain any values in kwargs now. 

787 kwargs = {} 

788 

789 # If we have some unrecognized dimensions we have to try to connect 

790 # them to records in other dimensions. This is made more complicated 

791 # by some dimensions having records with clashing names. A mitigation 

792 # is that we can tell by this point which dimensions are missing 

793 # for the DatasetType but this does not work for calibrations 

794 # where additional dimensions can be used to constrain the temporal 

795 # axis. 

796 if not_dimensions: 

797 # Search for all dimensions even if we have been given a value 

798 # explicitly. In some cases records are given as well as the 

799 # actually dimension and this should not be an error if they 

800 # match. 

801 mandatoryDimensions = datasetType.dimensions.names # - provided 

802 

803 candidateDimensions: Set[str] = set() 

804 candidateDimensions.update(mandatoryDimensions) 

805 

806 # For calibrations we may well be needing temporal dimensions 

807 # so rather than always including all dimensions in the scan 

808 # restrict things a little. It is still possible for there 

809 # to be confusion over day_obs in visit vs exposure for example. 

810 # If we are not searching calibration collections things may 

811 # fail but they are going to fail anyway because of the 

812 # ambiguousness of the dataId... 

813 if datasetType.isCalibration(): 

814 for dim in self.registry.dimensions.getStaticDimensions(): 

815 if dim.temporal: 

816 candidateDimensions.add(str(dim)) 

817 

818 # Look up table for the first association with a dimension 

819 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict) 

820 

821 # Keep track of whether an item is associated with multiple 

822 # dimensions. 

823 counter: Counter[str] = Counter() 

824 assigned: Dict[str, Set[str]] = defaultdict(set) 

825 

826 # Go through the missing dimensions and associate the 

827 # given names with records within those dimensions 

828 matched_dims = set() 

829 for dimensionName in candidateDimensions: 

830 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

831 fields = dimension.metadata.names | dimension.uniqueKeys.names 

832 for field in not_dimensions: 

833 if field in fields: 

834 guessedAssociation[dimensionName][field] = not_dimensions[field] 

835 counter[dimensionName] += 1 

836 assigned[field].add(dimensionName) 

837 matched_dims.add(field) 

838 

839 # Calculate the fields that matched nothing. 

840 never_found = set(not_dimensions) - matched_dims 

841 

842 if never_found: 

843 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

844 

845 # There is a chance we have allocated a single dataId item 

846 # to multiple dimensions. Need to decide which should be retained. 

847 # For now assume that the most popular alternative wins. 

848 # This means that day_obs with seq_num will result in 

849 # exposure.day_obs and not visit.day_obs 

850 # Also prefer an explicitly missing dimension over an inferred 

851 # temporal dimension. 

852 for fieldName, assignedDimensions in assigned.items(): 

853 if len(assignedDimensions) > 1: 

854 # Pick the most popular (preferring mandatory dimensions) 

855 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

856 if requiredButMissing: 

857 candidateDimensions = requiredButMissing 

858 else: 

859 candidateDimensions = assignedDimensions 

860 

861 # If this is a choice between visit and exposure and 

862 # neither was a required part of the dataset type, 

863 # (hence in this branch) always prefer exposure over 

864 # visit since exposures are always defined and visits 

865 # are defined from exposures. 

866 if candidateDimensions == {"exposure", "visit"}: 

867 candidateDimensions = {"exposure"} 

868 

869 # Select the relevant items and get a new restricted 

870 # counter. 

871 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

872 duplicatesCounter: Counter[str] = Counter() 

873 duplicatesCounter.update(theseCounts) 

874 

875 # Choose the most common. If they are equally common 

876 # we will pick the one that was found first. 

877 # Returns a list of tuples 

878 selected = duplicatesCounter.most_common(1)[0][0] 

879 

880 log.debug( 

881 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

882 " Removed ambiguity by choosing dimension %s.", 

883 fieldName, 

884 ", ".join(assignedDimensions), 

885 selected, 

886 ) 

887 

888 for candidateDimension in assignedDimensions: 

889 if candidateDimension != selected: 

890 del guessedAssociation[candidateDimension][fieldName] 

891 

892 # Update the record look up dict with the new associations 

893 for dimensionName, values in guessedAssociation.items(): 

894 if values: # A dict might now be empty 

895 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values) 

896 byRecord[dimensionName].update(values) 

897 

898 if byRecord: 

899 # Some record specifiers were found so we need to convert 

900 # them to the Id form 

901 for dimensionName, values in byRecord.items(): 

902 if dimensionName in newDataId: 

903 log.debug( 

904 "DataId specified explicit %s dimension value of %s in addition to" 

905 " general record specifiers for it of %s. Ignoring record information.", 

906 dimensionName, 

907 newDataId[dimensionName], 

908 str(values), 

909 ) 

910 # Get the actual record and compare with these values. 

911 try: 

912 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

913 except DataIdError: 

914 raise ValueError( 

915 f"Could not find dimension '{dimensionName}'" 

916 f" with dataId {newDataId} as part of comparing with" 

917 f" record values {byRecord[dimensionName]}" 

918 ) from None 

919 if len(recs) == 1: 

920 errmsg: List[str] = [] 

921 for k, v in values.items(): 

922 if (recval := getattr(recs[0], k)) != v: 

923 errmsg.append(f"{k}({recval} != {v})") 

924 if errmsg: 

925 raise ValueError( 

926 f"Dimension {dimensionName} in dataId has explicit value" 

927 " inconsistent with records: " + ", ".join(errmsg) 

928 ) 

929 else: 

930 # Multiple matches for an explicit dimension 

931 # should never happen but let downstream complain. 

932 pass 

933 continue 

934 

935 # Build up a WHERE expression 

936 bind = {k: v for k, v in values.items()} 

937 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

938 

939 # Hopefully we get a single record that matches 

940 records = set( 

941 self.registry.queryDimensionRecords( 

942 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

943 ) 

944 ) 

945 

946 if len(records) != 1: 

947 if len(records) > 1: 

948 # visit can have an ambiguous answer without involving 

949 # visit_system. The default visit_system is defined 

950 # by the instrument. 

951 if ( 

952 dimensionName == "visit" 

953 and "visit_system_membership" in self.registry.dimensions 

954 and "visit_system" 

955 in self.registry.dimensions["instrument"].metadata # type: ignore 

956 ): 

957 instrument_records = list( 

958 self.registry.queryDimensionRecords( 

959 "instrument", 

960 dataId=newDataId, 

961 **kwargs, 

962 ) 

963 ) 

964 if len(instrument_records) == 1: 

965 visit_system = instrument_records[0].visit_system 

966 if visit_system is None: 

967 # Set to a value that will never match. 

968 visit_system = -1 

969 

970 # Look up each visit in the 

971 # visit_system_membership records. 

972 for rec in records: 

973 membership = list( 

974 self.registry.queryDimensionRecords( 

975 # Use bind to allow zero results. 

976 # This is a fully-specified query. 

977 "visit_system_membership", 

978 where="instrument = inst AND visit_system = system AND visit = v", 

979 bind=dict( 

980 inst=instrument_records[0].name, system=visit_system, v=rec.id 

981 ), 

982 ) 

983 ) 

984 if membership: 

985 # This record is the right answer. 

986 records = set([rec]) 

987 break 

988 

989 # The ambiguity may have been resolved so check again. 

990 if len(records) > 1: 

991 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

992 for r in records: 

993 log.debug("- %s", str(r)) 

994 raise ValueError( 

995 f"DataId specification for dimension {dimensionName} is not" 

996 f" uniquely constrained to a single dataset by {values}." 

997 f" Got {len(records)} results." 

998 ) 

999 else: 

1000 raise ValueError( 

1001 f"DataId specification for dimension {dimensionName} matched no" 

1002 f" records when constrained by {values}" 

1003 ) 

1004 

1005 # Get the primary key from the real dimension object 

1006 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

1007 if not isinstance(dimension, Dimension): 

1008 raise RuntimeError( 

1009 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

1010 ) 

1011 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

1012 

1013 return newDataId, kwargs 

1014 

1015 def _findDatasetRef( 

1016 self, 

1017 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1018 dataId: Optional[DataId] = None, 

1019 *, 

1020 collections: Any = None, 

1021 allowUnresolved: bool = False, 

1022 **kwargs: Any, 

1023 ) -> DatasetRef: 

1024 """Shared logic for methods that start with a search for a dataset in 

1025 the registry. 

1026 

1027 Parameters 

1028 ---------- 

1029 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1030 When `DatasetRef` the `dataId` should be `None`. 

1031 Otherwise the `DatasetType` or name thereof. 

1032 dataId : `dict` or `DataCoordinate`, optional 

1033 A `dict` of `Dimension` link name, value pairs that label the 

1034 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1035 should be provided as the first argument. 

1036 collections : Any, optional 

1037 Collections to be searched, overriding ``self.collections``. 

1038 Can be any of the types supported by the ``collections`` argument 

1039 to butler construction. 

1040 allowUnresolved : `bool`, optional 

1041 If `True`, return an unresolved `DatasetRef` if finding a resolved 

1042 one in the `Registry` fails. Defaults to `False`. 

1043 **kwargs 

1044 Additional keyword arguments used to augment or construct a 

1045 `DataId`. See `DataId` parameters. 

1046 

1047 Returns 

1048 ------- 

1049 ref : `DatasetRef` 

1050 A reference to the dataset identified by the given arguments. 

1051 

1052 Raises 

1053 ------ 

1054 LookupError 

1055 Raised if no matching dataset exists in the `Registry` (and 

1056 ``allowUnresolved is False``). 

1057 ValueError 

1058 Raised if a resolved `DatasetRef` was passed as an input, but it 

1059 differs from the one found in the registry. 

1060 TypeError 

1061 Raised if no collections were provided. 

1062 """ 

1063 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

1064 if isinstance(datasetRefOrType, DatasetRef): 

1065 idNumber = datasetRefOrType.id 

1066 else: 

1067 idNumber = None 

1068 timespan: Optional[Timespan] = None 

1069 

1070 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1071 

1072 if datasetType.isCalibration(): 

1073 # Because this is a calibration dataset, first try to make a 

1074 # standardize the data ID without restricting the dimensions to 

1075 # those of the dataset type requested, because there may be extra 

1076 # dimensions that provide temporal information for a validity-range 

1077 # lookup. 

1078 dataId = DataCoordinate.standardize( 

1079 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1080 ) 

1081 if dataId.graph.temporal: 

1082 dataId = self.registry.expandDataId(dataId) 

1083 timespan = dataId.timespan 

1084 else: 

1085 # Standardize the data ID to just the dimensions of the dataset 

1086 # type instead of letting registry.findDataset do it, so we get the 

1087 # result even if no dataset is found. 

1088 dataId = DataCoordinate.standardize( 

1089 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1090 ) 

1091 # Always lookup the DatasetRef, even if one is given, to ensure it is 

1092 # present in the current collection. 

1093 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

1094 if ref is None: 

1095 if allowUnresolved: 

1096 return DatasetRef(datasetType, dataId) 

1097 else: 

1098 if collections is None: 

1099 collections = self.registry.defaults.collections 

1100 raise LookupError( 

1101 f"Dataset {datasetType.name} with data ID {dataId} " 

1102 f"could not be found in collections {collections}." 

1103 ) 

1104 if idNumber is not None and idNumber != ref.id: 

1105 if collections is None: 

1106 collections = self.registry.defaults.collections 

1107 raise ValueError( 

1108 f"DatasetRef.id provided ({idNumber}) does not match " 

1109 f"id ({ref.id}) in registry in collections {collections}." 

1110 ) 

1111 if datasetType != ref.datasetType: 

1112 # If they differ it is because the user explicitly specified 

1113 # a compatible dataset type to this call rather than using the 

1114 # registry definition. The DatasetRef must therefore be recreated 

1115 # using the user definition such that the expected type is 

1116 # returned. 

1117 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id) 

1118 

1119 return ref 

1120 

1121 @transactional 

1122 def putDirect(self, obj: Any, ref: DatasetRef) -> DatasetRef: 

1123 # Docstring inherited. 

1124 (imported_ref,) = self.registry._importDatasets( 

1125 [ref], 

1126 expand=True, 

1127 ) 

1128 if imported_ref.id != ref.getCheckedId(): 

1129 raise RuntimeError("This registry configuration does not support putDirect.") 

1130 self.datastore.put(obj, ref) 

1131 return ref 

1132 

1133 @transactional 

1134 def put( 

1135 self, 

1136 obj: Any, 

1137 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1138 dataId: Optional[DataId] = None, 

1139 *, 

1140 run: Optional[str] = None, 

1141 **kwargs: Any, 

1142 ) -> DatasetRef: 

1143 """Store and register a dataset. 

1144 

1145 Parameters 

1146 ---------- 

1147 obj : `object` 

1148 The dataset. 

1149 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1150 When `DatasetRef` is provided, ``dataId`` should be `None`. 

1151 Otherwise the `DatasetType` or name thereof. 

1152 dataId : `dict` or `DataCoordinate` 

1153 A `dict` of `Dimension` link name, value pairs that label the 

1154 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1155 should be provided as the second argument. 

1156 run : `str`, optional 

1157 The name of the run the dataset should be added to, overriding 

1158 ``self.run``. 

1159 **kwargs 

1160 Additional keyword arguments used to augment or construct a 

1161 `DataCoordinate`. See `DataCoordinate.standardize` 

1162 parameters. 

1163 

1164 Returns 

1165 ------- 

1166 ref : `DatasetRef` 

1167 A reference to the stored dataset, updated with the correct id if 

1168 given. 

1169 

1170 Raises 

1171 ------ 

1172 TypeError 

1173 Raised if the butler is read-only or if no run has been provided. 

1174 """ 

1175 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

1176 if not self.isWriteable(): 

1177 raise TypeError("Butler is read-only.") 

1178 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

1179 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

1180 raise ValueError("DatasetRef must not be in registry, must have None id") 

1181 

1182 # Handle dimension records in dataId 

1183 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1184 

1185 # Add Registry Dataset entry. 

1186 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

1187 

1188 # For an execution butler the datasets will be pre-defined. 

1189 # If the butler is configured that way datasets should only be inserted 

1190 # if they do not already exist in registry. Trying and catching 

1191 # ConflictingDefinitionError will not work because the transaction 

1192 # will be corrupted. Instead, in this mode always check first. 

1193 ref = None 

1194 ref_is_predefined = False 

1195 if self._allow_put_of_predefined_dataset: 

1196 # Get the matching ref for this run. 

1197 ref = self.registry.findDataset(datasetType, collections=run, dataId=dataId) 

1198 

1199 if ref: 

1200 # Must be expanded form for datastore templating 

1201 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions) 

1202 ref = ref.expanded(dataId) 

1203 ref_is_predefined = True 

1204 

1205 if not ref: 

1206 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

1207 

1208 # If the ref is predefined it is possible that the datastore also 

1209 # has the record. Asking datastore to put it again will result in 

1210 # the artifact being recreated, overwriting previous, then will cause 

1211 # a failure in writing the record which will cause the artifact 

1212 # to be removed. Much safer to ask first before attempting to 

1213 # overwrite. Race conditions should not be an issue for the 

1214 # execution butler environment. 

1215 if ref_is_predefined: 

1216 if self.datastore.knows(ref): 

1217 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.") 

1218 

1219 self.datastore.put(obj, ref) 

1220 

1221 return ref 

1222 

1223 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any: 

1224 """Retrieve a stored dataset. 

1225 

1226 Unlike `Butler.get`, this method allows datasets outside the Butler's 

1227 collection to be read as long as the `DatasetRef` that identifies them 

1228 can be obtained separately. 

1229 

1230 Parameters 

1231 ---------- 

1232 ref : `DatasetRef` 

1233 Resolved reference to an already stored dataset. 

1234 parameters : `dict` 

1235 Additional StorageClass-defined options to control reading, 

1236 typically used to efficiently read only a subset of the dataset. 

1237 

1238 Returns 

1239 ------- 

1240 obj : `object` 

1241 The dataset. 

1242 """ 

1243 return self.datastore.get(ref, parameters=parameters) 

1244 

1245 def getDirectDeferred( 

1246 self, ref: DatasetRef, *, parameters: Union[dict, None] = None 

1247 ) -> DeferredDatasetHandle: 

1248 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1249 from a resolved `DatasetRef`. 

1250 

1251 Parameters 

1252 ---------- 

1253 ref : `DatasetRef` 

1254 Resolved reference to an already stored dataset. 

1255 parameters : `dict` 

1256 Additional StorageClass-defined options to control reading, 

1257 typically used to efficiently read only a subset of the dataset. 

1258 

1259 Returns 

1260 ------- 

1261 obj : `DeferredDatasetHandle` 

1262 A handle which can be used to retrieve a dataset at a later time. 

1263 

1264 Raises 

1265 ------ 

1266 AmbiguousDatasetError 

1267 Raised if ``ref.id is None``, i.e. the reference is unresolved. 

1268 """ 

1269 if ref.id is None: 

1270 raise AmbiguousDatasetError( 

1271 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved." 

1272 ) 

1273 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

1274 

1275 def getDeferred( 

1276 self, 

1277 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1278 dataId: Optional[DataId] = None, 

1279 *, 

1280 parameters: Union[dict, None] = None, 

1281 collections: Any = None, 

1282 **kwargs: Any, 

1283 ) -> DeferredDatasetHandle: 

1284 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1285 after an immediate registry lookup. 

1286 

1287 Parameters 

1288 ---------- 

1289 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1290 When `DatasetRef` the `dataId` should be `None`. 

1291 Otherwise the `DatasetType` or name thereof. 

1292 dataId : `dict` or `DataCoordinate`, optional 

1293 A `dict` of `Dimension` link name, value pairs that label the 

1294 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1295 should be provided as the first argument. 

1296 parameters : `dict` 

1297 Additional StorageClass-defined options to control reading, 

1298 typically used to efficiently read only a subset of the dataset. 

1299 collections : Any, optional 

1300 Collections to be searched, overriding ``self.collections``. 

1301 Can be any of the types supported by the ``collections`` argument 

1302 to butler construction. 

1303 **kwargs 

1304 Additional keyword arguments used to augment or construct a 

1305 `DataId`. See `DataId` parameters. 

1306 

1307 Returns 

1308 ------- 

1309 obj : `DeferredDatasetHandle` 

1310 A handle which can be used to retrieve a dataset at a later time. 

1311 

1312 Raises 

1313 ------ 

1314 LookupError 

1315 Raised if no matching dataset exists in the `Registry` (and 

1316 ``allowUnresolved is False``). 

1317 ValueError 

1318 Raised if a resolved `DatasetRef` was passed as an input, but it 

1319 differs from the one found in the registry. 

1320 TypeError 

1321 Raised if no collections were provided. 

1322 """ 

1323 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1324 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

1325 

1326 def get( 

1327 self, 

1328 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1329 dataId: Optional[DataId] = None, 

1330 *, 

1331 parameters: Optional[Dict[str, Any]] = None, 

1332 collections: Any = None, 

1333 **kwargs: Any, 

1334 ) -> Any: 

1335 """Retrieve a stored dataset. 

1336 

1337 Parameters 

1338 ---------- 

1339 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1340 When `DatasetRef` the `dataId` should be `None`. 

1341 Otherwise the `DatasetType` or name thereof. 

1342 dataId : `dict` or `DataCoordinate` 

1343 A `dict` of `Dimension` link name, value pairs that label the 

1344 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1345 should be provided as the first argument. 

1346 parameters : `dict` 

1347 Additional StorageClass-defined options to control reading, 

1348 typically used to efficiently read only a subset of the dataset. 

1349 collections : Any, optional 

1350 Collections to be searched, overriding ``self.collections``. 

1351 Can be any of the types supported by the ``collections`` argument 

1352 to butler construction. 

1353 **kwargs 

1354 Additional keyword arguments used to augment or construct a 

1355 `DataCoordinate`. See `DataCoordinate.standardize` 

1356 parameters. 

1357 

1358 Returns 

1359 ------- 

1360 obj : `object` 

1361 The dataset. 

1362 

1363 Raises 

1364 ------ 

1365 ValueError 

1366 Raised if a resolved `DatasetRef` was passed as an input, but it 

1367 differs from the one found in the registry. 

1368 LookupError 

1369 Raised if no matching dataset exists in the `Registry`. 

1370 TypeError 

1371 Raised if no collections were provided. 

1372 

1373 Notes 

1374 ----- 

1375 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1376 this method requires that the given data ID include temporal dimensions 

1377 beyond the dimensions of the dataset type itself, in order to find the 

1378 dataset with the appropriate validity range. For example, a "bias" 

1379 dataset with native dimensions ``{instrument, detector}`` could be 

1380 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1381 ``exposure`` is a temporal dimension. 

1382 """ 

1383 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1384 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1385 return self.getDirect(ref, parameters=parameters) 

1386 

1387 def getURIs( 

1388 self, 

1389 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1390 dataId: Optional[DataId] = None, 

1391 *, 

1392 predict: bool = False, 

1393 collections: Any = None, 

1394 run: Optional[str] = None, 

1395 **kwargs: Any, 

1396 ) -> Tuple[Optional[ResourcePath], Dict[str, ResourcePath]]: 

1397 """Returns the URIs associated with the dataset. 

1398 

1399 Parameters 

1400 ---------- 

1401 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1402 When `DatasetRef` the `dataId` should be `None`. 

1403 Otherwise the `DatasetType` or name thereof. 

1404 dataId : `dict` or `DataCoordinate` 

1405 A `dict` of `Dimension` link name, value pairs that label the 

1406 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1407 should be provided as the first argument. 

1408 predict : `bool` 

1409 If `True`, allow URIs to be returned of datasets that have not 

1410 been written. 

1411 collections : Any, optional 

1412 Collections to be searched, overriding ``self.collections``. 

1413 Can be any of the types supported by the ``collections`` argument 

1414 to butler construction. 

1415 run : `str`, optional 

1416 Run to use for predictions, overriding ``self.run``. 

1417 **kwargs 

1418 Additional keyword arguments used to augment or construct a 

1419 `DataCoordinate`. See `DataCoordinate.standardize` 

1420 parameters. 

1421 

1422 Returns 

1423 ------- 

1424 primary : `lsst.resources.ResourcePath` 

1425 The URI to the primary artifact associated with this dataset. 

1426 If the dataset was disassembled within the datastore this 

1427 may be `None`. 

1428 components : `dict` 

1429 URIs to any components associated with the dataset artifact. 

1430 Can be empty if there are no components. 

1431 """ 

1432 ref = self._findDatasetRef( 

1433 datasetRefOrType, dataId, allowUnresolved=predict, collections=collections, **kwargs 

1434 ) 

1435 if ref.id is None: # only possible if predict is True 

1436 if run is None: 

1437 run = self.run 

1438 if run is None: 

1439 raise TypeError("Cannot predict location with run=None.") 

1440 # Lie about ID, because we can't guess it, and only 

1441 # Datastore.getURIs() will ever see it (and it doesn't use it). 

1442 ref = ref.resolved(id=0, run=run) 

1443 return self.datastore.getURIs(ref, predict) 

1444 

1445 def getURI( 

1446 self, 

1447 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1448 dataId: Optional[DataId] = None, 

1449 *, 

1450 predict: bool = False, 

1451 collections: Any = None, 

1452 run: Optional[str] = None, 

1453 **kwargs: Any, 

1454 ) -> ResourcePath: 

1455 """Return the URI to the Dataset. 

1456 

1457 Parameters 

1458 ---------- 

1459 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1460 When `DatasetRef` the `dataId` should be `None`. 

1461 Otherwise the `DatasetType` or name thereof. 

1462 dataId : `dict` or `DataCoordinate` 

1463 A `dict` of `Dimension` link name, value pairs that label the 

1464 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1465 should be provided as the first argument. 

1466 predict : `bool` 

1467 If `True`, allow URIs to be returned of datasets that have not 

1468 been written. 

1469 collections : Any, optional 

1470 Collections to be searched, overriding ``self.collections``. 

1471 Can be any of the types supported by the ``collections`` argument 

1472 to butler construction. 

1473 run : `str`, optional 

1474 Run to use for predictions, overriding ``self.run``. 

1475 **kwargs 

1476 Additional keyword arguments used to augment or construct a 

1477 `DataCoordinate`. See `DataCoordinate.standardize` 

1478 parameters. 

1479 

1480 Returns 

1481 ------- 

1482 uri : `lsst.resources.ResourcePath` 

1483 URI pointing to the Dataset within the datastore. If the 

1484 Dataset does not exist in the datastore, and if ``predict`` is 

1485 `True`, the URI will be a prediction and will include a URI 

1486 fragment "#predicted". 

1487 If the datastore does not have entities that relate well 

1488 to the concept of a URI the returned URI string will be 

1489 descriptive. The returned URI is not guaranteed to be obtainable. 

1490 

1491 Raises 

1492 ------ 

1493 LookupError 

1494 A URI has been requested for a dataset that does not exist and 

1495 guessing is not allowed. 

1496 ValueError 

1497 Raised if a resolved `DatasetRef` was passed as an input, but it 

1498 differs from the one found in the registry. 

1499 TypeError 

1500 Raised if no collections were provided. 

1501 RuntimeError 

1502 Raised if a URI is requested for a dataset that consists of 

1503 multiple artifacts. 

1504 """ 

1505 primary, components = self.getURIs( 

1506 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs 

1507 ) 

1508 

1509 if primary is None or components: 

1510 raise RuntimeError( 

1511 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1512 "Use Butler.getURIs() instead." 

1513 ) 

1514 return primary 

1515 

1516 def retrieveArtifacts( 

1517 self, 

1518 refs: Iterable[DatasetRef], 

1519 destination: ResourcePathExpression, 

1520 transfer: str = "auto", 

1521 preserve_path: bool = True, 

1522 overwrite: bool = False, 

1523 ) -> List[ResourcePath]: 

1524 """Retrieve the artifacts associated with the supplied refs. 

1525 

1526 Parameters 

1527 ---------- 

1528 refs : iterable of `DatasetRef` 

1529 The datasets for which artifacts are to be retrieved. 

1530 A single ref can result in multiple artifacts. The refs must 

1531 be resolved. 

1532 destination : `lsst.resources.ResourcePath` or `str` 

1533 Location to write the artifacts. 

1534 transfer : `str`, optional 

1535 Method to use to transfer the artifacts. Must be one of the options 

1536 supported by `~lsst.resources.ResourcePath.transfer_from()`. 

1537 "move" is not allowed. 

1538 preserve_path : `bool`, optional 

1539 If `True` the full path of the artifact within the datastore 

1540 is preserved. If `False` the final file component of the path 

1541 is used. 

1542 overwrite : `bool`, optional 

1543 If `True` allow transfers to overwrite existing files at the 

1544 destination. 

1545 

1546 Returns 

1547 ------- 

1548 targets : `list` of `lsst.resources.ResourcePath` 

1549 URIs of file artifacts in destination location. Order is not 

1550 preserved. 

1551 

1552 Notes 

1553 ----- 

1554 For non-file datastores the artifacts written to the destination 

1555 may not match the representation inside the datastore. For example 

1556 a hierarchical data structure in a NoSQL database may well be stored 

1557 as a JSON file. 

1558 """ 

1559 return self.datastore.retrieveArtifacts( 

1560 refs, 

1561 ResourcePath(destination), 

1562 transfer=transfer, 

1563 preserve_path=preserve_path, 

1564 overwrite=overwrite, 

1565 ) 

1566 

1567 def datasetExists( 

1568 self, 

1569 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1570 dataId: Optional[DataId] = None, 

1571 *, 

1572 collections: Any = None, 

1573 **kwargs: Any, 

1574 ) -> bool: 

1575 """Return True if the Dataset is actually present in the Datastore. 

1576 

1577 Parameters 

1578 ---------- 

1579 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1580 When `DatasetRef` the `dataId` should be `None`. 

1581 Otherwise the `DatasetType` or name thereof. 

1582 dataId : `dict` or `DataCoordinate` 

1583 A `dict` of `Dimension` link name, value pairs that label the 

1584 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1585 should be provided as the first argument. 

1586 collections : Any, optional 

1587 Collections to be searched, overriding ``self.collections``. 

1588 Can be any of the types supported by the ``collections`` argument 

1589 to butler construction. 

1590 **kwargs 

1591 Additional keyword arguments used to augment or construct a 

1592 `DataCoordinate`. See `DataCoordinate.standardize` 

1593 parameters. 

1594 

1595 Raises 

1596 ------ 

1597 LookupError 

1598 Raised if the dataset is not even present in the Registry. 

1599 ValueError 

1600 Raised if a resolved `DatasetRef` was passed as an input, but it 

1601 differs from the one found in the registry. 

1602 TypeError 

1603 Raised if no collections were provided. 

1604 """ 

1605 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1606 return self.datastore.exists(ref) 

1607 

1608 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1609 """Remove one or more `~CollectionType.RUN` collections and the 

1610 datasets within them. 

1611 

1612 Parameters 

1613 ---------- 

1614 names : `Iterable` [ `str` ] 

1615 The names of the collections to remove. 

1616 unstore : `bool`, optional 

1617 If `True` (default), delete datasets from all datastores in which 

1618 they are present, and attempt to rollback the registry deletions if 

1619 datastore deletions fail (which may not always be possible). If 

1620 `False`, datastore records for these datasets are still removed, 

1621 but any artifacts (e.g. files) will not be. 

1622 

1623 Raises 

1624 ------ 

1625 TypeError 

1626 Raised if one or more collections are not of type 

1627 `~CollectionType.RUN`. 

1628 """ 

1629 if not self.isWriteable(): 

1630 raise TypeError("Butler is read-only.") 

1631 names = list(names) 

1632 refs: List[DatasetRef] = [] 

1633 for name in names: 

1634 collectionType = self.registry.getCollectionType(name) 

1635 if collectionType is not CollectionType.RUN: 

1636 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1637 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True)) 

1638 with self.registry.transaction(): 

1639 if unstore: 

1640 self.datastore.trash(refs) 

1641 else: 

1642 self.datastore.forget(refs) 

1643 for name in names: 

1644 self.registry.removeCollection(name) 

1645 if unstore: 

1646 # Point of no return for removing artifacts 

1647 self.datastore.emptyTrash() 

1648 

1649 def pruneCollection( 

1650 self, name: str, purge: bool = False, unstore: bool = False, unlink: Optional[List[str]] = None 

1651 ) -> None: 

1652 """Remove a collection and possibly prune datasets within it. 

1653 

1654 Parameters 

1655 ---------- 

1656 name : `str` 

1657 Name of the collection to remove. If this is a 

1658 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection, 

1659 datasets within the collection are not modified unless ``unstore`` 

1660 is `True`. If this is a `~CollectionType.RUN` collection, 

1661 ``purge`` and ``unstore`` must be `True`, and all datasets in it 

1662 are fully removed from the data repository. 

1663 purge : `bool`, optional 

1664 If `True`, permit `~CollectionType.RUN` collections to be removed, 

1665 fully removing datasets within them. Requires ``unstore=True`` as 

1666 well as an added precaution against accidental deletion. Must be 

1667 `False` (default) if the collection is not a ``RUN``. 

1668 unstore: `bool`, optional 

1669 If `True`, remove all datasets in the collection from all 

1670 datastores in which they appear. 

1671 unlink: `list` [`str`], optional 

1672 Before removing the given `collection` unlink it from from these 

1673 parent collections. 

1674 

1675 Raises 

1676 ------ 

1677 TypeError 

1678 Raised if the butler is read-only or arguments are mutually 

1679 inconsistent. 

1680 """ 

1681 # See pruneDatasets comments for more information about the logic here; 

1682 # the cases are almost the same, but here we can rely on Registry to 

1683 # take care everything but Datastore deletion when we remove the 

1684 # collection. 

1685 if not self.isWriteable(): 

1686 raise TypeError("Butler is read-only.") 

1687 collectionType = self.registry.getCollectionType(name) 

1688 if purge and not unstore: 

1689 raise PurgeWithoutUnstorePruneCollectionsError() 

1690 if collectionType is CollectionType.RUN and not purge: 

1691 raise RunWithoutPurgePruneCollectionsError(collectionType) 

1692 if collectionType is not CollectionType.RUN and purge: 

1693 raise PurgeUnsupportedPruneCollectionsError(collectionType) 

1694 

1695 def remove(child: str, parent: str) -> None: 

1696 """Remove a child collection from a parent collection.""" 

1697 # Remove child from parent. 

1698 chain = list(self.registry.getCollectionChain(parent)) 

1699 try: 

1700 chain.remove(name) 

1701 except ValueError as e: 

1702 raise RuntimeError(f"{name} is not a child of {parent}") from e 

1703 self.registry.setCollectionChain(parent, chain) 

1704 

1705 with self.registry.transaction(): 

1706 if unlink: 

1707 for parent in unlink: 

1708 remove(name, parent) 

1709 if unstore: 

1710 refs = self.registry.queryDatasets(..., collections=name, findFirst=True) 

1711 self.datastore.trash(refs) 

1712 self.registry.removeCollection(name) 

1713 

1714 if unstore: 

1715 # Point of no return for removing artifacts 

1716 self.datastore.emptyTrash() 

1717 

1718 def pruneDatasets( 

1719 self, 

1720 refs: Iterable[DatasetRef], 

1721 *, 

1722 disassociate: bool = True, 

1723 unstore: bool = False, 

1724 tags: Iterable[str] = (), 

1725 purge: bool = False, 

1726 run: Optional[str] = None, 

1727 ) -> None: 

1728 """Remove one or more datasets from a collection and/or storage. 

1729 

1730 Parameters 

1731 ---------- 

1732 refs : `~collections.abc.Iterable` of `DatasetRef` 

1733 Datasets to prune. These must be "resolved" references (not just 

1734 a `DatasetType` and data ID). 

1735 disassociate : `bool`, optional 

1736 Disassociate pruned datasets from ``tags``, or from all collections 

1737 if ``purge=True``. 

1738 unstore : `bool`, optional 

1739 If `True` (`False` is default) remove these datasets from all 

1740 datastores known to this butler. Note that this will make it 

1741 impossible to retrieve these datasets even via other collections. 

1742 Datasets that are already not stored are ignored by this option. 

1743 tags : `Iterable` [ `str` ], optional 

1744 `~CollectionType.TAGGED` collections to disassociate the datasets 

1745 from. Ignored if ``disassociate`` is `False` or ``purge`` is 

1746 `True`. 

1747 purge : `bool`, optional 

1748 If `True` (`False` is default), completely remove the dataset from 

1749 the `Registry`. To prevent accidental deletions, ``purge`` may 

1750 only be `True` if all of the following conditions are met: 

1751 

1752 - All given datasets are in the given run. 

1753 - ``disassociate`` is `True`; 

1754 - ``unstore`` is `True`. 

1755 

1756 This mode may remove provenance information from datasets other 

1757 than those provided, and should be used with extreme care. 

1758 

1759 Raises 

1760 ------ 

1761 TypeError 

1762 Raised if the butler is read-only, if no collection was provided, 

1763 or the conditions for ``purge=True`` were not met. 

1764 """ 

1765 if not self.isWriteable(): 

1766 raise TypeError("Butler is read-only.") 

1767 if purge: 

1768 if not disassociate: 

1769 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1770 if not unstore: 

1771 raise TypeError("Cannot pass purge=True without unstore=True.") 

1772 elif disassociate: 

1773 tags = tuple(tags) 

1774 if not tags: 

1775 raise TypeError("No tags provided but disassociate=True.") 

1776 for tag in tags: 

1777 collectionType = self.registry.getCollectionType(tag) 

1778 if collectionType is not CollectionType.TAGGED: 

1779 raise TypeError( 

1780 f"Cannot disassociate from collection '{tag}' " 

1781 f"of non-TAGGED type {collectionType.name}." 

1782 ) 

1783 # Transform possibly-single-pass iterable into something we can iterate 

1784 # over multiple times. 

1785 refs = list(refs) 

1786 # Pruning a component of a DatasetRef makes no sense since registry 

1787 # doesn't know about components and datastore might not store 

1788 # components in a separate file 

1789 for ref in refs: 

1790 if ref.datasetType.component(): 

1791 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1792 # We don't need an unreliable Datastore transaction for this, because 

1793 # we've been extra careful to ensure that Datastore.trash only involves 

1794 # mutating the Registry (it can _look_ at Datastore-specific things, 

1795 # but shouldn't change them), and hence all operations here are 

1796 # Registry operations. 

1797 with self.registry.transaction(): 

1798 if unstore: 

1799 self.datastore.trash(refs) 

1800 if purge: 

1801 self.registry.removeDatasets(refs) 

1802 elif disassociate: 

1803 assert tags, "Guaranteed by earlier logic in this function." 

1804 for tag in tags: 

1805 self.registry.disassociate(tag, refs) 

1806 # We've exited the Registry transaction, and apparently committed. 

1807 # (if there was an exception, everything rolled back, and it's as if 

1808 # nothing happened - and we never get here). 

1809 # Datastore artifacts are not yet gone, but they're clearly marked 

1810 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1811 # problems we can try again later, and if manual administrative 

1812 # intervention is required, it's pretty clear what that should entail: 

1813 # deleting everything on disk and in private Datastore tables that is 

1814 # in the dataset_location_trash table. 

1815 if unstore: 

1816 # Point of no return for removing artifacts 

1817 self.datastore.emptyTrash() 

1818 

1819 @transactional 

1820 def ingest( 

1821 self, 

1822 *datasets: FileDataset, 

1823 transfer: Optional[str] = "auto", 

1824 run: Optional[str] = None, 

1825 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1826 record_validation_info: bool = True, 

1827 ) -> None: 

1828 """Store and register one or more datasets that already exist on disk. 

1829 

1830 Parameters 

1831 ---------- 

1832 datasets : `FileDataset` 

1833 Each positional argument is a struct containing information about 

1834 a file to be ingested, including its URI (either absolute or 

1835 relative to the datastore root, if applicable), a `DatasetRef`, 

1836 and optionally a formatter class or its fully-qualified string 

1837 name. If a formatter is not provided, the formatter that would be 

1838 used for `put` is assumed. On successful return, all 

1839 `FileDataset.ref` attributes will have their `DatasetRef.id` 

1840 attribute populated and all `FileDataset.formatter` attributes will 

1841 be set to the formatter class used. `FileDataset.path` attributes 

1842 may be modified to put paths in whatever the datastore considers a 

1843 standardized form. 

1844 transfer : `str`, optional 

1845 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1846 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to 

1847 transfer the file. 

1848 run : `str`, optional 

1849 The name of the run ingested datasets should be added to, 

1850 overriding ``self.run``. 

1851 idGenerationMode : `DatasetIdGenEnum`, optional 

1852 Specifies option for generating dataset IDs. By default unique IDs 

1853 are generated for each inserted dataset. 

1854 record_validation_info : `bool`, optional 

1855 If `True`, the default, the datastore can record validation 

1856 information associated with the file. If `False` the datastore 

1857 will not attempt to track any information such as checksums 

1858 or file sizes. This can be useful if such information is tracked 

1859 in an external system or if the file is to be compressed in place. 

1860 It is up to the datastore whether this parameter is relevant. 

1861 

1862 Raises 

1863 ------ 

1864 TypeError 

1865 Raised if the butler is read-only or if no run was provided. 

1866 NotImplementedError 

1867 Raised if the `Datastore` does not support the given transfer mode. 

1868 DatasetTypeNotSupportedError 

1869 Raised if one or more files to be ingested have a dataset type that 

1870 is not supported by the `Datastore`.. 

1871 FileNotFoundError 

1872 Raised if one of the given files does not exist. 

1873 FileExistsError 

1874 Raised if transfer is not `None` but the (internal) location the 

1875 file would be moved to is already occupied. 

1876 

1877 Notes 

1878 ----- 

1879 This operation is not fully exception safe: if a database operation 

1880 fails, the given `FileDataset` instances may be only partially updated. 

1881 

1882 It is atomic in terms of database operations (they will either all 

1883 succeed or all fail) providing the database engine implements 

1884 transactions correctly. It will attempt to be atomic in terms of 

1885 filesystem operations as well, but this cannot be implemented 

1886 rigorously for most datastores. 

1887 """ 

1888 if not self.isWriteable(): 

1889 raise TypeError("Butler is read-only.") 

1890 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1891 # Reorganize the inputs so they're grouped by DatasetType and then 

1892 # data ID. We also include a list of DatasetRefs for each FileDataset 

1893 # to hold the resolved DatasetRefs returned by the Registry, before 

1894 # it's safe to swap them into FileDataset.refs. 

1895 # Some type annotation aliases to make that clearer: 

1896 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]] 

1897 GroupedData = MutableMapping[DatasetType, GroupForType] 

1898 # The actual data structure: 

1899 groupedData: GroupedData = defaultdict(dict) 

1900 # And the nested loop that populates it: 

1901 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1902 # This list intentionally shared across the inner loop, since it's 

1903 # associated with `dataset`. 

1904 resolvedRefs: List[DatasetRef] = [] 

1905 

1906 # Somewhere to store pre-existing refs if we have an 

1907 # execution butler. 

1908 existingRefs: List[DatasetRef] = [] 

1909 

1910 for ref in dataset.refs: 

1911 if ref.dataId in groupedData[ref.datasetType]: 

1912 raise ConflictingDefinitionError( 

1913 f"Ingest conflict. Dataset {dataset.path} has same" 

1914 " DataId as other ingest dataset" 

1915 f" {groupedData[ref.datasetType][ref.dataId][0].path} " 

1916 f" ({ref.dataId})" 

1917 ) 

1918 if self._allow_put_of_predefined_dataset: 

1919 existing_ref = self.registry.findDataset( 

1920 ref.datasetType, dataId=ref.dataId, collections=run 

1921 ) 

1922 if existing_ref: 

1923 if self.datastore.knows(existing_ref): 

1924 raise ConflictingDefinitionError( 

1925 f"Dataset associated with path {dataset.path}" 

1926 f" already exists as {existing_ref}." 

1927 ) 

1928 # Store this ref elsewhere since it already exists 

1929 # and we do not want to remake it but we do want 

1930 # to store it in the datastore. 

1931 existingRefs.append(existing_ref) 

1932 

1933 # Nothing else to do until we have finished 

1934 # iterating. 

1935 continue 

1936 

1937 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs) 

1938 

1939 if existingRefs: 

1940 

1941 if len(dataset.refs) != len(existingRefs): 

1942 # Keeping track of partially pre-existing datasets is hard 

1943 # and should generally never happen. For now don't allow 

1944 # it. 

1945 raise ConflictingDefinitionError( 

1946 f"For dataset {dataset.path} some dataIds already exist" 

1947 " in registry but others do not. This is not supported." 

1948 ) 

1949 

1950 # Attach the resolved refs if we found them. 

1951 dataset.refs = existingRefs 

1952 

1953 # Now we can bulk-insert into Registry for each DatasetType. 

1954 for datasetType, groupForType in progress.iter_item_chunks( 

1955 groupedData.items(), desc="Bulk-inserting datasets by type" 

1956 ): 

1957 refs = self.registry.insertDatasets( 

1958 datasetType, 

1959 dataIds=groupForType.keys(), 

1960 run=run, 

1961 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType), 

1962 idGenerationMode=idGenerationMode, 

1963 ) 

1964 # Append those resolved DatasetRefs to the new lists we set up for 

1965 # them. 

1966 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()): 

1967 resolvedRefs.append(ref) 

1968 

1969 # Go back to the original FileDatasets to replace their refs with the 

1970 # new resolved ones. 

1971 for groupForType in progress.iter_chunks( 

1972 groupedData.values(), desc="Reassociating resolved dataset refs with files" 

1973 ): 

1974 for dataset, resolvedRefs in groupForType.values(): 

1975 dataset.refs = resolvedRefs 

1976 

1977 # Bulk-insert everything into Datastore. 

1978 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info) 

1979 

1980 @contextlib.contextmanager 

1981 def export( 

1982 self, 

1983 *, 

1984 directory: Optional[str] = None, 

1985 filename: Optional[str] = None, 

1986 format: Optional[str] = None, 

1987 transfer: Optional[str] = None, 

1988 ) -> Iterator[RepoExportContext]: 

1989 """Export datasets from the repository represented by this `Butler`. 

1990 

1991 This method is a context manager that returns a helper object 

1992 (`RepoExportContext`) that is used to indicate what information from 

1993 the repository should be exported. 

1994 

1995 Parameters 

1996 ---------- 

1997 directory : `str`, optional 

1998 Directory dataset files should be written to if ``transfer`` is not 

1999 `None`. 

2000 filename : `str`, optional 

2001 Name for the file that will include database information associated 

2002 with the exported datasets. If this is not an absolute path and 

2003 ``directory`` is not `None`, it will be written to ``directory`` 

2004 instead of the current working directory. Defaults to 

2005 "export.{format}". 

2006 format : `str`, optional 

2007 File format for the database information file. If `None`, the 

2008 extension of ``filename`` will be used. 

2009 transfer : `str`, optional 

2010 Transfer mode passed to `Datastore.export`. 

2011 

2012 Raises 

2013 ------ 

2014 TypeError 

2015 Raised if the set of arguments passed is inconsistent. 

2016 

2017 Examples 

2018 -------- 

2019 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

2020 methods are used to provide the iterables over data IDs and/or datasets 

2021 to be exported:: 

2022 

2023 with butler.export("exports.yaml") as export: 

2024 # Export all flats, but none of the dimension element rows 

2025 # (i.e. data ID information) associated with them. 

2026 export.saveDatasets(butler.registry.queryDatasets("flat"), 

2027 elements=()) 

2028 # Export all datasets that start with "deepCoadd_" and all of 

2029 # their associated data ID information. 

2030 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

2031 """ 

2032 if directory is None and transfer is not None: 

2033 raise TypeError("Cannot transfer without providing a directory.") 

2034 if transfer == "move": 

2035 raise TypeError("Transfer may not be 'move': export is read-only") 

2036 if format is None: 

2037 if filename is None: 

2038 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2039 else: 

2040 _, format = os.path.splitext(filename) 

2041 elif filename is None: 

2042 filename = f"export.{format}" 

2043 if directory is not None: 

2044 filename = os.path.join(directory, filename) 

2045 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["export"]) 

2046 with open(filename, "w") as stream: 

2047 backend = BackendClass(stream, universe=self.registry.dimensions) 

2048 try: 

2049 helper = RepoExportContext( 

2050 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer 

2051 ) 

2052 yield helper 

2053 except BaseException: 

2054 raise 

2055 else: 

2056 helper._finish() 

2057 

2058 def import_( 

2059 self, 

2060 *, 

2061 directory: Optional[str] = None, 

2062 filename: Union[str, TextIO, None] = None, 

2063 format: Optional[str] = None, 

2064 transfer: Optional[str] = None, 

2065 skip_dimensions: Optional[Set] = None, 

2066 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

2067 reuseIds: bool = False, 

2068 ) -> None: 

2069 """Import datasets into this repository that were exported from a 

2070 different butler repository via `~lsst.daf.butler.Butler.export`. 

2071 

2072 Parameters 

2073 ---------- 

2074 directory : `str`, optional 

2075 Directory containing dataset files to import from. If `None`, 

2076 ``filename`` and all dataset file paths specified therein must 

2077 be absolute. 

2078 filename : `str` or `TextIO`, optional 

2079 A stream or name of file that contains database information 

2080 associated with the exported datasets, typically generated by 

2081 `~lsst.daf.butler.Butler.export`. If this a string (name) and 

2082 is not an absolute path, does not exist in the current working 

2083 directory, and ``directory`` is not `None`, it is assumed to be in 

2084 ``directory``. Defaults to "export.{format}". 

2085 format : `str`, optional 

2086 File format for ``filename``. If `None`, the extension of 

2087 ``filename`` will be used. 

2088 transfer : `str`, optional 

2089 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

2090 skip_dimensions : `set`, optional 

2091 Names of dimensions that should be skipped and not imported. 

2092 idGenerationMode : `DatasetIdGenEnum`, optional 

2093 Specifies option for generating dataset IDs when IDs are not 

2094 provided or their type does not match backend type. By default 

2095 unique IDs are generated for each inserted dataset. 

2096 reuseIds : `bool`, optional 

2097 If `True` then forces re-use of imported dataset IDs for integer 

2098 IDs which are normally generated as auto-incremented; exception 

2099 will be raised if imported IDs clash with existing ones. This 

2100 option has no effect on the use of globally-unique IDs which are 

2101 always re-used (or generated if integer IDs are being imported). 

2102 

2103 Raises 

2104 ------ 

2105 TypeError 

2106 Raised if the set of arguments passed is inconsistent, or if the 

2107 butler is read-only. 

2108 """ 

2109 if not self.isWriteable(): 

2110 raise TypeError("Butler is read-only.") 

2111 if format is None: 

2112 if filename is None: 

2113 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2114 else: 

2115 _, format = os.path.splitext(filename) # type: ignore 

2116 elif filename is None: 

2117 filename = f"export.{format}" 

2118 if isinstance(filename, str) and directory is not None and not os.path.exists(filename): 

2119 filename = os.path.join(directory, filename) 

2120 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"]) 

2121 

2122 def doImport(importStream: TextIO) -> None: 

2123 backend = BackendClass(importStream, self.registry) 

2124 backend.register() 

2125 with self.transaction(): 

2126 backend.load( 

2127 self.datastore, 

2128 directory=directory, 

2129 transfer=transfer, 

2130 skip_dimensions=skip_dimensions, 

2131 idGenerationMode=idGenerationMode, 

2132 reuseIds=reuseIds, 

2133 ) 

2134 

2135 if isinstance(filename, str): 

2136 with open(filename, "r") as stream: 

2137 doImport(stream) 

2138 else: 

2139 doImport(filename) 

2140 

2141 def transfer_from( 

2142 self, 

2143 source_butler: Butler, 

2144 source_refs: Iterable[DatasetRef], 

2145 transfer: str = "auto", 

2146 id_gen_map: Dict[str, DatasetIdGenEnum] = None, 

2147 skip_missing: bool = True, 

2148 register_dataset_types: bool = False, 

2149 ) -> List[DatasetRef]: 

2150 """Transfer datasets to this Butler from a run in another Butler. 

2151 

2152 Parameters 

2153 ---------- 

2154 source_butler : `Butler` 

2155 Butler from which the datasets are to be transferred. 

2156 source_refs : iterable of `DatasetRef` 

2157 Datasets defined in the source butler that should be transferred to 

2158 this butler. 

2159 transfer : `str`, optional 

2160 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

2161 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional 

2162 A mapping of dataset type to ID generation mode. Only used if 

2163 the source butler is using integer IDs. Should not be used 

2164 if this receiving butler uses integer IDs. Without this dataset 

2165 import always uses unique. 

2166 skip_missing : `bool` 

2167 If `True`, datasets with no datastore artifact associated with 

2168 them are not transferred. If `False` a registry entry will be 

2169 created even if no datastore record is created (and so will 

2170 look equivalent to the dataset being unstored). 

2171 register_dataset_types : `bool` 

2172 If `True` any missing dataset types are registered. Otherwise 

2173 an exception is raised. 

2174 

2175 Returns 

2176 ------- 

2177 refs : `list` of `DatasetRef` 

2178 The refs added to this Butler. 

2179 

2180 Notes 

2181 ----- 

2182 Requires that any dimension definitions are already present in the 

2183 receiving Butler. The datastore artifact has to exist for a transfer 

2184 to be made but non-existence is not an error. 

2185 

2186 Datasets that already exist in this run will be skipped. 

2187 

2188 The datasets are imported as part of a transaction, although 

2189 dataset types are registered before the transaction is started. 

2190 This means that it is possible for a dataset type to be registered 

2191 even though transfer has failed. 

2192 """ 

2193 if not self.isWriteable(): 

2194 raise TypeError("Butler is read-only.") 

2195 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

2196 

2197 # Will iterate through the refs multiple times so need to convert 

2198 # to a list if this isn't a collection. 

2199 if not isinstance(source_refs, collections.abc.Collection): 

2200 source_refs = list(source_refs) 

2201 

2202 original_count = len(source_refs) 

2203 log.info("Transferring %d datasets into %s", original_count, str(self)) 

2204 

2205 if id_gen_map is None: 

2206 id_gen_map = {} 

2207 

2208 # In some situations the datastore artifact may be missing 

2209 # and we do not want that registry entry to be imported. 

2210 # Asking datastore is not sufficient, the records may have been 

2211 # purged, we have to ask for the (predicted) URI and check 

2212 # existence explicitly. Execution butler is set up exactly like 

2213 # this with no datastore records. 

2214 artifact_existence: Dict[ResourcePath, bool] = {} 

2215 if skip_missing: 

2216 dataset_existence = source_butler.datastore.mexists( 

2217 source_refs, artifact_existence=artifact_existence 

2218 ) 

2219 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

2220 filtered_count = len(source_refs) 

2221 log.verbose( 

2222 "%d datasets removed because the artifact does not exist. Now have %d.", 

2223 original_count - filtered_count, 

2224 filtered_count, 

2225 ) 

2226 

2227 # Importing requires that we group the refs by dataset type and run 

2228 # before doing the import. 

2229 source_dataset_types = set() 

2230 grouped_refs = defaultdict(list) 

2231 grouped_indices = defaultdict(list) 

2232 for i, ref in enumerate(source_refs): 

2233 grouped_refs[ref.datasetType, ref.run].append(ref) 

2234 grouped_indices[ref.datasetType, ref.run].append(i) 

2235 source_dataset_types.add(ref.datasetType) 

2236 

2237 # Check to see if the dataset type in the source butler has 

2238 # the same definition in the target butler and register missing 

2239 # ones if requested. Registration must happen outside a transaction. 

2240 newly_registered_dataset_types = set() 

2241 for datasetType in source_dataset_types: 

2242 if register_dataset_types: 

2243 # Let this raise immediately if inconsistent. Continuing 

2244 # on to find additional inconsistent dataset types 

2245 # might result in additional unwanted dataset types being 

2246 # registered. 

2247 if self.registry.registerDatasetType(datasetType): 

2248 newly_registered_dataset_types.add(datasetType) 

2249 else: 

2250 # If the dataset type is missing, let it fail immediately. 

2251 target_dataset_type = self.registry.getDatasetType(datasetType.name) 

2252 if target_dataset_type != datasetType: 

2253 raise ConflictingDefinitionError( 

2254 "Source butler dataset type differs from definition" 

2255 f" in target butler: {datasetType} !=" 

2256 f" {target_dataset_type}" 

2257 ) 

2258 if newly_registered_dataset_types: 

2259 # We may have registered some even if there were inconsistencies 

2260 # but should let people know (or else remove them again). 

2261 log.log( 

2262 VERBOSE, 

2263 "Registered the following dataset types in the target Butler: %s", 

2264 ", ".join(d.name for d in newly_registered_dataset_types), 

2265 ) 

2266 else: 

2267 log.log(VERBOSE, "All required dataset types are known to the target Butler") 

2268 

2269 # The returned refs should be identical for UUIDs. 

2270 # For now must also support integers and so need to retain the 

2271 # newly-created refs from this registry. 

2272 # Pre-size it so we can assign refs into the correct slots 

2273 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs) 

2274 default_id_gen = DatasetIdGenEnum.UNIQUE 

2275 

2276 handled_collections: Set[str] = set() 

2277 

2278 # Do all the importing in a single transaction. 

2279 with self.transaction(): 

2280 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

2281 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

2282 ): 

2283 if run not in handled_collections: 

2284 run_doc = source_butler.registry.getCollectionDocumentation(run) 

2285 registered = self.registry.registerRun(run, doc=run_doc) 

2286 handled_collections.add(run) 

2287 if registered: 

2288 log.log(VERBOSE, "Creating output run %s", run) 

2289 

2290 id_generation_mode = default_id_gen 

2291 if isinstance(refs_to_import[0].id, int): 

2292 # ID generation mode might need to be overridden when 

2293 # targetting UUID 

2294 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen) 

2295 

2296 n_refs = len(refs_to_import) 

2297 log.verbose( 

2298 "Importing %d ref%s of dataset type %s into run %s", 

2299 n_refs, 

2300 "" if n_refs == 1 else "s", 

2301 datasetType.name, 

2302 run, 

2303 ) 

2304 

2305 # No way to know if this butler's registry uses UUID. 

2306 # We have to trust the caller on this. If it fails they will 

2307 # have to change their approach. We can't catch the exception 

2308 # and retry with unique because that will mess up the 

2309 # transaction handling. We aren't allowed to ask the registry 

2310 # manager what type of ID it is using. 

2311 imported_refs = self.registry._importDatasets( 

2312 refs_to_import, idGenerationMode=id_generation_mode, expand=False 

2313 ) 

2314 

2315 # Map them into the correct slots to match the initial order 

2316 for i, ref in zip(grouped_indices[datasetType, run], imported_refs): 

2317 transferred_refs_tmp[i] = ref 

2318 

2319 # Mypy insists that we might have None in here so we have to make 

2320 # that explicit by assigning to a new variable and filtering out 

2321 # something that won't be there. 

2322 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None] 

2323 

2324 # Check consistency 

2325 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given" 

2326 

2327 log.verbose("Imported %d datasets into destination butler", len(transferred_refs)) 

2328 

2329 # The transferred refs need to be reordered to match the original 

2330 # ordering given by the caller. Without this the datastore transfer 

2331 # will be broken. 

2332 

2333 # Ask the datastore to transfer. The datastore has to check that 

2334 # the source datastore is compatible with the target datastore. 

2335 self.datastore.transfer_from( 

2336 source_butler.datastore, 

2337 source_refs, 

2338 local_refs=transferred_refs, 

2339 transfer=transfer, 

2340 artifact_existence=artifact_existence, 

2341 ) 

2342 

2343 return transferred_refs 

2344 

2345 def validateConfiguration( 

2346 self, 

2347 logFailures: bool = False, 

2348 datasetTypeNames: Optional[Iterable[str]] = None, 

2349 ignore: Iterable[str] = None, 

2350 ) -> None: 

2351 """Validate butler configuration. 

2352 

2353 Checks that each `DatasetType` can be stored in the `Datastore`. 

2354 

2355 Parameters 

2356 ---------- 

2357 logFailures : `bool`, optional 

2358 If `True`, output a log message for every validation error 

2359 detected. 

2360 datasetTypeNames : iterable of `str`, optional 

2361 The `DatasetType` names that should be checked. This allows 

2362 only a subset to be selected. 

2363 ignore : iterable of `str`, optional 

2364 Names of DatasetTypes to skip over. This can be used to skip 

2365 known problems. If a named `DatasetType` corresponds to a 

2366 composite, all components of that `DatasetType` will also be 

2367 ignored. 

2368 

2369 Raises 

2370 ------ 

2371 ButlerValidationError 

2372 Raised if there is some inconsistency with how this Butler 

2373 is configured. 

2374 """ 

2375 if datasetTypeNames: 

2376 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

2377 else: 

2378 datasetTypes = list(self.registry.queryDatasetTypes()) 

2379 

2380 # filter out anything from the ignore list 

2381 if ignore: 

2382 ignore = set(ignore) 

2383 datasetTypes = [ 

2384 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

2385 ] 

2386 else: 

2387 ignore = set() 

2388 

2389 # Find all the registered instruments 

2390 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument")) 

2391 

2392 # For each datasetType that has an instrument dimension, create 

2393 # a DatasetRef for each defined instrument 

2394 datasetRefs = [] 

2395 

2396 for datasetType in datasetTypes: 

2397 if "instrument" in datasetType.dimensions: 

2398 for instrument in instruments: 

2399 datasetRef = DatasetRef( 

2400 datasetType, {"instrument": instrument}, conform=False # type: ignore 

2401 ) 

2402 datasetRefs.append(datasetRef) 

2403 

2404 entities: List[Union[DatasetType, DatasetRef]] = [] 

2405 entities.extend(datasetTypes) 

2406 entities.extend(datasetRefs) 

2407 

2408 datastoreErrorStr = None 

2409 try: 

2410 self.datastore.validateConfiguration(entities, logFailures=logFailures) 

2411 except ValidationError as e: 

2412 datastoreErrorStr = str(e) 

2413 

2414 # Also check that the LookupKeys used by the datastores match 

2415 # registry and storage class definitions 

2416 keys = self.datastore.getLookupKeys() 

2417 

2418 failedNames = set() 

2419 failedDataId = set() 

2420 for key in keys: 

2421 if key.name is not None: 

2422 if key.name in ignore: 

2423 continue 

2424 

2425 # skip if specific datasetType names were requested and this 

2426 # name does not match 

2427 if datasetTypeNames and key.name not in datasetTypeNames: 

2428 continue 

2429 

2430 # See if it is a StorageClass or a DatasetType 

2431 if key.name in self.storageClasses: 

2432 pass 

2433 else: 

2434 try: 

2435 self.registry.getDatasetType(key.name) 

2436 except KeyError: 

2437 if logFailures: 

2438 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

2439 failedNames.add(key) 

2440 else: 

2441 # Dimensions are checked for consistency when the Butler 

2442 # is created and rendezvoused with a universe. 

2443 pass 

2444 

2445 # Check that the instrument is a valid instrument 

2446 # Currently only support instrument so check for that 

2447 if key.dataId: 

2448 dataIdKeys = set(key.dataId) 

2449 if set(["instrument"]) != dataIdKeys: 

2450 if logFailures: 

2451 log.critical("Key '%s' has unsupported DataId override", key) 

2452 failedDataId.add(key) 

2453 elif key.dataId["instrument"] not in instruments: 

2454 if logFailures: 

2455 log.critical("Key '%s' has unknown instrument", key) 

2456 failedDataId.add(key) 

2457 

2458 messages = [] 

2459 

2460 if datastoreErrorStr: 

2461 messages.append(datastoreErrorStr) 

2462 

2463 for failed, msg in ( 

2464 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2465 (failedDataId, "Keys with bad DataId entries: "), 

2466 ): 

2467 if failed: 

2468 msg += ", ".join(str(k) for k in failed) 

2469 messages.append(msg) 

2470 

2471 if messages: 

2472 raise ValidationError(";\n".join(messages)) 

2473 

2474 @property 

2475 def collections(self) -> CollectionSearch: 

2476 """The collections to search by default, in order (`CollectionSearch`). 

2477 

2478 This is an alias for ``self.registry.defaults.collections``. It cannot 

2479 be set directly in isolation, but all defaults may be changed together 

2480 by assigning a new `RegistryDefaults` instance to 

2481 ``self.registry.defaults``. 

2482 """ 

2483 return self.registry.defaults.collections 

2484 

2485 @property 

2486 def run(self) -> Optional[str]: 

2487 """Name of the run this butler writes outputs to by default (`str` or 

2488 `None`). 

2489 

2490 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2491 directly in isolation, but all defaults may be changed together by 

2492 assigning a new `RegistryDefaults` instance to 

2493 ``self.registry.defaults``. 

2494 """ 

2495 return self.registry.defaults.run 

2496 

2497 @property 

2498 def dimensions(self) -> DimensionUniverse: 

2499 # Docstring inherited. 

2500 return self.registry.dimensions 

2501 

2502 registry: Registry 

2503 """The object that manages dataset metadata and relationships (`Registry`). 

2504 

2505 Most operations that don't involve reading or writing butler datasets are 

2506 accessible only via `Registry` methods. 

2507 """ 

2508 

2509 datastore: Datastore 

2510 """The object that manages actual dataset storage (`Datastore`). 

2511 

2512 Direct user access to the datastore should rarely be necessary; the primary 

2513 exception is the case where a `Datastore` implementation provides extra 

2514 functionality beyond what the base class defines. 

2515 """ 

2516 

2517 storageClasses: StorageClassFactory 

2518 """An object that maps known storage class names to objects that fully 

2519 describe them (`StorageClassFactory`). 

2520 """ 

2521 

2522 _allow_put_of_predefined_dataset: bool 

2523 """Allow a put to succeed even if there is already a registry entry for it 

2524 but not a datastore record. (`bool`)."""