Coverage for python/lsst/daf/butler/_butler.py: 10%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

603 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Butler top level classes. 

24""" 

25from __future__ import annotations 

26 

27__all__ = ( 

28 "Butler", 

29 "ButlerValidationError", 

30 "PruneCollectionsArgsError", 

31 "PurgeWithoutUnstorePruneCollectionsError", 

32 "RunWithoutPurgePruneCollectionsError", 

33 "PurgeUnsupportedPruneCollectionsError", 

34) 

35 

36import collections.abc 

37import contextlib 

38import logging 

39import numbers 

40import os 

41from collections import defaultdict 

42from typing import ( 

43 Any, 

44 ClassVar, 

45 Counter, 

46 Dict, 

47 Iterable, 

48 Iterator, 

49 List, 

50 MutableMapping, 

51 Optional, 

52 Set, 

53 TextIO, 

54 Tuple, 

55 Type, 

56 Union, 

57) 

58 

59try: 

60 import boto3 

61except ImportError: 

62 boto3 = None 

63 

64from lsst.utils import doImportType 

65from lsst.utils.introspection import get_class_of 

66from lsst.utils.logging import VERBOSE, getLogger 

67 

68from ._butlerConfig import ButlerConfig 

69from ._butlerRepoIndex import ButlerRepoIndex 

70from ._deferredDatasetHandle import DeferredDatasetHandle 

71from .core import ( 

72 AmbiguousDatasetError, 

73 ButlerURI, 

74 Config, 

75 ConfigSubset, 

76 DataCoordinate, 

77 DataId, 

78 DataIdValue, 

79 DatasetRef, 

80 DatasetType, 

81 Datastore, 

82 Dimension, 

83 DimensionConfig, 

84 FileDataset, 

85 Progress, 

86 StorageClassFactory, 

87 Timespan, 

88 ValidationError, 

89) 

90from .core.repoRelocation import BUTLER_ROOT_TAG 

91from .core.utils import transactional 

92from .registry import ( 

93 CollectionSearch, 

94 CollectionType, 

95 ConflictingDefinitionError, 

96 DatasetIdGenEnum, 

97 Registry, 

98 RegistryConfig, 

99 RegistryDefaults, 

100) 

101from .transfers import RepoExportContext 

102 

103log = getLogger(__name__) 

104 

105 

106class ButlerValidationError(ValidationError): 

107 """There is a problem with the Butler configuration.""" 

108 

109 pass 

110 

111 

112class PruneCollectionsArgsError(TypeError): 

113 """Base class for errors relating to Butler.pruneCollections input 

114 arguments. 

115 """ 

116 

117 pass 

118 

119 

120class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError): 

121 """Raised when purge and unstore are both required to be True, and 

122 purge is True but unstore is False. 

123 """ 

124 

125 def __init__(self) -> None: 

126 super().__init__("Cannot pass purge=True without unstore=True.") 

127 

128 

129class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError): 

130 """Raised when pruning a RUN collection but purge is False.""" 

131 

132 def __init__(self, collectionType: CollectionType): 

133 self.collectionType = collectionType 

134 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.") 

135 

136 

137class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError): 

138 """Raised when purge is True but is not supported for the given 

139 collection.""" 

140 

141 def __init__(self, collectionType: CollectionType): 

142 self.collectionType = collectionType 

143 super().__init__( 

144 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True." 

145 ) 

146 

147 

148class Butler: 

149 """Main entry point for the data access system. 

150 

151 Parameters 

152 ---------- 

153 config : `ButlerConfig`, `Config` or `str`, optional. 

154 Configuration. Anything acceptable to the 

155 `ButlerConfig` constructor. If a directory path 

156 is given the configuration will be read from a ``butler.yaml`` file in 

157 that location. If `None` is given default values will be used. 

158 butler : `Butler`, optional. 

159 If provided, construct a new Butler that uses the same registry and 

160 datastore as the given one, but with the given collection and run. 

161 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

162 arguments. 

163 collections : `str` or `Iterable` [ `str` ], optional 

164 An expression specifying the collections to be searched (in order) when 

165 reading datasets. 

166 This may be a `str` collection name or an iterable thereof. 

167 See :ref:`daf_butler_collection_expressions` for more information. 

168 These collections are not registered automatically and must be 

169 manually registered before they are used by any method, but they may be 

170 manually registered after the `Butler` is initialized. 

171 run : `str`, optional 

172 Name of the `~CollectionType.RUN` collection new datasets should be 

173 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

174 ``collections`` will be set to ``[run]``. If not `None`, this 

175 collection will automatically be registered. If this is not set (and 

176 ``writeable`` is not set either), a read-only butler will be created. 

177 searchPaths : `list` of `str`, optional 

178 Directory paths to search when calculating the full Butler 

179 configuration. Not used if the supplied config is already a 

180 `ButlerConfig`. 

181 writeable : `bool`, optional 

182 Explicitly sets whether the butler supports write operations. If not 

183 provided, a read-write butler is created if any of ``run``, ``tags``, 

184 or ``chains`` is non-empty. 

185 inferDefaults : `bool`, optional 

186 If `True` (default) infer default data ID values from the values 

187 present in the datasets in ``collections``: if all collections have the 

188 same value (or no value) for a governor dimension, that value will be 

189 the default for that dimension. Nonexistent collections are ignored. 

190 If a default value is provided explicitly for a governor dimension via 

191 ``**kwargs``, no default will be inferred for that dimension. 

192 **kwargs : `str` 

193 Default data ID key-value pairs. These may only identify "governor" 

194 dimensions like ``instrument`` and ``skymap``. 

195 

196 Examples 

197 -------- 

198 While there are many ways to control exactly how a `Butler` interacts with 

199 the collections in its `Registry`, the most common cases are still simple. 

200 

201 For a read-only `Butler` that searches one collection, do:: 

202 

203 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

204 

205 For a read-write `Butler` that writes to and reads from a 

206 `~CollectionType.RUN` collection:: 

207 

208 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

209 

210 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

211 because we want to write to one `~CollectionType.RUN` collection but read 

212 from several others (as well):: 

213 

214 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

215 collections=["u/alice/DM-50000/a", 

216 "u/bob/DM-49998", 

217 "HSC/defaults"]) 

218 

219 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

220 Datasets will be read first from that run (since it appears first in the 

221 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

222 

223 Finally, one can always create a `Butler` with no collections:: 

224 

225 butler = Butler("/path/to/repo", writeable=True) 

226 

227 This can be extremely useful when you just want to use ``butler.registry``, 

228 e.g. for inserting dimension data or managing collections, or when the 

229 collections you want to use with the butler are not consistent. 

230 Passing ``writeable`` explicitly here is only necessary if you want to be 

231 able to make changes to the repo - usually the value for ``writeable`` can 

232 be guessed from the collection arguments provided, but it defaults to 

233 `False` when there are not collection arguments. 

234 """ 

235 

236 def __init__( 

237 self, 

238 config: Union[Config, str, None] = None, 

239 *, 

240 butler: Optional[Butler] = None, 

241 collections: Any = None, 

242 run: Optional[str] = None, 

243 searchPaths: Optional[List[str]] = None, 

244 writeable: Optional[bool] = None, 

245 inferDefaults: bool = True, 

246 **kwargs: str, 

247 ): 

248 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

249 # Load registry, datastore, etc. from config or existing butler. 

250 if butler is not None: 

251 if config is not None or searchPaths is not None or writeable is not None: 

252 raise TypeError( 

253 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument." 

254 ) 

255 self.registry = butler.registry.copy(defaults) 

256 self.datastore = butler.datastore 

257 self.storageClasses = butler.storageClasses 

258 self._config: ButlerConfig = butler._config 

259 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset 

260 else: 

261 self._config = ButlerConfig(config, searchPaths=searchPaths) 

262 try: 

263 if "root" in self._config: 

264 butlerRoot = self._config["root"] 

265 else: 

266 butlerRoot = self._config.configDir 

267 if writeable is None: 

268 writeable = run is not None 

269 self.registry = Registry.fromConfig( 

270 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

271 ) 

272 self.datastore = Datastore.fromConfig( 

273 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

274 ) 

275 self.storageClasses = StorageClassFactory() 

276 self.storageClasses.addFromConfig(self._config) 

277 self._allow_put_of_predefined_dataset = self._config.get( 

278 "allow_put_of_predefined_dataset", False 

279 ) 

280 except Exception: 

281 # Failures here usually mean that configuration is incomplete, 

282 # just issue an error message which includes config file URI. 

283 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

284 raise 

285 

286 if "run" in self._config or "collection" in self._config: 

287 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

288 

289 GENERATION: ClassVar[int] = 3 

290 """This is a Generation 3 Butler. 

291 

292 This attribute may be removed in the future, once the Generation 2 Butler 

293 interface has been fully retired; it should only be used in transitional 

294 code. 

295 """ 

296 

297 @classmethod 

298 def get_repo_uri(cls, label: str) -> ButlerURI: 

299 """Look up the label in a butler repository index. 

300 

301 Parameters 

302 ---------- 

303 label : `str` 

304 Label of the Butler repository to look up. 

305 

306 Returns 

307 ------- 

308 uri : `ButlerURI` 

309 URI to the Butler repository associated with the given label. 

310 

311 Raises 

312 ------ 

313 KeyError 

314 Raised if the label is not found in the index, or if an index 

315 can not be found at all. 

316 

317 Notes 

318 ----- 

319 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

320 information is discovered. 

321 """ 

322 return ButlerRepoIndex.get_repo_uri(label) 

323 

324 @classmethod 

325 def get_known_repos(cls) -> Set[str]: 

326 """Retrieve the list of known repository labels. 

327 

328 Returns 

329 ------- 

330 repos : `set` of `str` 

331 All the known labels. Can be empty if no index can be found. 

332 

333 Notes 

334 ----- 

335 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

336 information is discovered. 

337 """ 

338 return ButlerRepoIndex.get_known_repos() 

339 

340 @staticmethod 

341 def makeRepo( 

342 root: str, 

343 config: Union[Config, str, None] = None, 

344 dimensionConfig: Union[Config, str, None] = None, 

345 standalone: bool = False, 

346 searchPaths: Optional[List[str]] = None, 

347 forceConfigRoot: bool = True, 

348 outfile: Optional[str] = None, 

349 overwrite: bool = False, 

350 ) -> Config: 

351 """Create an empty data repository by adding a butler.yaml config 

352 to a repository root directory. 

353 

354 Parameters 

355 ---------- 

356 root : `str` or `ButlerURI` 

357 Path or URI to the root location of the new repository. Will be 

358 created if it does not exist. 

359 config : `Config` or `str`, optional 

360 Configuration to write to the repository, after setting any 

361 root-dependent Registry or Datastore config options. Can not 

362 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

363 configuration will be used. Root-dependent config options 

364 specified in this config are overwritten if ``forceConfigRoot`` 

365 is `True`. 

366 dimensionConfig : `Config` or `str`, optional 

367 Configuration for dimensions, will be used to initialize registry 

368 database. 

369 standalone : `bool` 

370 If True, write all expanded defaults, not just customized or 

371 repository-specific settings. 

372 This (mostly) decouples the repository from the default 

373 configuration, insulating it from changes to the defaults (which 

374 may be good or bad, depending on the nature of the changes). 

375 Future *additions* to the defaults will still be picked up when 

376 initializing `Butlers` to repos created with ``standalone=True``. 

377 searchPaths : `list` of `str`, optional 

378 Directory paths to search when calculating the full butler 

379 configuration. 

380 forceConfigRoot : `bool`, optional 

381 If `False`, any values present in the supplied ``config`` that 

382 would normally be reset are not overridden and will appear 

383 directly in the output config. This allows non-standard overrides 

384 of the root directory for a datastore or registry to be given. 

385 If this parameter is `True` the values for ``root`` will be 

386 forced into the resulting config if appropriate. 

387 outfile : `str`, optional 

388 If not-`None`, the output configuration will be written to this 

389 location rather than into the repository itself. Can be a URI 

390 string. Can refer to a directory that will be used to write 

391 ``butler.yaml``. 

392 overwrite : `bool`, optional 

393 Create a new configuration file even if one already exists 

394 in the specified output location. Default is to raise 

395 an exception. 

396 

397 Returns 

398 ------- 

399 config : `Config` 

400 The updated `Config` instance written to the repo. 

401 

402 Raises 

403 ------ 

404 ValueError 

405 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

406 regular Config (as these subclasses would make it impossible to 

407 support ``standalone=False``). 

408 FileExistsError 

409 Raised if the output config file already exists. 

410 os.error 

411 Raised if the directory does not exist, exists but is not a 

412 directory, or cannot be created. 

413 

414 Notes 

415 ----- 

416 Note that when ``standalone=False`` (the default), the configuration 

417 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

418 construct the repository should also be used to construct any Butlers 

419 to avoid configuration inconsistencies. 

420 """ 

421 if isinstance(config, (ButlerConfig, ConfigSubset)): 

422 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

423 

424 # Ensure that the root of the repository exists or can be made 

425 uri = ButlerURI(root, forceDirectory=True) 

426 uri.mkdir() 

427 

428 config = Config(config) 

429 

430 # If we are creating a new repo from scratch with relative roots, 

431 # do not propagate an explicit root from the config file 

432 if "root" in config: 

433 del config["root"] 

434 

435 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

436 imported_class = doImportType(full["datastore", "cls"]) 

437 if not issubclass(imported_class, Datastore): 

438 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore") 

439 datastoreClass: Type[Datastore] = imported_class 

440 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

441 

442 # if key exists in given config, parse it, otherwise parse the defaults 

443 # in the expanded config 

444 if config.get(("registry", "db")): 

445 registryConfig = RegistryConfig(config) 

446 else: 

447 registryConfig = RegistryConfig(full) 

448 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

449 if defaultDatabaseUri is not None: 

450 Config.updateParameters( 

451 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot 

452 ) 

453 else: 

454 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot) 

455 

456 if standalone: 

457 config.merge(full) 

458 else: 

459 # Always expand the registry.managers section into the per-repo 

460 # config, because after the database schema is created, it's not 

461 # allowed to change anymore. Note that in the standalone=True 

462 # branch, _everything_ in the config is expanded, so there's no 

463 # need to special case this. 

464 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

465 configURI: Union[str, ButlerURI] 

466 if outfile is not None: 

467 # When writing to a separate location we must include 

468 # the root of the butler repo in the config else it won't know 

469 # where to look. 

470 config["root"] = uri.geturl() 

471 configURI = outfile 

472 else: 

473 configURI = uri 

474 config.dumpToUri(configURI, overwrite=overwrite) 

475 

476 # Create Registry and populate tables 

477 registryConfig = RegistryConfig(config.get("registry")) 

478 dimensionConfig = DimensionConfig(dimensionConfig) 

479 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root) 

480 

481 log.verbose("Wrote new Butler configuration file to %s", configURI) 

482 

483 return config 

484 

485 @classmethod 

486 def _unpickle( 

487 cls, 

488 config: ButlerConfig, 

489 collections: Optional[CollectionSearch], 

490 run: Optional[str], 

491 defaultDataId: Dict[str, str], 

492 writeable: bool, 

493 ) -> Butler: 

494 """Callable used to unpickle a Butler. 

495 

496 We prefer not to use ``Butler.__init__`` directly so we can force some 

497 of its many arguments to be keyword-only (note that ``__reduce__`` 

498 can only invoke callables with positional arguments). 

499 

500 Parameters 

501 ---------- 

502 config : `ButlerConfig` 

503 Butler configuration, already coerced into a true `ButlerConfig` 

504 instance (and hence after any search paths for overrides have been 

505 utilized). 

506 collections : `CollectionSearch` 

507 Names of the default collections to read from. 

508 run : `str`, optional 

509 Name of the default `~CollectionType.RUN` collection to write to. 

510 defaultDataId : `dict` [ `str`, `str` ] 

511 Default data ID values. 

512 writeable : `bool` 

513 Whether the Butler should support write operations. 

514 

515 Returns 

516 ------- 

517 butler : `Butler` 

518 A new `Butler` instance. 

519 """ 

520 # MyPy doesn't recognize that the kwargs below are totally valid; it 

521 # seems to think '**defaultDataId* is a _positional_ argument! 

522 return cls( 

523 config=config, 

524 collections=collections, 

525 run=run, 

526 writeable=writeable, 

527 **defaultDataId, # type: ignore 

528 ) 

529 

530 def __reduce__(self) -> tuple: 

531 """Support pickling.""" 

532 return ( 

533 Butler._unpickle, 

534 ( 

535 self._config, 

536 self.collections, 

537 self.run, 

538 self.registry.defaults.dataId.byName(), 

539 self.registry.isWriteable(), 

540 ), 

541 ) 

542 

543 def __str__(self) -> str: 

544 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

545 self.collections, self.run, self.datastore, self.registry 

546 ) 

547 

548 def isWriteable(self) -> bool: 

549 """Return `True` if this `Butler` supports write operations.""" 

550 return self.registry.isWriteable() 

551 

552 @contextlib.contextmanager 

553 def transaction(self) -> Iterator[None]: 

554 """Context manager supporting `Butler` transactions. 

555 

556 Transactions can be nested. 

557 """ 

558 with self.registry.transaction(): 

559 with self.datastore.transaction(): 

560 yield 

561 

562 def _standardizeArgs( 

563 self, 

564 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

565 dataId: Optional[DataId] = None, 

566 **kwargs: Any, 

567 ) -> Tuple[DatasetType, Optional[DataId]]: 

568 """Standardize the arguments passed to several Butler APIs. 

569 

570 Parameters 

571 ---------- 

572 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

573 When `DatasetRef` the `dataId` should be `None`. 

574 Otherwise the `DatasetType` or name thereof. 

575 dataId : `dict` or `DataCoordinate` 

576 A `dict` of `Dimension` link name, value pairs that label the 

577 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

578 should be provided as the second argument. 

579 **kwargs 

580 Additional keyword arguments used to augment or construct a 

581 `DataCoordinate`. See `DataCoordinate.standardize` 

582 parameters. 

583 

584 Returns 

585 ------- 

586 datasetType : `DatasetType` 

587 A `DatasetType` instance extracted from ``datasetRefOrType``. 

588 dataId : `dict` or `DataId`, optional 

589 Argument that can be used (along with ``kwargs``) to construct a 

590 `DataId`. 

591 

592 Notes 

593 ----- 

594 Butler APIs that conceptually need a DatasetRef also allow passing a 

595 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

596 keyword arguments that can be used to construct one) separately. This 

597 method accepts those arguments and always returns a true `DatasetType` 

598 and a `DataId` or `dict`. 

599 

600 Standardization of `dict` vs `DataId` is best handled by passing the 

601 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

602 generally similarly flexible. 

603 """ 

604 externalDatasetType: Optional[DatasetType] = None 

605 internalDatasetType: Optional[DatasetType] = None 

606 if isinstance(datasetRefOrType, DatasetRef): 

607 if dataId is not None or kwargs: 

608 raise ValueError("DatasetRef given, cannot use dataId as well") 

609 externalDatasetType = datasetRefOrType.datasetType 

610 dataId = datasetRefOrType.dataId 

611 else: 

612 # Don't check whether DataId is provided, because Registry APIs 

613 # can usually construct a better error message when it wasn't. 

614 if isinstance(datasetRefOrType, DatasetType): 

615 externalDatasetType = datasetRefOrType 

616 else: 

617 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

618 

619 # Check that they are self-consistent 

620 if externalDatasetType is not None: 

621 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

622 if externalDatasetType != internalDatasetType: 

623 raise ValueError( 

624 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

625 f"registry definition ({internalDatasetType})" 

626 ) 

627 

628 assert internalDatasetType is not None 

629 return internalDatasetType, dataId 

630 

631 def _rewrite_data_id( 

632 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any 

633 ) -> Tuple[Optional[DataId], Dict[str, Any]]: 

634 """Rewrite a data ID taking into account dimension records. 

635 

636 Take a Data ID and keyword args and rewrite it if necessary to 

637 allow the user to specify dimension records rather than dimension 

638 primary values. 

639 

640 This allows a user to include a dataId dict with keys of 

641 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

642 the integer exposure ID. It also allows a string to be given 

643 for a dimension value rather than the integer ID if that is more 

644 convenient. For example, rather than having to specifyin the 

645 detector with ``detector.full_name``, a string given for ``detector`` 

646 will be interpreted as the full name and converted to the integer 

647 value. 

648 

649 Keyword arguments can also use strings for dimensions like detector 

650 and exposure but python does not allow them to include ``.`` and 

651 so the ``exposure.day_obs`` syntax can not be used in a keyword 

652 argument. 

653 

654 Parameters 

655 ---------- 

656 dataId : `dict` or `DataCoordinate` 

657 A `dict` of `Dimension` link name, value pairs that will label the 

658 `DatasetRef` within a Collection. 

659 datasetType : `DatasetType` 

660 The dataset type associated with this dataId. Required to 

661 determine the relevant dimensions. 

662 **kwargs 

663 Additional keyword arguments used to augment or construct a 

664 `DataId`. See `DataId` parameters. 

665 

666 Returns 

667 ------- 

668 dataId : `dict` or `DataCoordinate` 

669 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

670 no keyword arguments, the original dataId will be returned 

671 unchanged. 

672 **kwargs : `dict` 

673 Any unused keyword arguments. 

674 """ 

675 # Do nothing if we have a standalone DataCoordinate. 

676 if isinstance(dataId, DataCoordinate) and not kwargs: 

677 return dataId, kwargs 

678 

679 # Process dimension records that are using record information 

680 # rather than ids 

681 newDataId: Dict[str, DataIdValue] = {} 

682 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict) 

683 

684 # if all the dataId comes from keyword parameters we do not need 

685 # to do anything here because they can't be of the form 

686 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

687 if dataId: 

688 for k, v in dataId.items(): 

689 # If we have a Dimension we do not need to do anything 

690 # because it cannot be a compound key. 

691 if isinstance(k, str) and "." in k: 

692 # Someone is using a more human-readable dataId 

693 dimensionName, record = k.split(".", 1) 

694 byRecord[dimensionName][record] = v 

695 elif isinstance(k, Dimension): 

696 newDataId[k.name] = v 

697 else: 

698 newDataId[k] = v 

699 

700 # Go through the updated dataId and check the type in case someone is 

701 # using an alternate key. We have already filtered out the compound 

702 # keys dimensions.record format. 

703 not_dimensions = {} 

704 

705 # Will need to look in the dataId and the keyword arguments 

706 # and will remove them if they need to be fixed or are unrecognized. 

707 for dataIdDict in (newDataId, kwargs): 

708 # Use a list so we can adjust the dict safely in the loop 

709 for dimensionName in list(dataIdDict): 

710 value = dataIdDict[dimensionName] 

711 try: 

712 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

713 except KeyError: 

714 # This is not a real dimension 

715 not_dimensions[dimensionName] = value 

716 del dataIdDict[dimensionName] 

717 continue 

718 

719 # Convert an integral type to an explicit int to simplify 

720 # comparisons here 

721 if isinstance(value, numbers.Integral): 

722 value = int(value) 

723 

724 if not isinstance(value, dimension.primaryKey.getPythonType()): 

725 for alternate in dimension.alternateKeys: 

726 if isinstance(value, alternate.getPythonType()): 

727 byRecord[dimensionName][alternate.name] = value 

728 del dataIdDict[dimensionName] 

729 log.debug( 

730 "Converting dimension %s to %s.%s=%s", 

731 dimensionName, 

732 dimensionName, 

733 alternate.name, 

734 value, 

735 ) 

736 break 

737 else: 

738 log.warning( 

739 "Type mismatch found for value '%r' provided for dimension %s. " 

740 "Could not find matching alternative (primary key has type %s) " 

741 "so attempting to use as-is.", 

742 value, 

743 dimensionName, 

744 dimension.primaryKey.getPythonType(), 

745 ) 

746 

747 # If we have some unrecognized dimensions we have to try to connect 

748 # them to records in other dimensions. This is made more complicated 

749 # by some dimensions having records with clashing names. A mitigation 

750 # is that we can tell by this point which dimensions are missing 

751 # for the DatasetType but this does not work for calibrations 

752 # where additional dimensions can be used to constrain the temporal 

753 # axis. 

754 if not_dimensions: 

755 # Calculate missing dimensions 

756 provided = set(newDataId) | set(kwargs) | set(byRecord) 

757 missingDimensions = datasetType.dimensions.names - provided 

758 

759 # For calibrations we may well be needing temporal dimensions 

760 # so rather than always including all dimensions in the scan 

761 # restrict things a little. It is still possible for there 

762 # to be confusion over day_obs in visit vs exposure for example. 

763 # If we are not searching calibration collections things may 

764 # fail but they are going to fail anyway because of the 

765 # ambiguousness of the dataId... 

766 candidateDimensions: Set[str] = set() 

767 candidateDimensions.update(missingDimensions) 

768 if datasetType.isCalibration(): 

769 for dim in self.registry.dimensions.getStaticDimensions(): 

770 if dim.temporal: 

771 candidateDimensions.add(str(dim)) 

772 

773 # Look up table for the first association with a dimension 

774 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict) 

775 

776 # Keep track of whether an item is associated with multiple 

777 # dimensions. 

778 counter: Counter[str] = Counter() 

779 assigned: Dict[str, Set[str]] = defaultdict(set) 

780 

781 # Go through the missing dimensions and associate the 

782 # given names with records within those dimensions 

783 matched_dims = set() 

784 for dimensionName in candidateDimensions: 

785 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

786 fields = dimension.metadata.names | dimension.uniqueKeys.names 

787 for field in not_dimensions: 

788 if field in fields: 

789 guessedAssociation[dimensionName][field] = not_dimensions[field] 

790 counter[dimensionName] += 1 

791 assigned[field].add(dimensionName) 

792 matched_dims.add(field) 

793 

794 # Calculate the fields that matched nothing. 

795 never_found = set(not_dimensions) - matched_dims 

796 

797 if never_found: 

798 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

799 

800 # There is a chance we have allocated a single dataId item 

801 # to multiple dimensions. Need to decide which should be retained. 

802 # For now assume that the most popular alternative wins. 

803 # This means that day_obs with seq_num will result in 

804 # exposure.day_obs and not visit.day_obs 

805 # Also prefer an explicitly missing dimension over an inferred 

806 # temporal dimension. 

807 for fieldName, assignedDimensions in assigned.items(): 

808 if len(assignedDimensions) > 1: 

809 # Pick the most popular (preferring mandatory dimensions) 

810 requiredButMissing = assignedDimensions.intersection(missingDimensions) 

811 if requiredButMissing: 

812 candidateDimensions = requiredButMissing 

813 else: 

814 candidateDimensions = assignedDimensions 

815 

816 # Select the relevant items and get a new restricted 

817 # counter. 

818 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

819 duplicatesCounter: Counter[str] = Counter() 

820 duplicatesCounter.update(theseCounts) 

821 

822 # Choose the most common. If they are equally common 

823 # we will pick the one that was found first. 

824 # Returns a list of tuples 

825 selected = duplicatesCounter.most_common(1)[0][0] 

826 

827 log.debug( 

828 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

829 " Removed ambiguity by choosing dimension %s.", 

830 fieldName, 

831 ", ".join(assignedDimensions), 

832 selected, 

833 ) 

834 

835 for candidateDimension in assignedDimensions: 

836 if candidateDimension != selected: 

837 del guessedAssociation[candidateDimension][fieldName] 

838 

839 # Update the record look up dict with the new associations 

840 for dimensionName, values in guessedAssociation.items(): 

841 if values: # A dict might now be empty 

842 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values) 

843 byRecord[dimensionName].update(values) 

844 

845 if byRecord: 

846 # Some record specifiers were found so we need to convert 

847 # them to the Id form 

848 for dimensionName, values in byRecord.items(): 

849 if dimensionName in newDataId: 

850 log.warning( 

851 "DataId specified explicit %s dimension value of %s in addition to" 

852 " general record specifiers for it of %s. Ignoring record information.", 

853 dimensionName, 

854 newDataId[dimensionName], 

855 str(values), 

856 ) 

857 continue 

858 

859 # Build up a WHERE expression 

860 bind = {k: v for k, v in values.items()} 

861 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

862 

863 # Hopefully we get a single record that matches 

864 records = set( 

865 self.registry.queryDimensionRecords( 

866 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

867 ) 

868 ) 

869 

870 if len(records) != 1: 

871 if len(records) > 1: 

872 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

873 for r in records: 

874 log.debug("- %s", str(r)) 

875 raise RuntimeError( 

876 f"DataId specification for dimension {dimensionName} is not" 

877 f" uniquely constrained to a single dataset by {values}." 

878 f" Got {len(records)} results." 

879 ) 

880 raise RuntimeError( 

881 f"DataId specification for dimension {dimensionName} matched no" 

882 f" records when constrained by {values}" 

883 ) 

884 

885 # Get the primary key from the real dimension object 

886 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

887 if not isinstance(dimension, Dimension): 

888 raise RuntimeError( 

889 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

890 ) 

891 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

892 

893 # We have modified the dataId so need to switch to it 

894 dataId = newDataId 

895 

896 return dataId, kwargs 

897 

898 def _findDatasetRef( 

899 self, 

900 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

901 dataId: Optional[DataId] = None, 

902 *, 

903 collections: Any = None, 

904 allowUnresolved: bool = False, 

905 **kwargs: Any, 

906 ) -> DatasetRef: 

907 """Shared logic for methods that start with a search for a dataset in 

908 the registry. 

909 

910 Parameters 

911 ---------- 

912 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

913 When `DatasetRef` the `dataId` should be `None`. 

914 Otherwise the `DatasetType` or name thereof. 

915 dataId : `dict` or `DataCoordinate`, optional 

916 A `dict` of `Dimension` link name, value pairs that label the 

917 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

918 should be provided as the first argument. 

919 collections : Any, optional 

920 Collections to be searched, overriding ``self.collections``. 

921 Can be any of the types supported by the ``collections`` argument 

922 to butler construction. 

923 allowUnresolved : `bool`, optional 

924 If `True`, return an unresolved `DatasetRef` if finding a resolved 

925 one in the `Registry` fails. Defaults to `False`. 

926 **kwargs 

927 Additional keyword arguments used to augment or construct a 

928 `DataId`. See `DataId` parameters. 

929 

930 Returns 

931 ------- 

932 ref : `DatasetRef` 

933 A reference to the dataset identified by the given arguments. 

934 

935 Raises 

936 ------ 

937 LookupError 

938 Raised if no matching dataset exists in the `Registry` (and 

939 ``allowUnresolved is False``). 

940 ValueError 

941 Raised if a resolved `DatasetRef` was passed as an input, but it 

942 differs from the one found in the registry. 

943 TypeError 

944 Raised if no collections were provided. 

945 """ 

946 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

947 if isinstance(datasetRefOrType, DatasetRef): 

948 idNumber = datasetRefOrType.id 

949 else: 

950 idNumber = None 

951 timespan: Optional[Timespan] = None 

952 

953 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

954 

955 if datasetType.isCalibration(): 

956 # Because this is a calibration dataset, first try to make a 

957 # standardize the data ID without restricting the dimensions to 

958 # those of the dataset type requested, because there may be extra 

959 # dimensions that provide temporal information for a validity-range 

960 # lookup. 

961 dataId = DataCoordinate.standardize( 

962 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

963 ) 

964 if dataId.graph.temporal: 

965 dataId = self.registry.expandDataId(dataId) 

966 timespan = dataId.timespan 

967 else: 

968 # Standardize the data ID to just the dimensions of the dataset 

969 # type instead of letting registry.findDataset do it, so we get the 

970 # result even if no dataset is found. 

971 dataId = DataCoordinate.standardize( 

972 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

973 ) 

974 # Always lookup the DatasetRef, even if one is given, to ensure it is 

975 # present in the current collection. 

976 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

977 if ref is None: 

978 if allowUnresolved: 

979 return DatasetRef(datasetType, dataId) 

980 else: 

981 if collections is None: 

982 collections = self.registry.defaults.collections 

983 raise LookupError( 

984 f"Dataset {datasetType.name} with data ID {dataId} " 

985 f"could not be found in collections {collections}." 

986 ) 

987 if idNumber is not None and idNumber != ref.id: 

988 if collections is None: 

989 collections = self.registry.defaults.collections 

990 raise ValueError( 

991 f"DatasetRef.id provided ({idNumber}) does not match " 

992 f"id ({ref.id}) in registry in collections {collections}." 

993 ) 

994 return ref 

995 

996 @transactional 

997 def put( 

998 self, 

999 obj: Any, 

1000 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1001 dataId: Optional[DataId] = None, 

1002 *, 

1003 run: Optional[str] = None, 

1004 **kwargs: Any, 

1005 ) -> DatasetRef: 

1006 """Store and register a dataset. 

1007 

1008 Parameters 

1009 ---------- 

1010 obj : `object` 

1011 The dataset. 

1012 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1013 When `DatasetRef` is provided, ``dataId`` should be `None`. 

1014 Otherwise the `DatasetType` or name thereof. 

1015 dataId : `dict` or `DataCoordinate` 

1016 A `dict` of `Dimension` link name, value pairs that label the 

1017 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1018 should be provided as the second argument. 

1019 run : `str`, optional 

1020 The name of the run the dataset should be added to, overriding 

1021 ``self.run``. 

1022 **kwargs 

1023 Additional keyword arguments used to augment or construct a 

1024 `DataCoordinate`. See `DataCoordinate.standardize` 

1025 parameters. 

1026 

1027 Returns 

1028 ------- 

1029 ref : `DatasetRef` 

1030 A reference to the stored dataset, updated with the correct id if 

1031 given. 

1032 

1033 Raises 

1034 ------ 

1035 TypeError 

1036 Raised if the butler is read-only or if no run has been provided. 

1037 """ 

1038 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

1039 if not self.isWriteable(): 

1040 raise TypeError("Butler is read-only.") 

1041 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

1042 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

1043 raise ValueError("DatasetRef must not be in registry, must have None id") 

1044 

1045 # Handle dimension records in dataId 

1046 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1047 

1048 # Add Registry Dataset entry. 

1049 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

1050 

1051 # For an execution butler the datasets will be pre-defined. 

1052 # If the butler is configured that way datasets should only be inserted 

1053 # if they do not already exist in registry. Trying and catching 

1054 # ConflictingDefinitionError will not work because the transaction 

1055 # will be corrupted. Instead, in this mode always check first. 

1056 ref = None 

1057 ref_is_predefined = False 

1058 if self._allow_put_of_predefined_dataset: 

1059 # Get the matching ref for this run. 

1060 ref = self.registry.findDataset(datasetType, collections=run, dataId=dataId) 

1061 

1062 if ref: 

1063 # Must be expanded form for datastore templating 

1064 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions) 

1065 ref = ref.expanded(dataId) 

1066 ref_is_predefined = True 

1067 

1068 if not ref: 

1069 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

1070 

1071 # If the ref is predefined it is possible that the datastore also 

1072 # has the record. Asking datastore to put it again will result in 

1073 # the artifact being recreated, overwriting previous, then will cause 

1074 # a failure in writing the record which will cause the artifact 

1075 # to be removed. Much safer to ask first before attempting to 

1076 # overwrite. Race conditions should not be an issue for the 

1077 # execution butler environment. 

1078 if ref_is_predefined: 

1079 if self.datastore.knows(ref): 

1080 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.") 

1081 

1082 self.datastore.put(obj, ref) 

1083 

1084 return ref 

1085 

1086 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any: 

1087 """Retrieve a stored dataset. 

1088 

1089 Unlike `Butler.get`, this method allows datasets outside the Butler's 

1090 collection to be read as long as the `DatasetRef` that identifies them 

1091 can be obtained separately. 

1092 

1093 Parameters 

1094 ---------- 

1095 ref : `DatasetRef` 

1096 Resolved reference to an already stored dataset. 

1097 parameters : `dict` 

1098 Additional StorageClass-defined options to control reading, 

1099 typically used to efficiently read only a subset of the dataset. 

1100 

1101 Returns 

1102 ------- 

1103 obj : `object` 

1104 The dataset. 

1105 """ 

1106 return self.datastore.get(ref, parameters=parameters) 

1107 

1108 def getDirectDeferred( 

1109 self, ref: DatasetRef, *, parameters: Union[dict, None] = None 

1110 ) -> DeferredDatasetHandle: 

1111 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1112 from a resolved `DatasetRef`. 

1113 

1114 Parameters 

1115 ---------- 

1116 ref : `DatasetRef` 

1117 Resolved reference to an already stored dataset. 

1118 parameters : `dict` 

1119 Additional StorageClass-defined options to control reading, 

1120 typically used to efficiently read only a subset of the dataset. 

1121 

1122 Returns 

1123 ------- 

1124 obj : `DeferredDatasetHandle` 

1125 A handle which can be used to retrieve a dataset at a later time. 

1126 

1127 Raises 

1128 ------ 

1129 AmbiguousDatasetError 

1130 Raised if ``ref.id is None``, i.e. the reference is unresolved. 

1131 """ 

1132 if ref.id is None: 

1133 raise AmbiguousDatasetError( 

1134 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved." 

1135 ) 

1136 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

1137 

1138 def getDeferred( 

1139 self, 

1140 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1141 dataId: Optional[DataId] = None, 

1142 *, 

1143 parameters: Union[dict, None] = None, 

1144 collections: Any = None, 

1145 **kwargs: Any, 

1146 ) -> DeferredDatasetHandle: 

1147 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1148 after an immediate registry lookup. 

1149 

1150 Parameters 

1151 ---------- 

1152 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1153 When `DatasetRef` the `dataId` should be `None`. 

1154 Otherwise the `DatasetType` or name thereof. 

1155 dataId : `dict` or `DataCoordinate`, optional 

1156 A `dict` of `Dimension` link name, value pairs that label the 

1157 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1158 should be provided as the first argument. 

1159 parameters : `dict` 

1160 Additional StorageClass-defined options to control reading, 

1161 typically used to efficiently read only a subset of the dataset. 

1162 collections : Any, optional 

1163 Collections to be searched, overriding ``self.collections``. 

1164 Can be any of the types supported by the ``collections`` argument 

1165 to butler construction. 

1166 **kwargs 

1167 Additional keyword arguments used to augment or construct a 

1168 `DataId`. See `DataId` parameters. 

1169 

1170 Returns 

1171 ------- 

1172 obj : `DeferredDatasetHandle` 

1173 A handle which can be used to retrieve a dataset at a later time. 

1174 

1175 Raises 

1176 ------ 

1177 LookupError 

1178 Raised if no matching dataset exists in the `Registry` (and 

1179 ``allowUnresolved is False``). 

1180 ValueError 

1181 Raised if a resolved `DatasetRef` was passed as an input, but it 

1182 differs from the one found in the registry. 

1183 TypeError 

1184 Raised if no collections were provided. 

1185 """ 

1186 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1187 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

1188 

1189 def get( 

1190 self, 

1191 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1192 dataId: Optional[DataId] = None, 

1193 *, 

1194 parameters: Optional[Dict[str, Any]] = None, 

1195 collections: Any = None, 

1196 **kwargs: Any, 

1197 ) -> Any: 

1198 """Retrieve a stored dataset. 

1199 

1200 Parameters 

1201 ---------- 

1202 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1203 When `DatasetRef` the `dataId` should be `None`. 

1204 Otherwise the `DatasetType` or name thereof. 

1205 dataId : `dict` or `DataCoordinate` 

1206 A `dict` of `Dimension` link name, value pairs that label the 

1207 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1208 should be provided as the first argument. 

1209 parameters : `dict` 

1210 Additional StorageClass-defined options to control reading, 

1211 typically used to efficiently read only a subset of the dataset. 

1212 collections : Any, optional 

1213 Collections to be searched, overriding ``self.collections``. 

1214 Can be any of the types supported by the ``collections`` argument 

1215 to butler construction. 

1216 **kwargs 

1217 Additional keyword arguments used to augment or construct a 

1218 `DataCoordinate`. See `DataCoordinate.standardize` 

1219 parameters. 

1220 

1221 Returns 

1222 ------- 

1223 obj : `object` 

1224 The dataset. 

1225 

1226 Raises 

1227 ------ 

1228 ValueError 

1229 Raised if a resolved `DatasetRef` was passed as an input, but it 

1230 differs from the one found in the registry. 

1231 LookupError 

1232 Raised if no matching dataset exists in the `Registry`. 

1233 TypeError 

1234 Raised if no collections were provided. 

1235 

1236 Notes 

1237 ----- 

1238 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1239 this method requires that the given data ID include temporal dimensions 

1240 beyond the dimensions of the dataset type itself, in order to find the 

1241 dataset with the appropriate validity range. For example, a "bias" 

1242 dataset with native dimensions ``{instrument, detector}`` could be 

1243 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1244 ``exposure`` is a temporal dimension. 

1245 """ 

1246 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1247 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1248 return self.getDirect(ref, parameters=parameters) 

1249 

1250 def getURIs( 

1251 self, 

1252 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1253 dataId: Optional[DataId] = None, 

1254 *, 

1255 predict: bool = False, 

1256 collections: Any = None, 

1257 run: Optional[str] = None, 

1258 **kwargs: Any, 

1259 ) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

1260 """Returns the URIs associated with the dataset. 

1261 

1262 Parameters 

1263 ---------- 

1264 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1265 When `DatasetRef` the `dataId` should be `None`. 

1266 Otherwise the `DatasetType` or name thereof. 

1267 dataId : `dict` or `DataCoordinate` 

1268 A `dict` of `Dimension` link name, value pairs that label the 

1269 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1270 should be provided as the first argument. 

1271 predict : `bool` 

1272 If `True`, allow URIs to be returned of datasets that have not 

1273 been written. 

1274 collections : Any, optional 

1275 Collections to be searched, overriding ``self.collections``. 

1276 Can be any of the types supported by the ``collections`` argument 

1277 to butler construction. 

1278 run : `str`, optional 

1279 Run to use for predictions, overriding ``self.run``. 

1280 **kwargs 

1281 Additional keyword arguments used to augment or construct a 

1282 `DataCoordinate`. See `DataCoordinate.standardize` 

1283 parameters. 

1284 

1285 Returns 

1286 ------- 

1287 primary : `ButlerURI` 

1288 The URI to the primary artifact associated with this dataset. 

1289 If the dataset was disassembled within the datastore this 

1290 may be `None`. 

1291 components : `dict` 

1292 URIs to any components associated with the dataset artifact. 

1293 Can be empty if there are no components. 

1294 """ 

1295 ref = self._findDatasetRef( 

1296 datasetRefOrType, dataId, allowUnresolved=predict, collections=collections, **kwargs 

1297 ) 

1298 if ref.id is None: # only possible if predict is True 

1299 if run is None: 

1300 run = self.run 

1301 if run is None: 

1302 raise TypeError("Cannot predict location with run=None.") 

1303 # Lie about ID, because we can't guess it, and only 

1304 # Datastore.getURIs() will ever see it (and it doesn't use it). 

1305 ref = ref.resolved(id=0, run=run) 

1306 return self.datastore.getURIs(ref, predict) 

1307 

1308 def getURI( 

1309 self, 

1310 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1311 dataId: Optional[DataId] = None, 

1312 *, 

1313 predict: bool = False, 

1314 collections: Any = None, 

1315 run: Optional[str] = None, 

1316 **kwargs: Any, 

1317 ) -> ButlerURI: 

1318 """Return the URI to the Dataset. 

1319 

1320 Parameters 

1321 ---------- 

1322 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1323 When `DatasetRef` the `dataId` should be `None`. 

1324 Otherwise the `DatasetType` or name thereof. 

1325 dataId : `dict` or `DataCoordinate` 

1326 A `dict` of `Dimension` link name, value pairs that label the 

1327 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1328 should be provided as the first argument. 

1329 predict : `bool` 

1330 If `True`, allow URIs to be returned of datasets that have not 

1331 been written. 

1332 collections : Any, optional 

1333 Collections to be searched, overriding ``self.collections``. 

1334 Can be any of the types supported by the ``collections`` argument 

1335 to butler construction. 

1336 run : `str`, optional 

1337 Run to use for predictions, overriding ``self.run``. 

1338 **kwargs 

1339 Additional keyword arguments used to augment or construct a 

1340 `DataCoordinate`. See `DataCoordinate.standardize` 

1341 parameters. 

1342 

1343 Returns 

1344 ------- 

1345 uri : `ButlerURI` 

1346 URI pointing to the Dataset within the datastore. If the 

1347 Dataset does not exist in the datastore, and if ``predict`` is 

1348 `True`, the URI will be a prediction and will include a URI 

1349 fragment "#predicted". 

1350 If the datastore does not have entities that relate well 

1351 to the concept of a URI the returned URI string will be 

1352 descriptive. The returned URI is not guaranteed to be obtainable. 

1353 

1354 Raises 

1355 ------ 

1356 LookupError 

1357 A URI has been requested for a dataset that does not exist and 

1358 guessing is not allowed. 

1359 ValueError 

1360 Raised if a resolved `DatasetRef` was passed as an input, but it 

1361 differs from the one found in the registry. 

1362 TypeError 

1363 Raised if no collections were provided. 

1364 RuntimeError 

1365 Raised if a URI is requested for a dataset that consists of 

1366 multiple artifacts. 

1367 """ 

1368 primary, components = self.getURIs( 

1369 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs 

1370 ) 

1371 

1372 if primary is None or components: 

1373 raise RuntimeError( 

1374 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1375 "Use Butler.getURIs() instead." 

1376 ) 

1377 return primary 

1378 

1379 def retrieveArtifacts( 

1380 self, 

1381 refs: Iterable[DatasetRef], 

1382 destination: Union[str, ButlerURI], 

1383 transfer: str = "auto", 

1384 preserve_path: bool = True, 

1385 overwrite: bool = False, 

1386 ) -> List[ButlerURI]: 

1387 """Retrieve the artifacts associated with the supplied refs. 

1388 

1389 Parameters 

1390 ---------- 

1391 refs : iterable of `DatasetRef` 

1392 The datasets for which artifacts are to be retrieved. 

1393 A single ref can result in multiple artifacts. The refs must 

1394 be resolved. 

1395 destination : `ButlerURI` or `str` 

1396 Location to write the artifacts. 

1397 transfer : `str`, optional 

1398 Method to use to transfer the artifacts. Must be one of the options 

1399 supported by `ButlerURI.transfer_from()`. "move" is not allowed. 

1400 preserve_path : `bool`, optional 

1401 If `True` the full path of the artifact within the datastore 

1402 is preserved. If `False` the final file component of the path 

1403 is used. 

1404 overwrite : `bool`, optional 

1405 If `True` allow transfers to overwrite existing files at the 

1406 destination. 

1407 

1408 Returns 

1409 ------- 

1410 targets : `list` of `ButlerURI` 

1411 URIs of file artifacts in destination location. Order is not 

1412 preserved. 

1413 

1414 Notes 

1415 ----- 

1416 For non-file datastores the artifacts written to the destination 

1417 may not match the representation inside the datastore. For example 

1418 a hierarchical data structure in a NoSQL database may well be stored 

1419 as a JSON file. 

1420 """ 

1421 return self.datastore.retrieveArtifacts( 

1422 refs, ButlerURI(destination), transfer=transfer, preserve_path=preserve_path, overwrite=overwrite 

1423 ) 

1424 

1425 def datasetExists( 

1426 self, 

1427 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1428 dataId: Optional[DataId] = None, 

1429 *, 

1430 collections: Any = None, 

1431 **kwargs: Any, 

1432 ) -> bool: 

1433 """Return True if the Dataset is actually present in the Datastore. 

1434 

1435 Parameters 

1436 ---------- 

1437 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1438 When `DatasetRef` the `dataId` should be `None`. 

1439 Otherwise the `DatasetType` or name thereof. 

1440 dataId : `dict` or `DataCoordinate` 

1441 A `dict` of `Dimension` link name, value pairs that label the 

1442 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1443 should be provided as the first argument. 

1444 collections : Any, optional 

1445 Collections to be searched, overriding ``self.collections``. 

1446 Can be any of the types supported by the ``collections`` argument 

1447 to butler construction. 

1448 **kwargs 

1449 Additional keyword arguments used to augment or construct a 

1450 `DataCoordinate`. See `DataCoordinate.standardize` 

1451 parameters. 

1452 

1453 Raises 

1454 ------ 

1455 LookupError 

1456 Raised if the dataset is not even present in the Registry. 

1457 ValueError 

1458 Raised if a resolved `DatasetRef` was passed as an input, but it 

1459 differs from the one found in the registry. 

1460 TypeError 

1461 Raised if no collections were provided. 

1462 """ 

1463 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1464 return self.datastore.exists(ref) 

1465 

1466 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1467 """Remove one or more `~CollectionType.RUN` collections and the 

1468 datasets within them. 

1469 

1470 Parameters 

1471 ---------- 

1472 names : `Iterable` [ `str` ] 

1473 The names of the collections to remove. 

1474 unstore : `bool`, optional 

1475 If `True` (default), delete datasets from all datastores in which 

1476 they are present, and attempt to rollback the registry deletions if 

1477 datastore deletions fail (which may not always be possible). If 

1478 `False`, datastore records for these datasets are still removed, 

1479 but any artifacts (e.g. files) will not be. 

1480 

1481 Raises 

1482 ------ 

1483 TypeError 

1484 Raised if one or more collections are not of type 

1485 `~CollectionType.RUN`. 

1486 """ 

1487 if not self.isWriteable(): 

1488 raise TypeError("Butler is read-only.") 

1489 names = list(names) 

1490 refs: List[DatasetRef] = [] 

1491 for name in names: 

1492 collectionType = self.registry.getCollectionType(name) 

1493 if collectionType is not CollectionType.RUN: 

1494 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1495 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True)) 

1496 with self.registry.transaction(): 

1497 if unstore: 

1498 self.datastore.trash(refs) 

1499 else: 

1500 self.datastore.forget(refs) 

1501 for name in names: 

1502 self.registry.removeCollection(name) 

1503 if unstore: 

1504 # Point of no return for removing artifacts 

1505 self.datastore.emptyTrash() 

1506 

1507 def pruneCollection( 

1508 self, name: str, purge: bool = False, unstore: bool = False, unlink: Optional[List[str]] = None 

1509 ) -> None: 

1510 """Remove a collection and possibly prune datasets within it. 

1511 

1512 Parameters 

1513 ---------- 

1514 name : `str` 

1515 Name of the collection to remove. If this is a 

1516 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection, 

1517 datasets within the collection are not modified unless ``unstore`` 

1518 is `True`. If this is a `~CollectionType.RUN` collection, 

1519 ``purge`` and ``unstore`` must be `True`, and all datasets in it 

1520 are fully removed from the data repository. 

1521 purge : `bool`, optional 

1522 If `True`, permit `~CollectionType.RUN` collections to be removed, 

1523 fully removing datasets within them. Requires ``unstore=True`` as 

1524 well as an added precaution against accidental deletion. Must be 

1525 `False` (default) if the collection is not a ``RUN``. 

1526 unstore: `bool`, optional 

1527 If `True`, remove all datasets in the collection from all 

1528 datastores in which they appear. 

1529 unlink: `list` [`str`], optional 

1530 Before removing the given `collection` unlink it from from these 

1531 parent collections. 

1532 

1533 Raises 

1534 ------ 

1535 TypeError 

1536 Raised if the butler is read-only or arguments are mutually 

1537 inconsistent. 

1538 """ 

1539 # See pruneDatasets comments for more information about the logic here; 

1540 # the cases are almost the same, but here we can rely on Registry to 

1541 # take care everything but Datastore deletion when we remove the 

1542 # collection. 

1543 if not self.isWriteable(): 

1544 raise TypeError("Butler is read-only.") 

1545 collectionType = self.registry.getCollectionType(name) 

1546 if purge and not unstore: 

1547 raise PurgeWithoutUnstorePruneCollectionsError() 

1548 if collectionType is CollectionType.RUN and not purge: 

1549 raise RunWithoutPurgePruneCollectionsError(collectionType) 

1550 if collectionType is not CollectionType.RUN and purge: 

1551 raise PurgeUnsupportedPruneCollectionsError(collectionType) 

1552 

1553 def remove(child: str, parent: str) -> None: 

1554 """Remove a child collection from a parent collection.""" 

1555 # Remove child from parent. 

1556 chain = list(self.registry.getCollectionChain(parent)) 

1557 try: 

1558 chain.remove(name) 

1559 except ValueError as e: 

1560 raise RuntimeError(f"{name} is not a child of {parent}") from e 

1561 self.registry.setCollectionChain(parent, chain) 

1562 

1563 with self.registry.transaction(): 

1564 if unlink: 

1565 for parent in unlink: 

1566 remove(name, parent) 

1567 if unstore: 

1568 refs = self.registry.queryDatasets(..., collections=name, findFirst=True) 

1569 self.datastore.trash(refs) 

1570 self.registry.removeCollection(name) 

1571 

1572 if unstore: 

1573 # Point of no return for removing artifacts 

1574 self.datastore.emptyTrash() 

1575 

1576 def pruneDatasets( 

1577 self, 

1578 refs: Iterable[DatasetRef], 

1579 *, 

1580 disassociate: bool = True, 

1581 unstore: bool = False, 

1582 tags: Iterable[str] = (), 

1583 purge: bool = False, 

1584 run: Optional[str] = None, 

1585 ) -> None: 

1586 """Remove one or more datasets from a collection and/or storage. 

1587 

1588 Parameters 

1589 ---------- 

1590 refs : `~collections.abc.Iterable` of `DatasetRef` 

1591 Datasets to prune. These must be "resolved" references (not just 

1592 a `DatasetType` and data ID). 

1593 disassociate : `bool`, optional 

1594 Disassociate pruned datasets from ``tags``, or from all collections 

1595 if ``purge=True``. 

1596 unstore : `bool`, optional 

1597 If `True` (`False` is default) remove these datasets from all 

1598 datastores known to this butler. Note that this will make it 

1599 impossible to retrieve these datasets even via other collections. 

1600 Datasets that are already not stored are ignored by this option. 

1601 tags : `Iterable` [ `str` ], optional 

1602 `~CollectionType.TAGGED` collections to disassociate the datasets 

1603 from. Ignored if ``disassociate`` is `False` or ``purge`` is 

1604 `True`. 

1605 purge : `bool`, optional 

1606 If `True` (`False` is default), completely remove the dataset from 

1607 the `Registry`. To prevent accidental deletions, ``purge`` may 

1608 only be `True` if all of the following conditions are met: 

1609 

1610 - All given datasets are in the given run. 

1611 - ``disassociate`` is `True`; 

1612 - ``unstore`` is `True`. 

1613 

1614 This mode may remove provenance information from datasets other 

1615 than those provided, and should be used with extreme care. 

1616 

1617 Raises 

1618 ------ 

1619 TypeError 

1620 Raised if the butler is read-only, if no collection was provided, 

1621 or the conditions for ``purge=True`` were not met. 

1622 """ 

1623 if not self.isWriteable(): 

1624 raise TypeError("Butler is read-only.") 

1625 if purge: 

1626 if not disassociate: 

1627 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1628 if not unstore: 

1629 raise TypeError("Cannot pass purge=True without unstore=True.") 

1630 elif disassociate: 

1631 tags = tuple(tags) 

1632 if not tags: 

1633 raise TypeError("No tags provided but disassociate=True.") 

1634 for tag in tags: 

1635 collectionType = self.registry.getCollectionType(tag) 

1636 if collectionType is not CollectionType.TAGGED: 

1637 raise TypeError( 

1638 f"Cannot disassociate from collection '{tag}' " 

1639 f"of non-TAGGED type {collectionType.name}." 

1640 ) 

1641 # Transform possibly-single-pass iterable into something we can iterate 

1642 # over multiple times. 

1643 refs = list(refs) 

1644 # Pruning a component of a DatasetRef makes no sense since registry 

1645 # doesn't know about components and datastore might not store 

1646 # components in a separate file 

1647 for ref in refs: 

1648 if ref.datasetType.component(): 

1649 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1650 # We don't need an unreliable Datastore transaction for this, because 

1651 # we've been extra careful to ensure that Datastore.trash only involves 

1652 # mutating the Registry (it can _look_ at Datastore-specific things, 

1653 # but shouldn't change them), and hence all operations here are 

1654 # Registry operations. 

1655 with self.registry.transaction(): 

1656 if unstore: 

1657 self.datastore.trash(refs) 

1658 if purge: 

1659 self.registry.removeDatasets(refs) 

1660 elif disassociate: 

1661 assert tags, "Guaranteed by earlier logic in this function." 

1662 for tag in tags: 

1663 self.registry.disassociate(tag, refs) 

1664 # We've exited the Registry transaction, and apparently committed. 

1665 # (if there was an exception, everything rolled back, and it's as if 

1666 # nothing happened - and we never get here). 

1667 # Datastore artifacts are not yet gone, but they're clearly marked 

1668 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1669 # problems we can try again later, and if manual administrative 

1670 # intervention is required, it's pretty clear what that should entail: 

1671 # deleting everything on disk and in private Datastore tables that is 

1672 # in the dataset_location_trash table. 

1673 if unstore: 

1674 # Point of no return for removing artifacts 

1675 self.datastore.emptyTrash() 

1676 

1677 @transactional 

1678 def ingest( 

1679 self, 

1680 *datasets: FileDataset, 

1681 transfer: Optional[str] = "auto", 

1682 run: Optional[str] = None, 

1683 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1684 ) -> None: 

1685 """Store and register one or more datasets that already exist on disk. 

1686 

1687 Parameters 

1688 ---------- 

1689 datasets : `FileDataset` 

1690 Each positional argument is a struct containing information about 

1691 a file to be ingested, including its URI (either absolute or 

1692 relative to the datastore root, if applicable), a `DatasetRef`, 

1693 and optionally a formatter class or its fully-qualified string 

1694 name. If a formatter is not provided, the formatter that would be 

1695 used for `put` is assumed. On successful return, all 

1696 `FileDataset.ref` attributes will have their `DatasetRef.id` 

1697 attribute populated and all `FileDataset.formatter` attributes will 

1698 be set to the formatter class used. `FileDataset.path` attributes 

1699 may be modified to put paths in whatever the datastore considers a 

1700 standardized form. 

1701 transfer : `str`, optional 

1702 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1703 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to 

1704 transfer the file. 

1705 run : `str`, optional 

1706 The name of the run ingested datasets should be added to, 

1707 overriding ``self.run``. 

1708 idGenerationMode : `DatasetIdGenEnum`, optional 

1709 Specifies option for generating dataset IDs. By default unique IDs 

1710 are generated for each inserted dataset. 

1711 

1712 Raises 

1713 ------ 

1714 TypeError 

1715 Raised if the butler is read-only or if no run was provided. 

1716 NotImplementedError 

1717 Raised if the `Datastore` does not support the given transfer mode. 

1718 DatasetTypeNotSupportedError 

1719 Raised if one or more files to be ingested have a dataset type that 

1720 is not supported by the `Datastore`.. 

1721 FileNotFoundError 

1722 Raised if one of the given files does not exist. 

1723 FileExistsError 

1724 Raised if transfer is not `None` but the (internal) location the 

1725 file would be moved to is already occupied. 

1726 

1727 Notes 

1728 ----- 

1729 This operation is not fully exception safe: if a database operation 

1730 fails, the given `FileDataset` instances may be only partially updated. 

1731 

1732 It is atomic in terms of database operations (they will either all 

1733 succeed or all fail) providing the database engine implements 

1734 transactions correctly. It will attempt to be atomic in terms of 

1735 filesystem operations as well, but this cannot be implemented 

1736 rigorously for most datastores. 

1737 """ 

1738 if not self.isWriteable(): 

1739 raise TypeError("Butler is read-only.") 

1740 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1741 # Reorganize the inputs so they're grouped by DatasetType and then 

1742 # data ID. We also include a list of DatasetRefs for each FileDataset 

1743 # to hold the resolved DatasetRefs returned by the Registry, before 

1744 # it's safe to swap them into FileDataset.refs. 

1745 # Some type annotation aliases to make that clearer: 

1746 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]] 

1747 GroupedData = MutableMapping[DatasetType, GroupForType] 

1748 # The actual data structure: 

1749 groupedData: GroupedData = defaultdict(dict) 

1750 # And the nested loop that populates it: 

1751 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1752 # This list intentionally shared across the inner loop, since it's 

1753 # associated with `dataset`. 

1754 resolvedRefs: List[DatasetRef] = [] 

1755 

1756 # Somewhere to store pre-existing refs if we have an 

1757 # execution butler. 

1758 existingRefs: List[DatasetRef] = [] 

1759 

1760 for ref in dataset.refs: 

1761 if ref.dataId in groupedData[ref.datasetType]: 

1762 raise ConflictingDefinitionError( 

1763 f"Ingest conflict. Dataset {dataset.path} has same" 

1764 " DataId as other ingest dataset" 

1765 f" {groupedData[ref.datasetType][ref.dataId][0].path} " 

1766 f" ({ref.dataId})" 

1767 ) 

1768 if self._allow_put_of_predefined_dataset: 

1769 existing_ref = self.registry.findDataset( 

1770 ref.datasetType, dataId=ref.dataId, collections=run 

1771 ) 

1772 if existing_ref: 

1773 if self.datastore.knows(existing_ref): 

1774 raise ConflictingDefinitionError( 

1775 f"Dataset associated with path {dataset.path}" 

1776 f" already exists as {existing_ref}." 

1777 ) 

1778 # Store this ref elsewhere since it already exists 

1779 # and we do not want to remake it but we do want 

1780 # to store it in the datastore. 

1781 existingRefs.append(existing_ref) 

1782 

1783 # Nothing else to do until we have finished 

1784 # iterating. 

1785 continue 

1786 

1787 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs) 

1788 

1789 if existingRefs: 

1790 

1791 if len(dataset.refs) != len(existingRefs): 

1792 # Keeping track of partially pre-existing datasets is hard 

1793 # and should generally never happen. For now don't allow 

1794 # it. 

1795 raise ConflictingDefinitionError( 

1796 f"For dataset {dataset.path} some dataIds already exist" 

1797 " in registry but others do not. This is not supported." 

1798 ) 

1799 

1800 # Attach the resolved refs if we found them. 

1801 dataset.refs = existingRefs 

1802 

1803 # Now we can bulk-insert into Registry for each DatasetType. 

1804 for datasetType, groupForType in progress.iter_item_chunks( 

1805 groupedData.items(), desc="Bulk-inserting datasets by type" 

1806 ): 

1807 refs = self.registry.insertDatasets( 

1808 datasetType, 

1809 dataIds=groupForType.keys(), 

1810 run=run, 

1811 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType), 

1812 idGenerationMode=idGenerationMode, 

1813 ) 

1814 # Append those resolved DatasetRefs to the new lists we set up for 

1815 # them. 

1816 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()): 

1817 resolvedRefs.append(ref) 

1818 

1819 # Go back to the original FileDatasets to replace their refs with the 

1820 # new resolved ones. 

1821 for groupForType in progress.iter_chunks( 

1822 groupedData.values(), desc="Reassociating resolved dataset refs with files" 

1823 ): 

1824 for dataset, resolvedRefs in groupForType.values(): 

1825 dataset.refs = resolvedRefs 

1826 

1827 # Bulk-insert everything into Datastore. 

1828 self.datastore.ingest(*datasets, transfer=transfer) 

1829 

1830 @contextlib.contextmanager 

1831 def export( 

1832 self, 

1833 *, 

1834 directory: Optional[str] = None, 

1835 filename: Optional[str] = None, 

1836 format: Optional[str] = None, 

1837 transfer: Optional[str] = None, 

1838 ) -> Iterator[RepoExportContext]: 

1839 """Export datasets from the repository represented by this `Butler`. 

1840 

1841 This method is a context manager that returns a helper object 

1842 (`RepoExportContext`) that is used to indicate what information from 

1843 the repository should be exported. 

1844 

1845 Parameters 

1846 ---------- 

1847 directory : `str`, optional 

1848 Directory dataset files should be written to if ``transfer`` is not 

1849 `None`. 

1850 filename : `str`, optional 

1851 Name for the file that will include database information associated 

1852 with the exported datasets. If this is not an absolute path and 

1853 ``directory`` is not `None`, it will be written to ``directory`` 

1854 instead of the current working directory. Defaults to 

1855 "export.{format}". 

1856 format : `str`, optional 

1857 File format for the database information file. If `None`, the 

1858 extension of ``filename`` will be used. 

1859 transfer : `str`, optional 

1860 Transfer mode passed to `Datastore.export`. 

1861 

1862 Raises 

1863 ------ 

1864 TypeError 

1865 Raised if the set of arguments passed is inconsistent. 

1866 

1867 Examples 

1868 -------- 

1869 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

1870 methods are used to provide the iterables over data IDs and/or datasets 

1871 to be exported:: 

1872 

1873 with butler.export("exports.yaml") as export: 

1874 # Export all flats, but none of the dimension element rows 

1875 # (i.e. data ID information) associated with them. 

1876 export.saveDatasets(butler.registry.queryDatasets("flat"), 

1877 elements=()) 

1878 # Export all datasets that start with "deepCoadd_" and all of 

1879 # their associated data ID information. 

1880 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

1881 """ 

1882 if directory is None and transfer is not None: 

1883 raise TypeError("Cannot transfer without providing a directory.") 

1884 if transfer == "move": 

1885 raise TypeError("Transfer may not be 'move': export is read-only") 

1886 if format is None: 

1887 if filename is None: 

1888 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1889 else: 

1890 _, format = os.path.splitext(filename) 

1891 elif filename is None: 

1892 filename = f"export.{format}" 

1893 if directory is not None: 

1894 filename = os.path.join(directory, filename) 

1895 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["export"]) 

1896 with open(filename, "w") as stream: 

1897 backend = BackendClass(stream) 

1898 try: 

1899 helper = RepoExportContext( 

1900 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer 

1901 ) 

1902 yield helper 

1903 except BaseException: 

1904 raise 

1905 else: 

1906 helper._finish() 

1907 

1908 def import_( 

1909 self, 

1910 *, 

1911 directory: Optional[str] = None, 

1912 filename: Union[str, TextIO, None] = None, 

1913 format: Optional[str] = None, 

1914 transfer: Optional[str] = None, 

1915 skip_dimensions: Optional[Set] = None, 

1916 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1917 reuseIds: bool = False, 

1918 ) -> None: 

1919 """Import datasets into this repository that were exported from a 

1920 different butler repository via `~lsst.daf.butler.Butler.export`. 

1921 

1922 Parameters 

1923 ---------- 

1924 directory : `str`, optional 

1925 Directory containing dataset files to import from. If `None`, 

1926 ``filename`` and all dataset file paths specified therein must 

1927 be absolute. 

1928 filename : `str` or `TextIO`, optional 

1929 A stream or name of file that contains database information 

1930 associated with the exported datasets, typically generated by 

1931 `~lsst.daf.butler.Butler.export`. If this a string (name) and 

1932 is not an absolute path, does not exist in the current working 

1933 directory, and ``directory`` is not `None`, it is assumed to be in 

1934 ``directory``. Defaults to "export.{format}". 

1935 format : `str`, optional 

1936 File format for ``filename``. If `None`, the extension of 

1937 ``filename`` will be used. 

1938 transfer : `str`, optional 

1939 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

1940 skip_dimensions : `set`, optional 

1941 Names of dimensions that should be skipped and not imported. 

1942 idGenerationMode : `DatasetIdGenEnum`, optional 

1943 Specifies option for generating dataset IDs when IDs are not 

1944 provided or their type does not match backend type. By default 

1945 unique IDs are generated for each inserted dataset. 

1946 reuseIds : `bool`, optional 

1947 If `True` then forces re-use of imported dataset IDs for integer 

1948 IDs which are normally generated as auto-incremented; exception 

1949 will be raised if imported IDs clash with existing ones. This 

1950 option has no effect on the use of globally-unique IDs which are 

1951 always re-used (or generated if integer IDs are being imported). 

1952 

1953 Raises 

1954 ------ 

1955 TypeError 

1956 Raised if the set of arguments passed is inconsistent, or if the 

1957 butler is read-only. 

1958 """ 

1959 if not self.isWriteable(): 

1960 raise TypeError("Butler is read-only.") 

1961 if format is None: 

1962 if filename is None: 

1963 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1964 else: 

1965 _, format = os.path.splitext(filename) # type: ignore 

1966 elif filename is None: 

1967 filename = f"export.{format}" 

1968 if isinstance(filename, str) and directory is not None and not os.path.exists(filename): 

1969 filename = os.path.join(directory, filename) 

1970 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"]) 

1971 

1972 def doImport(importStream: TextIO) -> None: 

1973 backend = BackendClass(importStream, self.registry) 

1974 backend.register() 

1975 with self.transaction(): 

1976 backend.load( 

1977 self.datastore, 

1978 directory=directory, 

1979 transfer=transfer, 

1980 skip_dimensions=skip_dimensions, 

1981 idGenerationMode=idGenerationMode, 

1982 reuseIds=reuseIds, 

1983 ) 

1984 

1985 if isinstance(filename, str): 

1986 with open(filename, "r") as stream: 

1987 doImport(stream) 

1988 else: 

1989 doImport(filename) 

1990 

1991 def transfer_from( 

1992 self, 

1993 source_butler: Butler, 

1994 source_refs: Iterable[DatasetRef], 

1995 transfer: str = "auto", 

1996 id_gen_map: Dict[str, DatasetIdGenEnum] = None, 

1997 skip_missing: bool = True, 

1998 register_dataset_types: bool = False, 

1999 ) -> List[DatasetRef]: 

2000 """Transfer datasets to this Butler from a run in another Butler. 

2001 

2002 Parameters 

2003 ---------- 

2004 source_butler : `Butler` 

2005 Butler from which the datasets are to be transferred. 

2006 source_refs : iterable of `DatasetRef` 

2007 Datasets defined in the source butler that should be transferred to 

2008 this butler. 

2009 transfer : `str`, optional 

2010 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

2011 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional 

2012 A mapping of dataset type to ID generation mode. Only used if 

2013 the source butler is using integer IDs. Should not be used 

2014 if this receiving butler uses integer IDs. Without this dataset 

2015 import always uses unique. 

2016 skip_missing : `bool` 

2017 If `True`, datasets with no datastore artifact associated with 

2018 them are not transferred. If `False` a registry entry will be 

2019 created even if no datastore record is created (and so will 

2020 look equivalent to the dataset being unstored). 

2021 register_dataset_types : `bool` 

2022 If `True` any missing dataset types are registered. Otherwise 

2023 an exception is raised. 

2024 

2025 Returns 

2026 ------- 

2027 refs : `list` of `DatasetRef` 

2028 The refs added to this Butler. 

2029 

2030 Notes 

2031 ----- 

2032 Requires that any dimension definitions are already present in the 

2033 receiving Butler. The datastore artifact has to exist for a transfer 

2034 to be made but non-existence is not an error. 

2035 

2036 Datasets that already exist in this run will be skipped. 

2037 

2038 The datasets are imported as part of a transaction, although 

2039 dataset types are registered before the transaction is started. 

2040 This means that it is possible for a dataset type to be registered 

2041 even though transfer has failed. 

2042 """ 

2043 if not self.isWriteable(): 

2044 raise TypeError("Butler is read-only.") 

2045 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

2046 

2047 # Will iterate through the refs multiple times so need to convert 

2048 # to a list if this isn't a collection. 

2049 if not isinstance(source_refs, collections.abc.Collection): 

2050 source_refs = list(source_refs) 

2051 

2052 original_count = len(source_refs) 

2053 log.info("Transferring %d datasets into %s", original_count, str(self)) 

2054 

2055 if id_gen_map is None: 

2056 id_gen_map = {} 

2057 

2058 # In some situations the datastore artifact may be missing 

2059 # and we do not want that registry entry to be imported. 

2060 # Asking datastore is not sufficient, the records may have been 

2061 # purged, we have to ask for the (predicted) URI and check 

2062 # existence explicitly. Execution butler is set up exactly like 

2063 # this with no datastore records. 

2064 artifact_existence: Dict[ButlerURI, bool] = {} 

2065 if skip_missing: 

2066 dataset_existence = source_butler.datastore.mexists( 

2067 source_refs, artifact_existence=artifact_existence 

2068 ) 

2069 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

2070 filtered_count = len(source_refs) 

2071 log.verbose( 

2072 "%d datasets removed because the artifact does not exist. Now have %d.", 

2073 original_count - filtered_count, 

2074 filtered_count, 

2075 ) 

2076 

2077 # Importing requires that we group the refs by dataset type and run 

2078 # before doing the import. 

2079 source_dataset_types = set() 

2080 grouped_refs = defaultdict(list) 

2081 grouped_indices = defaultdict(list) 

2082 for i, ref in enumerate(source_refs): 

2083 grouped_refs[ref.datasetType, ref.run].append(ref) 

2084 grouped_indices[ref.datasetType, ref.run].append(i) 

2085 source_dataset_types.add(ref.datasetType) 

2086 

2087 # Check to see if the dataset type in the source butler has 

2088 # the same definition in the target butler and register missing 

2089 # ones if requested. Registration must happen outside a transaction. 

2090 newly_registered_dataset_types = set() 

2091 for datasetType in source_dataset_types: 

2092 if register_dataset_types: 

2093 # Let this raise immediately if inconsistent. Continuing 

2094 # on to find additional inconsistent dataset types 

2095 # might result in additional unwanted dataset types being 

2096 # registered. 

2097 if self.registry.registerDatasetType(datasetType): 

2098 newly_registered_dataset_types.add(datasetType) 

2099 else: 

2100 # If the dataset type is missing, let it fail immediately. 

2101 target_dataset_type = self.registry.getDatasetType(datasetType.name) 

2102 if target_dataset_type != datasetType: 

2103 raise ConflictingDefinitionError( 

2104 "Source butler dataset type differs from definition" 

2105 f" in target butler: {datasetType} !=" 

2106 f" {target_dataset_type}" 

2107 ) 

2108 if newly_registered_dataset_types: 

2109 # We may have registered some even if there were inconsistencies 

2110 # but should let people know (or else remove them again). 

2111 log.log( 

2112 VERBOSE, 

2113 "Registered the following dataset types in the target Butler: %s", 

2114 ", ".join(d.name for d in newly_registered_dataset_types), 

2115 ) 

2116 else: 

2117 log.log(VERBOSE, "All required dataset types are known to the target Butler") 

2118 

2119 # The returned refs should be identical for UUIDs. 

2120 # For now must also support integers and so need to retain the 

2121 # newly-created refs from this registry. 

2122 # Pre-size it so we can assign refs into the correct slots 

2123 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs) 

2124 default_id_gen = DatasetIdGenEnum.UNIQUE 

2125 

2126 handled_collections: Set[str] = set() 

2127 

2128 # Do all the importing in a single transaction. 

2129 with self.transaction(): 

2130 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

2131 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

2132 ): 

2133 if run not in handled_collections: 

2134 run_doc = source_butler.registry.getCollectionDocumentation(run) 

2135 registered = self.registry.registerRun(run, doc=run_doc) 

2136 handled_collections.add(run) 

2137 if registered: 

2138 log.log(VERBOSE, "Creating output run %s", run) 

2139 

2140 id_generation_mode = default_id_gen 

2141 if isinstance(refs_to_import[0].id, int): 

2142 # ID generation mode might need to be overridden when 

2143 # targetting UUID 

2144 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen) 

2145 

2146 n_refs = len(refs_to_import) 

2147 log.verbose( 

2148 "Importing %d ref%s of dataset type %s into run %s", 

2149 n_refs, 

2150 "" if n_refs == 1 else "s", 

2151 datasetType.name, 

2152 run, 

2153 ) 

2154 

2155 # No way to know if this butler's registry uses UUID. 

2156 # We have to trust the caller on this. If it fails they will 

2157 # have to change their approach. We can't catch the exception 

2158 # and retry with unique because that will mess up the 

2159 # transaction handling. We aren't allowed to ask the registry 

2160 # manager what type of ID it is using. 

2161 imported_refs = self.registry._importDatasets( 

2162 refs_to_import, idGenerationMode=id_generation_mode, expand=False 

2163 ) 

2164 

2165 # Map them into the correct slots to match the initial order 

2166 for i, ref in zip(grouped_indices[datasetType, run], imported_refs): 

2167 transferred_refs_tmp[i] = ref 

2168 

2169 # Mypy insists that we might have None in here so we have to make 

2170 # that explicit by assigning to a new variable and filtering out 

2171 # something that won't be there. 

2172 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None] 

2173 

2174 # Check consistency 

2175 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given" 

2176 

2177 log.verbose("Imported %d datasets into destination butler", len(transferred_refs)) 

2178 

2179 # The transferred refs need to be reordered to match the original 

2180 # ordering given by the caller. Without this the datastore transfer 

2181 # will be broken. 

2182 

2183 # Ask the datastore to transfer. The datastore has to check that 

2184 # the source datastore is compatible with the target datastore. 

2185 self.datastore.transfer_from( 

2186 source_butler.datastore, 

2187 source_refs, 

2188 local_refs=transferred_refs, 

2189 transfer=transfer, 

2190 artifact_existence=artifact_existence, 

2191 ) 

2192 

2193 return transferred_refs 

2194 

2195 def validateConfiguration( 

2196 self, 

2197 logFailures: bool = False, 

2198 datasetTypeNames: Optional[Iterable[str]] = None, 

2199 ignore: Iterable[str] = None, 

2200 ) -> None: 

2201 """Validate butler configuration. 

2202 

2203 Checks that each `DatasetType` can be stored in the `Datastore`. 

2204 

2205 Parameters 

2206 ---------- 

2207 logFailures : `bool`, optional 

2208 If `True`, output a log message for every validation error 

2209 detected. 

2210 datasetTypeNames : iterable of `str`, optional 

2211 The `DatasetType` names that should be checked. This allows 

2212 only a subset to be selected. 

2213 ignore : iterable of `str`, optional 

2214 Names of DatasetTypes to skip over. This can be used to skip 

2215 known problems. If a named `DatasetType` corresponds to a 

2216 composite, all components of that `DatasetType` will also be 

2217 ignored. 

2218 

2219 Raises 

2220 ------ 

2221 ButlerValidationError 

2222 Raised if there is some inconsistency with how this Butler 

2223 is configured. 

2224 """ 

2225 if datasetTypeNames: 

2226 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

2227 else: 

2228 datasetTypes = list(self.registry.queryDatasetTypes()) 

2229 

2230 # filter out anything from the ignore list 

2231 if ignore: 

2232 ignore = set(ignore) 

2233 datasetTypes = [ 

2234 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

2235 ] 

2236 else: 

2237 ignore = set() 

2238 

2239 # Find all the registered instruments 

2240 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument")) 

2241 

2242 # For each datasetType that has an instrument dimension, create 

2243 # a DatasetRef for each defined instrument 

2244 datasetRefs = [] 

2245 

2246 for datasetType in datasetTypes: 

2247 if "instrument" in datasetType.dimensions: 

2248 for instrument in instruments: 

2249 datasetRef = DatasetRef( 

2250 datasetType, {"instrument": instrument}, conform=False # type: ignore 

2251 ) 

2252 datasetRefs.append(datasetRef) 

2253 

2254 entities: List[Union[DatasetType, DatasetRef]] = [] 

2255 entities.extend(datasetTypes) 

2256 entities.extend(datasetRefs) 

2257 

2258 datastoreErrorStr = None 

2259 try: 

2260 self.datastore.validateConfiguration(entities, logFailures=logFailures) 

2261 except ValidationError as e: 

2262 datastoreErrorStr = str(e) 

2263 

2264 # Also check that the LookupKeys used by the datastores match 

2265 # registry and storage class definitions 

2266 keys = self.datastore.getLookupKeys() 

2267 

2268 failedNames = set() 

2269 failedDataId = set() 

2270 for key in keys: 

2271 if key.name is not None: 

2272 if key.name in ignore: 

2273 continue 

2274 

2275 # skip if specific datasetType names were requested and this 

2276 # name does not match 

2277 if datasetTypeNames and key.name not in datasetTypeNames: 

2278 continue 

2279 

2280 # See if it is a StorageClass or a DatasetType 

2281 if key.name in self.storageClasses: 

2282 pass 

2283 else: 

2284 try: 

2285 self.registry.getDatasetType(key.name) 

2286 except KeyError: 

2287 if logFailures: 

2288 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

2289 failedNames.add(key) 

2290 else: 

2291 # Dimensions are checked for consistency when the Butler 

2292 # is created and rendezvoused with a universe. 

2293 pass 

2294 

2295 # Check that the instrument is a valid instrument 

2296 # Currently only support instrument so check for that 

2297 if key.dataId: 

2298 dataIdKeys = set(key.dataId) 

2299 if set(["instrument"]) != dataIdKeys: 

2300 if logFailures: 

2301 log.critical("Key '%s' has unsupported DataId override", key) 

2302 failedDataId.add(key) 

2303 elif key.dataId["instrument"] not in instruments: 

2304 if logFailures: 

2305 log.critical("Key '%s' has unknown instrument", key) 

2306 failedDataId.add(key) 

2307 

2308 messages = [] 

2309 

2310 if datastoreErrorStr: 

2311 messages.append(datastoreErrorStr) 

2312 

2313 for failed, msg in ( 

2314 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2315 (failedDataId, "Keys with bad DataId entries: "), 

2316 ): 

2317 if failed: 

2318 msg += ", ".join(str(k) for k in failed) 

2319 messages.append(msg) 

2320 

2321 if messages: 

2322 raise ValidationError(";\n".join(messages)) 

2323 

2324 @property 

2325 def collections(self) -> CollectionSearch: 

2326 """The collections to search by default, in order (`CollectionSearch`). 

2327 

2328 This is an alias for ``self.registry.defaults.collections``. It cannot 

2329 be set directly in isolation, but all defaults may be changed together 

2330 by assigning a new `RegistryDefaults` instance to 

2331 ``self.registry.defaults``. 

2332 """ 

2333 return self.registry.defaults.collections 

2334 

2335 @property 

2336 def run(self) -> Optional[str]: 

2337 """Name of the run this butler writes outputs to by default (`str` or 

2338 `None`). 

2339 

2340 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2341 directly in isolation, but all defaults may be changed together by 

2342 assigning a new `RegistryDefaults` instance to 

2343 ``self.registry.defaults``. 

2344 """ 

2345 return self.registry.defaults.run 

2346 

2347 registry: Registry 

2348 """The object that manages dataset metadata and relationships (`Registry`). 

2349 

2350 Most operations that don't involve reading or writing butler datasets are 

2351 accessible only via `Registry` methods. 

2352 """ 

2353 

2354 datastore: Datastore 

2355 """The object that manages actual dataset storage (`Datastore`). 

2356 

2357 Direct user access to the datastore should rarely be necessary; the primary 

2358 exception is the case where a `Datastore` implementation provides extra 

2359 functionality beyond what the base class defines. 

2360 """ 

2361 

2362 storageClasses: StorageClassFactory 

2363 """An object that maps known storage class names to objects that fully 

2364 describe them (`StorageClassFactory`). 

2365 """ 

2366 

2367 _allow_put_of_predefined_dataset: bool 

2368 """Allow a put to succeed even if there is already a registry entry for it 

2369 but not a datastore record. (`bool`)."""