Coverage for python/lsst/daf/butler/_butler.py: 10%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

598 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Butler top level classes. 

24""" 

25from __future__ import annotations 

26 

27__all__ = ( 

28 "Butler", 

29 "ButlerValidationError", 

30 "PruneCollectionsArgsError", 

31 "PurgeWithoutUnstorePruneCollectionsError", 

32 "RunWithoutPurgePruneCollectionsError", 

33 "PurgeUnsupportedPruneCollectionsError", 

34) 

35 

36import collections.abc 

37from collections import defaultdict 

38import contextlib 

39import logging 

40import numbers 

41import os 

42from typing import ( 

43 Any, 

44 ClassVar, 

45 Counter, 

46 Dict, 

47 Iterable, 

48 Iterator, 

49 List, 

50 MutableMapping, 

51 Optional, 

52 Set, 

53 TextIO, 

54 Tuple, 

55 Type, 

56 Union, 

57) 

58 

59try: 

60 import boto3 

61except ImportError: 

62 boto3 = None 

63 

64from lsst.utils import doImportType 

65from lsst.utils.introspection import get_class_of 

66from lsst.utils.logging import getLogger, VERBOSE 

67from .core import ( 

68 AmbiguousDatasetError, 

69 ButlerURI, 

70 Config, 

71 ConfigSubset, 

72 DataCoordinate, 

73 DataId, 

74 DataIdValue, 

75 DatasetRef, 

76 DatasetType, 

77 Datastore, 

78 Dimension, 

79 DimensionConfig, 

80 FileDataset, 

81 Progress, 

82 StorageClassFactory, 

83 Timespan, 

84 ValidationError, 

85) 

86from .core.repoRelocation import BUTLER_ROOT_TAG 

87from .core.utils import transactional 

88from ._deferredDatasetHandle import DeferredDatasetHandle 

89from ._butlerConfig import ButlerConfig 

90from ._butlerRepoIndex import ButlerRepoIndex 

91from .registry import ( 

92 Registry, 

93 RegistryConfig, 

94 RegistryDefaults, 

95 CollectionSearch, 

96 CollectionType, 

97 ConflictingDefinitionError, 

98 DatasetIdGenEnum, 

99) 

100from .transfers import RepoExportContext 

101 

102log = getLogger(__name__) 

103 

104 

105class ButlerValidationError(ValidationError): 

106 """There is a problem with the Butler configuration.""" 

107 pass 

108 

109 

110class PruneCollectionsArgsError(TypeError): 

111 """Base class for errors relating to Butler.pruneCollections input 

112 arguments. 

113 """ 

114 pass 

115 

116 

117class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError): 

118 """Raised when purge and unstore are both required to be True, and 

119 purge is True but unstore is False. 

120 """ 

121 

122 def __init__(self) -> None: 

123 super().__init__("Cannot pass purge=True without unstore=True.") 

124 

125 

126class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError): 

127 """Raised when pruning a RUN collection but purge is False.""" 

128 

129 def __init__(self, collectionType: CollectionType): 

130 self.collectionType = collectionType 

131 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.") 

132 

133 

134class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError): 

135 """Raised when purge is True but is not supported for the given 

136 collection.""" 

137 

138 def __init__(self, collectionType: CollectionType): 

139 self.collectionType = collectionType 

140 super().__init__( 

141 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True.") 

142 

143 

144class Butler: 

145 """Main entry point for the data access system. 

146 

147 Parameters 

148 ---------- 

149 config : `ButlerConfig`, `Config` or `str`, optional. 

150 Configuration. Anything acceptable to the 

151 `ButlerConfig` constructor. If a directory path 

152 is given the configuration will be read from a ``butler.yaml`` file in 

153 that location. If `None` is given default values will be used. 

154 butler : `Butler`, optional. 

155 If provided, construct a new Butler that uses the same registry and 

156 datastore as the given one, but with the given collection and run. 

157 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

158 arguments. 

159 collections : `str` or `Iterable` [ `str` ], optional 

160 An expression specifying the collections to be searched (in order) when 

161 reading datasets. 

162 This may be a `str` collection name or an iterable thereof. 

163 See :ref:`daf_butler_collection_expressions` for more information. 

164 These collections are not registered automatically and must be 

165 manually registered before they are used by any method, but they may be 

166 manually registered after the `Butler` is initialized. 

167 run : `str`, optional 

168 Name of the `~CollectionType.RUN` collection new datasets should be 

169 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

170 ``collections`` will be set to ``[run]``. If not `None`, this 

171 collection will automatically be registered. If this is not set (and 

172 ``writeable`` is not set either), a read-only butler will be created. 

173 searchPaths : `list` of `str`, optional 

174 Directory paths to search when calculating the full Butler 

175 configuration. Not used if the supplied config is already a 

176 `ButlerConfig`. 

177 writeable : `bool`, optional 

178 Explicitly sets whether the butler supports write operations. If not 

179 provided, a read-write butler is created if any of ``run``, ``tags``, 

180 or ``chains`` is non-empty. 

181 inferDefaults : `bool`, optional 

182 If `True` (default) infer default data ID values from the values 

183 present in the datasets in ``collections``: if all collections have the 

184 same value (or no value) for a governor dimension, that value will be 

185 the default for that dimension. Nonexistent collections are ignored. 

186 If a default value is provided explicitly for a governor dimension via 

187 ``**kwargs``, no default will be inferred for that dimension. 

188 **kwargs : `str` 

189 Default data ID key-value pairs. These may only identify "governor" 

190 dimensions like ``instrument`` and ``skymap``. 

191 

192 Examples 

193 -------- 

194 While there are many ways to control exactly how a `Butler` interacts with 

195 the collections in its `Registry`, the most common cases are still simple. 

196 

197 For a read-only `Butler` that searches one collection, do:: 

198 

199 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

200 

201 For a read-write `Butler` that writes to and reads from a 

202 `~CollectionType.RUN` collection:: 

203 

204 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

205 

206 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

207 because we want to write to one `~CollectionType.RUN` collection but read 

208 from several others (as well):: 

209 

210 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

211 collections=["u/alice/DM-50000/a", 

212 "u/bob/DM-49998", 

213 "HSC/defaults"]) 

214 

215 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

216 Datasets will be read first from that run (since it appears first in the 

217 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

218 

219 Finally, one can always create a `Butler` with no collections:: 

220 

221 butler = Butler("/path/to/repo", writeable=True) 

222 

223 This can be extremely useful when you just want to use ``butler.registry``, 

224 e.g. for inserting dimension data or managing collections, or when the 

225 collections you want to use with the butler are not consistent. 

226 Passing ``writeable`` explicitly here is only necessary if you want to be 

227 able to make changes to the repo - usually the value for ``writeable`` can 

228 be guessed from the collection arguments provided, but it defaults to 

229 `False` when there are not collection arguments. 

230 """ 

231 def __init__(self, config: Union[Config, str, None] = None, *, 

232 butler: Optional[Butler] = None, 

233 collections: Any = None, 

234 run: Optional[str] = None, 

235 searchPaths: Optional[List[str]] = None, 

236 writeable: Optional[bool] = None, 

237 inferDefaults: bool = True, 

238 **kwargs: str, 

239 ): 

240 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

241 # Load registry, datastore, etc. from config or existing butler. 

242 if butler is not None: 

243 if config is not None or searchPaths is not None or writeable is not None: 

244 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' " 

245 "arguments with 'butler' argument.") 

246 self.registry = butler.registry.copy(defaults) 

247 self.datastore = butler.datastore 

248 self.storageClasses = butler.storageClasses 

249 self._config: ButlerConfig = butler._config 

250 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset 

251 else: 

252 self._config = ButlerConfig(config, searchPaths=searchPaths) 

253 try: 

254 if "root" in self._config: 

255 butlerRoot = self._config["root"] 

256 else: 

257 butlerRoot = self._config.configDir 

258 if writeable is None: 

259 writeable = run is not None 

260 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable, 

261 defaults=defaults) 

262 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(), 

263 butlerRoot=butlerRoot) 

264 self.storageClasses = StorageClassFactory() 

265 self.storageClasses.addFromConfig(self._config) 

266 self._allow_put_of_predefined_dataset = self._config.get("allow_put_of_predefined_dataset", 

267 False) 

268 except Exception: 

269 # Failures here usually mean that configuration is incomplete, 

270 # just issue an error message which includes config file URI. 

271 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

272 raise 

273 

274 if "run" in self._config or "collection" in self._config: 

275 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

276 

277 GENERATION: ClassVar[int] = 3 

278 """This is a Generation 3 Butler. 

279 

280 This attribute may be removed in the future, once the Generation 2 Butler 

281 interface has been fully retired; it should only be used in transitional 

282 code. 

283 """ 

284 

285 @classmethod 

286 def get_repo_uri(cls, label: str) -> ButlerURI: 

287 """Look up the label in a butler repository index. 

288 

289 Parameters 

290 ---------- 

291 label : `str` 

292 Label of the Butler repository to look up. 

293 

294 Returns 

295 ------- 

296 uri : `ButlerURI` 

297 URI to the Butler repository associated with the given label. 

298 

299 Raises 

300 ------ 

301 KeyError 

302 Raised if the label is not found in the index, or if an index 

303 can not be found at all. 

304 

305 Notes 

306 ----- 

307 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

308 information is discovered. 

309 """ 

310 return ButlerRepoIndex.get_repo_uri(label) 

311 

312 @classmethod 

313 def get_known_repos(cls) -> Set[str]: 

314 """Retrieve the list of known repository labels. 

315 

316 Returns 

317 ------- 

318 repos : `set` of `str` 

319 All the known labels. Can be empty if no index can be found. 

320 

321 Notes 

322 ----- 

323 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

324 information is discovered. 

325 """ 

326 return ButlerRepoIndex.get_known_repos() 

327 

328 @staticmethod 

329 def makeRepo(root: str, config: Union[Config, str, None] = None, 

330 dimensionConfig: Union[Config, str, None] = None, standalone: bool = False, 

331 searchPaths: Optional[List[str]] = None, forceConfigRoot: bool = True, 

332 outfile: Optional[str] = None, overwrite: bool = False) -> Config: 

333 """Create an empty data repository by adding a butler.yaml config 

334 to a repository root directory. 

335 

336 Parameters 

337 ---------- 

338 root : `str` or `ButlerURI` 

339 Path or URI to the root location of the new repository. Will be 

340 created if it does not exist. 

341 config : `Config` or `str`, optional 

342 Configuration to write to the repository, after setting any 

343 root-dependent Registry or Datastore config options. Can not 

344 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

345 configuration will be used. Root-dependent config options 

346 specified in this config are overwritten if ``forceConfigRoot`` 

347 is `True`. 

348 dimensionConfig : `Config` or `str`, optional 

349 Configuration for dimensions, will be used to initialize registry 

350 database. 

351 standalone : `bool` 

352 If True, write all expanded defaults, not just customized or 

353 repository-specific settings. 

354 This (mostly) decouples the repository from the default 

355 configuration, insulating it from changes to the defaults (which 

356 may be good or bad, depending on the nature of the changes). 

357 Future *additions* to the defaults will still be picked up when 

358 initializing `Butlers` to repos created with ``standalone=True``. 

359 searchPaths : `list` of `str`, optional 

360 Directory paths to search when calculating the full butler 

361 configuration. 

362 forceConfigRoot : `bool`, optional 

363 If `False`, any values present in the supplied ``config`` that 

364 would normally be reset are not overridden and will appear 

365 directly in the output config. This allows non-standard overrides 

366 of the root directory for a datastore or registry to be given. 

367 If this parameter is `True` the values for ``root`` will be 

368 forced into the resulting config if appropriate. 

369 outfile : `str`, optional 

370 If not-`None`, the output configuration will be written to this 

371 location rather than into the repository itself. Can be a URI 

372 string. Can refer to a directory that will be used to write 

373 ``butler.yaml``. 

374 overwrite : `bool`, optional 

375 Create a new configuration file even if one already exists 

376 in the specified output location. Default is to raise 

377 an exception. 

378 

379 Returns 

380 ------- 

381 config : `Config` 

382 The updated `Config` instance written to the repo. 

383 

384 Raises 

385 ------ 

386 ValueError 

387 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

388 regular Config (as these subclasses would make it impossible to 

389 support ``standalone=False``). 

390 FileExistsError 

391 Raised if the output config file already exists. 

392 os.error 

393 Raised if the directory does not exist, exists but is not a 

394 directory, or cannot be created. 

395 

396 Notes 

397 ----- 

398 Note that when ``standalone=False`` (the default), the configuration 

399 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

400 construct the repository should also be used to construct any Butlers 

401 to avoid configuration inconsistencies. 

402 """ 

403 if isinstance(config, (ButlerConfig, ConfigSubset)): 

404 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

405 

406 # Ensure that the root of the repository exists or can be made 

407 uri = ButlerURI(root, forceDirectory=True) 

408 uri.mkdir() 

409 

410 config = Config(config) 

411 

412 # If we are creating a new repo from scratch with relative roots, 

413 # do not propagate an explicit root from the config file 

414 if "root" in config: 

415 del config["root"] 

416 

417 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

418 imported_class = doImportType(full["datastore", "cls"]) 

419 if not issubclass(imported_class, Datastore): 

420 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore") 

421 datastoreClass: Type[Datastore] = imported_class 

422 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

423 

424 # if key exists in given config, parse it, otherwise parse the defaults 

425 # in the expanded config 

426 if config.get(("registry", "db")): 

427 registryConfig = RegistryConfig(config) 

428 else: 

429 registryConfig = RegistryConfig(full) 

430 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

431 if defaultDatabaseUri is not None: 

432 Config.updateParameters(RegistryConfig, config, full, 

433 toUpdate={"db": defaultDatabaseUri}, 

434 overwrite=forceConfigRoot) 

435 else: 

436 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), 

437 overwrite=forceConfigRoot) 

438 

439 if standalone: 

440 config.merge(full) 

441 else: 

442 # Always expand the registry.managers section into the per-repo 

443 # config, because after the database schema is created, it's not 

444 # allowed to change anymore. Note that in the standalone=True 

445 # branch, _everything_ in the config is expanded, so there's no 

446 # need to special case this. 

447 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

448 configURI: Union[str, ButlerURI] 

449 if outfile is not None: 

450 # When writing to a separate location we must include 

451 # the root of the butler repo in the config else it won't know 

452 # where to look. 

453 config["root"] = uri.geturl() 

454 configURI = outfile 

455 else: 

456 configURI = uri 

457 config.dumpToUri(configURI, overwrite=overwrite) 

458 

459 # Create Registry and populate tables 

460 registryConfig = RegistryConfig(config.get("registry")) 

461 dimensionConfig = DimensionConfig(dimensionConfig) 

462 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root) 

463 

464 log.verbose("Wrote new Butler configuration file to %s", configURI) 

465 

466 return config 

467 

468 @classmethod 

469 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str], 

470 defaultDataId: Dict[str, str], writeable: bool) -> Butler: 

471 """Callable used to unpickle a Butler. 

472 

473 We prefer not to use ``Butler.__init__`` directly so we can force some 

474 of its many arguments to be keyword-only (note that ``__reduce__`` 

475 can only invoke callables with positional arguments). 

476 

477 Parameters 

478 ---------- 

479 config : `ButlerConfig` 

480 Butler configuration, already coerced into a true `ButlerConfig` 

481 instance (and hence after any search paths for overrides have been 

482 utilized). 

483 collections : `CollectionSearch` 

484 Names of the default collections to read from. 

485 run : `str`, optional 

486 Name of the default `~CollectionType.RUN` collection to write to. 

487 defaultDataId : `dict` [ `str`, `str` ] 

488 Default data ID values. 

489 writeable : `bool` 

490 Whether the Butler should support write operations. 

491 

492 Returns 

493 ------- 

494 butler : `Butler` 

495 A new `Butler` instance. 

496 """ 

497 # MyPy doesn't recognize that the kwargs below are totally valid; it 

498 # seems to think '**defaultDataId* is a _positional_ argument! 

499 return cls(config=config, collections=collections, run=run, writeable=writeable, 

500 **defaultDataId) # type: ignore 

501 

502 def __reduce__(self) -> tuple: 

503 """Support pickling. 

504 """ 

505 return (Butler._unpickle, (self._config, self.collections, self.run, 

506 self.registry.defaults.dataId.byName(), 

507 self.registry.isWriteable())) 

508 

509 def __str__(self) -> str: 

510 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

511 self.collections, self.run, self.datastore, self.registry) 

512 

513 def isWriteable(self) -> bool: 

514 """Return `True` if this `Butler` supports write operations. 

515 """ 

516 return self.registry.isWriteable() 

517 

518 @contextlib.contextmanager 

519 def transaction(self) -> Iterator[None]: 

520 """Context manager supporting `Butler` transactions. 

521 

522 Transactions can be nested. 

523 """ 

524 with self.registry.transaction(): 

525 with self.datastore.transaction(): 

526 yield 

527 

528 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

529 dataId: Optional[DataId] = None, **kwargs: Any 

530 ) -> Tuple[DatasetType, Optional[DataId]]: 

531 """Standardize the arguments passed to several Butler APIs. 

532 

533 Parameters 

534 ---------- 

535 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

536 When `DatasetRef` the `dataId` should be `None`. 

537 Otherwise the `DatasetType` or name thereof. 

538 dataId : `dict` or `DataCoordinate` 

539 A `dict` of `Dimension` link name, value pairs that label the 

540 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

541 should be provided as the second argument. 

542 **kwargs 

543 Additional keyword arguments used to augment or construct a 

544 `DataCoordinate`. See `DataCoordinate.standardize` 

545 parameters. 

546 

547 Returns 

548 ------- 

549 datasetType : `DatasetType` 

550 A `DatasetType` instance extracted from ``datasetRefOrType``. 

551 dataId : `dict` or `DataId`, optional 

552 Argument that can be used (along with ``kwargs``) to construct a 

553 `DataId`. 

554 

555 Notes 

556 ----- 

557 Butler APIs that conceptually need a DatasetRef also allow passing a 

558 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

559 keyword arguments that can be used to construct one) separately. This 

560 method accepts those arguments and always returns a true `DatasetType` 

561 and a `DataId` or `dict`. 

562 

563 Standardization of `dict` vs `DataId` is best handled by passing the 

564 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

565 generally similarly flexible. 

566 """ 

567 externalDatasetType: Optional[DatasetType] = None 

568 internalDatasetType: Optional[DatasetType] = None 

569 if isinstance(datasetRefOrType, DatasetRef): 

570 if dataId is not None or kwargs: 

571 raise ValueError("DatasetRef given, cannot use dataId as well") 

572 externalDatasetType = datasetRefOrType.datasetType 

573 dataId = datasetRefOrType.dataId 

574 else: 

575 # Don't check whether DataId is provided, because Registry APIs 

576 # can usually construct a better error message when it wasn't. 

577 if isinstance(datasetRefOrType, DatasetType): 

578 externalDatasetType = datasetRefOrType 

579 else: 

580 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

581 

582 # Check that they are self-consistent 

583 if externalDatasetType is not None: 

584 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

585 if externalDatasetType != internalDatasetType: 

586 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

587 f"registry definition ({internalDatasetType})") 

588 

589 assert internalDatasetType is not None 

590 return internalDatasetType, dataId 

591 

592 def _rewrite_data_id(self, dataId: Optional[DataId], datasetType: DatasetType, 

593 **kwargs: Any) -> Tuple[Optional[DataId], Dict[str, Any]]: 

594 """Rewrite a data ID taking into account dimension records. 

595 

596 Take a Data ID and keyword args and rewrite it if necessary to 

597 allow the user to specify dimension records rather than dimension 

598 primary values. 

599 

600 This allows a user to include a dataId dict with keys of 

601 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

602 the integer exposure ID. It also allows a string to be given 

603 for a dimension value rather than the integer ID if that is more 

604 convenient. For example, rather than having to specifyin the 

605 detector with ``detector.full_name``, a string given for ``detector`` 

606 will be interpreted as the full name and converted to the integer 

607 value. 

608 

609 Keyword arguments can also use strings for dimensions like detector 

610 and exposure but python does not allow them to include ``.`` and 

611 so the ``exposure.day_obs`` syntax can not be used in a keyword 

612 argument. 

613 

614 Parameters 

615 ---------- 

616 dataId : `dict` or `DataCoordinate` 

617 A `dict` of `Dimension` link name, value pairs that will label the 

618 `DatasetRef` within a Collection. 

619 datasetType : `DatasetType` 

620 The dataset type associated with this dataId. Required to 

621 determine the relevant dimensions. 

622 **kwargs 

623 Additional keyword arguments used to augment or construct a 

624 `DataId`. See `DataId` parameters. 

625 

626 Returns 

627 ------- 

628 dataId : `dict` or `DataCoordinate` 

629 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

630 no keyword arguments, the original dataId will be returned 

631 unchanged. 

632 **kwargs : `dict` 

633 Any unused keyword arguments. 

634 """ 

635 # Do nothing if we have a standalone DataCoordinate. 

636 if isinstance(dataId, DataCoordinate) and not kwargs: 

637 return dataId, kwargs 

638 

639 # Process dimension records that are using record information 

640 # rather than ids 

641 newDataId: Dict[str, DataIdValue] = {} 

642 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict) 

643 

644 # if all the dataId comes from keyword parameters we do not need 

645 # to do anything here because they can't be of the form 

646 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

647 if dataId: 

648 for k, v in dataId.items(): 

649 # If we have a Dimension we do not need to do anything 

650 # because it cannot be a compound key. 

651 if isinstance(k, str) and "." in k: 

652 # Someone is using a more human-readable dataId 

653 dimensionName, record = k.split(".", 1) 

654 byRecord[dimensionName][record] = v 

655 elif isinstance(k, Dimension): 

656 newDataId[k.name] = v 

657 else: 

658 newDataId[k] = v 

659 

660 # Go through the updated dataId and check the type in case someone is 

661 # using an alternate key. We have already filtered out the compound 

662 # keys dimensions.record format. 

663 not_dimensions = {} 

664 

665 # Will need to look in the dataId and the keyword arguments 

666 # and will remove them if they need to be fixed or are unrecognized. 

667 for dataIdDict in (newDataId, kwargs): 

668 # Use a list so we can adjust the dict safely in the loop 

669 for dimensionName in list(dataIdDict): 

670 value = dataIdDict[dimensionName] 

671 try: 

672 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

673 except KeyError: 

674 # This is not a real dimension 

675 not_dimensions[dimensionName] = value 

676 del dataIdDict[dimensionName] 

677 continue 

678 

679 # Convert an integral type to an explicit int to simplify 

680 # comparisons here 

681 if isinstance(value, numbers.Integral): 

682 value = int(value) 

683 

684 if not isinstance(value, dimension.primaryKey.getPythonType()): 

685 for alternate in dimension.alternateKeys: 

686 if isinstance(value, alternate.getPythonType()): 

687 byRecord[dimensionName][alternate.name] = value 

688 del dataIdDict[dimensionName] 

689 log.debug("Converting dimension %s to %s.%s=%s", 

690 dimensionName, dimensionName, alternate.name, value) 

691 break 

692 else: 

693 log.warning("Type mismatch found for value '%r' provided for dimension %s. " 

694 "Could not find matching alternative (primary key has type %s) " 

695 "so attempting to use as-is.", 

696 value, dimensionName, dimension.primaryKey.getPythonType()) 

697 

698 # If we have some unrecognized dimensions we have to try to connect 

699 # them to records in other dimensions. This is made more complicated 

700 # by some dimensions having records with clashing names. A mitigation 

701 # is that we can tell by this point which dimensions are missing 

702 # for the DatasetType but this does not work for calibrations 

703 # where additional dimensions can be used to constrain the temporal 

704 # axis. 

705 if not_dimensions: 

706 # Calculate missing dimensions 

707 provided = set(newDataId) | set(kwargs) | set(byRecord) 

708 missingDimensions = datasetType.dimensions.names - provided 

709 

710 # For calibrations we may well be needing temporal dimensions 

711 # so rather than always including all dimensions in the scan 

712 # restrict things a little. It is still possible for there 

713 # to be confusion over day_obs in visit vs exposure for example. 

714 # If we are not searching calibration collections things may 

715 # fail but they are going to fail anyway because of the 

716 # ambiguousness of the dataId... 

717 candidateDimensions: Set[str] = set() 

718 candidateDimensions.update(missingDimensions) 

719 if datasetType.isCalibration(): 

720 for dim in self.registry.dimensions.getStaticDimensions(): 

721 if dim.temporal: 

722 candidateDimensions.add(str(dim)) 

723 

724 # Look up table for the first association with a dimension 

725 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict) 

726 

727 # Keep track of whether an item is associated with multiple 

728 # dimensions. 

729 counter: Counter[str] = Counter() 

730 assigned: Dict[str, Set[str]] = defaultdict(set) 

731 

732 # Go through the missing dimensions and associate the 

733 # given names with records within those dimensions 

734 for dimensionName in candidateDimensions: 

735 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

736 fields = dimension.metadata.names | dimension.uniqueKeys.names 

737 for field in not_dimensions: 

738 if field in fields: 

739 guessedAssociation[dimensionName][field] = not_dimensions[field] 

740 counter[dimensionName] += 1 

741 assigned[field].add(dimensionName) 

742 

743 # There is a chance we have allocated a single dataId item 

744 # to multiple dimensions. Need to decide which should be retained. 

745 # For now assume that the most popular alternative wins. 

746 # This means that day_obs with seq_num will result in 

747 # exposure.day_obs and not visit.day_obs 

748 # Also prefer an explicitly missing dimension over an inferred 

749 # temporal dimension. 

750 for fieldName, assignedDimensions in assigned.items(): 

751 if len(assignedDimensions) > 1: 

752 # Pick the most popular (preferring mandatory dimensions) 

753 requiredButMissing = assignedDimensions.intersection(missingDimensions) 

754 if requiredButMissing: 

755 candidateDimensions = requiredButMissing 

756 else: 

757 candidateDimensions = assignedDimensions 

758 

759 # Select the relevant items and get a new restricted 

760 # counter. 

761 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

762 duplicatesCounter: Counter[str] = Counter() 

763 duplicatesCounter.update(theseCounts) 

764 

765 # Choose the most common. If they are equally common 

766 # we will pick the one that was found first. 

767 # Returns a list of tuples 

768 selected = duplicatesCounter.most_common(1)[0][0] 

769 

770 log.debug("Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

771 " Removed ambiguity by choosing dimension %s.", 

772 fieldName, ", ".join(assignedDimensions), selected) 

773 

774 for candidateDimension in assignedDimensions: 

775 if candidateDimension != selected: 

776 del guessedAssociation[candidateDimension][fieldName] 

777 

778 # Update the record look up dict with the new associations 

779 for dimensionName, values in guessedAssociation.items(): 

780 if values: # A dict might now be empty 

781 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", 

782 dimensionName, values) 

783 byRecord[dimensionName].update(values) 

784 

785 if byRecord: 

786 # Some record specifiers were found so we need to convert 

787 # them to the Id form 

788 for dimensionName, values in byRecord.items(): 

789 if dimensionName in newDataId: 

790 log.warning("DataId specified explicit %s dimension value of %s in addition to" 

791 " general record specifiers for it of %s. Ignoring record information.", 

792 dimensionName, newDataId[dimensionName], str(values)) 

793 continue 

794 

795 # Build up a WHERE expression 

796 bind = {k: v for k, v in values.items()} 

797 where = " AND ".join(f"{dimensionName}.{k} = {k}" 

798 for k in bind) 

799 

800 # Hopefully we get a single record that matches 

801 records = set(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId, 

802 where=where, bind=bind, **kwargs)) 

803 

804 if len(records) != 1: 

805 if len(records) > 1: 

806 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

807 for r in records: 

808 log.debug("- %s", str(r)) 

809 raise RuntimeError(f"DataId specification for dimension {dimensionName} is not" 

810 f" uniquely constrained to a single dataset by {values}." 

811 f" Got {len(records)} results.") 

812 raise RuntimeError(f"DataId specification for dimension {dimensionName} matched no" 

813 f" records when constrained by {values}") 

814 

815 # Get the primary key from the real dimension object 

816 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

817 if not isinstance(dimension, Dimension): 

818 raise RuntimeError( 

819 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

820 ) 

821 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

822 

823 # We have modified the dataId so need to switch to it 

824 dataId = newDataId 

825 

826 return dataId, kwargs 

827 

828 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

829 dataId: Optional[DataId] = None, *, 

830 collections: Any = None, 

831 allowUnresolved: bool = False, 

832 **kwargs: Any) -> DatasetRef: 

833 """Shared logic for methods that start with a search for a dataset in 

834 the registry. 

835 

836 Parameters 

837 ---------- 

838 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

839 When `DatasetRef` the `dataId` should be `None`. 

840 Otherwise the `DatasetType` or name thereof. 

841 dataId : `dict` or `DataCoordinate`, optional 

842 A `dict` of `Dimension` link name, value pairs that label the 

843 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

844 should be provided as the first argument. 

845 collections : Any, optional 

846 Collections to be searched, overriding ``self.collections``. 

847 Can be any of the types supported by the ``collections`` argument 

848 to butler construction. 

849 allowUnresolved : `bool`, optional 

850 If `True`, return an unresolved `DatasetRef` if finding a resolved 

851 one in the `Registry` fails. Defaults to `False`. 

852 **kwargs 

853 Additional keyword arguments used to augment or construct a 

854 `DataId`. See `DataId` parameters. 

855 

856 Returns 

857 ------- 

858 ref : `DatasetRef` 

859 A reference to the dataset identified by the given arguments. 

860 

861 Raises 

862 ------ 

863 LookupError 

864 Raised if no matching dataset exists in the `Registry` (and 

865 ``allowUnresolved is False``). 

866 ValueError 

867 Raised if a resolved `DatasetRef` was passed as an input, but it 

868 differs from the one found in the registry. 

869 TypeError 

870 Raised if no collections were provided. 

871 """ 

872 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

873 if isinstance(datasetRefOrType, DatasetRef): 

874 idNumber = datasetRefOrType.id 

875 else: 

876 idNumber = None 

877 timespan: Optional[Timespan] = None 

878 

879 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

880 

881 if datasetType.isCalibration(): 

882 # Because this is a calibration dataset, first try to make a 

883 # standardize the data ID without restricting the dimensions to 

884 # those of the dataset type requested, because there may be extra 

885 # dimensions that provide temporal information for a validity-range 

886 # lookup. 

887 dataId = DataCoordinate.standardize(dataId, universe=self.registry.dimensions, 

888 defaults=self.registry.defaults.dataId, **kwargs) 

889 if dataId.graph.temporal: 

890 dataId = self.registry.expandDataId(dataId) 

891 timespan = dataId.timespan 

892 else: 

893 # Standardize the data ID to just the dimensions of the dataset 

894 # type instead of letting registry.findDataset do it, so we get the 

895 # result even if no dataset is found. 

896 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions, 

897 defaults=self.registry.defaults.dataId, **kwargs) 

898 # Always lookup the DatasetRef, even if one is given, to ensure it is 

899 # present in the current collection. 

900 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

901 if ref is None: 

902 if allowUnresolved: 

903 return DatasetRef(datasetType, dataId) 

904 else: 

905 if collections is None: 

906 collections = self.registry.defaults.collections 

907 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} " 

908 f"could not be found in collections {collections}.") 

909 if idNumber is not None and idNumber != ref.id: 

910 if collections is None: 

911 collections = self.registry.defaults.collections 

912 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match " 

913 f"id ({ref.id}) in registry in collections {collections}.") 

914 return ref 

915 

916 @transactional 

917 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

918 dataId: Optional[DataId] = None, *, 

919 run: Optional[str] = None, 

920 **kwargs: Any) -> DatasetRef: 

921 """Store and register a dataset. 

922 

923 Parameters 

924 ---------- 

925 obj : `object` 

926 The dataset. 

927 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

928 When `DatasetRef` is provided, ``dataId`` should be `None`. 

929 Otherwise the `DatasetType` or name thereof. 

930 dataId : `dict` or `DataCoordinate` 

931 A `dict` of `Dimension` link name, value pairs that label the 

932 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

933 should be provided as the second argument. 

934 run : `str`, optional 

935 The name of the run the dataset should be added to, overriding 

936 ``self.run``. 

937 **kwargs 

938 Additional keyword arguments used to augment or construct a 

939 `DataCoordinate`. See `DataCoordinate.standardize` 

940 parameters. 

941 

942 Returns 

943 ------- 

944 ref : `DatasetRef` 

945 A reference to the stored dataset, updated with the correct id if 

946 given. 

947 

948 Raises 

949 ------ 

950 TypeError 

951 Raised if the butler is read-only or if no run has been provided. 

952 """ 

953 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

954 if not self.isWriteable(): 

955 raise TypeError("Butler is read-only.") 

956 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

957 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

958 raise ValueError("DatasetRef must not be in registry, must have None id") 

959 

960 # Handle dimension records in dataId 

961 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

962 

963 # Add Registry Dataset entry. 

964 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

965 

966 # For an execution butler the datasets will be pre-defined. 

967 # If the butler is configured that way datasets should only be inserted 

968 # if they do not already exist in registry. Trying and catching 

969 # ConflictingDefinitionError will not work because the transaction 

970 # will be corrupted. Instead, in this mode always check first. 

971 ref = None 

972 ref_is_predefined = False 

973 if self._allow_put_of_predefined_dataset: 

974 # Get the matching ref for this run. 

975 ref = self.registry.findDataset(datasetType, collections=run, 

976 dataId=dataId) 

977 

978 if ref: 

979 # Must be expanded form for datastore templating 

980 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions) 

981 ref = ref.expanded(dataId) 

982 ref_is_predefined = True 

983 

984 if not ref: 

985 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

986 

987 # If the ref is predefined it is possible that the datastore also 

988 # has the record. Asking datastore to put it again will result in 

989 # the artifact being recreated, overwriting previous, then will cause 

990 # a failure in writing the record which will cause the artifact 

991 # to be removed. Much safer to ask first before attempting to 

992 # overwrite. Race conditions should not be an issue for the 

993 # execution butler environment. 

994 if ref_is_predefined: 

995 if self.datastore.knows(ref): 

996 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.") 

997 

998 self.datastore.put(obj, ref) 

999 

1000 return ref 

1001 

1002 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any: 

1003 """Retrieve a stored dataset. 

1004 

1005 Unlike `Butler.get`, this method allows datasets outside the Butler's 

1006 collection to be read as long as the `DatasetRef` that identifies them 

1007 can be obtained separately. 

1008 

1009 Parameters 

1010 ---------- 

1011 ref : `DatasetRef` 

1012 Resolved reference to an already stored dataset. 

1013 parameters : `dict` 

1014 Additional StorageClass-defined options to control reading, 

1015 typically used to efficiently read only a subset of the dataset. 

1016 

1017 Returns 

1018 ------- 

1019 obj : `object` 

1020 The dataset. 

1021 """ 

1022 return self.datastore.get(ref, parameters=parameters) 

1023 

1024 def getDirectDeferred(self, ref: DatasetRef, *, 

1025 parameters: Union[dict, None] = None) -> DeferredDatasetHandle: 

1026 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1027 from a resolved `DatasetRef`. 

1028 

1029 Parameters 

1030 ---------- 

1031 ref : `DatasetRef` 

1032 Resolved reference to an already stored dataset. 

1033 parameters : `dict` 

1034 Additional StorageClass-defined options to control reading, 

1035 typically used to efficiently read only a subset of the dataset. 

1036 

1037 Returns 

1038 ------- 

1039 obj : `DeferredDatasetHandle` 

1040 A handle which can be used to retrieve a dataset at a later time. 

1041 

1042 Raises 

1043 ------ 

1044 AmbiguousDatasetError 

1045 Raised if ``ref.id is None``, i.e. the reference is unresolved. 

1046 """ 

1047 if ref.id is None: 

1048 raise AmbiguousDatasetError( 

1049 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved." 

1050 ) 

1051 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

1052 

1053 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1054 dataId: Optional[DataId] = None, *, 

1055 parameters: Union[dict, None] = None, 

1056 collections: Any = None, 

1057 **kwargs: Any) -> DeferredDatasetHandle: 

1058 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1059 after an immediate registry lookup. 

1060 

1061 Parameters 

1062 ---------- 

1063 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1064 When `DatasetRef` the `dataId` should be `None`. 

1065 Otherwise the `DatasetType` or name thereof. 

1066 dataId : `dict` or `DataCoordinate`, optional 

1067 A `dict` of `Dimension` link name, value pairs that label the 

1068 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1069 should be provided as the first argument. 

1070 parameters : `dict` 

1071 Additional StorageClass-defined options to control reading, 

1072 typically used to efficiently read only a subset of the dataset. 

1073 collections : Any, optional 

1074 Collections to be searched, overriding ``self.collections``. 

1075 Can be any of the types supported by the ``collections`` argument 

1076 to butler construction. 

1077 **kwargs 

1078 Additional keyword arguments used to augment or construct a 

1079 `DataId`. See `DataId` parameters. 

1080 

1081 Returns 

1082 ------- 

1083 obj : `DeferredDatasetHandle` 

1084 A handle which can be used to retrieve a dataset at a later time. 

1085 

1086 Raises 

1087 ------ 

1088 LookupError 

1089 Raised if no matching dataset exists in the `Registry` (and 

1090 ``allowUnresolved is False``). 

1091 ValueError 

1092 Raised if a resolved `DatasetRef` was passed as an input, but it 

1093 differs from the one found in the registry. 

1094 TypeError 

1095 Raised if no collections were provided. 

1096 """ 

1097 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1098 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

1099 

1100 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1101 dataId: Optional[DataId] = None, *, 

1102 parameters: Optional[Dict[str, Any]] = None, 

1103 collections: Any = None, 

1104 **kwargs: Any) -> Any: 

1105 """Retrieve a stored dataset. 

1106 

1107 Parameters 

1108 ---------- 

1109 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1110 When `DatasetRef` the `dataId` should be `None`. 

1111 Otherwise the `DatasetType` or name thereof. 

1112 dataId : `dict` or `DataCoordinate` 

1113 A `dict` of `Dimension` link name, value pairs that label the 

1114 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1115 should be provided as the first argument. 

1116 parameters : `dict` 

1117 Additional StorageClass-defined options to control reading, 

1118 typically used to efficiently read only a subset of the dataset. 

1119 collections : Any, optional 

1120 Collections to be searched, overriding ``self.collections``. 

1121 Can be any of the types supported by the ``collections`` argument 

1122 to butler construction. 

1123 **kwargs 

1124 Additional keyword arguments used to augment or construct a 

1125 `DataCoordinate`. See `DataCoordinate.standardize` 

1126 parameters. 

1127 

1128 Returns 

1129 ------- 

1130 obj : `object` 

1131 The dataset. 

1132 

1133 Raises 

1134 ------ 

1135 ValueError 

1136 Raised if a resolved `DatasetRef` was passed as an input, but it 

1137 differs from the one found in the registry. 

1138 LookupError 

1139 Raised if no matching dataset exists in the `Registry`. 

1140 TypeError 

1141 Raised if no collections were provided. 

1142 

1143 Notes 

1144 ----- 

1145 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1146 this method requires that the given data ID include temporal dimensions 

1147 beyond the dimensions of the dataset type itself, in order to find the 

1148 dataset with the appropriate validity range. For example, a "bias" 

1149 dataset with native dimensions ``{instrument, detector}`` could be 

1150 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1151 ``exposure`` is a temporal dimension. 

1152 """ 

1153 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1154 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1155 return self.getDirect(ref, parameters=parameters) 

1156 

1157 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1158 dataId: Optional[DataId] = None, *, 

1159 predict: bool = False, 

1160 collections: Any = None, 

1161 run: Optional[str] = None, 

1162 **kwargs: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

1163 """Returns the URIs associated with the dataset. 

1164 

1165 Parameters 

1166 ---------- 

1167 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1168 When `DatasetRef` the `dataId` should be `None`. 

1169 Otherwise the `DatasetType` or name thereof. 

1170 dataId : `dict` or `DataCoordinate` 

1171 A `dict` of `Dimension` link name, value pairs that label the 

1172 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1173 should be provided as the first argument. 

1174 predict : `bool` 

1175 If `True`, allow URIs to be returned of datasets that have not 

1176 been written. 

1177 collections : Any, optional 

1178 Collections to be searched, overriding ``self.collections``. 

1179 Can be any of the types supported by the ``collections`` argument 

1180 to butler construction. 

1181 run : `str`, optional 

1182 Run to use for predictions, overriding ``self.run``. 

1183 **kwargs 

1184 Additional keyword arguments used to augment or construct a 

1185 `DataCoordinate`. See `DataCoordinate.standardize` 

1186 parameters. 

1187 

1188 Returns 

1189 ------- 

1190 primary : `ButlerURI` 

1191 The URI to the primary artifact associated with this dataset. 

1192 If the dataset was disassembled within the datastore this 

1193 may be `None`. 

1194 components : `dict` 

1195 URIs to any components associated with the dataset artifact. 

1196 Can be empty if there are no components. 

1197 """ 

1198 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict, 

1199 collections=collections, **kwargs) 

1200 if ref.id is None: # only possible if predict is True 

1201 if run is None: 

1202 run = self.run 

1203 if run is None: 

1204 raise TypeError("Cannot predict location with run=None.") 

1205 # Lie about ID, because we can't guess it, and only 

1206 # Datastore.getURIs() will ever see it (and it doesn't use it). 

1207 ref = ref.resolved(id=0, run=run) 

1208 return self.datastore.getURIs(ref, predict) 

1209 

1210 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1211 dataId: Optional[DataId] = None, *, 

1212 predict: bool = False, 

1213 collections: Any = None, 

1214 run: Optional[str] = None, 

1215 **kwargs: Any) -> ButlerURI: 

1216 """Return the URI to the Dataset. 

1217 

1218 Parameters 

1219 ---------- 

1220 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1221 When `DatasetRef` the `dataId` should be `None`. 

1222 Otherwise the `DatasetType` or name thereof. 

1223 dataId : `dict` or `DataCoordinate` 

1224 A `dict` of `Dimension` link name, value pairs that label the 

1225 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1226 should be provided as the first argument. 

1227 predict : `bool` 

1228 If `True`, allow URIs to be returned of datasets that have not 

1229 been written. 

1230 collections : Any, optional 

1231 Collections to be searched, overriding ``self.collections``. 

1232 Can be any of the types supported by the ``collections`` argument 

1233 to butler construction. 

1234 run : `str`, optional 

1235 Run to use for predictions, overriding ``self.run``. 

1236 **kwargs 

1237 Additional keyword arguments used to augment or construct a 

1238 `DataCoordinate`. See `DataCoordinate.standardize` 

1239 parameters. 

1240 

1241 Returns 

1242 ------- 

1243 uri : `ButlerURI` 

1244 URI pointing to the Dataset within the datastore. If the 

1245 Dataset does not exist in the datastore, and if ``predict`` is 

1246 `True`, the URI will be a prediction and will include a URI 

1247 fragment "#predicted". 

1248 If the datastore does not have entities that relate well 

1249 to the concept of a URI the returned URI string will be 

1250 descriptive. The returned URI is not guaranteed to be obtainable. 

1251 

1252 Raises 

1253 ------ 

1254 LookupError 

1255 A URI has been requested for a dataset that does not exist and 

1256 guessing is not allowed. 

1257 ValueError 

1258 Raised if a resolved `DatasetRef` was passed as an input, but it 

1259 differs from the one found in the registry. 

1260 TypeError 

1261 Raised if no collections were provided. 

1262 RuntimeError 

1263 Raised if a URI is requested for a dataset that consists of 

1264 multiple artifacts. 

1265 """ 

1266 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict, 

1267 collections=collections, run=run, **kwargs) 

1268 

1269 if primary is None or components: 

1270 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1271 "Use Butler.getURIs() instead.") 

1272 return primary 

1273 

1274 def retrieveArtifacts(self, refs: Iterable[DatasetRef], 

1275 destination: Union[str, ButlerURI], transfer: str = "auto", 

1276 preserve_path: bool = True, 

1277 overwrite: bool = False) -> List[ButlerURI]: 

1278 """Retrieve the artifacts associated with the supplied refs. 

1279 

1280 Parameters 

1281 ---------- 

1282 refs : iterable of `DatasetRef` 

1283 The datasets for which artifacts are to be retrieved. 

1284 A single ref can result in multiple artifacts. The refs must 

1285 be resolved. 

1286 destination : `ButlerURI` or `str` 

1287 Location to write the artifacts. 

1288 transfer : `str`, optional 

1289 Method to use to transfer the artifacts. Must be one of the options 

1290 supported by `ButlerURI.transfer_from()`. "move" is not allowed. 

1291 preserve_path : `bool`, optional 

1292 If `True` the full path of the artifact within the datastore 

1293 is preserved. If `False` the final file component of the path 

1294 is used. 

1295 overwrite : `bool`, optional 

1296 If `True` allow transfers to overwrite existing files at the 

1297 destination. 

1298 

1299 Returns 

1300 ------- 

1301 targets : `list` of `ButlerURI` 

1302 URIs of file artifacts in destination location. Order is not 

1303 preserved. 

1304 

1305 Notes 

1306 ----- 

1307 For non-file datastores the artifacts written to the destination 

1308 may not match the representation inside the datastore. For example 

1309 a hierarchical data structure in a NoSQL database may well be stored 

1310 as a JSON file. 

1311 """ 

1312 return self.datastore.retrieveArtifacts(refs, ButlerURI(destination), transfer=transfer, 

1313 preserve_path=preserve_path, overwrite=overwrite) 

1314 

1315 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1316 dataId: Optional[DataId] = None, *, 

1317 collections: Any = None, 

1318 **kwargs: Any) -> bool: 

1319 """Return True if the Dataset is actually present in the Datastore. 

1320 

1321 Parameters 

1322 ---------- 

1323 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1324 When `DatasetRef` the `dataId` should be `None`. 

1325 Otherwise the `DatasetType` or name thereof. 

1326 dataId : `dict` or `DataCoordinate` 

1327 A `dict` of `Dimension` link name, value pairs that label the 

1328 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1329 should be provided as the first argument. 

1330 collections : Any, optional 

1331 Collections to be searched, overriding ``self.collections``. 

1332 Can be any of the types supported by the ``collections`` argument 

1333 to butler construction. 

1334 **kwargs 

1335 Additional keyword arguments used to augment or construct a 

1336 `DataCoordinate`. See `DataCoordinate.standardize` 

1337 parameters. 

1338 

1339 Raises 

1340 ------ 

1341 LookupError 

1342 Raised if the dataset is not even present in the Registry. 

1343 ValueError 

1344 Raised if a resolved `DatasetRef` was passed as an input, but it 

1345 differs from the one found in the registry. 

1346 TypeError 

1347 Raised if no collections were provided. 

1348 """ 

1349 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1350 return self.datastore.exists(ref) 

1351 

1352 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1353 """Remove one or more `~CollectionType.RUN` collections and the 

1354 datasets within them. 

1355 

1356 Parameters 

1357 ---------- 

1358 names : `Iterable` [ `str` ] 

1359 The names of the collections to remove. 

1360 unstore : `bool`, optional 

1361 If `True` (default), delete datasets from all datastores in which 

1362 they are present, and attempt to rollback the registry deletions if 

1363 datastore deletions fail (which may not always be possible). If 

1364 `False`, datastore records for these datasets are still removed, 

1365 but any artifacts (e.g. files) will not be. 

1366 

1367 Raises 

1368 ------ 

1369 TypeError 

1370 Raised if one or more collections are not of type 

1371 `~CollectionType.RUN`. 

1372 """ 

1373 if not self.isWriteable(): 

1374 raise TypeError("Butler is read-only.") 

1375 names = list(names) 

1376 refs: List[DatasetRef] = [] 

1377 for name in names: 

1378 collectionType = self.registry.getCollectionType(name) 

1379 if collectionType is not CollectionType.RUN: 

1380 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1381 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True)) 

1382 with self.registry.transaction(): 

1383 if unstore: 

1384 self.datastore.trash(refs) 

1385 else: 

1386 self.datastore.forget(refs) 

1387 for name in names: 

1388 self.registry.removeCollection(name) 

1389 if unstore: 

1390 # Point of no return for removing artifacts 

1391 self.datastore.emptyTrash() 

1392 

1393 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False, 

1394 unlink: Optional[List[str]] = None) -> None: 

1395 """Remove a collection and possibly prune datasets within it. 

1396 

1397 Parameters 

1398 ---------- 

1399 name : `str` 

1400 Name of the collection to remove. If this is a 

1401 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection, 

1402 datasets within the collection are not modified unless ``unstore`` 

1403 is `True`. If this is a `~CollectionType.RUN` collection, 

1404 ``purge`` and ``unstore`` must be `True`, and all datasets in it 

1405 are fully removed from the data repository. 

1406 purge : `bool`, optional 

1407 If `True`, permit `~CollectionType.RUN` collections to be removed, 

1408 fully removing datasets within them. Requires ``unstore=True`` as 

1409 well as an added precaution against accidental deletion. Must be 

1410 `False` (default) if the collection is not a ``RUN``. 

1411 unstore: `bool`, optional 

1412 If `True`, remove all datasets in the collection from all 

1413 datastores in which they appear. 

1414 unlink: `list` [`str`], optional 

1415 Before removing the given `collection` unlink it from from these 

1416 parent collections. 

1417 

1418 Raises 

1419 ------ 

1420 TypeError 

1421 Raised if the butler is read-only or arguments are mutually 

1422 inconsistent. 

1423 """ 

1424 # See pruneDatasets comments for more information about the logic here; 

1425 # the cases are almost the same, but here we can rely on Registry to 

1426 # take care everything but Datastore deletion when we remove the 

1427 # collection. 

1428 if not self.isWriteable(): 

1429 raise TypeError("Butler is read-only.") 

1430 collectionType = self.registry.getCollectionType(name) 

1431 if purge and not unstore: 

1432 raise PurgeWithoutUnstorePruneCollectionsError() 

1433 if collectionType is CollectionType.RUN and not purge: 

1434 raise RunWithoutPurgePruneCollectionsError(collectionType) 

1435 if collectionType is not CollectionType.RUN and purge: 

1436 raise PurgeUnsupportedPruneCollectionsError(collectionType) 

1437 

1438 def remove(child: str, parent: str) -> None: 

1439 """Remove a child collection from a parent collection.""" 

1440 # Remove child from parent. 

1441 chain = list(self.registry.getCollectionChain(parent)) 

1442 try: 

1443 chain.remove(name) 

1444 except ValueError as e: 

1445 raise RuntimeError(f"{name} is not a child of {parent}") from e 

1446 self.registry.setCollectionChain(parent, chain) 

1447 

1448 with self.registry.transaction(): 

1449 if (unlink): 

1450 for parent in unlink: 

1451 remove(name, parent) 

1452 if unstore: 

1453 refs = self.registry.queryDatasets(..., collections=name, findFirst=True) 

1454 self.datastore.trash(refs) 

1455 self.registry.removeCollection(name) 

1456 

1457 if unstore: 

1458 # Point of no return for removing artifacts 

1459 self.datastore.emptyTrash() 

1460 

1461 def pruneDatasets(self, refs: Iterable[DatasetRef], *, 

1462 disassociate: bool = True, 

1463 unstore: bool = False, 

1464 tags: Iterable[str] = (), 

1465 purge: bool = False, 

1466 run: Optional[str] = None) -> None: 

1467 """Remove one or more datasets from a collection and/or storage. 

1468 

1469 Parameters 

1470 ---------- 

1471 refs : `~collections.abc.Iterable` of `DatasetRef` 

1472 Datasets to prune. These must be "resolved" references (not just 

1473 a `DatasetType` and data ID). 

1474 disassociate : `bool`, optional 

1475 Disassociate pruned datasets from ``tags``, or from all collections 

1476 if ``purge=True``. 

1477 unstore : `bool`, optional 

1478 If `True` (`False` is default) remove these datasets from all 

1479 datastores known to this butler. Note that this will make it 

1480 impossible to retrieve these datasets even via other collections. 

1481 Datasets that are already not stored are ignored by this option. 

1482 tags : `Iterable` [ `str` ], optional 

1483 `~CollectionType.TAGGED` collections to disassociate the datasets 

1484 from. Ignored if ``disassociate`` is `False` or ``purge`` is 

1485 `True`. 

1486 purge : `bool`, optional 

1487 If `True` (`False` is default), completely remove the dataset from 

1488 the `Registry`. To prevent accidental deletions, ``purge`` may 

1489 only be `True` if all of the following conditions are met: 

1490 

1491 - All given datasets are in the given run. 

1492 - ``disassociate`` is `True`; 

1493 - ``unstore`` is `True`. 

1494 

1495 This mode may remove provenance information from datasets other 

1496 than those provided, and should be used with extreme care. 

1497 

1498 Raises 

1499 ------ 

1500 TypeError 

1501 Raised if the butler is read-only, if no collection was provided, 

1502 or the conditions for ``purge=True`` were not met. 

1503 """ 

1504 if not self.isWriteable(): 

1505 raise TypeError("Butler is read-only.") 

1506 if purge: 

1507 if not disassociate: 

1508 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1509 if not unstore: 

1510 raise TypeError("Cannot pass purge=True without unstore=True.") 

1511 elif disassociate: 

1512 tags = tuple(tags) 

1513 if not tags: 

1514 raise TypeError("No tags provided but disassociate=True.") 

1515 for tag in tags: 

1516 collectionType = self.registry.getCollectionType(tag) 

1517 if collectionType is not CollectionType.TAGGED: 

1518 raise TypeError(f"Cannot disassociate from collection '{tag}' " 

1519 f"of non-TAGGED type {collectionType.name}.") 

1520 # Transform possibly-single-pass iterable into something we can iterate 

1521 # over multiple times. 

1522 refs = list(refs) 

1523 # Pruning a component of a DatasetRef makes no sense since registry 

1524 # doesn't know about components and datastore might not store 

1525 # components in a separate file 

1526 for ref in refs: 

1527 if ref.datasetType.component(): 

1528 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1529 # We don't need an unreliable Datastore transaction for this, because 

1530 # we've been extra careful to ensure that Datastore.trash only involves 

1531 # mutating the Registry (it can _look_ at Datastore-specific things, 

1532 # but shouldn't change them), and hence all operations here are 

1533 # Registry operations. 

1534 with self.registry.transaction(): 

1535 if unstore: 

1536 self.datastore.trash(refs) 

1537 if purge: 

1538 self.registry.removeDatasets(refs) 

1539 elif disassociate: 

1540 assert tags, "Guaranteed by earlier logic in this function." 

1541 for tag in tags: 

1542 self.registry.disassociate(tag, refs) 

1543 # We've exited the Registry transaction, and apparently committed. 

1544 # (if there was an exception, everything rolled back, and it's as if 

1545 # nothing happened - and we never get here). 

1546 # Datastore artifacts are not yet gone, but they're clearly marked 

1547 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1548 # problems we can try again later, and if manual administrative 

1549 # intervention is required, it's pretty clear what that should entail: 

1550 # deleting everything on disk and in private Datastore tables that is 

1551 # in the dataset_location_trash table. 

1552 if unstore: 

1553 # Point of no return for removing artifacts 

1554 self.datastore.emptyTrash() 

1555 

1556 @transactional 

1557 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None, 

1558 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1559 ) -> None: 

1560 """Store and register one or more datasets that already exist on disk. 

1561 

1562 Parameters 

1563 ---------- 

1564 datasets : `FileDataset` 

1565 Each positional argument is a struct containing information about 

1566 a file to be ingested, including its URI (either absolute or 

1567 relative to the datastore root, if applicable), a `DatasetRef`, 

1568 and optionally a formatter class or its fully-qualified string 

1569 name. If a formatter is not provided, the formatter that would be 

1570 used for `put` is assumed. On successful return, all 

1571 `FileDataset.ref` attributes will have their `DatasetRef.id` 

1572 attribute populated and all `FileDataset.formatter` attributes will 

1573 be set to the formatter class used. `FileDataset.path` attributes 

1574 may be modified to put paths in whatever the datastore considers a 

1575 standardized form. 

1576 transfer : `str`, optional 

1577 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1578 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to 

1579 transfer the file. 

1580 run : `str`, optional 

1581 The name of the run ingested datasets should be added to, 

1582 overriding ``self.run``. 

1583 idGenerationMode : `DatasetIdGenEnum`, optional 

1584 Specifies option for generating dataset IDs. By default unique IDs 

1585 are generated for each inserted dataset. 

1586 

1587 Raises 

1588 ------ 

1589 TypeError 

1590 Raised if the butler is read-only or if no run was provided. 

1591 NotImplementedError 

1592 Raised if the `Datastore` does not support the given transfer mode. 

1593 DatasetTypeNotSupportedError 

1594 Raised if one or more files to be ingested have a dataset type that 

1595 is not supported by the `Datastore`.. 

1596 FileNotFoundError 

1597 Raised if one of the given files does not exist. 

1598 FileExistsError 

1599 Raised if transfer is not `None` but the (internal) location the 

1600 file would be moved to is already occupied. 

1601 

1602 Notes 

1603 ----- 

1604 This operation is not fully exception safe: if a database operation 

1605 fails, the given `FileDataset` instances may be only partially updated. 

1606 

1607 It is atomic in terms of database operations (they will either all 

1608 succeed or all fail) providing the database engine implements 

1609 transactions correctly. It will attempt to be atomic in terms of 

1610 filesystem operations as well, but this cannot be implemented 

1611 rigorously for most datastores. 

1612 """ 

1613 if not self.isWriteable(): 

1614 raise TypeError("Butler is read-only.") 

1615 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1616 # Reorganize the inputs so they're grouped by DatasetType and then 

1617 # data ID. We also include a list of DatasetRefs for each FileDataset 

1618 # to hold the resolved DatasetRefs returned by the Registry, before 

1619 # it's safe to swap them into FileDataset.refs. 

1620 # Some type annotation aliases to make that clearer: 

1621 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]] 

1622 GroupedData = MutableMapping[DatasetType, GroupForType] 

1623 # The actual data structure: 

1624 groupedData: GroupedData = defaultdict(dict) 

1625 # And the nested loop that populates it: 

1626 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1627 # This list intentionally shared across the inner loop, since it's 

1628 # associated with `dataset`. 

1629 resolvedRefs: List[DatasetRef] = [] 

1630 

1631 # Somewhere to store pre-existing refs if we have an 

1632 # execution butler. 

1633 existingRefs: List[DatasetRef] = [] 

1634 

1635 for ref in dataset.refs: 

1636 if ref.dataId in groupedData[ref.datasetType]: 

1637 raise ConflictingDefinitionError(f"Ingest conflict. Dataset {dataset.path} has same" 

1638 " DataId as other ingest dataset" 

1639 f" {groupedData[ref.datasetType][ref.dataId][0].path} " 

1640 f" ({ref.dataId})") 

1641 if self._allow_put_of_predefined_dataset: 

1642 existing_ref = self.registry.findDataset(ref.datasetType, 

1643 dataId=ref.dataId, 

1644 collections=run) 

1645 if existing_ref: 

1646 if self.datastore.knows(existing_ref): 

1647 raise ConflictingDefinitionError(f"Dataset associated with path {dataset.path}" 

1648 f" already exists as {existing_ref}.") 

1649 # Store this ref elsewhere since it already exists 

1650 # and we do not want to remake it but we do want 

1651 # to store it in the datastore. 

1652 existingRefs.append(existing_ref) 

1653 

1654 # Nothing else to do until we have finished 

1655 # iterating. 

1656 continue 

1657 

1658 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs) 

1659 

1660 if existingRefs: 

1661 

1662 if len(dataset.refs) != len(existingRefs): 

1663 # Keeping track of partially pre-existing datasets is hard 

1664 # and should generally never happen. For now don't allow 

1665 # it. 

1666 raise ConflictingDefinitionError(f"For dataset {dataset.path} some dataIds already exist" 

1667 " in registry but others do not. This is not supported.") 

1668 

1669 # Attach the resolved refs if we found them. 

1670 dataset.refs = existingRefs 

1671 

1672 # Now we can bulk-insert into Registry for each DatasetType. 

1673 for datasetType, groupForType in progress.iter_item_chunks(groupedData.items(), 

1674 desc="Bulk-inserting datasets by type"): 

1675 refs = self.registry.insertDatasets( 

1676 datasetType, 

1677 dataIds=groupForType.keys(), 

1678 run=run, 

1679 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType), 

1680 idGenerationMode=idGenerationMode, 

1681 ) 

1682 # Append those resolved DatasetRefs to the new lists we set up for 

1683 # them. 

1684 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()): 

1685 resolvedRefs.append(ref) 

1686 

1687 # Go back to the original FileDatasets to replace their refs with the 

1688 # new resolved ones. 

1689 for groupForType in progress.iter_chunks(groupedData.values(), 

1690 desc="Reassociating resolved dataset refs with files"): 

1691 for dataset, resolvedRefs in groupForType.values(): 

1692 dataset.refs = resolvedRefs 

1693 

1694 # Bulk-insert everything into Datastore. 

1695 self.datastore.ingest(*datasets, transfer=transfer) 

1696 

1697 @contextlib.contextmanager 

1698 def export(self, *, directory: Optional[str] = None, 

1699 filename: Optional[str] = None, 

1700 format: Optional[str] = None, 

1701 transfer: Optional[str] = None) -> Iterator[RepoExportContext]: 

1702 """Export datasets from the repository represented by this `Butler`. 

1703 

1704 This method is a context manager that returns a helper object 

1705 (`RepoExportContext`) that is used to indicate what information from 

1706 the repository should be exported. 

1707 

1708 Parameters 

1709 ---------- 

1710 directory : `str`, optional 

1711 Directory dataset files should be written to if ``transfer`` is not 

1712 `None`. 

1713 filename : `str`, optional 

1714 Name for the file that will include database information associated 

1715 with the exported datasets. If this is not an absolute path and 

1716 ``directory`` is not `None`, it will be written to ``directory`` 

1717 instead of the current working directory. Defaults to 

1718 "export.{format}". 

1719 format : `str`, optional 

1720 File format for the database information file. If `None`, the 

1721 extension of ``filename`` will be used. 

1722 transfer : `str`, optional 

1723 Transfer mode passed to `Datastore.export`. 

1724 

1725 Raises 

1726 ------ 

1727 TypeError 

1728 Raised if the set of arguments passed is inconsistent. 

1729 

1730 Examples 

1731 -------- 

1732 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

1733 methods are used to provide the iterables over data IDs and/or datasets 

1734 to be exported:: 

1735 

1736 with butler.export("exports.yaml") as export: 

1737 # Export all flats, but none of the dimension element rows 

1738 # (i.e. data ID information) associated with them. 

1739 export.saveDatasets(butler.registry.queryDatasets("flat"), 

1740 elements=()) 

1741 # Export all datasets that start with "deepCoadd_" and all of 

1742 # their associated data ID information. 

1743 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

1744 """ 

1745 if directory is None and transfer is not None: 

1746 raise TypeError("Cannot transfer without providing a directory.") 

1747 if transfer == "move": 

1748 raise TypeError("Transfer may not be 'move': export is read-only") 

1749 if format is None: 

1750 if filename is None: 

1751 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1752 else: 

1753 _, format = os.path.splitext(filename) 

1754 elif filename is None: 

1755 filename = f"export.{format}" 

1756 if directory is not None: 

1757 filename = os.path.join(directory, filename) 

1758 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["export"]) 

1759 with open(filename, 'w') as stream: 

1760 backend = BackendClass(stream) 

1761 try: 

1762 helper = RepoExportContext(self.registry, self.datastore, backend=backend, 

1763 directory=directory, transfer=transfer) 

1764 yield helper 

1765 except BaseException: 

1766 raise 

1767 else: 

1768 helper._finish() 

1769 

1770 def import_(self, *, directory: Optional[str] = None, 

1771 filename: Union[str, TextIO, None] = None, 

1772 format: Optional[str] = None, 

1773 transfer: Optional[str] = None, 

1774 skip_dimensions: Optional[Set] = None, 

1775 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1776 reuseIds: bool = False) -> None: 

1777 """Import datasets into this repository that were exported from a 

1778 different butler repository via `~lsst.daf.butler.Butler.export`. 

1779 

1780 Parameters 

1781 ---------- 

1782 directory : `str`, optional 

1783 Directory containing dataset files to import from. If `None`, 

1784 ``filename`` and all dataset file paths specified therein must 

1785 be absolute. 

1786 filename : `str` or `TextIO`, optional 

1787 A stream or name of file that contains database information 

1788 associated with the exported datasets, typically generated by 

1789 `~lsst.daf.butler.Butler.export`. If this a string (name) and 

1790 is not an absolute path, does not exist in the current working 

1791 directory, and ``directory`` is not `None`, it is assumed to be in 

1792 ``directory``. Defaults to "export.{format}". 

1793 format : `str`, optional 

1794 File format for ``filename``. If `None`, the extension of 

1795 ``filename`` will be used. 

1796 transfer : `str`, optional 

1797 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

1798 skip_dimensions : `set`, optional 

1799 Names of dimensions that should be skipped and not imported. 

1800 idGenerationMode : `DatasetIdGenEnum`, optional 

1801 Specifies option for generating dataset IDs when IDs are not 

1802 provided or their type does not match backend type. By default 

1803 unique IDs are generated for each inserted dataset. 

1804 reuseIds : `bool`, optional 

1805 If `True` then forces re-use of imported dataset IDs for integer 

1806 IDs which are normally generated as auto-incremented; exception 

1807 will be raised if imported IDs clash with existing ones. This 

1808 option has no effect on the use of globally-unique IDs which are 

1809 always re-used (or generated if integer IDs are being imported). 

1810 

1811 Raises 

1812 ------ 

1813 TypeError 

1814 Raised if the set of arguments passed is inconsistent, or if the 

1815 butler is read-only. 

1816 """ 

1817 if not self.isWriteable(): 

1818 raise TypeError("Butler is read-only.") 

1819 if format is None: 

1820 if filename is None: 

1821 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1822 else: 

1823 _, format = os.path.splitext(filename) # type: ignore 

1824 elif filename is None: 

1825 filename = f"export.{format}" 

1826 if isinstance(filename, str) and directory is not None and not os.path.exists(filename): 

1827 filename = os.path.join(directory, filename) 

1828 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"]) 

1829 

1830 def doImport(importStream: TextIO) -> None: 

1831 backend = BackendClass(importStream, self.registry) 

1832 backend.register() 

1833 with self.transaction(): 

1834 backend.load(self.datastore, directory=directory, transfer=transfer, 

1835 skip_dimensions=skip_dimensions, idGenerationMode=idGenerationMode, 

1836 reuseIds=reuseIds) 

1837 

1838 if isinstance(filename, str): 

1839 with open(filename, "r") as stream: 

1840 doImport(stream) 

1841 else: 

1842 doImport(filename) 

1843 

1844 def transfer_from(self, source_butler: Butler, source_refs: Iterable[DatasetRef], 

1845 transfer: str = "auto", 

1846 id_gen_map: Dict[str, DatasetIdGenEnum] = None, 

1847 skip_missing: bool = True, 

1848 register_dataset_types: bool = False) -> List[DatasetRef]: 

1849 """Transfer datasets to this Butler from a run in another Butler. 

1850 

1851 Parameters 

1852 ---------- 

1853 source_butler : `Butler` 

1854 Butler from which the datasets are to be transferred. 

1855 source_refs : iterable of `DatasetRef` 

1856 Datasets defined in the source butler that should be transferred to 

1857 this butler. 

1858 transfer : `str`, optional 

1859 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

1860 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional 

1861 A mapping of dataset type to ID generation mode. Only used if 

1862 the source butler is using integer IDs. Should not be used 

1863 if this receiving butler uses integer IDs. Without this dataset 

1864 import always uses unique. 

1865 skip_missing : `bool` 

1866 If `True`, datasets with no datastore artifact associated with 

1867 them are not transferred. If `False` a registry entry will be 

1868 created even if no datastore record is created (and so will 

1869 look equivalent to the dataset being unstored). 

1870 register_dataset_types : `bool` 

1871 If `True` any missing dataset types are registered. Otherwise 

1872 an exception is raised. 

1873 

1874 Returns 

1875 ------- 

1876 refs : `list` of `DatasetRef` 

1877 The refs added to this Butler. 

1878 

1879 Notes 

1880 ----- 

1881 Requires that any dimension definitions are already present in the 

1882 receiving Butler. The datastore artifact has to exist for a transfer 

1883 to be made but non-existence is not an error. 

1884 

1885 Datasets that already exist in this run will be skipped. 

1886 

1887 The datasets are imported as part of a transaction, although 

1888 dataset types are registered before the transaction is started. 

1889 This means that it is possible for a dataset type to be registered 

1890 even though transfer has failed. 

1891 """ 

1892 if not self.isWriteable(): 

1893 raise TypeError("Butler is read-only.") 

1894 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

1895 

1896 # Will iterate through the refs multiple times so need to convert 

1897 # to a list if this isn't a collection. 

1898 if not isinstance(source_refs, collections.abc.Collection): 

1899 source_refs = list(source_refs) 

1900 

1901 original_count = len(source_refs) 

1902 log.info("Transferring %d datasets into %s", original_count, str(self)) 

1903 

1904 if id_gen_map is None: 

1905 id_gen_map = {} 

1906 

1907 # In some situations the datastore artifact may be missing 

1908 # and we do not want that registry entry to be imported. 

1909 # Asking datastore is not sufficient, the records may have been 

1910 # purged, we have to ask for the (predicted) URI and check 

1911 # existence explicitly. Execution butler is set up exactly like 

1912 # this with no datastore records. 

1913 artifact_existence: Dict[ButlerURI, bool] = {} 

1914 if skip_missing: 

1915 dataset_existence = source_butler.datastore.mexists(source_refs, 

1916 artifact_existence=artifact_existence) 

1917 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

1918 filtered_count = len(source_refs) 

1919 log.verbose("%d datasets removed because the artifact does not exist. Now have %d.", 

1920 original_count - filtered_count, filtered_count) 

1921 

1922 # Importing requires that we group the refs by dataset type and run 

1923 # before doing the import. 

1924 source_dataset_types = set() 

1925 grouped_refs = defaultdict(list) 

1926 grouped_indices = defaultdict(list) 

1927 for i, ref in enumerate(source_refs): 

1928 grouped_refs[ref.datasetType, ref.run].append(ref) 

1929 grouped_indices[ref.datasetType, ref.run].append(i) 

1930 source_dataset_types.add(ref.datasetType) 

1931 

1932 # Check to see if the dataset type in the source butler has 

1933 # the same definition in the target butler and register missing 

1934 # ones if requested. Registration must happen outside a transaction. 

1935 newly_registered_dataset_types = set() 

1936 for datasetType in source_dataset_types: 

1937 if register_dataset_types: 

1938 # Let this raise immediately if inconsistent. Continuing 

1939 # on to find additional inconsistent dataset types 

1940 # might result in additional unwanted dataset types being 

1941 # registered. 

1942 if self.registry.registerDatasetType(datasetType): 

1943 newly_registered_dataset_types.add(datasetType) 

1944 else: 

1945 # If the dataset type is missing, let it fail immediately. 

1946 target_dataset_type = self.registry.getDatasetType(datasetType.name) 

1947 if target_dataset_type != datasetType: 

1948 raise ConflictingDefinitionError("Source butler dataset type differs from definition" 

1949 f" in target butler: {datasetType} !=" 

1950 f" {target_dataset_type}") 

1951 if newly_registered_dataset_types: 

1952 # We may have registered some even if there were inconsistencies 

1953 # but should let people know (or else remove them again). 

1954 log.log(VERBOSE, "Registered the following dataset types in the target Butler: %s", 

1955 ", ".join(d.name for d in newly_registered_dataset_types)) 

1956 else: 

1957 log.log(VERBOSE, "All required dataset types are known to the target Butler") 

1958 

1959 # The returned refs should be identical for UUIDs. 

1960 # For now must also support integers and so need to retain the 

1961 # newly-created refs from this registry. 

1962 # Pre-size it so we can assign refs into the correct slots 

1963 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs) 

1964 default_id_gen = DatasetIdGenEnum.UNIQUE 

1965 

1966 handled_collections: Set[str] = set() 

1967 

1968 # Do all the importing in a single transaction. 

1969 with self.transaction(): 

1970 for (datasetType, run), refs_to_import in progress.iter_item_chunks(grouped_refs.items(), 

1971 desc="Importing to registry" 

1972 " by run and dataset type"): 

1973 if run not in handled_collections: 

1974 run_doc = source_butler.registry.getCollectionDocumentation(run) 

1975 registered = self.registry.registerRun(run, doc=run_doc) 

1976 handled_collections.add(run) 

1977 if registered: 

1978 log.log(VERBOSE, "Creating output run %s", run) 

1979 

1980 id_generation_mode = default_id_gen 

1981 if isinstance(refs_to_import[0].id, int): 

1982 # ID generation mode might need to be overridden when 

1983 # targetting UUID 

1984 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen) 

1985 

1986 n_refs = len(refs_to_import) 

1987 log.verbose("Importing %d ref%s of dataset type %s into run %s", 

1988 n_refs, "" if n_refs == 1 else "s", datasetType.name, run) 

1989 

1990 # No way to know if this butler's registry uses UUID. 

1991 # We have to trust the caller on this. If it fails they will 

1992 # have to change their approach. We can't catch the exception 

1993 # and retry with unique because that will mess up the 

1994 # transaction handling. We aren't allowed to ask the registry 

1995 # manager what type of ID it is using. 

1996 imported_refs = self.registry._importDatasets(refs_to_import, 

1997 idGenerationMode=id_generation_mode, 

1998 expand=False) 

1999 

2000 # Map them into the correct slots to match the initial order 

2001 for i, ref in zip(grouped_indices[datasetType, run], imported_refs): 

2002 transferred_refs_tmp[i] = ref 

2003 

2004 # Mypy insists that we might have None in here so we have to make 

2005 # that explicit by assigning to a new variable and filtering out 

2006 # something that won't be there. 

2007 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None] 

2008 

2009 # Check consistency 

2010 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given" 

2011 

2012 log.verbose("Imported %d datasets into destination butler", len(transferred_refs)) 

2013 

2014 # The transferred refs need to be reordered to match the original 

2015 # ordering given by the caller. Without this the datastore transfer 

2016 # will be broken. 

2017 

2018 # Ask the datastore to transfer. The datastore has to check that 

2019 # the source datastore is compatible with the target datastore. 

2020 self.datastore.transfer_from(source_butler.datastore, source_refs, 

2021 local_refs=transferred_refs, transfer=transfer, 

2022 artifact_existence=artifact_existence) 

2023 

2024 return transferred_refs 

2025 

2026 def validateConfiguration(self, logFailures: bool = False, 

2027 datasetTypeNames: Optional[Iterable[str]] = None, 

2028 ignore: Iterable[str] = None) -> None: 

2029 """Validate butler configuration. 

2030 

2031 Checks that each `DatasetType` can be stored in the `Datastore`. 

2032 

2033 Parameters 

2034 ---------- 

2035 logFailures : `bool`, optional 

2036 If `True`, output a log message for every validation error 

2037 detected. 

2038 datasetTypeNames : iterable of `str`, optional 

2039 The `DatasetType` names that should be checked. This allows 

2040 only a subset to be selected. 

2041 ignore : iterable of `str`, optional 

2042 Names of DatasetTypes to skip over. This can be used to skip 

2043 known problems. If a named `DatasetType` corresponds to a 

2044 composite, all components of that `DatasetType` will also be 

2045 ignored. 

2046 

2047 Raises 

2048 ------ 

2049 ButlerValidationError 

2050 Raised if there is some inconsistency with how this Butler 

2051 is configured. 

2052 """ 

2053 if datasetTypeNames: 

2054 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

2055 else: 

2056 datasetTypes = list(self.registry.queryDatasetTypes()) 

2057 

2058 # filter out anything from the ignore list 

2059 if ignore: 

2060 ignore = set(ignore) 

2061 datasetTypes = [e for e in datasetTypes 

2062 if e.name not in ignore and e.nameAndComponent()[0] not in ignore] 

2063 else: 

2064 ignore = set() 

2065 

2066 # Find all the registered instruments 

2067 instruments = set( 

2068 record.name for record in self.registry.queryDimensionRecords("instrument") 

2069 ) 

2070 

2071 # For each datasetType that has an instrument dimension, create 

2072 # a DatasetRef for each defined instrument 

2073 datasetRefs = [] 

2074 

2075 for datasetType in datasetTypes: 

2076 if "instrument" in datasetType.dimensions: 

2077 for instrument in instruments: 

2078 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, # type: ignore 

2079 conform=False) 

2080 datasetRefs.append(datasetRef) 

2081 

2082 entities: List[Union[DatasetType, DatasetRef]] = [] 

2083 entities.extend(datasetTypes) 

2084 entities.extend(datasetRefs) 

2085 

2086 datastoreErrorStr = None 

2087 try: 

2088 self.datastore.validateConfiguration(entities, logFailures=logFailures) 

2089 except ValidationError as e: 

2090 datastoreErrorStr = str(e) 

2091 

2092 # Also check that the LookupKeys used by the datastores match 

2093 # registry and storage class definitions 

2094 keys = self.datastore.getLookupKeys() 

2095 

2096 failedNames = set() 

2097 failedDataId = set() 

2098 for key in keys: 

2099 if key.name is not None: 

2100 if key.name in ignore: 

2101 continue 

2102 

2103 # skip if specific datasetType names were requested and this 

2104 # name does not match 

2105 if datasetTypeNames and key.name not in datasetTypeNames: 

2106 continue 

2107 

2108 # See if it is a StorageClass or a DatasetType 

2109 if key.name in self.storageClasses: 

2110 pass 

2111 else: 

2112 try: 

2113 self.registry.getDatasetType(key.name) 

2114 except KeyError: 

2115 if logFailures: 

2116 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

2117 failedNames.add(key) 

2118 else: 

2119 # Dimensions are checked for consistency when the Butler 

2120 # is created and rendezvoused with a universe. 

2121 pass 

2122 

2123 # Check that the instrument is a valid instrument 

2124 # Currently only support instrument so check for that 

2125 if key.dataId: 

2126 dataIdKeys = set(key.dataId) 

2127 if set(["instrument"]) != dataIdKeys: 

2128 if logFailures: 

2129 log.critical("Key '%s' has unsupported DataId override", key) 

2130 failedDataId.add(key) 

2131 elif key.dataId["instrument"] not in instruments: 

2132 if logFailures: 

2133 log.critical("Key '%s' has unknown instrument", key) 

2134 failedDataId.add(key) 

2135 

2136 messages = [] 

2137 

2138 if datastoreErrorStr: 

2139 messages.append(datastoreErrorStr) 

2140 

2141 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2142 (failedDataId, "Keys with bad DataId entries: ")): 

2143 if failed: 

2144 msg += ", ".join(str(k) for k in failed) 

2145 messages.append(msg) 

2146 

2147 if messages: 

2148 raise ValidationError(";\n".join(messages)) 

2149 

2150 @property 

2151 def collections(self) -> CollectionSearch: 

2152 """The collections to search by default, in order (`CollectionSearch`). 

2153 

2154 This is an alias for ``self.registry.defaults.collections``. It cannot 

2155 be set directly in isolation, but all defaults may be changed together 

2156 by assigning a new `RegistryDefaults` instance to 

2157 ``self.registry.defaults``. 

2158 """ 

2159 return self.registry.defaults.collections 

2160 

2161 @property 

2162 def run(self) -> Optional[str]: 

2163 """Name of the run this butler writes outputs to by default (`str` or 

2164 `None`). 

2165 

2166 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2167 directly in isolation, but all defaults may be changed together by 

2168 assigning a new `RegistryDefaults` instance to 

2169 ``self.registry.defaults``. 

2170 """ 

2171 return self.registry.defaults.run 

2172 

2173 registry: Registry 

2174 """The object that manages dataset metadata and relationships (`Registry`). 

2175 

2176 Most operations that don't involve reading or writing butler datasets are 

2177 accessible only via `Registry` methods. 

2178 """ 

2179 

2180 datastore: Datastore 

2181 """The object that manages actual dataset storage (`Datastore`). 

2182 

2183 Direct user access to the datastore should rarely be necessary; the primary 

2184 exception is the case where a `Datastore` implementation provides extra 

2185 functionality beyond what the base class defines. 

2186 """ 

2187 

2188 storageClasses: StorageClassFactory 

2189 """An object that maps known storage class names to objects that fully 

2190 describe them (`StorageClassFactory`). 

2191 """ 

2192 

2193 _allow_put_of_predefined_dataset: bool 

2194 """Allow a put to succeed even if there is already a registry entry for it 

2195 but not a datastore record. (`bool`)."""