Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Butler top level classes. 

24""" 

25from __future__ import annotations 

26 

27__all__ = ( 

28 "Butler", 

29 "ButlerValidationError", 

30 "PruneCollectionsArgsError", 

31 "PurgeWithoutUnstorePruneCollectionsError", 

32 "RunWithoutPurgePruneCollectionsError", 

33 "PurgeUnsupportedPruneCollectionsError", 

34) 

35 

36import collections.abc 

37from collections import defaultdict 

38import contextlib 

39import logging 

40import numbers 

41import os 

42from typing import ( 

43 Any, 

44 ClassVar, 

45 Counter, 

46 Dict, 

47 Iterable, 

48 Iterator, 

49 List, 

50 MutableMapping, 

51 Optional, 

52 Set, 

53 TextIO, 

54 Tuple, 

55 Type, 

56 Union, 

57) 

58 

59try: 

60 import boto3 

61except ImportError: 

62 boto3 = None 

63 

64from lsst.utils import doImportType 

65from lsst.utils.introspection import get_class_of 

66from lsst.utils.logging import getLogger, VERBOSE 

67from .core import ( 

68 AmbiguousDatasetError, 

69 ButlerURI, 

70 Config, 

71 ConfigSubset, 

72 DataCoordinate, 

73 DataId, 

74 DataIdValue, 

75 DatasetRef, 

76 DatasetType, 

77 Datastore, 

78 Dimension, 

79 DimensionConfig, 

80 FileDataset, 

81 Progress, 

82 StorageClassFactory, 

83 Timespan, 

84 ValidationError, 

85) 

86from .core.repoRelocation import BUTLER_ROOT_TAG 

87from .core.utils import transactional 

88from ._deferredDatasetHandle import DeferredDatasetHandle 

89from ._butlerConfig import ButlerConfig 

90from .registry import ( 

91 Registry, 

92 RegistryConfig, 

93 RegistryDefaults, 

94 CollectionSearch, 

95 CollectionType, 

96 ConflictingDefinitionError, 

97 DatasetIdGenEnum, 

98) 

99from .transfers import RepoExportContext 

100 

101log = getLogger(__name__) 

102 

103 

104class ButlerValidationError(ValidationError): 

105 """There is a problem with the Butler configuration.""" 

106 pass 

107 

108 

109class PruneCollectionsArgsError(TypeError): 

110 """Base class for errors relating to Butler.pruneCollections input 

111 arguments. 

112 """ 

113 pass 

114 

115 

116class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError): 

117 """Raised when purge and unstore are both required to be True, and 

118 purge is True but unstore is False. 

119 """ 

120 

121 def __init__(self) -> None: 

122 super().__init__("Cannot pass purge=True without unstore=True.") 

123 

124 

125class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError): 

126 """Raised when pruning a RUN collection but purge is False.""" 

127 

128 def __init__(self, collectionType: CollectionType): 

129 self.collectionType = collectionType 

130 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.") 

131 

132 

133class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError): 

134 """Raised when purge is True but is not supported for the given 

135 collection.""" 

136 

137 def __init__(self, collectionType: CollectionType): 

138 self.collectionType = collectionType 

139 super().__init__( 

140 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True.") 

141 

142 

143class Butler: 

144 """Main entry point for the data access system. 

145 

146 Parameters 

147 ---------- 

148 config : `ButlerConfig`, `Config` or `str`, optional. 

149 Configuration. Anything acceptable to the 

150 `ButlerConfig` constructor. If a directory path 

151 is given the configuration will be read from a ``butler.yaml`` file in 

152 that location. If `None` is given default values will be used. 

153 butler : `Butler`, optional. 

154 If provided, construct a new Butler that uses the same registry and 

155 datastore as the given one, but with the given collection and run. 

156 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

157 arguments. 

158 collections : `str` or `Iterable` [ `str` ], optional 

159 An expression specifying the collections to be searched (in order) when 

160 reading datasets. 

161 This may be a `str` collection name or an iterable thereof. 

162 See :ref:`daf_butler_collection_expressions` for more information. 

163 These collections are not registered automatically and must be 

164 manually registered before they are used by any method, but they may be 

165 manually registered after the `Butler` is initialized. 

166 run : `str`, optional 

167 Name of the `~CollectionType.RUN` collection new datasets should be 

168 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

169 ``collections`` will be set to ``[run]``. If not `None`, this 

170 collection will automatically be registered. If this is not set (and 

171 ``writeable`` is not set either), a read-only butler will be created. 

172 searchPaths : `list` of `str`, optional 

173 Directory paths to search when calculating the full Butler 

174 configuration. Not used if the supplied config is already a 

175 `ButlerConfig`. 

176 writeable : `bool`, optional 

177 Explicitly sets whether the butler supports write operations. If not 

178 provided, a read-write butler is created if any of ``run``, ``tags``, 

179 or ``chains`` is non-empty. 

180 inferDefaults : `bool`, optional 

181 If `True` (default) infer default data ID values from the values 

182 present in the datasets in ``collections``: if all collections have the 

183 same value (or no value) for a governor dimension, that value will be 

184 the default for that dimension. Nonexistent collections are ignored. 

185 If a default value is provided explicitly for a governor dimension via 

186 ``**kwargs``, no default will be inferred for that dimension. 

187 **kwargs : `str` 

188 Default data ID key-value pairs. These may only identify "governor" 

189 dimensions like ``instrument`` and ``skymap``. 

190 

191 Examples 

192 -------- 

193 While there are many ways to control exactly how a `Butler` interacts with 

194 the collections in its `Registry`, the most common cases are still simple. 

195 

196 For a read-only `Butler` that searches one collection, do:: 

197 

198 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

199 

200 For a read-write `Butler` that writes to and reads from a 

201 `~CollectionType.RUN` collection:: 

202 

203 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

204 

205 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

206 because we want to write to one `~CollectionType.RUN` collection but read 

207 from several others (as well):: 

208 

209 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

210 collections=["u/alice/DM-50000/a", 

211 "u/bob/DM-49998", 

212 "HSC/defaults"]) 

213 

214 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

215 Datasets will be read first from that run (since it appears first in the 

216 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

217 

218 Finally, one can always create a `Butler` with no collections:: 

219 

220 butler = Butler("/path/to/repo", writeable=True) 

221 

222 This can be extremely useful when you just want to use ``butler.registry``, 

223 e.g. for inserting dimension data or managing collections, or when the 

224 collections you want to use with the butler are not consistent. 

225 Passing ``writeable`` explicitly here is only necessary if you want to be 

226 able to make changes to the repo - usually the value for ``writeable`` can 

227 be guessed from the collection arguments provided, but it defaults to 

228 `False` when there are not collection arguments. 

229 """ 

230 def __init__(self, config: Union[Config, str, None] = None, *, 

231 butler: Optional[Butler] = None, 

232 collections: Any = None, 

233 run: Optional[str] = None, 

234 searchPaths: Optional[List[str]] = None, 

235 writeable: Optional[bool] = None, 

236 inferDefaults: bool = True, 

237 **kwargs: str, 

238 ): 

239 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

240 # Load registry, datastore, etc. from config or existing butler. 

241 if butler is not None: 

242 if config is not None or searchPaths is not None or writeable is not None: 

243 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' " 

244 "arguments with 'butler' argument.") 

245 self.registry = butler.registry.copy(defaults) 

246 self.datastore = butler.datastore 

247 self.storageClasses = butler.storageClasses 

248 self._config: ButlerConfig = butler._config 

249 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset 

250 else: 

251 self._config = ButlerConfig(config, searchPaths=searchPaths) 

252 try: 

253 if "root" in self._config: 

254 butlerRoot = self._config["root"] 

255 else: 

256 butlerRoot = self._config.configDir 

257 if writeable is None: 

258 writeable = run is not None 

259 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable, 

260 defaults=defaults) 

261 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(), 

262 butlerRoot=butlerRoot) 

263 self.storageClasses = StorageClassFactory() 

264 self.storageClasses.addFromConfig(self._config) 

265 self._allow_put_of_predefined_dataset = self._config.get("allow_put_of_predefined_dataset", 

266 False) 

267 except Exception: 

268 # Failures here usually mean that configuration is incomplete, 

269 # just issue an error message which includes config file URI. 

270 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

271 raise 

272 

273 if "run" in self._config or "collection" in self._config: 

274 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

275 

276 GENERATION: ClassVar[int] = 3 

277 """This is a Generation 3 Butler. 

278 

279 This attribute may be removed in the future, once the Generation 2 Butler 

280 interface has been fully retired; it should only be used in transitional 

281 code. 

282 """ 

283 

284 @staticmethod 

285 def makeRepo(root: str, config: Union[Config, str, None] = None, 

286 dimensionConfig: Union[Config, str, None] = None, standalone: bool = False, 

287 searchPaths: Optional[List[str]] = None, forceConfigRoot: bool = True, 

288 outfile: Optional[str] = None, overwrite: bool = False) -> Config: 

289 """Create an empty data repository by adding a butler.yaml config 

290 to a repository root directory. 

291 

292 Parameters 

293 ---------- 

294 root : `str` or `ButlerURI` 

295 Path or URI to the root location of the new repository. Will be 

296 created if it does not exist. 

297 config : `Config` or `str`, optional 

298 Configuration to write to the repository, after setting any 

299 root-dependent Registry or Datastore config options. Can not 

300 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

301 configuration will be used. Root-dependent config options 

302 specified in this config are overwritten if ``forceConfigRoot`` 

303 is `True`. 

304 dimensionConfig : `Config` or `str`, optional 

305 Configuration for dimensions, will be used to initialize registry 

306 database. 

307 standalone : `bool` 

308 If True, write all expanded defaults, not just customized or 

309 repository-specific settings. 

310 This (mostly) decouples the repository from the default 

311 configuration, insulating it from changes to the defaults (which 

312 may be good or bad, depending on the nature of the changes). 

313 Future *additions* to the defaults will still be picked up when 

314 initializing `Butlers` to repos created with ``standalone=True``. 

315 searchPaths : `list` of `str`, optional 

316 Directory paths to search when calculating the full butler 

317 configuration. 

318 forceConfigRoot : `bool`, optional 

319 If `False`, any values present in the supplied ``config`` that 

320 would normally be reset are not overridden and will appear 

321 directly in the output config. This allows non-standard overrides 

322 of the root directory for a datastore or registry to be given. 

323 If this parameter is `True` the values for ``root`` will be 

324 forced into the resulting config if appropriate. 

325 outfile : `str`, optional 

326 If not-`None`, the output configuration will be written to this 

327 location rather than into the repository itself. Can be a URI 

328 string. Can refer to a directory that will be used to write 

329 ``butler.yaml``. 

330 overwrite : `bool`, optional 

331 Create a new configuration file even if one already exists 

332 in the specified output location. Default is to raise 

333 an exception. 

334 

335 Returns 

336 ------- 

337 config : `Config` 

338 The updated `Config` instance written to the repo. 

339 

340 Raises 

341 ------ 

342 ValueError 

343 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

344 regular Config (as these subclasses would make it impossible to 

345 support ``standalone=False``). 

346 FileExistsError 

347 Raised if the output config file already exists. 

348 os.error 

349 Raised if the directory does not exist, exists but is not a 

350 directory, or cannot be created. 

351 

352 Notes 

353 ----- 

354 Note that when ``standalone=False`` (the default), the configuration 

355 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

356 construct the repository should also be used to construct any Butlers 

357 to avoid configuration inconsistencies. 

358 """ 

359 if isinstance(config, (ButlerConfig, ConfigSubset)): 

360 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

361 

362 # Ensure that the root of the repository exists or can be made 

363 uri = ButlerURI(root, forceDirectory=True) 

364 uri.mkdir() 

365 

366 config = Config(config) 

367 

368 # If we are creating a new repo from scratch with relative roots, 

369 # do not propagate an explicit root from the config file 

370 if "root" in config: 

371 del config["root"] 

372 

373 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

374 imported_class = doImportType(full["datastore", "cls"]) 

375 if not issubclass(imported_class, Datastore): 

376 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore") 

377 datastoreClass: Type[Datastore] = imported_class 

378 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

379 

380 # if key exists in given config, parse it, otherwise parse the defaults 

381 # in the expanded config 

382 if config.get(("registry", "db")): 

383 registryConfig = RegistryConfig(config) 

384 else: 

385 registryConfig = RegistryConfig(full) 

386 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

387 if defaultDatabaseUri is not None: 

388 Config.updateParameters(RegistryConfig, config, full, 

389 toUpdate={"db": defaultDatabaseUri}, 

390 overwrite=forceConfigRoot) 

391 else: 

392 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), 

393 overwrite=forceConfigRoot) 

394 

395 if standalone: 

396 config.merge(full) 

397 else: 

398 # Always expand the registry.managers section into the per-repo 

399 # config, because after the database schema is created, it's not 

400 # allowed to change anymore. Note that in the standalone=True 

401 # branch, _everything_ in the config is expanded, so there's no 

402 # need to special case this. 

403 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

404 configURI: Union[str, ButlerURI] 

405 if outfile is not None: 

406 # When writing to a separate location we must include 

407 # the root of the butler repo in the config else it won't know 

408 # where to look. 

409 config["root"] = uri.geturl() 

410 configURI = outfile 

411 else: 

412 configURI = uri 

413 config.dumpToUri(configURI, overwrite=overwrite) 

414 

415 # Create Registry and populate tables 

416 registryConfig = RegistryConfig(config.get("registry")) 

417 dimensionConfig = DimensionConfig(dimensionConfig) 

418 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root) 

419 

420 log.verbose("Wrote new Butler configuration file to %s", configURI) 

421 

422 return config 

423 

424 @classmethod 

425 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str], 

426 defaultDataId: Dict[str, str], writeable: bool) -> Butler: 

427 """Callable used to unpickle a Butler. 

428 

429 We prefer not to use ``Butler.__init__`` directly so we can force some 

430 of its many arguments to be keyword-only (note that ``__reduce__`` 

431 can only invoke callables with positional arguments). 

432 

433 Parameters 

434 ---------- 

435 config : `ButlerConfig` 

436 Butler configuration, already coerced into a true `ButlerConfig` 

437 instance (and hence after any search paths for overrides have been 

438 utilized). 

439 collections : `CollectionSearch` 

440 Names of the default collections to read from. 

441 run : `str`, optional 

442 Name of the default `~CollectionType.RUN` collection to write to. 

443 defaultDataId : `dict` [ `str`, `str` ] 

444 Default data ID values. 

445 writeable : `bool` 

446 Whether the Butler should support write operations. 

447 

448 Returns 

449 ------- 

450 butler : `Butler` 

451 A new `Butler` instance. 

452 """ 

453 # MyPy doesn't recognize that the kwargs below are totally valid; it 

454 # seems to think '**defaultDataId* is a _positional_ argument! 

455 return cls(config=config, collections=collections, run=run, writeable=writeable, 

456 **defaultDataId) # type: ignore 

457 

458 def __reduce__(self) -> tuple: 

459 """Support pickling. 

460 """ 

461 return (Butler._unpickle, (self._config, self.collections, self.run, 

462 self.registry.defaults.dataId.byName(), 

463 self.registry.isWriteable())) 

464 

465 def __str__(self) -> str: 

466 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

467 self.collections, self.run, self.datastore, self.registry) 

468 

469 def isWriteable(self) -> bool: 

470 """Return `True` if this `Butler` supports write operations. 

471 """ 

472 return self.registry.isWriteable() 

473 

474 @contextlib.contextmanager 

475 def transaction(self) -> Iterator[None]: 

476 """Context manager supporting `Butler` transactions. 

477 

478 Transactions can be nested. 

479 """ 

480 with self.registry.transaction(): 

481 with self.datastore.transaction(): 

482 yield 

483 

484 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

485 dataId: Optional[DataId] = None, **kwargs: Any 

486 ) -> Tuple[DatasetType, Optional[DataId]]: 

487 """Standardize the arguments passed to several Butler APIs. 

488 

489 Parameters 

490 ---------- 

491 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

492 When `DatasetRef` the `dataId` should be `None`. 

493 Otherwise the `DatasetType` or name thereof. 

494 dataId : `dict` or `DataCoordinate` 

495 A `dict` of `Dimension` link name, value pairs that label the 

496 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

497 should be provided as the second argument. 

498 **kwargs 

499 Additional keyword arguments used to augment or construct a 

500 `DataCoordinate`. See `DataCoordinate.standardize` 

501 parameters. 

502 

503 Returns 

504 ------- 

505 datasetType : `DatasetType` 

506 A `DatasetType` instance extracted from ``datasetRefOrType``. 

507 dataId : `dict` or `DataId`, optional 

508 Argument that can be used (along with ``kwargs``) to construct a 

509 `DataId`. 

510 

511 Notes 

512 ----- 

513 Butler APIs that conceptually need a DatasetRef also allow passing a 

514 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

515 keyword arguments that can be used to construct one) separately. This 

516 method accepts those arguments and always returns a true `DatasetType` 

517 and a `DataId` or `dict`. 

518 

519 Standardization of `dict` vs `DataId` is best handled by passing the 

520 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

521 generally similarly flexible. 

522 """ 

523 externalDatasetType: Optional[DatasetType] = None 

524 internalDatasetType: Optional[DatasetType] = None 

525 if isinstance(datasetRefOrType, DatasetRef): 

526 if dataId is not None or kwargs: 

527 raise ValueError("DatasetRef given, cannot use dataId as well") 

528 externalDatasetType = datasetRefOrType.datasetType 

529 dataId = datasetRefOrType.dataId 

530 else: 

531 # Don't check whether DataId is provided, because Registry APIs 

532 # can usually construct a better error message when it wasn't. 

533 if isinstance(datasetRefOrType, DatasetType): 

534 externalDatasetType = datasetRefOrType 

535 else: 

536 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

537 

538 # Check that they are self-consistent 

539 if externalDatasetType is not None: 

540 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

541 if externalDatasetType != internalDatasetType: 

542 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

543 f"registry definition ({internalDatasetType})") 

544 

545 assert internalDatasetType is not None 

546 return internalDatasetType, dataId 

547 

548 def _rewrite_data_id(self, dataId: Optional[DataId], datasetType: DatasetType, 

549 **kwargs: Any) -> Tuple[Optional[DataId], Dict[str, Any]]: 

550 """Rewrite a data ID taking into account dimension records. 

551 

552 Take a Data ID and keyword args and rewrite it if necessary to 

553 allow the user to specify dimension records rather than dimension 

554 primary values. 

555 

556 This allows a user to include a dataId dict with keys of 

557 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

558 the integer exposure ID. It also allows a string to be given 

559 for a dimension value rather than the integer ID if that is more 

560 convenient. For example, rather than having to specifyin the 

561 detector with ``detector.full_name``, a string given for ``detector`` 

562 will be interpreted as the full name and converted to the integer 

563 value. 

564 

565 Keyword arguments can also use strings for dimensions like detector 

566 and exposure but python does not allow them to include ``.`` and 

567 so the ``exposure.day_obs`` syntax can not be used in a keyword 

568 argument. 

569 

570 Parameters 

571 ---------- 

572 dataId : `dict` or `DataCoordinate` 

573 A `dict` of `Dimension` link name, value pairs that will label the 

574 `DatasetRef` within a Collection. 

575 datasetType : `DatasetType` 

576 The dataset type associated with this dataId. Required to 

577 determine the relevant dimensions. 

578 **kwargs 

579 Additional keyword arguments used to augment or construct a 

580 `DataId`. See `DataId` parameters. 

581 

582 Returns 

583 ------- 

584 dataId : `dict` or `DataCoordinate` 

585 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

586 no keyword arguments, the orginal dataId will be returned 

587 unchanged. 

588 **kwargs : `dict` 

589 Any unused keyword arguments. 

590 """ 

591 # Do nothing if we have a standalone DataCoordinate. 

592 if isinstance(dataId, DataCoordinate) and not kwargs: 

593 return dataId, kwargs 

594 

595 # Process dimension records that are using record information 

596 # rather than ids 

597 newDataId: Dict[str, DataIdValue] = {} 

598 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict) 

599 

600 # if all the dataId comes from keyword parameters we do not need 

601 # to do anything here because they can't be of the form 

602 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

603 if dataId: 

604 for k, v in dataId.items(): 

605 # If we have a Dimension we do not need to do anything 

606 # because it cannot be a compound key. 

607 if isinstance(k, str) and "." in k: 

608 # Someone is using a more human-readable dataId 

609 dimensionName, record = k.split(".", 1) 

610 byRecord[dimensionName][record] = v 

611 elif isinstance(k, Dimension): 

612 newDataId[k.name] = v 

613 else: 

614 newDataId[k] = v 

615 

616 # Go through the updated dataId and check the type in case someone is 

617 # using an alternate key. We have already filtered out the compound 

618 # keys dimensions.record format. 

619 not_dimensions = {} 

620 

621 # Will need to look in the dataId and the keyword arguments 

622 # and will remove them if they need to be fixed or are unrecognized. 

623 for dataIdDict in (newDataId, kwargs): 

624 # Use a list so we can adjust the dict safely in the loop 

625 for dimensionName in list(dataIdDict): 

626 value = dataIdDict[dimensionName] 

627 try: 

628 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

629 except KeyError: 

630 # This is not a real dimension 

631 not_dimensions[dimensionName] = value 

632 del dataIdDict[dimensionName] 

633 continue 

634 

635 # Convert an integral type to an explicit int to simplify 

636 # comparisons here 

637 if isinstance(value, numbers.Integral): 

638 value = int(value) 

639 

640 if not isinstance(value, dimension.primaryKey.getPythonType()): 

641 for alternate in dimension.alternateKeys: 

642 if isinstance(value, alternate.getPythonType()): 

643 byRecord[dimensionName][alternate.name] = value 

644 del dataIdDict[dimensionName] 

645 log.debug("Converting dimension %s to %s.%s=%s", 

646 dimensionName, dimensionName, alternate.name, value) 

647 break 

648 else: 

649 log.warning("Type mismatch found for value '%r' provided for dimension %s. " 

650 "Could not find matching alternative (primary key has type %s) " 

651 "so attempting to use as-is.", 

652 value, dimensionName, dimension.primaryKey.getPythonType()) 

653 

654 # If we have some unrecognized dimensions we have to try to connect 

655 # them to records in other dimensions. This is made more complicated 

656 # by some dimensions having records with clashing names. A mitigation 

657 # is that we can tell by this point which dimensions are missing 

658 # for the DatasetType but this does not work for calibrations 

659 # where additional dimensions can be used to constrain the temporal 

660 # axis. 

661 if not_dimensions: 

662 # Calculate missing dimensions 

663 provided = set(newDataId) | set(kwargs) | set(byRecord) 

664 missingDimensions = datasetType.dimensions.names - provided 

665 

666 # For calibrations we may well be needing temporal dimensions 

667 # so rather than always including all dimensions in the scan 

668 # restrict things a little. It is still possible for there 

669 # to be confusion over day_obs in visit vs exposure for example. 

670 # If we are not searching calibration collections things may 

671 # fail but they are going to fail anyway because of the 

672 # ambiguousness of the dataId... 

673 candidateDimensions: Set[str] = set() 

674 candidateDimensions.update(missingDimensions) 

675 if datasetType.isCalibration(): 

676 for dim in self.registry.dimensions.getStaticDimensions(): 

677 if dim.temporal: 

678 candidateDimensions.add(str(dim)) 

679 

680 # Look up table for the first association with a dimension 

681 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict) 

682 

683 # Keep track of whether an item is associated with multiple 

684 # dimensions. 

685 counter: Counter[str] = Counter() 

686 assigned: Dict[str, Set[str]] = defaultdict(set) 

687 

688 # Go through the missing dimensions and associate the 

689 # given names with records within those dimensions 

690 for dimensionName in candidateDimensions: 

691 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

692 fields = dimension.metadata.names | dimension.uniqueKeys.names 

693 for field in not_dimensions: 

694 if field in fields: 

695 guessedAssociation[dimensionName][field] = not_dimensions[field] 

696 counter[dimensionName] += 1 

697 assigned[field].add(dimensionName) 

698 

699 # There is a chance we have allocated a single dataId item 

700 # to multiple dimensions. Need to decide which should be retained. 

701 # For now assume that the most popular alternative wins. 

702 # This means that day_obs with seq_num will result in 

703 # exposure.day_obs and not visit.day_obs 

704 # Also prefer an explicitly missing dimension over an inferred 

705 # temporal dimension. 

706 for fieldName, assignedDimensions in assigned.items(): 

707 if len(assignedDimensions) > 1: 

708 # Pick the most popular (preferring mandatory dimensions) 

709 requiredButMissing = assignedDimensions.intersection(missingDimensions) 

710 if requiredButMissing: 

711 candidateDimensions = requiredButMissing 

712 else: 

713 candidateDimensions = assignedDimensions 

714 

715 # Select the relevant items and get a new restricted 

716 # counter. 

717 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

718 duplicatesCounter: Counter[str] = Counter() 

719 duplicatesCounter.update(theseCounts) 

720 

721 # Choose the most common. If they are equally common 

722 # we will pick the one that was found first. 

723 # Returns a list of tuples 

724 selected = duplicatesCounter.most_common(1)[0][0] 

725 

726 log.debug("Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

727 " Removed ambiguity by choosing dimension %s.", 

728 fieldName, ", ".join(assignedDimensions), selected) 

729 

730 for candidateDimension in assignedDimensions: 

731 if candidateDimension != selected: 

732 del guessedAssociation[candidateDimension][fieldName] 

733 

734 # Update the record look up dict with the new associations 

735 for dimensionName, values in guessedAssociation.items(): 

736 if values: # A dict might now be empty 

737 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", 

738 dimensionName, values) 

739 byRecord[dimensionName].update(values) 

740 

741 if byRecord: 

742 # Some record specifiers were found so we need to convert 

743 # them to the Id form 

744 for dimensionName, values in byRecord.items(): 

745 if dimensionName in newDataId: 

746 log.warning("DataId specified explicit %s dimension value of %s in addition to" 

747 " general record specifiers for it of %s. Ignoring record information.", 

748 dimensionName, newDataId[dimensionName], str(values)) 

749 continue 

750 

751 # Build up a WHERE expression 

752 bind = {k: v for k, v in values.items()} 

753 where = " AND ".join(f"{dimensionName}.{k} = {k}" 

754 for k in bind) 

755 

756 # Hopefully we get a single record that matches 

757 records = set(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId, 

758 where=where, bind=bind, **kwargs)) 

759 

760 if len(records) != 1: 

761 if len(records) > 1: 

762 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

763 for r in records: 

764 log.debug("- %s", str(r)) 

765 raise RuntimeError(f"DataId specification for dimension {dimensionName} is not" 

766 f" uniquely constrained to a single dataset by {values}." 

767 f" Got {len(records)} results.") 

768 raise RuntimeError(f"DataId specification for dimension {dimensionName} matched no" 

769 f" records when constrained by {values}") 

770 

771 # Get the primary key from the real dimension object 

772 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

773 if not isinstance(dimension, Dimension): 

774 raise RuntimeError( 

775 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

776 ) 

777 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

778 

779 # We have modified the dataId so need to switch to it 

780 dataId = newDataId 

781 

782 return dataId, kwargs 

783 

784 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

785 dataId: Optional[DataId] = None, *, 

786 collections: Any = None, 

787 allowUnresolved: bool = False, 

788 **kwargs: Any) -> DatasetRef: 

789 """Shared logic for methods that start with a search for a dataset in 

790 the registry. 

791 

792 Parameters 

793 ---------- 

794 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

795 When `DatasetRef` the `dataId` should be `None`. 

796 Otherwise the `DatasetType` or name thereof. 

797 dataId : `dict` or `DataCoordinate`, optional 

798 A `dict` of `Dimension` link name, value pairs that label the 

799 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

800 should be provided as the first argument. 

801 collections : Any, optional 

802 Collections to be searched, overriding ``self.collections``. 

803 Can be any of the types supported by the ``collections`` argument 

804 to butler construction. 

805 allowUnresolved : `bool`, optional 

806 If `True`, return an unresolved `DatasetRef` if finding a resolved 

807 one in the `Registry` fails. Defaults to `False`. 

808 **kwargs 

809 Additional keyword arguments used to augment or construct a 

810 `DataId`. See `DataId` parameters. 

811 

812 Returns 

813 ------- 

814 ref : `DatasetRef` 

815 A reference to the dataset identified by the given arguments. 

816 

817 Raises 

818 ------ 

819 LookupError 

820 Raised if no matching dataset exists in the `Registry` (and 

821 ``allowUnresolved is False``). 

822 ValueError 

823 Raised if a resolved `DatasetRef` was passed as an input, but it 

824 differs from the one found in the registry. 

825 TypeError 

826 Raised if no collections were provided. 

827 """ 

828 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

829 if isinstance(datasetRefOrType, DatasetRef): 

830 idNumber = datasetRefOrType.id 

831 else: 

832 idNumber = None 

833 timespan: Optional[Timespan] = None 

834 

835 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

836 

837 if datasetType.isCalibration(): 

838 # Because this is a calibration dataset, first try to make a 

839 # standardize the data ID without restricting the dimensions to 

840 # those of the dataset type requested, because there may be extra 

841 # dimensions that provide temporal information for a validity-range 

842 # lookup. 

843 dataId = DataCoordinate.standardize(dataId, universe=self.registry.dimensions, 

844 defaults=self.registry.defaults.dataId, **kwargs) 

845 if dataId.graph.temporal: 

846 dataId = self.registry.expandDataId(dataId) 

847 timespan = dataId.timespan 

848 else: 

849 # Standardize the data ID to just the dimensions of the dataset 

850 # type instead of letting registry.findDataset do it, so we get the 

851 # result even if no dataset is found. 

852 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions, 

853 defaults=self.registry.defaults.dataId, **kwargs) 

854 # Always lookup the DatasetRef, even if one is given, to ensure it is 

855 # present in the current collection. 

856 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

857 if ref is None: 

858 if allowUnresolved: 

859 return DatasetRef(datasetType, dataId) 

860 else: 

861 if collections is None: 

862 collections = self.registry.defaults.collections 

863 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} " 

864 f"could not be found in collections {collections}.") 

865 if idNumber is not None and idNumber != ref.id: 

866 if collections is None: 

867 collections = self.registry.defaults.collections 

868 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match " 

869 f"id ({ref.id}) in registry in collections {collections}.") 

870 return ref 

871 

872 @transactional 

873 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

874 dataId: Optional[DataId] = None, *, 

875 run: Optional[str] = None, 

876 **kwargs: Any) -> DatasetRef: 

877 """Store and register a dataset. 

878 

879 Parameters 

880 ---------- 

881 obj : `object` 

882 The dataset. 

883 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

884 When `DatasetRef` is provided, ``dataId`` should be `None`. 

885 Otherwise the `DatasetType` or name thereof. 

886 dataId : `dict` or `DataCoordinate` 

887 A `dict` of `Dimension` link name, value pairs that label the 

888 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

889 should be provided as the second argument. 

890 run : `str`, optional 

891 The name of the run the dataset should be added to, overriding 

892 ``self.run``. 

893 **kwargs 

894 Additional keyword arguments used to augment or construct a 

895 `DataCoordinate`. See `DataCoordinate.standardize` 

896 parameters. 

897 

898 Returns 

899 ------- 

900 ref : `DatasetRef` 

901 A reference to the stored dataset, updated with the correct id if 

902 given. 

903 

904 Raises 

905 ------ 

906 TypeError 

907 Raised if the butler is read-only or if no run has been provided. 

908 """ 

909 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

910 if not self.isWriteable(): 

911 raise TypeError("Butler is read-only.") 

912 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

913 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

914 raise ValueError("DatasetRef must not be in registry, must have None id") 

915 

916 # Handle dimension records in dataId 

917 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

918 

919 # Add Registry Dataset entry. 

920 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

921 

922 # For an execution butler the datasets will be pre-defined. 

923 # If the butler is configured that way datasets should only be inserted 

924 # if they do not already exist in registry. Trying and catching 

925 # ConflictingDefinitionError will not work because the transaction 

926 # will be corrupted. Instead, in this mode always check first. 

927 ref = None 

928 ref_is_predefined = False 

929 if self._allow_put_of_predefined_dataset: 

930 # Get the matching ref for this run. 

931 ref = self.registry.findDataset(datasetType, collections=run, 

932 dataId=dataId) 

933 

934 if ref: 

935 # Must be expanded form for datastore templating 

936 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions) 

937 ref = ref.expanded(dataId) 

938 ref_is_predefined = True 

939 

940 if not ref: 

941 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

942 

943 # If the ref is predefined it is possible that the datastore also 

944 # has the record. Asking datastore to put it again will result in 

945 # the artifact being recreated, overwriting previous, then will cause 

946 # a failure in writing the record which will cause the artifact 

947 # to be removed. Much safer to ask first before attempting to 

948 # overwrite. Race conditions should not be an issue for the 

949 # execution butler environment. 

950 if ref_is_predefined: 

951 if self.datastore.knows(ref): 

952 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.") 

953 

954 self.datastore.put(obj, ref) 

955 

956 return ref 

957 

958 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any: 

959 """Retrieve a stored dataset. 

960 

961 Unlike `Butler.get`, this method allows datasets outside the Butler's 

962 collection to be read as long as the `DatasetRef` that identifies them 

963 can be obtained separately. 

964 

965 Parameters 

966 ---------- 

967 ref : `DatasetRef` 

968 Resolved reference to an already stored dataset. 

969 parameters : `dict` 

970 Additional StorageClass-defined options to control reading, 

971 typically used to efficiently read only a subset of the dataset. 

972 

973 Returns 

974 ------- 

975 obj : `object` 

976 The dataset. 

977 """ 

978 return self.datastore.get(ref, parameters=parameters) 

979 

980 def getDirectDeferred(self, ref: DatasetRef, *, 

981 parameters: Union[dict, None] = None) -> DeferredDatasetHandle: 

982 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

983 from a resolved `DatasetRef`. 

984 

985 Parameters 

986 ---------- 

987 ref : `DatasetRef` 

988 Resolved reference to an already stored dataset. 

989 parameters : `dict` 

990 Additional StorageClass-defined options to control reading, 

991 typically used to efficiently read only a subset of the dataset. 

992 

993 Returns 

994 ------- 

995 obj : `DeferredDatasetHandle` 

996 A handle which can be used to retrieve a dataset at a later time. 

997 

998 Raises 

999 ------ 

1000 AmbiguousDatasetError 

1001 Raised if ``ref.id is None``, i.e. the reference is unresolved. 

1002 """ 

1003 if ref.id is None: 

1004 raise AmbiguousDatasetError( 

1005 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved." 

1006 ) 

1007 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

1008 

1009 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1010 dataId: Optional[DataId] = None, *, 

1011 parameters: Union[dict, None] = None, 

1012 collections: Any = None, 

1013 **kwargs: Any) -> DeferredDatasetHandle: 

1014 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1015 after an immediate registry lookup. 

1016 

1017 Parameters 

1018 ---------- 

1019 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1020 When `DatasetRef` the `dataId` should be `None`. 

1021 Otherwise the `DatasetType` or name thereof. 

1022 dataId : `dict` or `DataCoordinate`, optional 

1023 A `dict` of `Dimension` link name, value pairs that label the 

1024 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1025 should be provided as the first argument. 

1026 parameters : `dict` 

1027 Additional StorageClass-defined options to control reading, 

1028 typically used to efficiently read only a subset of the dataset. 

1029 collections : Any, optional 

1030 Collections to be searched, overriding ``self.collections``. 

1031 Can be any of the types supported by the ``collections`` argument 

1032 to butler construction. 

1033 **kwargs 

1034 Additional keyword arguments used to augment or construct a 

1035 `DataId`. See `DataId` parameters. 

1036 

1037 Returns 

1038 ------- 

1039 obj : `DeferredDatasetHandle` 

1040 A handle which can be used to retrieve a dataset at a later time. 

1041 

1042 Raises 

1043 ------ 

1044 LookupError 

1045 Raised if no matching dataset exists in the `Registry` (and 

1046 ``allowUnresolved is False``). 

1047 ValueError 

1048 Raised if a resolved `DatasetRef` was passed as an input, but it 

1049 differs from the one found in the registry. 

1050 TypeError 

1051 Raised if no collections were provided. 

1052 """ 

1053 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1054 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

1055 

1056 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1057 dataId: Optional[DataId] = None, *, 

1058 parameters: Optional[Dict[str, Any]] = None, 

1059 collections: Any = None, 

1060 **kwargs: Any) -> Any: 

1061 """Retrieve a stored dataset. 

1062 

1063 Parameters 

1064 ---------- 

1065 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1066 When `DatasetRef` the `dataId` should be `None`. 

1067 Otherwise the `DatasetType` or name thereof. 

1068 dataId : `dict` or `DataCoordinate` 

1069 A `dict` of `Dimension` link name, value pairs that label the 

1070 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1071 should be provided as the first argument. 

1072 parameters : `dict` 

1073 Additional StorageClass-defined options to control reading, 

1074 typically used to efficiently read only a subset of the dataset. 

1075 collections : Any, optional 

1076 Collections to be searched, overriding ``self.collections``. 

1077 Can be any of the types supported by the ``collections`` argument 

1078 to butler construction. 

1079 **kwargs 

1080 Additional keyword arguments used to augment or construct a 

1081 `DataCoordinate`. See `DataCoordinate.standardize` 

1082 parameters. 

1083 

1084 Returns 

1085 ------- 

1086 obj : `object` 

1087 The dataset. 

1088 

1089 Raises 

1090 ------ 

1091 ValueError 

1092 Raised if a resolved `DatasetRef` was passed as an input, but it 

1093 differs from the one found in the registry. 

1094 LookupError 

1095 Raised if no matching dataset exists in the `Registry`. 

1096 TypeError 

1097 Raised if no collections were provided. 

1098 

1099 Notes 

1100 ----- 

1101 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1102 this method requires that the given data ID include temporal dimensions 

1103 beyond the dimensions of the dataset type itself, in order to find the 

1104 dataset with the appropriate validity range. For example, a "bias" 

1105 dataset with native dimensions ``{instrument, detector}`` could be 

1106 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1107 ``exposure`` is a temporal dimension. 

1108 """ 

1109 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1110 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1111 return self.getDirect(ref, parameters=parameters) 

1112 

1113 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1114 dataId: Optional[DataId] = None, *, 

1115 predict: bool = False, 

1116 collections: Any = None, 

1117 run: Optional[str] = None, 

1118 **kwargs: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

1119 """Returns the URIs associated with the dataset. 

1120 

1121 Parameters 

1122 ---------- 

1123 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1124 When `DatasetRef` the `dataId` should be `None`. 

1125 Otherwise the `DatasetType` or name thereof. 

1126 dataId : `dict` or `DataCoordinate` 

1127 A `dict` of `Dimension` link name, value pairs that label the 

1128 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1129 should be provided as the first argument. 

1130 predict : `bool` 

1131 If `True`, allow URIs to be returned of datasets that have not 

1132 been written. 

1133 collections : Any, optional 

1134 Collections to be searched, overriding ``self.collections``. 

1135 Can be any of the types supported by the ``collections`` argument 

1136 to butler construction. 

1137 run : `str`, optional 

1138 Run to use for predictions, overriding ``self.run``. 

1139 **kwargs 

1140 Additional keyword arguments used to augment or construct a 

1141 `DataCoordinate`. See `DataCoordinate.standardize` 

1142 parameters. 

1143 

1144 Returns 

1145 ------- 

1146 primary : `ButlerURI` 

1147 The URI to the primary artifact associated with this dataset. 

1148 If the dataset was disassembled within the datastore this 

1149 may be `None`. 

1150 components : `dict` 

1151 URIs to any components associated with the dataset artifact. 

1152 Can be empty if there are no components. 

1153 """ 

1154 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict, 

1155 collections=collections, **kwargs) 

1156 if ref.id is None: # only possible if predict is True 

1157 if run is None: 

1158 run = self.run 

1159 if run is None: 

1160 raise TypeError("Cannot predict location with run=None.") 

1161 # Lie about ID, because we can't guess it, and only 

1162 # Datastore.getURIs() will ever see it (and it doesn't use it). 

1163 ref = ref.resolved(id=0, run=run) 

1164 return self.datastore.getURIs(ref, predict) 

1165 

1166 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1167 dataId: Optional[DataId] = None, *, 

1168 predict: bool = False, 

1169 collections: Any = None, 

1170 run: Optional[str] = None, 

1171 **kwargs: Any) -> ButlerURI: 

1172 """Return the URI to the Dataset. 

1173 

1174 Parameters 

1175 ---------- 

1176 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1177 When `DatasetRef` the `dataId` should be `None`. 

1178 Otherwise the `DatasetType` or name thereof. 

1179 dataId : `dict` or `DataCoordinate` 

1180 A `dict` of `Dimension` link name, value pairs that label the 

1181 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1182 should be provided as the first argument. 

1183 predict : `bool` 

1184 If `True`, allow URIs to be returned of datasets that have not 

1185 been written. 

1186 collections : Any, optional 

1187 Collections to be searched, overriding ``self.collections``. 

1188 Can be any of the types supported by the ``collections`` argument 

1189 to butler construction. 

1190 run : `str`, optional 

1191 Run to use for predictions, overriding ``self.run``. 

1192 **kwargs 

1193 Additional keyword arguments used to augment or construct a 

1194 `DataCoordinate`. See `DataCoordinate.standardize` 

1195 parameters. 

1196 

1197 Returns 

1198 ------- 

1199 uri : `ButlerURI` 

1200 URI pointing to the Dataset within the datastore. If the 

1201 Dataset does not exist in the datastore, and if ``predict`` is 

1202 `True`, the URI will be a prediction and will include a URI 

1203 fragment "#predicted". 

1204 If the datastore does not have entities that relate well 

1205 to the concept of a URI the returned URI string will be 

1206 descriptive. The returned URI is not guaranteed to be obtainable. 

1207 

1208 Raises 

1209 ------ 

1210 LookupError 

1211 A URI has been requested for a dataset that does not exist and 

1212 guessing is not allowed. 

1213 ValueError 

1214 Raised if a resolved `DatasetRef` was passed as an input, but it 

1215 differs from the one found in the registry. 

1216 TypeError 

1217 Raised if no collections were provided. 

1218 RuntimeError 

1219 Raised if a URI is requested for a dataset that consists of 

1220 multiple artifacts. 

1221 """ 

1222 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict, 

1223 collections=collections, run=run, **kwargs) 

1224 

1225 if primary is None or components: 

1226 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1227 "Use Butler.getURIs() instead.") 

1228 return primary 

1229 

1230 def retrieveArtifacts(self, refs: Iterable[DatasetRef], 

1231 destination: Union[str, ButlerURI], transfer: str = "auto", 

1232 preserve_path: bool = True, 

1233 overwrite: bool = False) -> List[ButlerURI]: 

1234 """Retrieve the artifacts associated with the supplied refs. 

1235 

1236 Parameters 

1237 ---------- 

1238 refs : iterable of `DatasetRef` 

1239 The datasets for which artifacts are to be retrieved. 

1240 A single ref can result in multiple artifacts. The refs must 

1241 be resolved. 

1242 destination : `ButlerURI` or `str` 

1243 Location to write the artifacts. 

1244 transfer : `str`, optional 

1245 Method to use to transfer the artifacts. Must be one of the options 

1246 supported by `ButlerURI.transfer_from()`. "move" is not allowed. 

1247 preserve_path : `bool`, optional 

1248 If `True` the full path of the artifact within the datastore 

1249 is preserved. If `False` the final file component of the path 

1250 is used. 

1251 overwrite : `bool`, optional 

1252 If `True` allow transfers to overwrite existing files at the 

1253 destination. 

1254 

1255 Returns 

1256 ------- 

1257 targets : `list` of `ButlerURI` 

1258 URIs of file artifacts in destination location. Order is not 

1259 preserved. 

1260 

1261 Notes 

1262 ----- 

1263 For non-file datastores the artifacts written to the destination 

1264 may not match the representation inside the datastore. For example 

1265 a hierarchical data structure in a NoSQL database may well be stored 

1266 as a JSON file. 

1267 """ 

1268 return self.datastore.retrieveArtifacts(refs, ButlerURI(destination), transfer=transfer, 

1269 preserve_path=preserve_path, overwrite=overwrite) 

1270 

1271 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1272 dataId: Optional[DataId] = None, *, 

1273 collections: Any = None, 

1274 **kwargs: Any) -> bool: 

1275 """Return True if the Dataset is actually present in the Datastore. 

1276 

1277 Parameters 

1278 ---------- 

1279 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1280 When `DatasetRef` the `dataId` should be `None`. 

1281 Otherwise the `DatasetType` or name thereof. 

1282 dataId : `dict` or `DataCoordinate` 

1283 A `dict` of `Dimension` link name, value pairs that label the 

1284 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1285 should be provided as the first argument. 

1286 collections : Any, optional 

1287 Collections to be searched, overriding ``self.collections``. 

1288 Can be any of the types supported by the ``collections`` argument 

1289 to butler construction. 

1290 **kwargs 

1291 Additional keyword arguments used to augment or construct a 

1292 `DataCoordinate`. See `DataCoordinate.standardize` 

1293 parameters. 

1294 

1295 Raises 

1296 ------ 

1297 LookupError 

1298 Raised if the dataset is not even present in the Registry. 

1299 ValueError 

1300 Raised if a resolved `DatasetRef` was passed as an input, but it 

1301 differs from the one found in the registry. 

1302 TypeError 

1303 Raised if no collections were provided. 

1304 """ 

1305 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1306 return self.datastore.exists(ref) 

1307 

1308 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1309 """Remove one or more `~CollectionType.RUN` collections and the 

1310 datasets within them. 

1311 

1312 Parameters 

1313 ---------- 

1314 names : `Iterable` [ `str` ] 

1315 The names of the collections to remove. 

1316 unstore : `bool`, optional 

1317 If `True` (default), delete datasets from all datastores in which 

1318 they are present, and attempt to rollback the registry deletions if 

1319 datastore deletions fail (which may not always be possible). If 

1320 `False`, datastore records for these datasets are still removed, 

1321 but any artifacts (e.g. files) will not be. 

1322 

1323 Raises 

1324 ------ 

1325 TypeError 

1326 Raised if one or more collections are not of type 

1327 `~CollectionType.RUN`. 

1328 """ 

1329 if not self.isWriteable(): 

1330 raise TypeError("Butler is read-only.") 

1331 names = list(names) 

1332 refs: List[DatasetRef] = [] 

1333 for name in names: 

1334 collectionType = self.registry.getCollectionType(name) 

1335 if collectionType is not CollectionType.RUN: 

1336 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1337 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True)) 

1338 with self.registry.transaction(): 

1339 if unstore: 

1340 self.datastore.trash(refs) 

1341 else: 

1342 self.datastore.forget(refs) 

1343 for name in names: 

1344 self.registry.removeCollection(name) 

1345 if unstore: 

1346 # Point of no return for removing artifacts 

1347 self.datastore.emptyTrash() 

1348 

1349 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False, 

1350 unlink: Optional[List[str]] = None) -> None: 

1351 """Remove a collection and possibly prune datasets within it. 

1352 

1353 Parameters 

1354 ---------- 

1355 name : `str` 

1356 Name of the collection to remove. If this is a 

1357 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection, 

1358 datasets within the collection are not modified unless ``unstore`` 

1359 is `True`. If this is a `~CollectionType.RUN` collection, 

1360 ``purge`` and ``unstore`` must be `True`, and all datasets in it 

1361 are fully removed from the data repository. 

1362 purge : `bool`, optional 

1363 If `True`, permit `~CollectionType.RUN` collections to be removed, 

1364 fully removing datasets within them. Requires ``unstore=True`` as 

1365 well as an added precaution against accidental deletion. Must be 

1366 `False` (default) if the collection is not a ``RUN``. 

1367 unstore: `bool`, optional 

1368 If `True`, remove all datasets in the collection from all 

1369 datastores in which they appear. 

1370 unlink: `list` [`str`], optional 

1371 Before removing the given `collection` unlink it from from these 

1372 parent collections. 

1373 

1374 Raises 

1375 ------ 

1376 TypeError 

1377 Raised if the butler is read-only or arguments are mutually 

1378 inconsistent. 

1379 """ 

1380 # See pruneDatasets comments for more information about the logic here; 

1381 # the cases are almost the same, but here we can rely on Registry to 

1382 # take care everything but Datastore deletion when we remove the 

1383 # collection. 

1384 if not self.isWriteable(): 

1385 raise TypeError("Butler is read-only.") 

1386 collectionType = self.registry.getCollectionType(name) 

1387 if purge and not unstore: 

1388 raise PurgeWithoutUnstorePruneCollectionsError() 

1389 if collectionType is CollectionType.RUN and not purge: 

1390 raise RunWithoutPurgePruneCollectionsError(collectionType) 

1391 if collectionType is not CollectionType.RUN and purge: 

1392 raise PurgeUnsupportedPruneCollectionsError(collectionType) 

1393 

1394 def remove(child: str, parent: str) -> None: 

1395 """Remove a child collection from a parent collection.""" 

1396 # Remove child from parent. 

1397 chain = list(self.registry.getCollectionChain(parent)) 

1398 try: 

1399 chain.remove(name) 

1400 except ValueError as e: 

1401 raise RuntimeError(f"{name} is not a child of {parent}") from e 

1402 self.registry.setCollectionChain(parent, chain) 

1403 

1404 with self.registry.transaction(): 

1405 if (unlink): 

1406 for parent in unlink: 

1407 remove(name, parent) 

1408 if unstore: 

1409 refs = self.registry.queryDatasets(..., collections=name, findFirst=True) 

1410 self.datastore.trash(refs) 

1411 self.registry.removeCollection(name) 

1412 

1413 if unstore: 

1414 # Point of no return for removing artifacts 

1415 self.datastore.emptyTrash() 

1416 

1417 def pruneDatasets(self, refs: Iterable[DatasetRef], *, 

1418 disassociate: bool = True, 

1419 unstore: bool = False, 

1420 tags: Iterable[str] = (), 

1421 purge: bool = False, 

1422 run: Optional[str] = None) -> None: 

1423 """Remove one or more datasets from a collection and/or storage. 

1424 

1425 Parameters 

1426 ---------- 

1427 refs : `~collections.abc.Iterable` of `DatasetRef` 

1428 Datasets to prune. These must be "resolved" references (not just 

1429 a `DatasetType` and data ID). 

1430 disassociate : `bool`, optional 

1431 Disassociate pruned datasets from ``tags``, or from all collections 

1432 if ``purge=True``. 

1433 unstore : `bool`, optional 

1434 If `True` (`False` is default) remove these datasets from all 

1435 datastores known to this butler. Note that this will make it 

1436 impossible to retrieve these datasets even via other collections. 

1437 Datasets that are already not stored are ignored by this option. 

1438 tags : `Iterable` [ `str` ], optional 

1439 `~CollectionType.TAGGED` collections to disassociate the datasets 

1440 from. Ignored if ``disassociate`` is `False` or ``purge`` is 

1441 `True`. 

1442 purge : `bool`, optional 

1443 If `True` (`False` is default), completely remove the dataset from 

1444 the `Registry`. To prevent accidental deletions, ``purge`` may 

1445 only be `True` if all of the following conditions are met: 

1446 

1447 - All given datasets are in the given run. 

1448 - ``disassociate`` is `True`; 

1449 - ``unstore`` is `True`. 

1450 

1451 This mode may remove provenance information from datasets other 

1452 than those provided, and should be used with extreme care. 

1453 

1454 Raises 

1455 ------ 

1456 TypeError 

1457 Raised if the butler is read-only, if no collection was provided, 

1458 or the conditions for ``purge=True`` were not met. 

1459 """ 

1460 if not self.isWriteable(): 

1461 raise TypeError("Butler is read-only.") 

1462 if purge: 

1463 if not disassociate: 

1464 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1465 if not unstore: 

1466 raise TypeError("Cannot pass purge=True without unstore=True.") 

1467 elif disassociate: 

1468 tags = tuple(tags) 

1469 if not tags: 

1470 raise TypeError("No tags provided but disassociate=True.") 

1471 for tag in tags: 

1472 collectionType = self.registry.getCollectionType(tag) 

1473 if collectionType is not CollectionType.TAGGED: 

1474 raise TypeError(f"Cannot disassociate from collection '{tag}' " 

1475 f"of non-TAGGED type {collectionType.name}.") 

1476 # Transform possibly-single-pass iterable into something we can iterate 

1477 # over multiple times. 

1478 refs = list(refs) 

1479 # Pruning a component of a DatasetRef makes no sense since registry 

1480 # doesn't know about components and datastore might not store 

1481 # components in a separate file 

1482 for ref in refs: 

1483 if ref.datasetType.component(): 

1484 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1485 # We don't need an unreliable Datastore transaction for this, because 

1486 # we've been extra careful to ensure that Datastore.trash only involves 

1487 # mutating the Registry (it can _look_ at Datastore-specific things, 

1488 # but shouldn't change them), and hence all operations here are 

1489 # Registry operations. 

1490 with self.registry.transaction(): 

1491 if unstore: 

1492 self.datastore.trash(refs) 

1493 if purge: 

1494 self.registry.removeDatasets(refs) 

1495 elif disassociate: 

1496 assert tags, "Guaranteed by earlier logic in this function." 

1497 for tag in tags: 

1498 self.registry.disassociate(tag, refs) 

1499 # We've exited the Registry transaction, and apparently committed. 

1500 # (if there was an exception, everything rolled back, and it's as if 

1501 # nothing happened - and we never get here). 

1502 # Datastore artifacts are not yet gone, but they're clearly marked 

1503 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1504 # problems we can try again later, and if manual administrative 

1505 # intervention is required, it's pretty clear what that should entail: 

1506 # deleting everything on disk and in private Datastore tables that is 

1507 # in the dataset_location_trash table. 

1508 if unstore: 

1509 # Point of no return for removing artifacts 

1510 self.datastore.emptyTrash() 

1511 

1512 @transactional 

1513 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None, 

1514 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1515 ) -> None: 

1516 """Store and register one or more datasets that already exist on disk. 

1517 

1518 Parameters 

1519 ---------- 

1520 datasets : `FileDataset` 

1521 Each positional argument is a struct containing information about 

1522 a file to be ingested, including its URI (either absolute or 

1523 relative to the datastore root, if applicable), a `DatasetRef`, 

1524 and optionally a formatter class or its fully-qualified string 

1525 name. If a formatter is not provided, the formatter that would be 

1526 used for `put` is assumed. On successful return, all 

1527 `FileDataset.ref` attributes will have their `DatasetRef.id` 

1528 attribute populated and all `FileDataset.formatter` attributes will 

1529 be set to the formatter class used. `FileDataset.path` attributes 

1530 may be modified to put paths in whatever the datastore considers a 

1531 standardized form. 

1532 transfer : `str`, optional 

1533 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1534 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to 

1535 transfer the file. 

1536 run : `str`, optional 

1537 The name of the run ingested datasets should be added to, 

1538 overriding ``self.run``. 

1539 idGenerationMode : `DatasetIdGenEnum`, optional 

1540 Specifies option for generating dataset IDs. By default unique IDs 

1541 are generated for each inserted dataset. 

1542 

1543 Raises 

1544 ------ 

1545 TypeError 

1546 Raised if the butler is read-only or if no run was provided. 

1547 NotImplementedError 

1548 Raised if the `Datastore` does not support the given transfer mode. 

1549 DatasetTypeNotSupportedError 

1550 Raised if one or more files to be ingested have a dataset type that 

1551 is not supported by the `Datastore`.. 

1552 FileNotFoundError 

1553 Raised if one of the given files does not exist. 

1554 FileExistsError 

1555 Raised if transfer is not `None` but the (internal) location the 

1556 file would be moved to is already occupied. 

1557 

1558 Notes 

1559 ----- 

1560 This operation is not fully exception safe: if a database operation 

1561 fails, the given `FileDataset` instances may be only partially updated. 

1562 

1563 It is atomic in terms of database operations (they will either all 

1564 succeed or all fail) providing the database engine implements 

1565 transactions correctly. It will attempt to be atomic in terms of 

1566 filesystem operations as well, but this cannot be implemented 

1567 rigorously for most datastores. 

1568 """ 

1569 if not self.isWriteable(): 

1570 raise TypeError("Butler is read-only.") 

1571 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1572 # Reorganize the inputs so they're grouped by DatasetType and then 

1573 # data ID. We also include a list of DatasetRefs for each FileDataset 

1574 # to hold the resolved DatasetRefs returned by the Registry, before 

1575 # it's safe to swap them into FileDataset.refs. 

1576 # Some type annotation aliases to make that clearer: 

1577 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]] 

1578 GroupedData = MutableMapping[DatasetType, GroupForType] 

1579 # The actual data structure: 

1580 groupedData: GroupedData = defaultdict(dict) 

1581 # And the nested loop that populates it: 

1582 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1583 # This list intentionally shared across the inner loop, since it's 

1584 # associated with `dataset`. 

1585 resolvedRefs: List[DatasetRef] = [] 

1586 

1587 # Somewhere to store pre-existing refs if we have an 

1588 # execution butler. 

1589 existingRefs: List[DatasetRef] = [] 

1590 

1591 for ref in dataset.refs: 

1592 if ref.dataId in groupedData[ref.datasetType]: 

1593 raise ConflictingDefinitionError(f"Ingest conflict. Dataset {dataset.path} has same" 

1594 " DataId as other ingest dataset" 

1595 f" {groupedData[ref.datasetType][ref.dataId][0].path} " 

1596 f" ({ref.dataId})") 

1597 if self._allow_put_of_predefined_dataset: 

1598 existing_ref = self.registry.findDataset(ref.datasetType, 

1599 dataId=ref.dataId, 

1600 collections=run) 

1601 if existing_ref: 

1602 if self.datastore.knows(existing_ref): 

1603 raise ConflictingDefinitionError(f"Dataset associated with path {dataset.path}" 

1604 f" already exists as {existing_ref}.") 

1605 # Store this ref elsewhere since it already exists 

1606 # and we do not want to remake it but we do want 

1607 # to store it in the datastore. 

1608 existingRefs.append(existing_ref) 

1609 

1610 # Nothing else to do until we have finished 

1611 # iterating. 

1612 continue 

1613 

1614 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs) 

1615 

1616 if existingRefs: 

1617 

1618 if len(dataset.refs) != len(existingRefs): 

1619 # Keeping track of partially pre-existing datasets is hard 

1620 # and should generally never happen. For now don't allow 

1621 # it. 

1622 raise ConflictingDefinitionError(f"For dataset {dataset.path} some dataIds already exist" 

1623 " in registry but others do not. This is not supported.") 

1624 

1625 # Attach the resolved refs if we found them. 

1626 dataset.refs = existingRefs 

1627 

1628 # Now we can bulk-insert into Registry for each DatasetType. 

1629 for datasetType, groupForType in progress.iter_item_chunks(groupedData.items(), 

1630 desc="Bulk-inserting datasets by type"): 

1631 refs = self.registry.insertDatasets( 

1632 datasetType, 

1633 dataIds=groupForType.keys(), 

1634 run=run, 

1635 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType), 

1636 idGenerationMode=idGenerationMode, 

1637 ) 

1638 # Append those resolved DatasetRefs to the new lists we set up for 

1639 # them. 

1640 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()): 

1641 resolvedRefs.append(ref) 

1642 

1643 # Go back to the original FileDatasets to replace their refs with the 

1644 # new resolved ones. 

1645 for groupForType in progress.iter_chunks(groupedData.values(), 

1646 desc="Reassociating resolved dataset refs with files"): 

1647 for dataset, resolvedRefs in groupForType.values(): 

1648 dataset.refs = resolvedRefs 

1649 

1650 # Bulk-insert everything into Datastore. 

1651 self.datastore.ingest(*datasets, transfer=transfer) 

1652 

1653 @contextlib.contextmanager 

1654 def export(self, *, directory: Optional[str] = None, 

1655 filename: Optional[str] = None, 

1656 format: Optional[str] = None, 

1657 transfer: Optional[str] = None) -> Iterator[RepoExportContext]: 

1658 """Export datasets from the repository represented by this `Butler`. 

1659 

1660 This method is a context manager that returns a helper object 

1661 (`RepoExportContext`) that is used to indicate what information from 

1662 the repository should be exported. 

1663 

1664 Parameters 

1665 ---------- 

1666 directory : `str`, optional 

1667 Directory dataset files should be written to if ``transfer`` is not 

1668 `None`. 

1669 filename : `str`, optional 

1670 Name for the file that will include database information associated 

1671 with the exported datasets. If this is not an absolute path and 

1672 ``directory`` is not `None`, it will be written to ``directory`` 

1673 instead of the current working directory. Defaults to 

1674 "export.{format}". 

1675 format : `str`, optional 

1676 File format for the database information file. If `None`, the 

1677 extension of ``filename`` will be used. 

1678 transfer : `str`, optional 

1679 Transfer mode passed to `Datastore.export`. 

1680 

1681 Raises 

1682 ------ 

1683 TypeError 

1684 Raised if the set of arguments passed is inconsistent. 

1685 

1686 Examples 

1687 -------- 

1688 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

1689 methods are used to provide the iterables over data IDs and/or datasets 

1690 to be exported:: 

1691 

1692 with butler.export("exports.yaml") as export: 

1693 # Export all flats, but none of the dimension element rows 

1694 # (i.e. data ID information) associated with them. 

1695 export.saveDatasets(butler.registry.queryDatasets("flat"), 

1696 elements=()) 

1697 # Export all datasets that start with "deepCoadd_" and all of 

1698 # their associated data ID information. 

1699 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

1700 """ 

1701 if directory is None and transfer is not None: 

1702 raise TypeError("Cannot transfer without providing a directory.") 

1703 if transfer == "move": 

1704 raise TypeError("Transfer may not be 'move': export is read-only") 

1705 if format is None: 

1706 if filename is None: 

1707 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1708 else: 

1709 _, format = os.path.splitext(filename) 

1710 elif filename is None: 

1711 filename = f"export.{format}" 

1712 if directory is not None: 

1713 filename = os.path.join(directory, filename) 

1714 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["export"]) 

1715 with open(filename, 'w') as stream: 

1716 backend = BackendClass(stream) 

1717 try: 

1718 helper = RepoExportContext(self.registry, self.datastore, backend=backend, 

1719 directory=directory, transfer=transfer) 

1720 yield helper 

1721 except BaseException: 

1722 raise 

1723 else: 

1724 helper._finish() 

1725 

1726 def import_(self, *, directory: Optional[str] = None, 

1727 filename: Union[str, TextIO, None] = None, 

1728 format: Optional[str] = None, 

1729 transfer: Optional[str] = None, 

1730 skip_dimensions: Optional[Set] = None, 

1731 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1732 reuseIds: bool = False) -> None: 

1733 """Import datasets into this repository that were exported from a 

1734 different butler repository via `~lsst.daf.butler.Butler.export`. 

1735 

1736 Parameters 

1737 ---------- 

1738 directory : `str`, optional 

1739 Directory containing dataset files to import from. If `None`, 

1740 ``filename`` and all dataset file paths specified therein must 

1741 be absolute. 

1742 filename : `str` or `TextIO`, optional 

1743 A stream or name of file that contains database information 

1744 associated with the exported datasets, typically generated by 

1745 `~lsst.daf.butler.Butler.export`. If this a string (name) and 

1746 is not an absolute path, does not exist in the current working 

1747 directory, and ``directory`` is not `None`, it is assumed to be in 

1748 ``directory``. Defaults to "export.{format}". 

1749 format : `str`, optional 

1750 File format for ``filename``. If `None`, the extension of 

1751 ``filename`` will be used. 

1752 transfer : `str`, optional 

1753 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

1754 skip_dimensions : `set`, optional 

1755 Names of dimensions that should be skipped and not imported. 

1756 idGenerationMode : `DatasetIdGenEnum`, optional 

1757 Specifies option for generating dataset IDs when IDs are not 

1758 provided or their type does not match backend type. By default 

1759 unique IDs are generated for each inserted dataset. 

1760 reuseIds : `bool`, optional 

1761 If `True` then forces re-use of imported dataset IDs for integer 

1762 IDs which are normally generated as auto-incremented; exception 

1763 will be raised if imported IDs clash with existing ones. This 

1764 option has no effect on the use of globally-unique IDs which are 

1765 always re-used (or generated if integer IDs are being imported). 

1766 

1767 Raises 

1768 ------ 

1769 TypeError 

1770 Raised if the set of arguments passed is inconsistent, or if the 

1771 butler is read-only. 

1772 """ 

1773 if not self.isWriteable(): 

1774 raise TypeError("Butler is read-only.") 

1775 if format is None: 

1776 if filename is None: 

1777 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1778 else: 

1779 _, format = os.path.splitext(filename) # type: ignore 

1780 elif filename is None: 

1781 filename = f"export.{format}" 

1782 if isinstance(filename, str) and directory is not None and not os.path.exists(filename): 

1783 filename = os.path.join(directory, filename) 

1784 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"]) 

1785 

1786 def doImport(importStream: TextIO) -> None: 

1787 backend = BackendClass(importStream, self.registry) 

1788 backend.register() 

1789 with self.transaction(): 

1790 backend.load(self.datastore, directory=directory, transfer=transfer, 

1791 skip_dimensions=skip_dimensions, idGenerationMode=idGenerationMode, 

1792 reuseIds=reuseIds) 

1793 

1794 if isinstance(filename, str): 

1795 with open(filename, "r") as stream: 

1796 doImport(stream) 

1797 else: 

1798 doImport(filename) 

1799 

1800 def transfer_from(self, source_butler: Butler, source_refs: Iterable[DatasetRef], 

1801 transfer: str = "auto", 

1802 id_gen_map: Dict[str, DatasetIdGenEnum] = None, 

1803 skip_missing: bool = True, 

1804 register_dataset_types: bool = False) -> List[DatasetRef]: 

1805 """Transfer datasets to this Butler from a run in another Butler. 

1806 

1807 Parameters 

1808 ---------- 

1809 source_butler : `Butler` 

1810 Butler from which the datasets are to be transferred. 

1811 source_refs : iterable of `DatasetRef` 

1812 Datasets defined in the source butler that should be transferred to 

1813 this butler. 

1814 transfer : `str`, optional 

1815 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

1816 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional 

1817 A mapping of dataset type to ID generation mode. Only used if 

1818 the source butler is using integer IDs. Should not be used 

1819 if this receiving butler uses integer IDs. Without this dataset 

1820 import always uses unique. 

1821 skip_missing : `bool` 

1822 If `True`, datasets with no datastore artifact associated with 

1823 them are not transferred. If `False` a registry entry will be 

1824 created even if no datastore record is created (and so will 

1825 look equivalent to the dataset being unstored). 

1826 register_dataset_types : `bool` 

1827 If `True` any missing dataset types are registered. Otherwise 

1828 an exception is raised. 

1829 

1830 Returns 

1831 ------- 

1832 refs : `list` of `DatasetRef` 

1833 The refs added to this Butler. 

1834 

1835 Notes 

1836 ----- 

1837 Requires that any dimension definitions are already present in the 

1838 receiving Butler. The datastore artifact has to exist for a transfer 

1839 to be made but non-existence is not an error. 

1840 

1841 Datasets that already exist in this run will be skipped. 

1842 

1843 The datasets are imported as part of a transaction, although 

1844 dataset types are registered before the transaction is started. 

1845 This means that it is possible for a dataset type to be registered 

1846 even though transfer has failed. 

1847 """ 

1848 if not self.isWriteable(): 

1849 raise TypeError("Butler is read-only.") 

1850 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

1851 

1852 # Will iterate through the refs multiple times so need to convert 

1853 # to a list if this isn't a collection. 

1854 if not isinstance(source_refs, collections.abc.Collection): 

1855 source_refs = list(source_refs) 

1856 

1857 original_count = len(source_refs) 

1858 log.info("Transferring %d datasets into %s", original_count, str(self)) 

1859 

1860 if id_gen_map is None: 

1861 id_gen_map = {} 

1862 

1863 # In some situations the datastore artifact may be missing 

1864 # and we do not want that registry entry to be imported. 

1865 # Asking datastore is not sufficient, the records may have been 

1866 # purged, we have to ask for the (predicted) URI and check 

1867 # existence explicitly. Execution butler is set up exactly like 

1868 # this with no datastore records. 

1869 artifact_existence: Dict[ButlerURI, bool] = {} 

1870 if skip_missing: 

1871 dataset_existence = source_butler.datastore.mexists(source_refs, 

1872 artifact_existence=artifact_existence) 

1873 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

1874 filtered_count = len(source_refs) 

1875 log.verbose("%d datasets removed because the artifact does not exist. Now have %d.", 

1876 original_count - filtered_count, filtered_count) 

1877 

1878 # Importing requires that we group the refs by dataset type and run 

1879 # before doing the import. 

1880 source_dataset_types = set() 

1881 grouped_refs = defaultdict(list) 

1882 grouped_indices = defaultdict(list) 

1883 for i, ref in enumerate(source_refs): 

1884 grouped_refs[ref.datasetType, ref.run].append(ref) 

1885 grouped_indices[ref.datasetType, ref.run].append(i) 

1886 source_dataset_types.add(ref.datasetType) 

1887 

1888 # Check to see if the dataset type in the source butler has 

1889 # the same definition in the target butler and register missing 

1890 # ones if requested. Registration must happen outside a transaction. 

1891 newly_registered_dataset_types = set() 

1892 for datasetType in source_dataset_types: 

1893 if register_dataset_types: 

1894 # Let this raise immediately if inconsistent. Continuing 

1895 # on to find additional inconsistent dataset types 

1896 # might result in additional unwanted dataset types being 

1897 # registered. 

1898 if self.registry.registerDatasetType(datasetType): 

1899 newly_registered_dataset_types.add(datasetType) 

1900 else: 

1901 # If the dataset type is missing, let it fail immediately. 

1902 target_dataset_type = self.registry.getDatasetType(datasetType.name) 

1903 if target_dataset_type != datasetType: 

1904 raise ConflictingDefinitionError("Source butler dataset type differs from definition" 

1905 f" in target butler: {datasetType} !=" 

1906 f" {target_dataset_type}") 

1907 if newly_registered_dataset_types: 

1908 # We may have registered some even if there were inconsistencies 

1909 # but should let people know (or else remove them again). 

1910 log.log(VERBOSE, "Registered the following dataset types in the target Butler: %s", 

1911 ", ".join(d.name for d in newly_registered_dataset_types)) 

1912 else: 

1913 log.log(VERBOSE, "All required dataset types are known to the target Butler") 

1914 

1915 # The returned refs should be identical for UUIDs. 

1916 # For now must also support integers and so need to retain the 

1917 # newly-created refs from this registry. 

1918 # Pre-size it so we can assign refs into the correct slots 

1919 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs) 

1920 default_id_gen = DatasetIdGenEnum.UNIQUE 

1921 

1922 handled_collections: Set[str] = set() 

1923 

1924 # Do all the importing in a single transaction. 

1925 with self.transaction(): 

1926 for (datasetType, run), refs_to_import in progress.iter_item_chunks(grouped_refs.items(), 

1927 desc="Importing to registry" 

1928 " by run and dataset type"): 

1929 if run not in handled_collections: 

1930 run_doc = source_butler.registry.getCollectionDocumentation(run) 

1931 registered = self.registry.registerRun(run, doc=run_doc) 

1932 handled_collections.add(run) 

1933 if registered: 

1934 log.log(VERBOSE, "Creating output run %s", run) 

1935 

1936 id_generation_mode = default_id_gen 

1937 if isinstance(refs_to_import[0].id, int): 

1938 # ID generation mode might need to be overridden when 

1939 # targetting UUID 

1940 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen) 

1941 

1942 n_refs = len(refs_to_import) 

1943 log.verbose("Importing %d ref%s of dataset type %s into run %s", 

1944 n_refs, "" if n_refs == 1 else "s", datasetType.name, run) 

1945 

1946 # No way to know if this butler's registry uses UUID. 

1947 # We have to trust the caller on this. If it fails they will 

1948 # have to change their approach. We can't catch the exception 

1949 # and retry with unique because that will mess up the 

1950 # transaction handling. We aren't allowed to ask the registry 

1951 # manager what type of ID it is using. 

1952 imported_refs = self.registry._importDatasets(refs_to_import, 

1953 idGenerationMode=id_generation_mode, 

1954 expand=False) 

1955 

1956 # Map them into the correct slots to match the initial order 

1957 for i, ref in zip(grouped_indices[datasetType, run], imported_refs): 

1958 transferred_refs_tmp[i] = ref 

1959 

1960 # Mypy insists that we might have None in here so we have to make 

1961 # that explicit by assigning to a new variable and filtering out 

1962 # something that won't be there. 

1963 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None] 

1964 

1965 # Check consistency 

1966 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given" 

1967 

1968 log.verbose("Imported %d datasets into destination butler", len(transferred_refs)) 

1969 

1970 # The transferred refs need to be reordered to match the original 

1971 # ordering given by the caller. Without this the datastore transfer 

1972 # will be broken. 

1973 

1974 # Ask the datastore to transfer. The datastore has to check that 

1975 # the source datastore is compatible with the target datastore. 

1976 self.datastore.transfer_from(source_butler.datastore, source_refs, 

1977 local_refs=transferred_refs, transfer=transfer, 

1978 artifact_existence=artifact_existence) 

1979 

1980 return transferred_refs 

1981 

1982 def validateConfiguration(self, logFailures: bool = False, 

1983 datasetTypeNames: Optional[Iterable[str]] = None, 

1984 ignore: Iterable[str] = None) -> None: 

1985 """Validate butler configuration. 

1986 

1987 Checks that each `DatasetType` can be stored in the `Datastore`. 

1988 

1989 Parameters 

1990 ---------- 

1991 logFailures : `bool`, optional 

1992 If `True`, output a log message for every validation error 

1993 detected. 

1994 datasetTypeNames : iterable of `str`, optional 

1995 The `DatasetType` names that should be checked. This allows 

1996 only a subset to be selected. 

1997 ignore : iterable of `str`, optional 

1998 Names of DatasetTypes to skip over. This can be used to skip 

1999 known problems. If a named `DatasetType` corresponds to a 

2000 composite, all components of that `DatasetType` will also be 

2001 ignored. 

2002 

2003 Raises 

2004 ------ 

2005 ButlerValidationError 

2006 Raised if there is some inconsistency with how this Butler 

2007 is configured. 

2008 """ 

2009 if datasetTypeNames: 

2010 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

2011 else: 

2012 datasetTypes = list(self.registry.queryDatasetTypes()) 

2013 

2014 # filter out anything from the ignore list 

2015 if ignore: 

2016 ignore = set(ignore) 

2017 datasetTypes = [e for e in datasetTypes 

2018 if e.name not in ignore and e.nameAndComponent()[0] not in ignore] 

2019 else: 

2020 ignore = set() 

2021 

2022 # Find all the registered instruments 

2023 instruments = set( 

2024 record.name for record in self.registry.queryDimensionRecords("instrument") 

2025 ) 

2026 

2027 # For each datasetType that has an instrument dimension, create 

2028 # a DatasetRef for each defined instrument 

2029 datasetRefs = [] 

2030 

2031 for datasetType in datasetTypes: 

2032 if "instrument" in datasetType.dimensions: 

2033 for instrument in instruments: 

2034 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, # type: ignore 

2035 conform=False) 

2036 datasetRefs.append(datasetRef) 

2037 

2038 entities: List[Union[DatasetType, DatasetRef]] = [] 

2039 entities.extend(datasetTypes) 

2040 entities.extend(datasetRefs) 

2041 

2042 datastoreErrorStr = None 

2043 try: 

2044 self.datastore.validateConfiguration(entities, logFailures=logFailures) 

2045 except ValidationError as e: 

2046 datastoreErrorStr = str(e) 

2047 

2048 # Also check that the LookupKeys used by the datastores match 

2049 # registry and storage class definitions 

2050 keys = self.datastore.getLookupKeys() 

2051 

2052 failedNames = set() 

2053 failedDataId = set() 

2054 for key in keys: 

2055 if key.name is not None: 

2056 if key.name in ignore: 

2057 continue 

2058 

2059 # skip if specific datasetType names were requested and this 

2060 # name does not match 

2061 if datasetTypeNames and key.name not in datasetTypeNames: 

2062 continue 

2063 

2064 # See if it is a StorageClass or a DatasetType 

2065 if key.name in self.storageClasses: 

2066 pass 

2067 else: 

2068 try: 

2069 self.registry.getDatasetType(key.name) 

2070 except KeyError: 

2071 if logFailures: 

2072 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

2073 failedNames.add(key) 

2074 else: 

2075 # Dimensions are checked for consistency when the Butler 

2076 # is created and rendezvoused with a universe. 

2077 pass 

2078 

2079 # Check that the instrument is a valid instrument 

2080 # Currently only support instrument so check for that 

2081 if key.dataId: 

2082 dataIdKeys = set(key.dataId) 

2083 if set(["instrument"]) != dataIdKeys: 

2084 if logFailures: 

2085 log.critical("Key '%s' has unsupported DataId override", key) 

2086 failedDataId.add(key) 

2087 elif key.dataId["instrument"] not in instruments: 

2088 if logFailures: 

2089 log.critical("Key '%s' has unknown instrument", key) 

2090 failedDataId.add(key) 

2091 

2092 messages = [] 

2093 

2094 if datastoreErrorStr: 

2095 messages.append(datastoreErrorStr) 

2096 

2097 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2098 (failedDataId, "Keys with bad DataId entries: ")): 

2099 if failed: 

2100 msg += ", ".join(str(k) for k in failed) 

2101 messages.append(msg) 

2102 

2103 if messages: 

2104 raise ValidationError(";\n".join(messages)) 

2105 

2106 @property 

2107 def collections(self) -> CollectionSearch: 

2108 """The collections to search by default, in order (`CollectionSearch`). 

2109 

2110 This is an alias for ``self.registry.defaults.collections``. It cannot 

2111 be set directly in isolation, but all defaults may be changed together 

2112 by assigning a new `RegistryDefaults` instance to 

2113 ``self.registry.defaults``. 

2114 """ 

2115 return self.registry.defaults.collections 

2116 

2117 @property 

2118 def run(self) -> Optional[str]: 

2119 """Name of the run this butler writes outputs to by default (`str` or 

2120 `None`). 

2121 

2122 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2123 directly in isolation, but all defaults may be changed together by 

2124 assigning a new `RegistryDefaults` instance to 

2125 ``self.registry.defaults``. 

2126 """ 

2127 return self.registry.defaults.run 

2128 

2129 registry: Registry 

2130 """The object that manages dataset metadata and relationships (`Registry`). 

2131 

2132 Most operations that don't involve reading or writing butler datasets are 

2133 accessible only via `Registry` methods. 

2134 """ 

2135 

2136 datastore: Datastore 

2137 """The object that manages actual dataset storage (`Datastore`). 

2138 

2139 Direct user access to the datastore should rarely be necessary; the primary 

2140 exception is the case where a `Datastore` implementation provides extra 

2141 functionality beyond what the base class defines. 

2142 """ 

2143 

2144 storageClasses: StorageClassFactory 

2145 """An object that maps known storage class names to objects that fully 

2146 describe them (`StorageClassFactory`). 

2147 """ 

2148 

2149 _allow_put_of_predefined_dataset: bool 

2150 """Allow a put to succeed even if there is already a registry entry for it 

2151 but not a datastore record. (`bool`)."""