Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Butler top level classes. 

24""" 

25from __future__ import annotations 

26 

27__all__ = ( 

28 "Butler", 

29 "ButlerValidationError", 

30 "PruneCollectionsArgsError", 

31 "PurgeWithoutUnstorePruneCollectionsError", 

32 "RunWithoutPurgePruneCollectionsError", 

33 "PurgeUnsupportedPruneCollectionsError", 

34) 

35 

36import collections.abc 

37from collections import defaultdict 

38import contextlib 

39import logging 

40import numbers 

41import os 

42from typing import ( 

43 Any, 

44 ClassVar, 

45 Counter, 

46 Dict, 

47 Iterable, 

48 Iterator, 

49 List, 

50 MutableMapping, 

51 Optional, 

52 Set, 

53 TextIO, 

54 Tuple, 

55 Type, 

56 Union, 

57) 

58 

59try: 

60 import boto3 

61except ImportError: 

62 boto3 = None 

63 

64from lsst.utils import doImport 

65from .core import ( 

66 AmbiguousDatasetError, 

67 ButlerURI, 

68 Config, 

69 ConfigSubset, 

70 DataCoordinate, 

71 DataId, 

72 DataIdValue, 

73 DatasetRef, 

74 DatasetType, 

75 Datastore, 

76 Dimension, 

77 DimensionConfig, 

78 FileDataset, 

79 Progress, 

80 StorageClassFactory, 

81 Timespan, 

82 ValidationError, 

83 VERBOSE, 

84) 

85from .core.repoRelocation import BUTLER_ROOT_TAG 

86from .core.utils import transactional, getClassOf 

87from ._deferredDatasetHandle import DeferredDatasetHandle 

88from ._butlerConfig import ButlerConfig 

89from .registry import ( 

90 Registry, 

91 RegistryConfig, 

92 RegistryDefaults, 

93 CollectionSearch, 

94 CollectionType, 

95 ConflictingDefinitionError, 

96 DatasetIdGenEnum, 

97) 

98from .transfers import RepoExportContext 

99 

100log = logging.getLogger(__name__) 

101 

102 

103class ButlerValidationError(ValidationError): 

104 """There is a problem with the Butler configuration.""" 

105 pass 

106 

107 

108class PruneCollectionsArgsError(TypeError): 

109 """Base class for errors relating to Butler.pruneCollections input 

110 arguments. 

111 """ 

112 pass 

113 

114 

115class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError): 

116 """Raised when purge and unstore are both required to be True, and 

117 purge is True but unstore is False. 

118 """ 

119 

120 def __init__(self) -> None: 

121 super().__init__("Cannot pass purge=True without unstore=True.") 

122 

123 

124class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError): 

125 """Raised when pruning a RUN collection but purge is False.""" 

126 

127 def __init__(self, collectionType: CollectionType): 

128 self.collectionType = collectionType 

129 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.") 

130 

131 

132class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError): 

133 """Raised when purge is True but is not supported for the given 

134 collection.""" 

135 

136 def __init__(self, collectionType: CollectionType): 

137 self.collectionType = collectionType 

138 super().__init__( 

139 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True.") 

140 

141 

142class Butler: 

143 """Main entry point for the data access system. 

144 

145 Parameters 

146 ---------- 

147 config : `ButlerConfig`, `Config` or `str`, optional. 

148 Configuration. Anything acceptable to the 

149 `ButlerConfig` constructor. If a directory path 

150 is given the configuration will be read from a ``butler.yaml`` file in 

151 that location. If `None` is given default values will be used. 

152 butler : `Butler`, optional. 

153 If provided, construct a new Butler that uses the same registry and 

154 datastore as the given one, but with the given collection and run. 

155 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

156 arguments. 

157 collections : `str` or `Iterable` [ `str` ], optional 

158 An expression specifying the collections to be searched (in order) when 

159 reading datasets. 

160 This may be a `str` collection name or an iterable thereof. 

161 See :ref:`daf_butler_collection_expressions` for more information. 

162 These collections are not registered automatically and must be 

163 manually registered before they are used by any method, but they may be 

164 manually registered after the `Butler` is initialized. 

165 run : `str`, optional 

166 Name of the `~CollectionType.RUN` collection new datasets should be 

167 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

168 ``collections`` will be set to ``[run]``. If not `None`, this 

169 collection will automatically be registered. If this is not set (and 

170 ``writeable`` is not set either), a read-only butler will be created. 

171 searchPaths : `list` of `str`, optional 

172 Directory paths to search when calculating the full Butler 

173 configuration. Not used if the supplied config is already a 

174 `ButlerConfig`. 

175 writeable : `bool`, optional 

176 Explicitly sets whether the butler supports write operations. If not 

177 provided, a read-write butler is created if any of ``run``, ``tags``, 

178 or ``chains`` is non-empty. 

179 inferDefaults : `bool`, optional 

180 If `True` (default) infer default data ID values from the values 

181 present in the datasets in ``collections``: if all collections have the 

182 same value (or no value) for a governor dimension, that value will be 

183 the default for that dimension. Nonexistent collections are ignored. 

184 If a default value is provided explicitly for a governor dimension via 

185 ``**kwargs``, no default will be inferred for that dimension. 

186 **kwargs : `str` 

187 Default data ID key-value pairs. These may only identify "governor" 

188 dimensions like ``instrument`` and ``skymap``. 

189 

190 Examples 

191 -------- 

192 While there are many ways to control exactly how a `Butler` interacts with 

193 the collections in its `Registry`, the most common cases are still simple. 

194 

195 For a read-only `Butler` that searches one collection, do:: 

196 

197 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

198 

199 For a read-write `Butler` that writes to and reads from a 

200 `~CollectionType.RUN` collection:: 

201 

202 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

203 

204 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

205 because we want to write to one `~CollectionType.RUN` collection but read 

206 from several others (as well):: 

207 

208 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

209 collections=["u/alice/DM-50000/a", 

210 "u/bob/DM-49998", 

211 "HSC/defaults"]) 

212 

213 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

214 Datasets will be read first from that run (since it appears first in the 

215 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

216 

217 Finally, one can always create a `Butler` with no collections:: 

218 

219 butler = Butler("/path/to/repo", writeable=True) 

220 

221 This can be extremely useful when you just want to use ``butler.registry``, 

222 e.g. for inserting dimension data or managing collections, or when the 

223 collections you want to use with the butler are not consistent. 

224 Passing ``writeable`` explicitly here is only necessary if you want to be 

225 able to make changes to the repo - usually the value for ``writeable`` can 

226 be guessed from the collection arguments provided, but it defaults to 

227 `False` when there are not collection arguments. 

228 """ 

229 def __init__(self, config: Union[Config, str, None] = None, *, 

230 butler: Optional[Butler] = None, 

231 collections: Any = None, 

232 run: Optional[str] = None, 

233 searchPaths: Optional[List[str]] = None, 

234 writeable: Optional[bool] = None, 

235 inferDefaults: bool = True, 

236 **kwargs: str, 

237 ): 

238 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

239 # Load registry, datastore, etc. from config or existing butler. 

240 if butler is not None: 

241 if config is not None or searchPaths is not None or writeable is not None: 

242 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' " 

243 "arguments with 'butler' argument.") 

244 self.registry = butler.registry.copy(defaults) 

245 self.datastore = butler.datastore 

246 self.storageClasses = butler.storageClasses 

247 self._config: ButlerConfig = butler._config 

248 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset 

249 else: 

250 self._config = ButlerConfig(config, searchPaths=searchPaths) 

251 if "root" in self._config: 

252 butlerRoot = self._config["root"] 

253 else: 

254 butlerRoot = self._config.configDir 

255 if writeable is None: 

256 writeable = run is not None 

257 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable, 

258 defaults=defaults) 

259 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(), 

260 butlerRoot=butlerRoot) 

261 self.storageClasses = StorageClassFactory() 

262 self.storageClasses.addFromConfig(self._config) 

263 self._allow_put_of_predefined_dataset = self._config.get("allow_put_of_predefined_dataset", False) 

264 if "run" in self._config or "collection" in self._config: 

265 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

266 

267 GENERATION: ClassVar[int] = 3 

268 """This is a Generation 3 Butler. 

269 

270 This attribute may be removed in the future, once the Generation 2 Butler 

271 interface has been fully retired; it should only be used in transitional 

272 code. 

273 """ 

274 

275 @staticmethod 

276 def makeRepo(root: str, config: Union[Config, str, None] = None, 

277 dimensionConfig: Union[Config, str, None] = None, standalone: bool = False, 

278 searchPaths: Optional[List[str]] = None, forceConfigRoot: bool = True, 

279 outfile: Optional[str] = None, overwrite: bool = False) -> Config: 

280 """Create an empty data repository by adding a butler.yaml config 

281 to a repository root directory. 

282 

283 Parameters 

284 ---------- 

285 root : `str` or `ButlerURI` 

286 Path or URI to the root location of the new repository. Will be 

287 created if it does not exist. 

288 config : `Config` or `str`, optional 

289 Configuration to write to the repository, after setting any 

290 root-dependent Registry or Datastore config options. Can not 

291 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

292 configuration will be used. Root-dependent config options 

293 specified in this config are overwritten if ``forceConfigRoot`` 

294 is `True`. 

295 dimensionConfig : `Config` or `str`, optional 

296 Configuration for dimensions, will be used to initialize registry 

297 database. 

298 standalone : `bool` 

299 If True, write all expanded defaults, not just customized or 

300 repository-specific settings. 

301 This (mostly) decouples the repository from the default 

302 configuration, insulating it from changes to the defaults (which 

303 may be good or bad, depending on the nature of the changes). 

304 Future *additions* to the defaults will still be picked up when 

305 initializing `Butlers` to repos created with ``standalone=True``. 

306 searchPaths : `list` of `str`, optional 

307 Directory paths to search when calculating the full butler 

308 configuration. 

309 forceConfigRoot : `bool`, optional 

310 If `False`, any values present in the supplied ``config`` that 

311 would normally be reset are not overridden and will appear 

312 directly in the output config. This allows non-standard overrides 

313 of the root directory for a datastore or registry to be given. 

314 If this parameter is `True` the values for ``root`` will be 

315 forced into the resulting config if appropriate. 

316 outfile : `str`, optional 

317 If not-`None`, the output configuration will be written to this 

318 location rather than into the repository itself. Can be a URI 

319 string. Can refer to a directory that will be used to write 

320 ``butler.yaml``. 

321 overwrite : `bool`, optional 

322 Create a new configuration file even if one already exists 

323 in the specified output location. Default is to raise 

324 an exception. 

325 

326 Returns 

327 ------- 

328 config : `Config` 

329 The updated `Config` instance written to the repo. 

330 

331 Raises 

332 ------ 

333 ValueError 

334 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

335 regular Config (as these subclasses would make it impossible to 

336 support ``standalone=False``). 

337 FileExistsError 

338 Raised if the output config file already exists. 

339 os.error 

340 Raised if the directory does not exist, exists but is not a 

341 directory, or cannot be created. 

342 

343 Notes 

344 ----- 

345 Note that when ``standalone=False`` (the default), the configuration 

346 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

347 construct the repository should also be used to construct any Butlers 

348 to avoid configuration inconsistencies. 

349 """ 

350 if isinstance(config, (ButlerConfig, ConfigSubset)): 

351 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

352 

353 # Ensure that the root of the repository exists or can be made 

354 uri = ButlerURI(root, forceDirectory=True) 

355 uri.mkdir() 

356 

357 config = Config(config) 

358 

359 # If we are creating a new repo from scratch with relative roots, 

360 # do not propagate an explicit root from the config file 

361 if "root" in config: 

362 del config["root"] 

363 

364 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

365 datastoreClass: Type[Datastore] = doImport(full["datastore", "cls"]) 

366 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

367 

368 # if key exists in given config, parse it, otherwise parse the defaults 

369 # in the expanded config 

370 if config.get(("registry", "db")): 

371 registryConfig = RegistryConfig(config) 

372 else: 

373 registryConfig = RegistryConfig(full) 

374 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

375 if defaultDatabaseUri is not None: 

376 Config.updateParameters(RegistryConfig, config, full, 

377 toUpdate={"db": defaultDatabaseUri}, 

378 overwrite=forceConfigRoot) 

379 else: 

380 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), 

381 overwrite=forceConfigRoot) 

382 

383 if standalone: 

384 config.merge(full) 

385 else: 

386 # Always expand the registry.managers section into the per-repo 

387 # config, because after the database schema is created, it's not 

388 # allowed to change anymore. Note that in the standalone=True 

389 # branch, _everything_ in the config is expanded, so there's no 

390 # need to special case this. 

391 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

392 configURI: Union[str, ButlerURI] 

393 if outfile is not None: 

394 # When writing to a separate location we must include 

395 # the root of the butler repo in the config else it won't know 

396 # where to look. 

397 config["root"] = uri.geturl() 

398 configURI = outfile 

399 else: 

400 configURI = uri 

401 config.dumpToUri(configURI, overwrite=overwrite) 

402 

403 # Create Registry and populate tables 

404 registryConfig = RegistryConfig(config.get("registry")) 

405 dimensionConfig = DimensionConfig(dimensionConfig) 

406 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root) 

407 

408 log.log(VERBOSE, "Wrote new Butler configuration file to %s", configURI) 

409 

410 return config 

411 

412 @classmethod 

413 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str], 

414 defaultDataId: Dict[str, str], writeable: bool) -> Butler: 

415 """Callable used to unpickle a Butler. 

416 

417 We prefer not to use ``Butler.__init__`` directly so we can force some 

418 of its many arguments to be keyword-only (note that ``__reduce__`` 

419 can only invoke callables with positional arguments). 

420 

421 Parameters 

422 ---------- 

423 config : `ButlerConfig` 

424 Butler configuration, already coerced into a true `ButlerConfig` 

425 instance (and hence after any search paths for overrides have been 

426 utilized). 

427 collections : `CollectionSearch` 

428 Names of the default collections to read from. 

429 run : `str`, optional 

430 Name of the default `~CollectionType.RUN` collection to write to. 

431 defaultDataId : `dict` [ `str`, `str` ] 

432 Default data ID values. 

433 writeable : `bool` 

434 Whether the Butler should support write operations. 

435 

436 Returns 

437 ------- 

438 butler : `Butler` 

439 A new `Butler` instance. 

440 """ 

441 # MyPy doesn't recognize that the kwargs below are totally valid; it 

442 # seems to think '**defaultDataId* is a _positional_ argument! 

443 return cls(config=config, collections=collections, run=run, writeable=writeable, 

444 **defaultDataId) # type: ignore 

445 

446 def __reduce__(self) -> tuple: 

447 """Support pickling. 

448 """ 

449 return (Butler._unpickle, (self._config, self.collections, self.run, 

450 self.registry.defaults.dataId.byName(), 

451 self.registry.isWriteable())) 

452 

453 def __str__(self) -> str: 

454 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

455 self.collections, self.run, self.datastore, self.registry) 

456 

457 def isWriteable(self) -> bool: 

458 """Return `True` if this `Butler` supports write operations. 

459 """ 

460 return self.registry.isWriteable() 

461 

462 @contextlib.contextmanager 

463 def transaction(self) -> Iterator[None]: 

464 """Context manager supporting `Butler` transactions. 

465 

466 Transactions can be nested. 

467 """ 

468 with self.registry.transaction(): 

469 with self.datastore.transaction(): 

470 yield 

471 

472 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

473 dataId: Optional[DataId] = None, **kwargs: Any 

474 ) -> Tuple[DatasetType, Optional[DataId]]: 

475 """Standardize the arguments passed to several Butler APIs. 

476 

477 Parameters 

478 ---------- 

479 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

480 When `DatasetRef` the `dataId` should be `None`. 

481 Otherwise the `DatasetType` or name thereof. 

482 dataId : `dict` or `DataCoordinate` 

483 A `dict` of `Dimension` link name, value pairs that label the 

484 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

485 should be provided as the second argument. 

486 **kwargs 

487 Additional keyword arguments used to augment or construct a 

488 `DataCoordinate`. See `DataCoordinate.standardize` 

489 parameters. 

490 

491 Returns 

492 ------- 

493 datasetType : `DatasetType` 

494 A `DatasetType` instance extracted from ``datasetRefOrType``. 

495 dataId : `dict` or `DataId`, optional 

496 Argument that can be used (along with ``kwargs``) to construct a 

497 `DataId`. 

498 

499 Notes 

500 ----- 

501 Butler APIs that conceptually need a DatasetRef also allow passing a 

502 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

503 keyword arguments that can be used to construct one) separately. This 

504 method accepts those arguments and always returns a true `DatasetType` 

505 and a `DataId` or `dict`. 

506 

507 Standardization of `dict` vs `DataId` is best handled by passing the 

508 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

509 generally similarly flexible. 

510 """ 

511 externalDatasetType: Optional[DatasetType] = None 

512 internalDatasetType: Optional[DatasetType] = None 

513 if isinstance(datasetRefOrType, DatasetRef): 

514 if dataId is not None or kwargs: 

515 raise ValueError("DatasetRef given, cannot use dataId as well") 

516 externalDatasetType = datasetRefOrType.datasetType 

517 dataId = datasetRefOrType.dataId 

518 else: 

519 # Don't check whether DataId is provided, because Registry APIs 

520 # can usually construct a better error message when it wasn't. 

521 if isinstance(datasetRefOrType, DatasetType): 

522 externalDatasetType = datasetRefOrType 

523 else: 

524 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

525 

526 # Check that they are self-consistent 

527 if externalDatasetType is not None: 

528 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

529 if externalDatasetType != internalDatasetType: 

530 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

531 f"registry definition ({internalDatasetType})") 

532 

533 assert internalDatasetType is not None 

534 return internalDatasetType, dataId 

535 

536 def _rewrite_data_id(self, dataId: Optional[DataId], datasetType: DatasetType, 

537 **kwargs: Any) -> Tuple[Optional[DataId], Dict[str, Any]]: 

538 """Rewrite a data ID taking into account dimension records. 

539 

540 Take a Data ID and keyword args and rewrite it if necessary to 

541 allow the user to specify dimension records rather than dimension 

542 primary values. 

543 

544 This allows a user to include a dataId dict with keys of 

545 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

546 the integer exposure ID. It also allows a string to be given 

547 for a dimension value rather than the integer ID if that is more 

548 convenient. For example, rather than having to specifyin the 

549 detector with ``detector.full_name``, a string given for ``detector`` 

550 will be interpreted as the full name and converted to the integer 

551 value. 

552 

553 Keyword arguments can also use strings for dimensions like detector 

554 and exposure but python does not allow them to include ``.`` and 

555 so the ``exposure.day_obs`` syntax can not be used in a keyword 

556 argument. 

557 

558 Parameters 

559 ---------- 

560 dataId : `dict` or `DataCoordinate` 

561 A `dict` of `Dimension` link name, value pairs that will label the 

562 `DatasetRef` within a Collection. 

563 datasetType : `DatasetType` 

564 The dataset type associated with this dataId. Required to 

565 determine the relevant dimensions. 

566 **kwargs 

567 Additional keyword arguments used to augment or construct a 

568 `DataId`. See `DataId` parameters. 

569 

570 Returns 

571 ------- 

572 dataId : `dict` or `DataCoordinate` 

573 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

574 no keyword arguments, the orginal dataId will be returned 

575 unchanged. 

576 **kwargs : `dict` 

577 Any unused keyword arguments. 

578 """ 

579 # Do nothing if we have a standalone DataCoordinate. 

580 if isinstance(dataId, DataCoordinate) and not kwargs: 

581 return dataId, kwargs 

582 

583 # Process dimension records that are using record information 

584 # rather than ids 

585 newDataId: Dict[str, DataIdValue] = {} 

586 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict) 

587 

588 # if all the dataId comes from keyword parameters we do not need 

589 # to do anything here because they can't be of the form 

590 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

591 if dataId: 

592 for k, v in dataId.items(): 

593 # If we have a Dimension we do not need to do anything 

594 # because it cannot be a compound key. 

595 if isinstance(k, str) and "." in k: 

596 # Someone is using a more human-readable dataId 

597 dimensionName, record = k.split(".", 1) 

598 byRecord[dimensionName][record] = v 

599 elif isinstance(k, Dimension): 

600 newDataId[k.name] = v 

601 else: 

602 newDataId[k] = v 

603 

604 # Go through the updated dataId and check the type in case someone is 

605 # using an alternate key. We have already filtered out the compound 

606 # keys dimensions.record format. 

607 not_dimensions = {} 

608 

609 # Will need to look in the dataId and the keyword arguments 

610 # and will remove them if they need to be fixed or are unrecognized. 

611 for dataIdDict in (newDataId, kwargs): 

612 # Use a list so we can adjust the dict safely in the loop 

613 for dimensionName in list(dataIdDict): 

614 value = dataIdDict[dimensionName] 

615 try: 

616 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

617 except KeyError: 

618 # This is not a real dimension 

619 not_dimensions[dimensionName] = value 

620 del dataIdDict[dimensionName] 

621 continue 

622 

623 # Convert an integral type to an explicit int to simplify 

624 # comparisons here 

625 if isinstance(value, numbers.Integral): 

626 value = int(value) 

627 

628 if not isinstance(value, dimension.primaryKey.getPythonType()): 

629 for alternate in dimension.alternateKeys: 

630 if isinstance(value, alternate.getPythonType()): 

631 byRecord[dimensionName][alternate.name] = value 

632 del dataIdDict[dimensionName] 

633 log.debug("Converting dimension %s to %s.%s=%s", 

634 dimensionName, dimensionName, alternate.name, value) 

635 break 

636 else: 

637 log.warning("Type mismatch found for value '%r' provided for dimension %s. " 

638 "Could not find matching alternative (primary key has type %s) " 

639 "so attempting to use as-is.", 

640 value, dimensionName, dimension.primaryKey.getPythonType()) 

641 

642 # If we have some unrecognized dimensions we have to try to connect 

643 # them to records in other dimensions. This is made more complicated 

644 # by some dimensions having records with clashing names. A mitigation 

645 # is that we can tell by this point which dimensions are missing 

646 # for the DatasetType but this does not work for calibrations 

647 # where additional dimensions can be used to constrain the temporal 

648 # axis. 

649 if not_dimensions: 

650 # Calculate missing dimensions 

651 provided = set(newDataId) | set(kwargs) | set(byRecord) 

652 missingDimensions = datasetType.dimensions.names - provided 

653 

654 # For calibrations we may well be needing temporal dimensions 

655 # so rather than always including all dimensions in the scan 

656 # restrict things a little. It is still possible for there 

657 # to be confusion over day_obs in visit vs exposure for example. 

658 # If we are not searching calibration collections things may 

659 # fail but they are going to fail anyway because of the 

660 # ambiguousness of the dataId... 

661 candidateDimensions: Set[str] = set() 

662 candidateDimensions.update(missingDimensions) 

663 if datasetType.isCalibration(): 

664 for dim in self.registry.dimensions.getStaticDimensions(): 

665 if dim.temporal: 

666 candidateDimensions.add(str(dim)) 

667 

668 # Look up table for the first association with a dimension 

669 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict) 

670 

671 # Keep track of whether an item is associated with multiple 

672 # dimensions. 

673 counter: Counter[str] = Counter() 

674 assigned: Dict[str, Set[str]] = defaultdict(set) 

675 

676 # Go through the missing dimensions and associate the 

677 # given names with records within those dimensions 

678 for dimensionName in candidateDimensions: 

679 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

680 fields = dimension.metadata.names | dimension.uniqueKeys.names 

681 for field in not_dimensions: 

682 if field in fields: 

683 guessedAssociation[dimensionName][field] = not_dimensions[field] 

684 counter[dimensionName] += 1 

685 assigned[field].add(dimensionName) 

686 

687 # There is a chance we have allocated a single dataId item 

688 # to multiple dimensions. Need to decide which should be retained. 

689 # For now assume that the most popular alternative wins. 

690 # This means that day_obs with seq_num will result in 

691 # exposure.day_obs and not visit.day_obs 

692 # Also prefer an explicitly missing dimension over an inferred 

693 # temporal dimension. 

694 for fieldName, assignedDimensions in assigned.items(): 

695 if len(assignedDimensions) > 1: 

696 # Pick the most popular (preferring mandatory dimensions) 

697 requiredButMissing = assignedDimensions.intersection(missingDimensions) 

698 if requiredButMissing: 

699 candidateDimensions = requiredButMissing 

700 else: 

701 candidateDimensions = assignedDimensions 

702 

703 # Select the relevant items and get a new restricted 

704 # counter. 

705 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

706 duplicatesCounter: Counter[str] = Counter() 

707 duplicatesCounter.update(theseCounts) 

708 

709 # Choose the most common. If they are equally common 

710 # we will pick the one that was found first. 

711 # Returns a list of tuples 

712 selected = duplicatesCounter.most_common(1)[0][0] 

713 

714 log.debug("Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

715 " Removed ambiguity by choosing dimension %s.", 

716 fieldName, ", ".join(assignedDimensions), selected) 

717 

718 for candidateDimension in assignedDimensions: 

719 if candidateDimension != selected: 

720 del guessedAssociation[candidateDimension][fieldName] 

721 

722 # Update the record look up dict with the new associations 

723 for dimensionName, values in guessedAssociation.items(): 

724 if values: # A dict might now be empty 

725 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", 

726 dimensionName, values) 

727 byRecord[dimensionName].update(values) 

728 

729 if byRecord: 

730 # Some record specifiers were found so we need to convert 

731 # them to the Id form 

732 for dimensionName, values in byRecord.items(): 

733 if dimensionName in newDataId: 

734 log.warning("DataId specified explicit %s dimension value of %s in addition to" 

735 " general record specifiers for it of %s. Ignoring record information.", 

736 dimensionName, newDataId[dimensionName], str(values)) 

737 continue 

738 

739 # Build up a WHERE expression 

740 bind = {k: v for k, v in values.items()} 

741 where = " AND ".join(f"{dimensionName}.{k} = {k}" 

742 for k in bind) 

743 

744 # Hopefully we get a single record that matches 

745 records = set(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId, 

746 where=where, bind=bind, **kwargs)) 

747 

748 if len(records) != 1: 

749 if len(records) > 1: 

750 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

751 for r in records: 

752 log.debug("- %s", str(r)) 

753 raise RuntimeError(f"DataId specification for dimension {dimensionName} is not" 

754 f" uniquely constrained to a single dataset by {values}." 

755 f" Got {len(records)} results.") 

756 raise RuntimeError(f"DataId specification for dimension {dimensionName} matched no" 

757 f" records when constrained by {values}") 

758 

759 # Get the primary key from the real dimension object 

760 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

761 if not isinstance(dimension, Dimension): 

762 raise RuntimeError( 

763 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

764 ) 

765 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

766 

767 # We have modified the dataId so need to switch to it 

768 dataId = newDataId 

769 

770 return dataId, kwargs 

771 

772 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

773 dataId: Optional[DataId] = None, *, 

774 collections: Any = None, 

775 allowUnresolved: bool = False, 

776 **kwargs: Any) -> DatasetRef: 

777 """Shared logic for methods that start with a search for a dataset in 

778 the registry. 

779 

780 Parameters 

781 ---------- 

782 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

783 When `DatasetRef` the `dataId` should be `None`. 

784 Otherwise the `DatasetType` or name thereof. 

785 dataId : `dict` or `DataCoordinate`, optional 

786 A `dict` of `Dimension` link name, value pairs that label the 

787 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

788 should be provided as the first argument. 

789 collections : Any, optional 

790 Collections to be searched, overriding ``self.collections``. 

791 Can be any of the types supported by the ``collections`` argument 

792 to butler construction. 

793 allowUnresolved : `bool`, optional 

794 If `True`, return an unresolved `DatasetRef` if finding a resolved 

795 one in the `Registry` fails. Defaults to `False`. 

796 **kwargs 

797 Additional keyword arguments used to augment or construct a 

798 `DataId`. See `DataId` parameters. 

799 

800 Returns 

801 ------- 

802 ref : `DatasetRef` 

803 A reference to the dataset identified by the given arguments. 

804 

805 Raises 

806 ------ 

807 LookupError 

808 Raised if no matching dataset exists in the `Registry` (and 

809 ``allowUnresolved is False``). 

810 ValueError 

811 Raised if a resolved `DatasetRef` was passed as an input, but it 

812 differs from the one found in the registry. 

813 TypeError 

814 Raised if no collections were provided. 

815 """ 

816 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

817 if isinstance(datasetRefOrType, DatasetRef): 

818 idNumber = datasetRefOrType.id 

819 else: 

820 idNumber = None 

821 timespan: Optional[Timespan] = None 

822 

823 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

824 

825 if datasetType.isCalibration(): 

826 # Because this is a calibration dataset, first try to make a 

827 # standardize the data ID without restricting the dimensions to 

828 # those of the dataset type requested, because there may be extra 

829 # dimensions that provide temporal information for a validity-range 

830 # lookup. 

831 dataId = DataCoordinate.standardize(dataId, universe=self.registry.dimensions, 

832 defaults=self.registry.defaults.dataId, **kwargs) 

833 if dataId.graph.temporal: 

834 dataId = self.registry.expandDataId(dataId) 

835 timespan = dataId.timespan 

836 else: 

837 # Standardize the data ID to just the dimensions of the dataset 

838 # type instead of letting registry.findDataset do it, so we get the 

839 # result even if no dataset is found. 

840 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions, 

841 defaults=self.registry.defaults.dataId, **kwargs) 

842 # Always lookup the DatasetRef, even if one is given, to ensure it is 

843 # present in the current collection. 

844 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

845 if ref is None: 

846 if allowUnresolved: 

847 return DatasetRef(datasetType, dataId) 

848 else: 

849 if collections is None: 

850 collections = self.registry.defaults.collections 

851 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} " 

852 f"could not be found in collections {collections}.") 

853 if idNumber is not None and idNumber != ref.id: 

854 if collections is None: 

855 collections = self.registry.defaults.collections 

856 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match " 

857 f"id ({ref.id}) in registry in collections {collections}.") 

858 return ref 

859 

860 @transactional 

861 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

862 dataId: Optional[DataId] = None, *, 

863 run: Optional[str] = None, 

864 **kwargs: Any) -> DatasetRef: 

865 """Store and register a dataset. 

866 

867 Parameters 

868 ---------- 

869 obj : `object` 

870 The dataset. 

871 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

872 When `DatasetRef` is provided, ``dataId`` should be `None`. 

873 Otherwise the `DatasetType` or name thereof. 

874 dataId : `dict` or `DataCoordinate` 

875 A `dict` of `Dimension` link name, value pairs that label the 

876 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

877 should be provided as the second argument. 

878 run : `str`, optional 

879 The name of the run the dataset should be added to, overriding 

880 ``self.run``. 

881 **kwargs 

882 Additional keyword arguments used to augment or construct a 

883 `DataCoordinate`. See `DataCoordinate.standardize` 

884 parameters. 

885 

886 Returns 

887 ------- 

888 ref : `DatasetRef` 

889 A reference to the stored dataset, updated with the correct id if 

890 given. 

891 

892 Raises 

893 ------ 

894 TypeError 

895 Raised if the butler is read-only or if no run has been provided. 

896 """ 

897 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

898 if not self.isWriteable(): 

899 raise TypeError("Butler is read-only.") 

900 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

901 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

902 raise ValueError("DatasetRef must not be in registry, must have None id") 

903 

904 # Handle dimension records in dataId 

905 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

906 

907 # Add Registry Dataset entry. 

908 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

909 

910 # For an execution butler the datasets will be pre-defined. 

911 # If the butler is configured that way datasets should only be inserted 

912 # if they do not already exist in registry. Trying and catching 

913 # ConflictingDefinitionError will not work because the transaction 

914 # will be corrupted. Instead, in this mode always check first. 

915 ref = None 

916 ref_is_predefined = False 

917 if self._allow_put_of_predefined_dataset: 

918 # Get the matching ref for this run. 

919 ref = self.registry.findDataset(datasetType, collections=run, 

920 dataId=dataId) 

921 

922 if ref: 

923 # Must be expanded form for datastore templating 

924 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions) 

925 ref = ref.expanded(dataId) 

926 ref_is_predefined = True 

927 

928 if not ref: 

929 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

930 

931 # If the ref is predefined it is possible that the datastore also 

932 # has the record. Asking datastore to put it again will result in 

933 # the artifact being recreated, overwriting previous, then will cause 

934 # a failure in writing the record which will cause the artifact 

935 # to be removed. Much safer to ask first before attempting to 

936 # overwrite. Race conditions should not be an issue for the 

937 # execution butler environment. 

938 if ref_is_predefined: 

939 if self.datastore.knows(ref): 

940 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.") 

941 

942 self.datastore.put(obj, ref) 

943 

944 return ref 

945 

946 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any: 

947 """Retrieve a stored dataset. 

948 

949 Unlike `Butler.get`, this method allows datasets outside the Butler's 

950 collection to be read as long as the `DatasetRef` that identifies them 

951 can be obtained separately. 

952 

953 Parameters 

954 ---------- 

955 ref : `DatasetRef` 

956 Resolved reference to an already stored dataset. 

957 parameters : `dict` 

958 Additional StorageClass-defined options to control reading, 

959 typically used to efficiently read only a subset of the dataset. 

960 

961 Returns 

962 ------- 

963 obj : `object` 

964 The dataset. 

965 """ 

966 return self.datastore.get(ref, parameters=parameters) 

967 

968 def getDirectDeferred(self, ref: DatasetRef, *, 

969 parameters: Union[dict, None] = None) -> DeferredDatasetHandle: 

970 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

971 from a resolved `DatasetRef`. 

972 

973 Parameters 

974 ---------- 

975 ref : `DatasetRef` 

976 Resolved reference to an already stored dataset. 

977 parameters : `dict` 

978 Additional StorageClass-defined options to control reading, 

979 typically used to efficiently read only a subset of the dataset. 

980 

981 Returns 

982 ------- 

983 obj : `DeferredDatasetHandle` 

984 A handle which can be used to retrieve a dataset at a later time. 

985 

986 Raises 

987 ------ 

988 AmbiguousDatasetError 

989 Raised if ``ref.id is None``, i.e. the reference is unresolved. 

990 """ 

991 if ref.id is None: 

992 raise AmbiguousDatasetError( 

993 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved." 

994 ) 

995 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

996 

997 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

998 dataId: Optional[DataId] = None, *, 

999 parameters: Union[dict, None] = None, 

1000 collections: Any = None, 

1001 **kwargs: Any) -> DeferredDatasetHandle: 

1002 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1003 after an immediate registry lookup. 

1004 

1005 Parameters 

1006 ---------- 

1007 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1008 When `DatasetRef` the `dataId` should be `None`. 

1009 Otherwise the `DatasetType` or name thereof. 

1010 dataId : `dict` or `DataCoordinate`, optional 

1011 A `dict` of `Dimension` link name, value pairs that label the 

1012 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1013 should be provided as the first argument. 

1014 parameters : `dict` 

1015 Additional StorageClass-defined options to control reading, 

1016 typically used to efficiently read only a subset of the dataset. 

1017 collections : Any, optional 

1018 Collections to be searched, overriding ``self.collections``. 

1019 Can be any of the types supported by the ``collections`` argument 

1020 to butler construction. 

1021 **kwargs 

1022 Additional keyword arguments used to augment or construct a 

1023 `DataId`. See `DataId` parameters. 

1024 

1025 Returns 

1026 ------- 

1027 obj : `DeferredDatasetHandle` 

1028 A handle which can be used to retrieve a dataset at a later time. 

1029 

1030 Raises 

1031 ------ 

1032 LookupError 

1033 Raised if no matching dataset exists in the `Registry` (and 

1034 ``allowUnresolved is False``). 

1035 ValueError 

1036 Raised if a resolved `DatasetRef` was passed as an input, but it 

1037 differs from the one found in the registry. 

1038 TypeError 

1039 Raised if no collections were provided. 

1040 """ 

1041 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1042 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

1043 

1044 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1045 dataId: Optional[DataId] = None, *, 

1046 parameters: Optional[Dict[str, Any]] = None, 

1047 collections: Any = None, 

1048 **kwargs: Any) -> Any: 

1049 """Retrieve a stored dataset. 

1050 

1051 Parameters 

1052 ---------- 

1053 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1054 When `DatasetRef` the `dataId` should be `None`. 

1055 Otherwise the `DatasetType` or name thereof. 

1056 dataId : `dict` or `DataCoordinate` 

1057 A `dict` of `Dimension` link name, value pairs that label the 

1058 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1059 should be provided as the first argument. 

1060 parameters : `dict` 

1061 Additional StorageClass-defined options to control reading, 

1062 typically used to efficiently read only a subset of the dataset. 

1063 collections : Any, optional 

1064 Collections to be searched, overriding ``self.collections``. 

1065 Can be any of the types supported by the ``collections`` argument 

1066 to butler construction. 

1067 **kwargs 

1068 Additional keyword arguments used to augment or construct a 

1069 `DataCoordinate`. See `DataCoordinate.standardize` 

1070 parameters. 

1071 

1072 Returns 

1073 ------- 

1074 obj : `object` 

1075 The dataset. 

1076 

1077 Raises 

1078 ------ 

1079 ValueError 

1080 Raised if a resolved `DatasetRef` was passed as an input, but it 

1081 differs from the one found in the registry. 

1082 LookupError 

1083 Raised if no matching dataset exists in the `Registry`. 

1084 TypeError 

1085 Raised if no collections were provided. 

1086 

1087 Notes 

1088 ----- 

1089 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1090 this method requires that the given data ID include temporal dimensions 

1091 beyond the dimensions of the dataset type itself, in order to find the 

1092 dataset with the appropriate validity range. For example, a "bias" 

1093 dataset with native dimensions ``{instrument, detector}`` could be 

1094 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1095 ``exposure`` is a temporal dimension. 

1096 """ 

1097 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1098 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1099 return self.getDirect(ref, parameters=parameters) 

1100 

1101 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1102 dataId: Optional[DataId] = None, *, 

1103 predict: bool = False, 

1104 collections: Any = None, 

1105 run: Optional[str] = None, 

1106 **kwargs: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

1107 """Returns the URIs associated with the dataset. 

1108 

1109 Parameters 

1110 ---------- 

1111 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1112 When `DatasetRef` the `dataId` should be `None`. 

1113 Otherwise the `DatasetType` or name thereof. 

1114 dataId : `dict` or `DataCoordinate` 

1115 A `dict` of `Dimension` link name, value pairs that label the 

1116 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1117 should be provided as the first argument. 

1118 predict : `bool` 

1119 If `True`, allow URIs to be returned of datasets that have not 

1120 been written. 

1121 collections : Any, optional 

1122 Collections to be searched, overriding ``self.collections``. 

1123 Can be any of the types supported by the ``collections`` argument 

1124 to butler construction. 

1125 run : `str`, optional 

1126 Run to use for predictions, overriding ``self.run``. 

1127 **kwargs 

1128 Additional keyword arguments used to augment or construct a 

1129 `DataCoordinate`. See `DataCoordinate.standardize` 

1130 parameters. 

1131 

1132 Returns 

1133 ------- 

1134 primary : `ButlerURI` 

1135 The URI to the primary artifact associated with this dataset. 

1136 If the dataset was disassembled within the datastore this 

1137 may be `None`. 

1138 components : `dict` 

1139 URIs to any components associated with the dataset artifact. 

1140 Can be empty if there are no components. 

1141 """ 

1142 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict, 

1143 collections=collections, **kwargs) 

1144 if ref.id is None: # only possible if predict is True 

1145 if run is None: 

1146 run = self.run 

1147 if run is None: 

1148 raise TypeError("Cannot predict location with run=None.") 

1149 # Lie about ID, because we can't guess it, and only 

1150 # Datastore.getURIs() will ever see it (and it doesn't use it). 

1151 ref = ref.resolved(id=0, run=run) 

1152 return self.datastore.getURIs(ref, predict) 

1153 

1154 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1155 dataId: Optional[DataId] = None, *, 

1156 predict: bool = False, 

1157 collections: Any = None, 

1158 run: Optional[str] = None, 

1159 **kwargs: Any) -> ButlerURI: 

1160 """Return the URI to the Dataset. 

1161 

1162 Parameters 

1163 ---------- 

1164 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1165 When `DatasetRef` the `dataId` should be `None`. 

1166 Otherwise the `DatasetType` or name thereof. 

1167 dataId : `dict` or `DataCoordinate` 

1168 A `dict` of `Dimension` link name, value pairs that label the 

1169 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1170 should be provided as the first argument. 

1171 predict : `bool` 

1172 If `True`, allow URIs to be returned of datasets that have not 

1173 been written. 

1174 collections : Any, optional 

1175 Collections to be searched, overriding ``self.collections``. 

1176 Can be any of the types supported by the ``collections`` argument 

1177 to butler construction. 

1178 run : `str`, optional 

1179 Run to use for predictions, overriding ``self.run``. 

1180 **kwargs 

1181 Additional keyword arguments used to augment or construct a 

1182 `DataCoordinate`. See `DataCoordinate.standardize` 

1183 parameters. 

1184 

1185 Returns 

1186 ------- 

1187 uri : `ButlerURI` 

1188 URI pointing to the Dataset within the datastore. If the 

1189 Dataset does not exist in the datastore, and if ``predict`` is 

1190 `True`, the URI will be a prediction and will include a URI 

1191 fragment "#predicted". 

1192 If the datastore does not have entities that relate well 

1193 to the concept of a URI the returned URI string will be 

1194 descriptive. The returned URI is not guaranteed to be obtainable. 

1195 

1196 Raises 

1197 ------ 

1198 LookupError 

1199 A URI has been requested for a dataset that does not exist and 

1200 guessing is not allowed. 

1201 ValueError 

1202 Raised if a resolved `DatasetRef` was passed as an input, but it 

1203 differs from the one found in the registry. 

1204 TypeError 

1205 Raised if no collections were provided. 

1206 RuntimeError 

1207 Raised if a URI is requested for a dataset that consists of 

1208 multiple artifacts. 

1209 """ 

1210 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict, 

1211 collections=collections, run=run, **kwargs) 

1212 

1213 if primary is None or components: 

1214 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1215 "Use Butler.getURIs() instead.") 

1216 return primary 

1217 

1218 def retrieveArtifacts(self, refs: Iterable[DatasetRef], 

1219 destination: Union[str, ButlerURI], transfer: str = "auto", 

1220 preserve_path: bool = True, 

1221 overwrite: bool = False) -> List[ButlerURI]: 

1222 """Retrieve the artifacts associated with the supplied refs. 

1223 

1224 Parameters 

1225 ---------- 

1226 refs : iterable of `DatasetRef` 

1227 The datasets for which artifacts are to be retrieved. 

1228 A single ref can result in multiple artifacts. The refs must 

1229 be resolved. 

1230 destination : `ButlerURI` or `str` 

1231 Location to write the artifacts. 

1232 transfer : `str`, optional 

1233 Method to use to transfer the artifacts. Must be one of the options 

1234 supported by `ButlerURI.transfer_from()`. "move" is not allowed. 

1235 preserve_path : `bool`, optional 

1236 If `True` the full path of the artifact within the datastore 

1237 is preserved. If `False` the final file component of the path 

1238 is used. 

1239 overwrite : `bool`, optional 

1240 If `True` allow transfers to overwrite existing files at the 

1241 destination. 

1242 

1243 Returns 

1244 ------- 

1245 targets : `list` of `ButlerURI` 

1246 URIs of file artifacts in destination location. Order is not 

1247 preserved. 

1248 

1249 Notes 

1250 ----- 

1251 For non-file datastores the artifacts written to the destination 

1252 may not match the representation inside the datastore. For example 

1253 a hierarchical data structure in a NoSQL database may well be stored 

1254 as a JSON file. 

1255 """ 

1256 return self.datastore.retrieveArtifacts(refs, ButlerURI(destination), transfer=transfer, 

1257 preserve_path=preserve_path, overwrite=overwrite) 

1258 

1259 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1260 dataId: Optional[DataId] = None, *, 

1261 collections: Any = None, 

1262 **kwargs: Any) -> bool: 

1263 """Return True if the Dataset is actually present in the Datastore. 

1264 

1265 Parameters 

1266 ---------- 

1267 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1268 When `DatasetRef` the `dataId` should be `None`. 

1269 Otherwise the `DatasetType` or name thereof. 

1270 dataId : `dict` or `DataCoordinate` 

1271 A `dict` of `Dimension` link name, value pairs that label the 

1272 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1273 should be provided as the first argument. 

1274 collections : Any, optional 

1275 Collections to be searched, overriding ``self.collections``. 

1276 Can be any of the types supported by the ``collections`` argument 

1277 to butler construction. 

1278 **kwargs 

1279 Additional keyword arguments used to augment or construct a 

1280 `DataCoordinate`. See `DataCoordinate.standardize` 

1281 parameters. 

1282 

1283 Raises 

1284 ------ 

1285 LookupError 

1286 Raised if the dataset is not even present in the Registry. 

1287 ValueError 

1288 Raised if a resolved `DatasetRef` was passed as an input, but it 

1289 differs from the one found in the registry. 

1290 TypeError 

1291 Raised if no collections were provided. 

1292 """ 

1293 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1294 return self.datastore.exists(ref) 

1295 

1296 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1297 """Remove one or more `~CollectionType.RUN` collections and the 

1298 datasets within them. 

1299 

1300 Parameters 

1301 ---------- 

1302 names : `Iterable` [ `str` ] 

1303 The names of the collections to remove. 

1304 unstore : `bool`, optional 

1305 If `True` (default), delete datasets from all datastores in which 

1306 they are present, and attempt to rollback the registry deletions if 

1307 datastore deletions fail (which may not always be possible). If 

1308 `False`, datastore records for these datasets are still removed, 

1309 but any artifacts (e.g. files) will not be. 

1310 

1311 Raises 

1312 ------ 

1313 TypeError 

1314 Raised if one or more collections are not of type 

1315 `~CollectionType.RUN`. 

1316 """ 

1317 if not self.isWriteable(): 

1318 raise TypeError("Butler is read-only.") 

1319 names = list(names) 

1320 refs: List[DatasetRef] = [] 

1321 for name in names: 

1322 collectionType = self.registry.getCollectionType(name) 

1323 if collectionType is not CollectionType.RUN: 

1324 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1325 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True)) 

1326 with self.registry.transaction(): 

1327 if unstore: 

1328 self.datastore.trash(refs) 

1329 else: 

1330 self.datastore.forget(refs) 

1331 for name in names: 

1332 self.registry.removeCollection(name) 

1333 if unstore: 

1334 # Point of no return for removing artifacts 

1335 self.datastore.emptyTrash() 

1336 

1337 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False, 

1338 unlink: Optional[List[str]] = None) -> None: 

1339 """Remove a collection and possibly prune datasets within it. 

1340 

1341 Parameters 

1342 ---------- 

1343 name : `str` 

1344 Name of the collection to remove. If this is a 

1345 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection, 

1346 datasets within the collection are not modified unless ``unstore`` 

1347 is `True`. If this is a `~CollectionType.RUN` collection, 

1348 ``purge`` and ``unstore`` must be `True`, and all datasets in it 

1349 are fully removed from the data repository. 

1350 purge : `bool`, optional 

1351 If `True`, permit `~CollectionType.RUN` collections to be removed, 

1352 fully removing datasets within them. Requires ``unstore=True`` as 

1353 well as an added precaution against accidental deletion. Must be 

1354 `False` (default) if the collection is not a ``RUN``. 

1355 unstore: `bool`, optional 

1356 If `True`, remove all datasets in the collection from all 

1357 datastores in which they appear. 

1358 unlink: `list` [`str`], optional 

1359 Before removing the given `collection` unlink it from from these 

1360 parent collections. 

1361 

1362 Raises 

1363 ------ 

1364 TypeError 

1365 Raised if the butler is read-only or arguments are mutually 

1366 inconsistent. 

1367 """ 

1368 # See pruneDatasets comments for more information about the logic here; 

1369 # the cases are almost the same, but here we can rely on Registry to 

1370 # take care everything but Datastore deletion when we remove the 

1371 # collection. 

1372 if not self.isWriteable(): 

1373 raise TypeError("Butler is read-only.") 

1374 collectionType = self.registry.getCollectionType(name) 

1375 if purge and not unstore: 

1376 raise PurgeWithoutUnstorePruneCollectionsError() 

1377 if collectionType is CollectionType.RUN and not purge: 

1378 raise RunWithoutPurgePruneCollectionsError(collectionType) 

1379 if collectionType is not CollectionType.RUN and purge: 

1380 raise PurgeUnsupportedPruneCollectionsError(collectionType) 

1381 

1382 def remove(child: str, parent: str) -> None: 

1383 """Remove a child collection from a parent collection.""" 

1384 # Remove child from parent. 

1385 chain = list(self.registry.getCollectionChain(parent)) 

1386 try: 

1387 chain.remove(name) 

1388 except ValueError as e: 

1389 raise RuntimeError(f"{name} is not a child of {parent}") from e 

1390 self.registry.setCollectionChain(parent, chain) 

1391 

1392 with self.registry.transaction(): 

1393 if (unlink): 

1394 for parent in unlink: 

1395 remove(name, parent) 

1396 if unstore: 

1397 refs = self.registry.queryDatasets(..., collections=name, findFirst=True) 

1398 self.datastore.trash(refs) 

1399 self.registry.removeCollection(name) 

1400 

1401 if unstore: 

1402 # Point of no return for removing artifacts 

1403 self.datastore.emptyTrash() 

1404 

1405 def pruneDatasets(self, refs: Iterable[DatasetRef], *, 

1406 disassociate: bool = True, 

1407 unstore: bool = False, 

1408 tags: Iterable[str] = (), 

1409 purge: bool = False, 

1410 run: Optional[str] = None) -> None: 

1411 """Remove one or more datasets from a collection and/or storage. 

1412 

1413 Parameters 

1414 ---------- 

1415 refs : `~collections.abc.Iterable` of `DatasetRef` 

1416 Datasets to prune. These must be "resolved" references (not just 

1417 a `DatasetType` and data ID). 

1418 disassociate : `bool`, optional 

1419 Disassociate pruned datasets from ``tags``, or from all collections 

1420 if ``purge=True``. 

1421 unstore : `bool`, optional 

1422 If `True` (`False` is default) remove these datasets from all 

1423 datastores known to this butler. Note that this will make it 

1424 impossible to retrieve these datasets even via other collections. 

1425 Datasets that are already not stored are ignored by this option. 

1426 tags : `Iterable` [ `str` ], optional 

1427 `~CollectionType.TAGGED` collections to disassociate the datasets 

1428 from. Ignored if ``disassociate`` is `False` or ``purge`` is 

1429 `True`. 

1430 purge : `bool`, optional 

1431 If `True` (`False` is default), completely remove the dataset from 

1432 the `Registry`. To prevent accidental deletions, ``purge`` may 

1433 only be `True` if all of the following conditions are met: 

1434 

1435 - All given datasets are in the given run. 

1436 - ``disassociate`` is `True`; 

1437 - ``unstore`` is `True`. 

1438 

1439 This mode may remove provenance information from datasets other 

1440 than those provided, and should be used with extreme care. 

1441 

1442 Raises 

1443 ------ 

1444 TypeError 

1445 Raised if the butler is read-only, if no collection was provided, 

1446 or the conditions for ``purge=True`` were not met. 

1447 """ 

1448 if not self.isWriteable(): 

1449 raise TypeError("Butler is read-only.") 

1450 if purge: 

1451 if not disassociate: 

1452 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1453 if not unstore: 

1454 raise TypeError("Cannot pass purge=True without unstore=True.") 

1455 elif disassociate: 

1456 tags = tuple(tags) 

1457 if not tags: 

1458 raise TypeError("No tags provided but disassociate=True.") 

1459 for tag in tags: 

1460 collectionType = self.registry.getCollectionType(tag) 

1461 if collectionType is not CollectionType.TAGGED: 

1462 raise TypeError(f"Cannot disassociate from collection '{tag}' " 

1463 f"of non-TAGGED type {collectionType.name}.") 

1464 # Transform possibly-single-pass iterable into something we can iterate 

1465 # over multiple times. 

1466 refs = list(refs) 

1467 # Pruning a component of a DatasetRef makes no sense since registry 

1468 # doesn't know about components and datastore might not store 

1469 # components in a separate file 

1470 for ref in refs: 

1471 if ref.datasetType.component(): 

1472 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1473 # We don't need an unreliable Datastore transaction for this, because 

1474 # we've been extra careful to ensure that Datastore.trash only involves 

1475 # mutating the Registry (it can _look_ at Datastore-specific things, 

1476 # but shouldn't change them), and hence all operations here are 

1477 # Registry operations. 

1478 with self.registry.transaction(): 

1479 if unstore: 

1480 self.datastore.trash(refs) 

1481 if purge: 

1482 self.registry.removeDatasets(refs) 

1483 elif disassociate: 

1484 assert tags, "Guaranteed by earlier logic in this function." 

1485 for tag in tags: 

1486 self.registry.disassociate(tag, refs) 

1487 # We've exited the Registry transaction, and apparently committed. 

1488 # (if there was an exception, everything rolled back, and it's as if 

1489 # nothing happened - and we never get here). 

1490 # Datastore artifacts are not yet gone, but they're clearly marked 

1491 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1492 # problems we can try again later, and if manual administrative 

1493 # intervention is required, it's pretty clear what that should entail: 

1494 # deleting everything on disk and in private Datastore tables that is 

1495 # in the dataset_location_trash table. 

1496 if unstore: 

1497 # Point of no return for removing artifacts 

1498 self.datastore.emptyTrash() 

1499 

1500 @transactional 

1501 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None, 

1502 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1503 ) -> None: 

1504 """Store and register one or more datasets that already exist on disk. 

1505 

1506 Parameters 

1507 ---------- 

1508 datasets : `FileDataset` 

1509 Each positional argument is a struct containing information about 

1510 a file to be ingested, including its URI (either absolute or 

1511 relative to the datastore root, if applicable), a `DatasetRef`, 

1512 and optionally a formatter class or its fully-qualified string 

1513 name. If a formatter is not provided, the formatter that would be 

1514 used for `put` is assumed. On successful return, all 

1515 `FileDataset.ref` attributes will have their `DatasetRef.id` 

1516 attribute populated and all `FileDataset.formatter` attributes will 

1517 be set to the formatter class used. `FileDataset.path` attributes 

1518 may be modified to put paths in whatever the datastore considers a 

1519 standardized form. 

1520 transfer : `str`, optional 

1521 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1522 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to 

1523 transfer the file. 

1524 run : `str`, optional 

1525 The name of the run ingested datasets should be added to, 

1526 overriding ``self.run``. 

1527 idGenerationMode : `DatasetIdGenEnum`, optional 

1528 Specifies option for generating dataset IDs. By default unique IDs 

1529 are generated for each inserted dataset. 

1530 

1531 Raises 

1532 ------ 

1533 TypeError 

1534 Raised if the butler is read-only or if no run was provided. 

1535 NotImplementedError 

1536 Raised if the `Datastore` does not support the given transfer mode. 

1537 DatasetTypeNotSupportedError 

1538 Raised if one or more files to be ingested have a dataset type that 

1539 is not supported by the `Datastore`.. 

1540 FileNotFoundError 

1541 Raised if one of the given files does not exist. 

1542 FileExistsError 

1543 Raised if transfer is not `None` but the (internal) location the 

1544 file would be moved to is already occupied. 

1545 

1546 Notes 

1547 ----- 

1548 This operation is not fully exception safe: if a database operation 

1549 fails, the given `FileDataset` instances may be only partially updated. 

1550 

1551 It is atomic in terms of database operations (they will either all 

1552 succeed or all fail) providing the database engine implements 

1553 transactions correctly. It will attempt to be atomic in terms of 

1554 filesystem operations as well, but this cannot be implemented 

1555 rigorously for most datastores. 

1556 """ 

1557 if not self.isWriteable(): 

1558 raise TypeError("Butler is read-only.") 

1559 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1560 # Reorganize the inputs so they're grouped by DatasetType and then 

1561 # data ID. We also include a list of DatasetRefs for each FileDataset 

1562 # to hold the resolved DatasetRefs returned by the Registry, before 

1563 # it's safe to swap them into FileDataset.refs. 

1564 # Some type annotation aliases to make that clearer: 

1565 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]] 

1566 GroupedData = MutableMapping[DatasetType, GroupForType] 

1567 # The actual data structure: 

1568 groupedData: GroupedData = defaultdict(dict) 

1569 # And the nested loop that populates it: 

1570 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1571 # This list intentionally shared across the inner loop, since it's 

1572 # associated with `dataset`. 

1573 resolvedRefs: List[DatasetRef] = [] 

1574 

1575 # Somewhere to store pre-existing refs if we have an 

1576 # execution butler. 

1577 existingRefs: List[DatasetRef] = [] 

1578 

1579 for ref in dataset.refs: 

1580 if ref.dataId in groupedData[ref.datasetType]: 

1581 raise ConflictingDefinitionError(f"Ingest conflict. Dataset {dataset.path} has same" 

1582 " DataId as other ingest dataset" 

1583 f" {groupedData[ref.datasetType][ref.dataId][0].path} " 

1584 f" ({ref.dataId})") 

1585 if self._allow_put_of_predefined_dataset: 

1586 existing_ref = self.registry.findDataset(ref.datasetType, 

1587 dataId=ref.dataId, 

1588 collections=run) 

1589 if existing_ref: 

1590 if self.datastore.knows(existing_ref): 

1591 raise ConflictingDefinitionError(f"Dataset associated with path {dataset.path}" 

1592 f" already exists as {existing_ref}.") 

1593 # Store this ref elsewhere since it already exists 

1594 # and we do not want to remake it but we do want 

1595 # to store it in the datastore. 

1596 existingRefs.append(existing_ref) 

1597 

1598 # Nothing else to do until we have finished 

1599 # iterating. 

1600 continue 

1601 

1602 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs) 

1603 

1604 if existingRefs: 

1605 

1606 if len(dataset.refs) != len(existingRefs): 

1607 # Keeping track of partially pre-existing datasets is hard 

1608 # and should generally never happen. For now don't allow 

1609 # it. 

1610 raise ConflictingDefinitionError(f"For dataset {dataset.path} some dataIds already exist" 

1611 " in registry but others do not. This is not supported.") 

1612 

1613 # Attach the resolved refs if we found them. 

1614 dataset.refs = existingRefs 

1615 

1616 # Now we can bulk-insert into Registry for each DatasetType. 

1617 for datasetType, groupForType in progress.iter_item_chunks(groupedData.items(), 

1618 desc="Bulk-inserting datasets by type"): 

1619 refs = self.registry.insertDatasets( 

1620 datasetType, 

1621 dataIds=groupForType.keys(), 

1622 run=run, 

1623 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType), 

1624 idGenerationMode=idGenerationMode, 

1625 ) 

1626 # Append those resolved DatasetRefs to the new lists we set up for 

1627 # them. 

1628 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()): 

1629 resolvedRefs.append(ref) 

1630 

1631 # Go back to the original FileDatasets to replace their refs with the 

1632 # new resolved ones. 

1633 for groupForType in progress.iter_chunks(groupedData.values(), 

1634 desc="Reassociating resolved dataset refs with files"): 

1635 for dataset, resolvedRefs in groupForType.values(): 

1636 dataset.refs = resolvedRefs 

1637 

1638 # Bulk-insert everything into Datastore. 

1639 self.datastore.ingest(*datasets, transfer=transfer) 

1640 

1641 @contextlib.contextmanager 

1642 def export(self, *, directory: Optional[str] = None, 

1643 filename: Optional[str] = None, 

1644 format: Optional[str] = None, 

1645 transfer: Optional[str] = None) -> Iterator[RepoExportContext]: 

1646 """Export datasets from the repository represented by this `Butler`. 

1647 

1648 This method is a context manager that returns a helper object 

1649 (`RepoExportContext`) that is used to indicate what information from 

1650 the repository should be exported. 

1651 

1652 Parameters 

1653 ---------- 

1654 directory : `str`, optional 

1655 Directory dataset files should be written to if ``transfer`` is not 

1656 `None`. 

1657 filename : `str`, optional 

1658 Name for the file that will include database information associated 

1659 with the exported datasets. If this is not an absolute path and 

1660 ``directory`` is not `None`, it will be written to ``directory`` 

1661 instead of the current working directory. Defaults to 

1662 "export.{format}". 

1663 format : `str`, optional 

1664 File format for the database information file. If `None`, the 

1665 extension of ``filename`` will be used. 

1666 transfer : `str`, optional 

1667 Transfer mode passed to `Datastore.export`. 

1668 

1669 Raises 

1670 ------ 

1671 TypeError 

1672 Raised if the set of arguments passed is inconsistent. 

1673 

1674 Examples 

1675 -------- 

1676 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

1677 methods are used to provide the iterables over data IDs and/or datasets 

1678 to be exported:: 

1679 

1680 with butler.export("exports.yaml") as export: 

1681 # Export all flats, but none of the dimension element rows 

1682 # (i.e. data ID information) associated with them. 

1683 export.saveDatasets(butler.registry.queryDatasets("flat"), 

1684 elements=()) 

1685 # Export all datasets that start with "deepCoadd_" and all of 

1686 # their associated data ID information. 

1687 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

1688 """ 

1689 if directory is None and transfer is not None: 

1690 raise TypeError("Cannot transfer without providing a directory.") 

1691 if transfer == "move": 

1692 raise TypeError("Transfer may not be 'move': export is read-only") 

1693 if format is None: 

1694 if filename is None: 

1695 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1696 else: 

1697 _, format = os.path.splitext(filename) 

1698 elif filename is None: 

1699 filename = f"export.{format}" 

1700 if directory is not None: 

1701 filename = os.path.join(directory, filename) 

1702 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"]) 

1703 with open(filename, 'w') as stream: 

1704 backend = BackendClass(stream) 

1705 try: 

1706 helper = RepoExportContext(self.registry, self.datastore, backend=backend, 

1707 directory=directory, transfer=transfer) 

1708 yield helper 

1709 except BaseException: 

1710 raise 

1711 else: 

1712 helper._finish() 

1713 

1714 def import_(self, *, directory: Optional[str] = None, 

1715 filename: Union[str, TextIO, None] = None, 

1716 format: Optional[str] = None, 

1717 transfer: Optional[str] = None, 

1718 skip_dimensions: Optional[Set] = None, 

1719 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1720 reuseIds: bool = False) -> None: 

1721 """Import datasets into this repository that were exported from a 

1722 different butler repository via `~lsst.daf.butler.Butler.export`. 

1723 

1724 Parameters 

1725 ---------- 

1726 directory : `str`, optional 

1727 Directory containing dataset files to import from. If `None`, 

1728 ``filename`` and all dataset file paths specified therein must 

1729 be absolute. 

1730 filename : `str` or `TextIO`, optional 

1731 A stream or name of file that contains database information 

1732 associated with the exported datasets, typically generated by 

1733 `~lsst.daf.butler.Butler.export`. If this a string (name) and 

1734 is not an absolute path, does not exist in the current working 

1735 directory, and ``directory`` is not `None`, it is assumed to be in 

1736 ``directory``. Defaults to "export.{format}". 

1737 format : `str`, optional 

1738 File format for ``filename``. If `None`, the extension of 

1739 ``filename`` will be used. 

1740 transfer : `str`, optional 

1741 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

1742 skip_dimensions : `set`, optional 

1743 Names of dimensions that should be skipped and not imported. 

1744 idGenerationMode : `DatasetIdGenEnum`, optional 

1745 Specifies option for generating dataset IDs when IDs are not 

1746 provided or their type does not match backend type. By default 

1747 unique IDs are generated for each inserted dataset. 

1748 reuseIds : `bool`, optional 

1749 If `True` then forces re-use of imported dataset IDs for integer 

1750 IDs which are normally generated as auto-incremented; exception 

1751 will be raised if imported IDs clash with existing ones. This 

1752 option has no effect on the use of globally-unique IDs which are 

1753 always re-used (or generated if integer IDs are being imported). 

1754 

1755 Raises 

1756 ------ 

1757 TypeError 

1758 Raised if the set of arguments passed is inconsistent, or if the 

1759 butler is read-only. 

1760 """ 

1761 if not self.isWriteable(): 

1762 raise TypeError("Butler is read-only.") 

1763 if format is None: 

1764 if filename is None: 

1765 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1766 else: 

1767 _, format = os.path.splitext(filename) # type: ignore 

1768 elif filename is None: 

1769 filename = f"export.{format}" 

1770 if isinstance(filename, str) and directory is not None and not os.path.exists(filename): 

1771 filename = os.path.join(directory, filename) 

1772 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"]) 

1773 

1774 def doImport(importStream: TextIO) -> None: 

1775 backend = BackendClass(importStream, self.registry) 

1776 backend.register() 

1777 with self.transaction(): 

1778 backend.load(self.datastore, directory=directory, transfer=transfer, 

1779 skip_dimensions=skip_dimensions, idGenerationMode=idGenerationMode, 

1780 reuseIds=reuseIds) 

1781 

1782 if isinstance(filename, str): 

1783 with open(filename, "r") as stream: 

1784 doImport(stream) 

1785 else: 

1786 doImport(filename) 

1787 

1788 def transfer_from(self, source_butler: Butler, source_refs: Iterable[DatasetRef], 

1789 transfer: str = "auto", 

1790 id_gen_map: Dict[str, DatasetIdGenEnum] = None, 

1791 skip_missing: bool = True) -> List[DatasetRef]: 

1792 """Transfer datasets to this Butler from a run in another Butler. 

1793 

1794 Parameters 

1795 ---------- 

1796 source_butler : `Butler` 

1797 Butler from which the datasets are to be transferred. 

1798 source_refs : iterable of `DatasetRef` 

1799 Datasets defined in the source butler that should be transferred to 

1800 this butler. 

1801 transfer : `str`, optional 

1802 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

1803 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional 

1804 A mapping of dataset type to ID generation mode. Only used if 

1805 the source butler is using integer IDs. Should not be used 

1806 if this receiving butler uses integer IDs. Without this dataset 

1807 import always uses unique. 

1808 skip_missing : `bool` 

1809 If `True`, datasets with no datastore artifact associated with 

1810 them are not transferred. 

1811 

1812 Returns 

1813 ------- 

1814 refs : `list` of `DatasetRef` 

1815 The refs added to this Butler. 

1816 

1817 Notes 

1818 ----- 

1819 Requires that any dimension definitions are already present in the 

1820 receiving Butler. The datastore artifact has to exist for a transfer 

1821 to be made but non-existence is not an error. 

1822 

1823 Datasets that already exist in this run will be skipped. 

1824 

1825 The datasets are imported as part of a transaction, although 

1826 dataset types are registered before the transaction is started. 

1827 This means that it is possible for a dataset type to be registered 

1828 even though transfer has failed. 

1829 """ 

1830 if not self.isWriteable(): 

1831 raise TypeError("Butler is read-only.") 

1832 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

1833 

1834 # Will iterate through the refs multiple times so need to convert 

1835 # to a list if this isn't a collection. 

1836 if not isinstance(source_refs, collections.abc.Collection): 

1837 source_refs = list(source_refs) 

1838 

1839 log.info("Transferring %d datasets into %s", len(source_refs), str(self)) 

1840 

1841 if id_gen_map is None: 

1842 id_gen_map = {} 

1843 

1844 # In some situations the datastore artifact may be missing 

1845 # and we do not want that registry entry to be imported. 

1846 # Asking datastore is not sufficient, the records may have been 

1847 # purged, we have to ask for the (predicted) URI and check 

1848 # existence explicitly. Execution butler is set up exactly like 

1849 # this with no datastore records. 

1850 if skip_missing: 

1851 source_refs = [ref for ref in source_refs if source_butler.datastore.exists(ref)] 

1852 

1853 # Importing requires that we group the refs by dataset type and run 

1854 # before doing the import. 

1855 grouped_refs = defaultdict(list) 

1856 grouped_indices = defaultdict(list) 

1857 for i, ref in enumerate(source_refs): 

1858 grouped_refs[ref.datasetType, ref.run].append(ref) 

1859 grouped_indices[ref.datasetType, ref.run].append(i) 

1860 

1861 # Register any dataset types we need. This has to be done outside 

1862 # of a transaction and so will not be rolled back on failure. 

1863 for datasetType, _ in grouped_refs: 

1864 self.registry.registerDatasetType(datasetType) 

1865 

1866 # The returned refs should be identical for UUIDs. 

1867 # For now must also support integers and so need to retain the 

1868 # newly-created refs from this registry. 

1869 # Pre-size it so we can assign refs into the correct slots 

1870 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs) 

1871 default_id_gen = DatasetIdGenEnum.UNIQUE 

1872 

1873 # Do all the importing in a single transaction. 

1874 with self.transaction(): 

1875 for (datasetType, run), refs_to_import in progress.iter_item_chunks(grouped_refs.items(), 

1876 desc="Importing to registry" 

1877 " by run and dataset type"): 

1878 run_doc = source_butler.registry.getCollectionDocumentation(run) 

1879 self.registry.registerCollection(run, CollectionType.RUN, doc=run_doc) 

1880 

1881 id_generation_mode = default_id_gen 

1882 if isinstance(refs_to_import[0].id, int): 

1883 # ID generation mode might need to be overridden when 

1884 # targetting UUID 

1885 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen) 

1886 

1887 n_refs = len(refs_to_import) 

1888 log.log(VERBOSE, "Importing %d ref%s of dataset type %s into run %s", 

1889 n_refs, "" if n_refs == 1 else "s", datasetType.name, run) 

1890 

1891 # No way to know if this butler's registry uses UUID. 

1892 # We have to trust the caller on this. If it fails they will 

1893 # have to change their approach. We can't catch the exception 

1894 # and retry with unique because that will mess up the 

1895 # transaction handling. We aren't allowed to ask the registry 

1896 # manager what type of ID it is using. 

1897 imported_refs = self.registry._importDatasets(refs_to_import, 

1898 idGenerationMode=id_generation_mode, 

1899 expand=False) 

1900 

1901 # Map them into the correct slots to match the initial order 

1902 for i, ref in zip(grouped_indices[datasetType, run], imported_refs): 

1903 transferred_refs_tmp[i] = ref 

1904 

1905 # Mypy insists that we might have None in here so we have to make 

1906 # that explicit by assigning to a new variable and filtering out 

1907 # something that won't be there. 

1908 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None] 

1909 

1910 # Check consistency 

1911 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given" 

1912 

1913 log.log(VERBOSE, "Imported %d datasets into destination butler", len(transferred_refs)) 

1914 

1915 # The transferred refs need to be reordered to match the original 

1916 # ordering given by the caller. Without this the datastore transfer 

1917 # will be broken. 

1918 

1919 # Ask the datastore to transfer. The datastore has to check that 

1920 # the source datastore is compatible with the target datastore. 

1921 self.datastore.transfer_from(source_butler.datastore, source_refs, 

1922 local_refs=transferred_refs, transfer=transfer) 

1923 

1924 return transferred_refs 

1925 

1926 def validateConfiguration(self, logFailures: bool = False, 

1927 datasetTypeNames: Optional[Iterable[str]] = None, 

1928 ignore: Iterable[str] = None) -> None: 

1929 """Validate butler configuration. 

1930 

1931 Checks that each `DatasetType` can be stored in the `Datastore`. 

1932 

1933 Parameters 

1934 ---------- 

1935 logFailures : `bool`, optional 

1936 If `True`, output a log message for every validation error 

1937 detected. 

1938 datasetTypeNames : iterable of `str`, optional 

1939 The `DatasetType` names that should be checked. This allows 

1940 only a subset to be selected. 

1941 ignore : iterable of `str`, optional 

1942 Names of DatasetTypes to skip over. This can be used to skip 

1943 known problems. If a named `DatasetType` corresponds to a 

1944 composite, all components of that `DatasetType` will also be 

1945 ignored. 

1946 

1947 Raises 

1948 ------ 

1949 ButlerValidationError 

1950 Raised if there is some inconsistency with how this Butler 

1951 is configured. 

1952 """ 

1953 if datasetTypeNames: 

1954 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

1955 else: 

1956 datasetTypes = list(self.registry.queryDatasetTypes()) 

1957 

1958 # filter out anything from the ignore list 

1959 if ignore: 

1960 ignore = set(ignore) 

1961 datasetTypes = [e for e in datasetTypes 

1962 if e.name not in ignore and e.nameAndComponent()[0] not in ignore] 

1963 else: 

1964 ignore = set() 

1965 

1966 # Find all the registered instruments 

1967 instruments = set( 

1968 record.name for record in self.registry.queryDimensionRecords("instrument") 

1969 ) 

1970 

1971 # For each datasetType that has an instrument dimension, create 

1972 # a DatasetRef for each defined instrument 

1973 datasetRefs = [] 

1974 

1975 for datasetType in datasetTypes: 

1976 if "instrument" in datasetType.dimensions: 

1977 for instrument in instruments: 

1978 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, # type: ignore 

1979 conform=False) 

1980 datasetRefs.append(datasetRef) 

1981 

1982 entities: List[Union[DatasetType, DatasetRef]] = [] 

1983 entities.extend(datasetTypes) 

1984 entities.extend(datasetRefs) 

1985 

1986 datastoreErrorStr = None 

1987 try: 

1988 self.datastore.validateConfiguration(entities, logFailures=logFailures) 

1989 except ValidationError as e: 

1990 datastoreErrorStr = str(e) 

1991 

1992 # Also check that the LookupKeys used by the datastores match 

1993 # registry and storage class definitions 

1994 keys = self.datastore.getLookupKeys() 

1995 

1996 failedNames = set() 

1997 failedDataId = set() 

1998 for key in keys: 

1999 if key.name is not None: 

2000 if key.name in ignore: 

2001 continue 

2002 

2003 # skip if specific datasetType names were requested and this 

2004 # name does not match 

2005 if datasetTypeNames and key.name not in datasetTypeNames: 

2006 continue 

2007 

2008 # See if it is a StorageClass or a DatasetType 

2009 if key.name in self.storageClasses: 

2010 pass 

2011 else: 

2012 try: 

2013 self.registry.getDatasetType(key.name) 

2014 except KeyError: 

2015 if logFailures: 

2016 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

2017 failedNames.add(key) 

2018 else: 

2019 # Dimensions are checked for consistency when the Butler 

2020 # is created and rendezvoused with a universe. 

2021 pass 

2022 

2023 # Check that the instrument is a valid instrument 

2024 # Currently only support instrument so check for that 

2025 if key.dataId: 

2026 dataIdKeys = set(key.dataId) 

2027 if set(["instrument"]) != dataIdKeys: 

2028 if logFailures: 

2029 log.critical("Key '%s' has unsupported DataId override", key) 

2030 failedDataId.add(key) 

2031 elif key.dataId["instrument"] not in instruments: 

2032 if logFailures: 

2033 log.critical("Key '%s' has unknown instrument", key) 

2034 failedDataId.add(key) 

2035 

2036 messages = [] 

2037 

2038 if datastoreErrorStr: 

2039 messages.append(datastoreErrorStr) 

2040 

2041 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2042 (failedDataId, "Keys with bad DataId entries: ")): 

2043 if failed: 

2044 msg += ", ".join(str(k) for k in failed) 

2045 messages.append(msg) 

2046 

2047 if messages: 

2048 raise ValidationError(";\n".join(messages)) 

2049 

2050 @property 

2051 def collections(self) -> CollectionSearch: 

2052 """The collections to search by default, in order (`CollectionSearch`). 

2053 

2054 This is an alias for ``self.registry.defaults.collections``. It cannot 

2055 be set directly in isolation, but all defaults may be changed together 

2056 by assigning a new `RegistryDefaults` instance to 

2057 ``self.registry.defaults``. 

2058 """ 

2059 return self.registry.defaults.collections 

2060 

2061 @property 

2062 def run(self) -> Optional[str]: 

2063 """Name of the run this butler writes outputs to by default (`str` or 

2064 `None`). 

2065 

2066 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2067 directly in isolation, but all defaults may be changed together by 

2068 assigning a new `RegistryDefaults` instance to 

2069 ``self.registry.defaults``. 

2070 """ 

2071 return self.registry.defaults.run 

2072 

2073 registry: Registry 

2074 """The object that manages dataset metadata and relationships (`Registry`). 

2075 

2076 Most operations that don't involve reading or writing butler datasets are 

2077 accessible only via `Registry` methods. 

2078 """ 

2079 

2080 datastore: Datastore 

2081 """The object that manages actual dataset storage (`Datastore`). 

2082 

2083 Direct user access to the datastore should rarely be necessary; the primary 

2084 exception is the case where a `Datastore` implementation provides extra 

2085 functionality beyond what the base class defines. 

2086 """ 

2087 

2088 storageClasses: StorageClassFactory 

2089 """An object that maps known storage class names to objects that fully 

2090 describe them (`StorageClassFactory`). 

2091 """ 

2092 

2093 _allow_put_of_predefined_dataset: bool 

2094 """Allow a put to succeed even if there is already a registry entry for it 

2095 but not a datastore record. (`bool`)."""