Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Butler top level classes. 

24""" 

25from __future__ import annotations 

26 

27__all__ = ( 

28 "Butler", 

29 "ButlerValidationError", 

30 "PruneCollectionsArgsError", 

31 "PurgeWithoutUnstorePruneCollectionsError", 

32 "RunWithoutPurgePruneCollectionsError", 

33 "PurgeUnsupportedPruneCollectionsError", 

34) 

35 

36 

37from collections import defaultdict 

38import contextlib 

39import logging 

40import numbers 

41import os 

42from typing import ( 

43 Any, 

44 ClassVar, 

45 Counter, 

46 Dict, 

47 Iterable, 

48 Iterator, 

49 List, 

50 MutableMapping, 

51 Optional, 

52 Set, 

53 TextIO, 

54 Tuple, 

55 Type, 

56 Union, 

57) 

58 

59try: 

60 import boto3 

61except ImportError: 

62 boto3 = None 

63 

64from lsst.utils import doImport 

65from .core import ( 

66 AmbiguousDatasetError, 

67 ButlerURI, 

68 Config, 

69 ConfigSubset, 

70 DataCoordinate, 

71 DataId, 

72 DataIdValue, 

73 DatasetRef, 

74 DatasetType, 

75 Datastore, 

76 Dimension, 

77 DimensionConfig, 

78 FileDataset, 

79 StorageClassFactory, 

80 Timespan, 

81 ValidationError, 

82) 

83from .core.repoRelocation import BUTLER_ROOT_TAG 

84from .core.utils import transactional, getClassOf 

85from ._deferredDatasetHandle import DeferredDatasetHandle 

86from ._butlerConfig import ButlerConfig 

87from .registry import Registry, RegistryConfig, RegistryDefaults, CollectionType 

88from .registry.wildcards import CollectionSearch 

89from .transfers import RepoExportContext 

90 

91log = logging.getLogger(__name__) 

92 

93 

94class ButlerValidationError(ValidationError): 

95 """There is a problem with the Butler configuration.""" 

96 pass 

97 

98 

99class PruneCollectionsArgsError(TypeError): 

100 """Base class for errors relating to Butler.pruneCollections input 

101 arguments. 

102 """ 

103 pass 

104 

105 

106class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError): 

107 """Raised when purge and unstore are both required to be True, and 

108 purge is True but unstore is False. 

109 """ 

110 

111 def __init__(self) -> None: 

112 super().__init__("Cannot pass purge=True without unstore=True.") 

113 

114 

115class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError): 

116 """Raised when pruning a RUN collection but purge is False.""" 

117 

118 def __init__(self, collectionType: CollectionType): 

119 self.collectionType = collectionType 

120 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.") 

121 

122 

123class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError): 

124 """Raised when purge is True but is not supported for the given 

125 collection.""" 

126 

127 def __init__(self, collectionType: CollectionType): 

128 self.collectionType = collectionType 

129 super().__init__( 

130 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True.") 

131 

132 

133class Butler: 

134 """Main entry point for the data access system. 

135 

136 Parameters 

137 ---------- 

138 config : `ButlerConfig`, `Config` or `str`, optional. 

139 Configuration. Anything acceptable to the 

140 `ButlerConfig` constructor. If a directory path 

141 is given the configuration will be read from a ``butler.yaml`` file in 

142 that location. If `None` is given default values will be used. 

143 butler : `Butler`, optional. 

144 If provided, construct a new Butler that uses the same registry and 

145 datastore as the given one, but with the given collection and run. 

146 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

147 arguments. 

148 collections : `str` or `Iterable` [ `str` ], optional 

149 An expression specifying the collections to be searched (in order) when 

150 reading datasets. 

151 This may be a `str` collection name or an iterable thereof. 

152 See :ref:`daf_butler_collection_expressions` for more information. 

153 These collections are not registered automatically and must be 

154 manually registered before they are used by any method, but they may be 

155 manually registered after the `Butler` is initialized. 

156 run : `str`, optional 

157 Name of the `~CollectionType.RUN` collection new datasets should be 

158 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

159 ``collections`` will be set to ``[run]``. If not `None`, this 

160 collection will automatically be registered. If this is not set (and 

161 ``writeable`` is not set either), a read-only butler will be created. 

162 searchPaths : `list` of `str`, optional 

163 Directory paths to search when calculating the full Butler 

164 configuration. Not used if the supplied config is already a 

165 `ButlerConfig`. 

166 writeable : `bool`, optional 

167 Explicitly sets whether the butler supports write operations. If not 

168 provided, a read-write butler is created if any of ``run``, ``tags``, 

169 or ``chains`` is non-empty. 

170 inferDefaults : `bool`, optional 

171 If `True` (default) infer default data ID values from the values 

172 present in the datasets in ``collections``: if all collections have the 

173 same value (or no value) for a governor dimension, that value will be 

174 the default for that dimension. Nonexistent collections are ignored. 

175 If a default value is provided explicitly for a governor dimension via 

176 ``**kwargs``, no default will be inferred for that dimension. 

177 **kwargs : `str` 

178 Default data ID key-value pairs. These may only identify "governor" 

179 dimensions like ``instrument`` and ``skymap``. 

180 

181 Examples 

182 -------- 

183 While there are many ways to control exactly how a `Butler` interacts with 

184 the collections in its `Registry`, the most common cases are still simple. 

185 

186 For a read-only `Butler` that searches one collection, do:: 

187 

188 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

189 

190 For a read-write `Butler` that writes to and reads from a 

191 `~CollectionType.RUN` collection:: 

192 

193 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

194 

195 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

196 because we want to write to one `~CollectionType.RUN` collection but read 

197 from several others (as well):: 

198 

199 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

200 collections=["u/alice/DM-50000/a", 

201 "u/bob/DM-49998", 

202 "HSC/defaults"]) 

203 

204 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

205 Datasets will be read first from that run (since it appears first in the 

206 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

207 

208 Finally, one can always create a `Butler` with no collections:: 

209 

210 butler = Butler("/path/to/repo", writeable=True) 

211 

212 This can be extremely useful when you just want to use ``butler.registry``, 

213 e.g. for inserting dimension data or managing collections, or when the 

214 collections you want to use with the butler are not consistent. 

215 Passing ``writeable`` explicitly here is only necessary if you want to be 

216 able to make changes to the repo - usually the value for ``writeable`` can 

217 be guessed from the collection arguments provided, but it defaults to 

218 `False` when there are not collection arguments. 

219 """ 

220 def __init__(self, config: Union[Config, str, None] = None, *, 

221 butler: Optional[Butler] = None, 

222 collections: Any = None, 

223 run: Optional[str] = None, 

224 searchPaths: Optional[List[str]] = None, 

225 writeable: Optional[bool] = None, 

226 inferDefaults: bool = True, 

227 **kwargs: str, 

228 ): 

229 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

230 # Load registry, datastore, etc. from config or existing butler. 

231 if butler is not None: 

232 if config is not None or searchPaths is not None or writeable is not None: 

233 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' " 

234 "arguments with 'butler' argument.") 

235 self.registry = butler.registry.copy(defaults) 

236 self.datastore = butler.datastore 

237 self.storageClasses = butler.storageClasses 

238 self._config: ButlerConfig = butler._config 

239 else: 

240 self._config = ButlerConfig(config, searchPaths=searchPaths) 

241 if "root" in self._config: 

242 butlerRoot = self._config["root"] 

243 else: 

244 butlerRoot = self._config.configDir 

245 if writeable is None: 

246 writeable = run is not None 

247 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable, 

248 defaults=defaults) 

249 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(), 

250 butlerRoot=butlerRoot) 

251 self.storageClasses = StorageClassFactory() 

252 self.storageClasses.addFromConfig(self._config) 

253 if "run" in self._config or "collection" in self._config: 

254 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

255 

256 GENERATION: ClassVar[int] = 3 

257 """This is a Generation 3 Butler. 

258 

259 This attribute may be removed in the future, once the Generation 2 Butler 

260 interface has been fully retired; it should only be used in transitional 

261 code. 

262 """ 

263 

264 @staticmethod 

265 def makeRepo(root: str, config: Union[Config, str, None] = None, 

266 dimensionConfig: Union[Config, str, None] = None, standalone: bool = False, 

267 searchPaths: Optional[List[str]] = None, forceConfigRoot: bool = True, 

268 outfile: Optional[str] = None, overwrite: bool = False) -> Config: 

269 """Create an empty data repository by adding a butler.yaml config 

270 to a repository root directory. 

271 

272 Parameters 

273 ---------- 

274 root : `str` or `ButlerURI` 

275 Path or URI to the root location of the new repository. Will be 

276 created if it does not exist. 

277 config : `Config` or `str`, optional 

278 Configuration to write to the repository, after setting any 

279 root-dependent Registry or Datastore config options. Can not 

280 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

281 configuration will be used. Root-dependent config options 

282 specified in this config are overwritten if ``forceConfigRoot`` 

283 is `True`. 

284 dimensionConfig : `Config` or `str`, optional 

285 Configuration for dimensions, will be used to initialize registry 

286 database. 

287 standalone : `bool` 

288 If True, write all expanded defaults, not just customized or 

289 repository-specific settings. 

290 This (mostly) decouples the repository from the default 

291 configuration, insulating it from changes to the defaults (which 

292 may be good or bad, depending on the nature of the changes). 

293 Future *additions* to the defaults will still be picked up when 

294 initializing `Butlers` to repos created with ``standalone=True``. 

295 searchPaths : `list` of `str`, optional 

296 Directory paths to search when calculating the full butler 

297 configuration. 

298 forceConfigRoot : `bool`, optional 

299 If `False`, any values present in the supplied ``config`` that 

300 would normally be reset are not overridden and will appear 

301 directly in the output config. This allows non-standard overrides 

302 of the root directory for a datastore or registry to be given. 

303 If this parameter is `True` the values for ``root`` will be 

304 forced into the resulting config if appropriate. 

305 outfile : `str`, optional 

306 If not-`None`, the output configuration will be written to this 

307 location rather than into the repository itself. Can be a URI 

308 string. Can refer to a directory that will be used to write 

309 ``butler.yaml``. 

310 overwrite : `bool`, optional 

311 Create a new configuration file even if one already exists 

312 in the specified output location. Default is to raise 

313 an exception. 

314 

315 Returns 

316 ------- 

317 config : `Config` 

318 The updated `Config` instance written to the repo. 

319 

320 Raises 

321 ------ 

322 ValueError 

323 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

324 regular Config (as these subclasses would make it impossible to 

325 support ``standalone=False``). 

326 FileExistsError 

327 Raised if the output config file already exists. 

328 os.error 

329 Raised if the directory does not exist, exists but is not a 

330 directory, or cannot be created. 

331 

332 Notes 

333 ----- 

334 Note that when ``standalone=False`` (the default), the configuration 

335 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

336 construct the repository should also be used to construct any Butlers 

337 to avoid configuration inconsistencies. 

338 """ 

339 if isinstance(config, (ButlerConfig, ConfigSubset)): 

340 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

341 

342 # Ensure that the root of the repository exists or can be made 

343 uri = ButlerURI(root, forceDirectory=True) 

344 uri.mkdir() 

345 

346 config = Config(config) 

347 

348 # If we are creating a new repo from scratch with relative roots, 

349 # do not propagate an explicit root from the config file 

350 if "root" in config: 

351 del config["root"] 

352 

353 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

354 datastoreClass: Type[Datastore] = doImport(full["datastore", "cls"]) 

355 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

356 

357 # if key exists in given config, parse it, otherwise parse the defaults 

358 # in the expanded config 

359 if config.get(("registry", "db")): 

360 registryConfig = RegistryConfig(config) 

361 else: 

362 registryConfig = RegistryConfig(full) 

363 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

364 if defaultDatabaseUri is not None: 

365 Config.updateParameters(RegistryConfig, config, full, 

366 toUpdate={"db": defaultDatabaseUri}, 

367 overwrite=forceConfigRoot) 

368 else: 

369 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), 

370 overwrite=forceConfigRoot) 

371 

372 if standalone: 

373 config.merge(full) 

374 else: 

375 # Always expand the registry.managers section into the per-repo 

376 # config, because after the database schema is created, it's not 

377 # allowed to change anymore. Note that in the standalone=True 

378 # branch, _everything_ in the config is expanded, so there's no 

379 # need to special case this. 

380 Config.updateParameters(RegistryConfig, config, full, toCopy=("managers",), overwrite=False) 

381 configURI: Union[str, ButlerURI] 

382 if outfile is not None: 

383 # When writing to a separate location we must include 

384 # the root of the butler repo in the config else it won't know 

385 # where to look. 

386 config["root"] = uri.geturl() 

387 configURI = outfile 

388 else: 

389 configURI = uri 

390 config.dumpToUri(configURI, overwrite=overwrite) 

391 

392 # Create Registry and populate tables 

393 registryConfig = RegistryConfig(config.get("registry")) 

394 dimensionConfig = DimensionConfig(dimensionConfig) 

395 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root) 

396 

397 return config 

398 

399 @classmethod 

400 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str], 

401 defaultDataId: Dict[str, str], writeable: bool) -> Butler: 

402 """Callable used to unpickle a Butler. 

403 

404 We prefer not to use ``Butler.__init__`` directly so we can force some 

405 of its many arguments to be keyword-only (note that ``__reduce__`` 

406 can only invoke callables with positional arguments). 

407 

408 Parameters 

409 ---------- 

410 config : `ButlerConfig` 

411 Butler configuration, already coerced into a true `ButlerConfig` 

412 instance (and hence after any search paths for overrides have been 

413 utilized). 

414 collections : `CollectionSearch` 

415 Names of the default collections to read from. 

416 run : `str`, optional 

417 Name of the default `~CollectionType.RUN` collection to write to. 

418 defaultDataId : `dict` [ `str`, `str` ] 

419 Default data ID values. 

420 writeable : `bool` 

421 Whether the Butler should support write operations. 

422 

423 Returns 

424 ------- 

425 butler : `Butler` 

426 A new `Butler` instance. 

427 """ 

428 # MyPy doesn't recognize that the kwargs below are totally valid; it 

429 # seems to think '**defaultDataId* is a _positional_ argument! 

430 return cls(config=config, collections=collections, run=run, writeable=writeable, 

431 **defaultDataId) # type: ignore 

432 

433 def __reduce__(self) -> tuple: 

434 """Support pickling. 

435 """ 

436 return (Butler._unpickle, (self._config, self.collections, self.run, 

437 self.registry.defaults.dataId.byName(), 

438 self.registry.isWriteable())) 

439 

440 def __str__(self) -> str: 

441 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

442 self.collections, self.run, self.datastore, self.registry) 

443 

444 def isWriteable(self) -> bool: 

445 """Return `True` if this `Butler` supports write operations. 

446 """ 

447 return self.registry.isWriteable() 

448 

449 @contextlib.contextmanager 

450 def transaction(self) -> Iterator[None]: 

451 """Context manager supporting `Butler` transactions. 

452 

453 Transactions can be nested. 

454 """ 

455 with self.registry.transaction(): 

456 with self.datastore.transaction(): 

457 yield 

458 

459 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

460 dataId: Optional[DataId] = None, **kwds: Any 

461 ) -> Tuple[DatasetType, Optional[DataId]]: 

462 """Standardize the arguments passed to several Butler APIs. 

463 

464 Parameters 

465 ---------- 

466 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

467 When `DatasetRef` the `dataId` should be `None`. 

468 Otherwise the `DatasetType` or name thereof. 

469 dataId : `dict` or `DataCoordinate` 

470 A `dict` of `Dimension` link name, value pairs that label the 

471 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

472 should be provided as the second argument. 

473 kwds 

474 Additional keyword arguments used to augment or construct a 

475 `DataCoordinate`. See `DataCoordinate.standardize` 

476 parameters. 

477 

478 Returns 

479 ------- 

480 datasetType : `DatasetType` 

481 A `DatasetType` instance extracted from ``datasetRefOrType``. 

482 dataId : `dict` or `DataId`, optional 

483 Argument that can be used (along with ``kwds``) to construct a 

484 `DataId`. 

485 

486 Notes 

487 ----- 

488 Butler APIs that conceptually need a DatasetRef also allow passing a 

489 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

490 keyword arguments that can be used to construct one) separately. This 

491 method accepts those arguments and always returns a true `DatasetType` 

492 and a `DataId` or `dict`. 

493 

494 Standardization of `dict` vs `DataId` is best handled by passing the 

495 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are 

496 generally similarly flexible. 

497 """ 

498 externalDatasetType: Optional[DatasetType] = None 

499 internalDatasetType: Optional[DatasetType] = None 

500 if isinstance(datasetRefOrType, DatasetRef): 

501 if dataId is not None or kwds: 

502 raise ValueError("DatasetRef given, cannot use dataId as well") 

503 externalDatasetType = datasetRefOrType.datasetType 

504 dataId = datasetRefOrType.dataId 

505 else: 

506 # Don't check whether DataId is provided, because Registry APIs 

507 # can usually construct a better error message when it wasn't. 

508 if isinstance(datasetRefOrType, DatasetType): 

509 externalDatasetType = datasetRefOrType 

510 else: 

511 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

512 

513 # Check that they are self-consistent 

514 if externalDatasetType is not None: 

515 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

516 if externalDatasetType != internalDatasetType: 

517 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

518 f"registry definition ({internalDatasetType})") 

519 

520 assert internalDatasetType is not None 

521 return internalDatasetType, dataId 

522 

523 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

524 dataId: Optional[DataId] = None, *, 

525 collections: Any = None, 

526 allowUnresolved: bool = False, 

527 **kwds: Any) -> DatasetRef: 

528 """Shared logic for methods that start with a search for a dataset in 

529 the registry. 

530 

531 Parameters 

532 ---------- 

533 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

534 When `DatasetRef` the `dataId` should be `None`. 

535 Otherwise the `DatasetType` or name thereof. 

536 dataId : `dict` or `DataCoordinate`, optional 

537 A `dict` of `Dimension` link name, value pairs that label the 

538 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

539 should be provided as the first argument. 

540 collections : Any, optional 

541 Collections to be searched, overriding ``self.collections``. 

542 Can be any of the types supported by the ``collections`` argument 

543 to butler construction. 

544 allowUnresolved : `bool`, optional 

545 If `True`, return an unresolved `DatasetRef` if finding a resolved 

546 one in the `Registry` fails. Defaults to `False`. 

547 kwds 

548 Additional keyword arguments used to augment or construct a 

549 `DataId`. See `DataId` parameters. 

550 

551 Returns 

552 ------- 

553 ref : `DatasetRef` 

554 A reference to the dataset identified by the given arguments. 

555 

556 Raises 

557 ------ 

558 LookupError 

559 Raised if no matching dataset exists in the `Registry` (and 

560 ``allowUnresolved is False``). 

561 ValueError 

562 Raised if a resolved `DatasetRef` was passed as an input, but it 

563 differs from the one found in the registry. 

564 TypeError 

565 Raised if no collections were provided. 

566 """ 

567 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds) 

568 if isinstance(datasetRefOrType, DatasetRef): 

569 idNumber = datasetRefOrType.id 

570 else: 

571 idNumber = None 

572 timespan: Optional[Timespan] = None 

573 

574 # Process dimension records that are using record information 

575 # rather than ids 

576 newDataId: Dict[str, DataIdValue] = {} 

577 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict) 

578 

579 # if all the dataId comes from keyword parameters we do not need 

580 # to do anything here because they can't be of the form 

581 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

582 if dataId: 

583 for k, v in dataId.items(): 

584 # If we have a Dimension we do not need to do anything 

585 # because it cannot be a compound key. 

586 if isinstance(k, str) and "." in k: 

587 # Someone is using a more human-readable dataId 

588 dimensionName, record = k.split(".", 1) 

589 byRecord[dimensionName][record] = v 

590 elif isinstance(k, Dimension): 

591 newDataId[k.name] = v 

592 else: 

593 newDataId[k] = v 

594 

595 # Go through the updated dataId and check the type in case someone is 

596 # using an alternate key. We have already filtered out the compound 

597 # keys dimensions.record format. 

598 not_dimensions = {} 

599 

600 # Will need to look in the dataId and the keyword arguments 

601 # and will remove them if they need to be fixed or are unrecognized. 

602 for dataIdDict in (newDataId, kwds): 

603 # Use a list so we can adjust the dict safely in the loop 

604 for dimensionName in list(dataIdDict): 

605 value = dataIdDict[dimensionName] 

606 try: 

607 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

608 except KeyError: 

609 # This is not a real dimension 

610 not_dimensions[dimensionName] = value 

611 del dataIdDict[dimensionName] 

612 continue 

613 

614 # Convert an integral type to an explicit int to simplify 

615 # comparisons here 

616 if isinstance(value, numbers.Integral): 

617 value = int(value) 

618 

619 if not isinstance(value, dimension.primaryKey.getPythonType()): 

620 for alternate in dimension.alternateKeys: 

621 if isinstance(value, alternate.getPythonType()): 

622 byRecord[dimensionName][alternate.name] = value 

623 del dataIdDict[dimensionName] 

624 log.debug("Converting dimension %s to %s.%s=%s", 

625 dimensionName, dimensionName, alternate.name, value) 

626 break 

627 else: 

628 log.warning("Type mismatch found for value '%r' provided for dimension %s. " 

629 "Could not find matching alternative (primary key has type %s) " 

630 "so attempting to use as-is.", 

631 value, dimensionName, dimension.primaryKey.getPythonType()) 

632 

633 # If we have some unrecognized dimensions we have to try to connect 

634 # them to records in other dimensions. This is made more complicated 

635 # by some dimensions having records with clashing names. A mitigation 

636 # is that we can tell by this point which dimensions are missing 

637 # for the DatasetType but this does not work for calibrations 

638 # where additional dimensions can be used to constrain the temporal 

639 # axis. 

640 if not_dimensions: 

641 # Calculate missing dimensions 

642 provided = set(newDataId) | set(kwds) | set(byRecord) 

643 missingDimensions = datasetType.dimensions.names - provided 

644 

645 # For calibrations we may well be needing temporal dimensions 

646 # so rather than always including all dimensions in the scan 

647 # restrict things a little. It is still possible for there 

648 # to be confusion over day_obs in visit vs exposure for example. 

649 # If we are not searching calibration collections things may 

650 # fail but they are going to fail anyway because of the 

651 # ambiguousness of the dataId... 

652 candidateDimensions: Set[str] = set() 

653 candidateDimensions.update(missingDimensions) 

654 if datasetType.isCalibration(): 

655 for dim in self.registry.dimensions.getStaticDimensions(): 

656 if dim.temporal: 

657 candidateDimensions.add(str(dim)) 

658 

659 # Look up table for the first association with a dimension 

660 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict) 

661 

662 # Keep track of whether an item is associated with multiple 

663 # dimensions. 

664 counter: Counter[str] = Counter() 

665 assigned: Dict[str, Set[str]] = defaultdict(set) 

666 

667 # Go through the missing dimensions and associate the 

668 # given names with records within those dimensions 

669 for dimensionName in candidateDimensions: 

670 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

671 fields = dimension.metadata.names | dimension.uniqueKeys.names 

672 for field in not_dimensions: 

673 if field in fields: 

674 guessedAssociation[dimensionName][field] = not_dimensions[field] 

675 counter[dimensionName] += 1 

676 assigned[field].add(dimensionName) 

677 

678 # There is a chance we have allocated a single dataId item 

679 # to multiple dimensions. Need to decide which should be retained. 

680 # For now assume that the most popular alternative wins. 

681 # This means that day_obs with seq_num will result in 

682 # exposure.day_obs and not visit.day_obs 

683 # Also prefer an explicitly missing dimension over an inferred 

684 # temporal dimension. 

685 for fieldName, assignedDimensions in assigned.items(): 

686 if len(assignedDimensions) > 1: 

687 # Pick the most popular (preferring mandatory dimensions) 

688 requiredButMissing = assignedDimensions.intersection(missingDimensions) 

689 if requiredButMissing: 

690 candidateDimensions = requiredButMissing 

691 else: 

692 candidateDimensions = assignedDimensions 

693 

694 # Select the relevant items and get a new restricted 

695 # counter. 

696 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

697 duplicatesCounter: Counter[str] = Counter() 

698 duplicatesCounter.update(theseCounts) 

699 

700 # Choose the most common. If they are equally common 

701 # we will pick the one that was found first. 

702 # Returns a list of tuples 

703 selected = duplicatesCounter.most_common(1)[0][0] 

704 

705 log.debug("Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

706 " Removed ambiguity by choosing dimension %s.", 

707 fieldName, ", ".join(assignedDimensions), selected) 

708 

709 for candidateDimension in assignedDimensions: 

710 if candidateDimension != selected: 

711 del guessedAssociation[candidateDimension][fieldName] 

712 

713 # Update the record look up dict with the new associations 

714 for dimensionName, values in guessedAssociation.items(): 

715 if values: # A dict might now be empty 

716 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", 

717 dimensionName, values) 

718 byRecord[dimensionName].update(values) 

719 

720 if byRecord: 

721 # Some record specifiers were found so we need to convert 

722 # them to the Id form 

723 for dimensionName, values in byRecord.items(): 

724 if dimensionName in newDataId: 

725 log.warning("DataId specified explicit %s dimension value of %s in addition to" 

726 " general record specifiers for it of %s. Ignoring record information.", 

727 dimensionName, newDataId[dimensionName], str(values)) 

728 continue 

729 

730 # Build up a WHERE expression -- use single quotes 

731 def quote(s: Any) -> str: 

732 if isinstance(s, str): 

733 return f"'{s}'" 

734 else: 

735 return s 

736 

737 where = " AND ".join(f"{dimensionName}.{k} = {quote(v)}" 

738 for k, v in values.items()) 

739 

740 # Hopefully we get a single record that matches 

741 records = set(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId, 

742 where=where, **kwds)) 

743 

744 if len(records) != 1: 

745 if len(records) > 1: 

746 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

747 for r in records: 

748 log.debug("- %s", str(r)) 

749 raise RuntimeError(f"DataId specification for dimension {dimensionName} is not" 

750 f" uniquely constrained to a single dataset by {values}." 

751 f" Got {len(records)} results.") 

752 raise RuntimeError(f"DataId specification for dimension {dimensionName} matched no" 

753 f" records when constrained by {values}") 

754 

755 # Get the primary key from the real dimension object 

756 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

757 if not isinstance(dimension, Dimension): 

758 raise RuntimeError( 

759 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

760 ) 

761 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

762 

763 # We have modified the dataId so need to switch to it 

764 dataId = newDataId 

765 

766 if datasetType.isCalibration(): 

767 # Because this is a calibration dataset, first try to make a 

768 # standardize the data ID without restricting the dimensions to 

769 # those of the dataset type requested, because there may be extra 

770 # dimensions that provide temporal information for a validity-range 

771 # lookup. 

772 dataId = DataCoordinate.standardize(dataId, universe=self.registry.dimensions, 

773 defaults=self.registry.defaults.dataId, **kwds) 

774 if dataId.graph.temporal: 

775 dataId = self.registry.expandDataId(dataId) 

776 timespan = dataId.timespan 

777 else: 

778 # Standardize the data ID to just the dimensions of the dataset 

779 # type instead of letting registry.findDataset do it, so we get the 

780 # result even if no dataset is found. 

781 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions, 

782 defaults=self.registry.defaults.dataId, **kwds) 

783 # Always lookup the DatasetRef, even if one is given, to ensure it is 

784 # present in the current collection. 

785 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

786 if ref is None: 

787 if allowUnresolved: 

788 return DatasetRef(datasetType, dataId) 

789 else: 

790 if collections is None: 

791 collections = self.registry.defaults.collections 

792 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} " 

793 f"could not be found in collections {collections}.") 

794 if idNumber is not None and idNumber != ref.id: 

795 if collections is None: 

796 collections = self.registry.defaults.collections 

797 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match " 

798 f"id ({ref.id}) in registry in collections {collections}.") 

799 return ref 

800 

801 @transactional 

802 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

803 dataId: Optional[DataId] = None, *, 

804 run: Optional[str] = None, 

805 **kwds: Any) -> DatasetRef: 

806 """Store and register a dataset. 

807 

808 Parameters 

809 ---------- 

810 obj : `object` 

811 The dataset. 

812 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

813 When `DatasetRef` is provided, ``dataId`` should be `None`. 

814 Otherwise the `DatasetType` or name thereof. 

815 dataId : `dict` or `DataCoordinate` 

816 A `dict` of `Dimension` link name, value pairs that label the 

817 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

818 should be provided as the second argument. 

819 run : `str`, optional 

820 The name of the run the dataset should be added to, overriding 

821 ``self.run``. 

822 kwds 

823 Additional keyword arguments used to augment or construct a 

824 `DataCoordinate`. See `DataCoordinate.standardize` 

825 parameters. 

826 

827 Returns 

828 ------- 

829 ref : `DatasetRef` 

830 A reference to the stored dataset, updated with the correct id if 

831 given. 

832 

833 Raises 

834 ------ 

835 TypeError 

836 Raised if the butler is read-only or if no run has been provided. 

837 """ 

838 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

839 if not self.isWriteable(): 

840 raise TypeError("Butler is read-only.") 

841 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds) 

842 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

843 raise ValueError("DatasetRef must not be in registry, must have None id") 

844 

845 # Add Registry Dataset entry. 

846 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds) 

847 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

848 

849 # Add Datastore entry. 

850 self.datastore.put(obj, ref) 

851 

852 return ref 

853 

854 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any: 

855 """Retrieve a stored dataset. 

856 

857 Unlike `Butler.get`, this method allows datasets outside the Butler's 

858 collection to be read as long as the `DatasetRef` that identifies them 

859 can be obtained separately. 

860 

861 Parameters 

862 ---------- 

863 ref : `DatasetRef` 

864 Resolved reference to an already stored dataset. 

865 parameters : `dict` 

866 Additional StorageClass-defined options to control reading, 

867 typically used to efficiently read only a subset of the dataset. 

868 

869 Returns 

870 ------- 

871 obj : `object` 

872 The dataset. 

873 """ 

874 return self.datastore.get(ref, parameters=parameters) 

875 

876 def getDirectDeferred(self, ref: DatasetRef, *, 

877 parameters: Union[dict, None] = None) -> DeferredDatasetHandle: 

878 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

879 from a resolved `DatasetRef`. 

880 

881 Parameters 

882 ---------- 

883 ref : `DatasetRef` 

884 Resolved reference to an already stored dataset. 

885 parameters : `dict` 

886 Additional StorageClass-defined options to control reading, 

887 typically used to efficiently read only a subset of the dataset. 

888 

889 Returns 

890 ------- 

891 obj : `DeferredDatasetHandle` 

892 A handle which can be used to retrieve a dataset at a later time. 

893 

894 Raises 

895 ------ 

896 AmbiguousDatasetError 

897 Raised if ``ref.id is None``, i.e. the reference is unresolved. 

898 """ 

899 if ref.id is None: 

900 raise AmbiguousDatasetError( 

901 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved." 

902 ) 

903 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

904 

905 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

906 dataId: Optional[DataId] = None, *, 

907 parameters: Union[dict, None] = None, 

908 collections: Any = None, 

909 **kwds: Any) -> DeferredDatasetHandle: 

910 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

911 after an immediate registry lookup. 

912 

913 Parameters 

914 ---------- 

915 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

916 When `DatasetRef` the `dataId` should be `None`. 

917 Otherwise the `DatasetType` or name thereof. 

918 dataId : `dict` or `DataCoordinate`, optional 

919 A `dict` of `Dimension` link name, value pairs that label the 

920 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

921 should be provided as the first argument. 

922 parameters : `dict` 

923 Additional StorageClass-defined options to control reading, 

924 typically used to efficiently read only a subset of the dataset. 

925 collections : Any, optional 

926 Collections to be searched, overriding ``self.collections``. 

927 Can be any of the types supported by the ``collections`` argument 

928 to butler construction. 

929 kwds 

930 Additional keyword arguments used to augment or construct a 

931 `DataId`. See `DataId` parameters. 

932 

933 Returns 

934 ------- 

935 obj : `DeferredDatasetHandle` 

936 A handle which can be used to retrieve a dataset at a later time. 

937 

938 Raises 

939 ------ 

940 LookupError 

941 Raised if no matching dataset exists in the `Registry` (and 

942 ``allowUnresolved is False``). 

943 ValueError 

944 Raised if a resolved `DatasetRef` was passed as an input, but it 

945 differs from the one found in the registry. 

946 TypeError 

947 Raised if no collections were provided. 

948 """ 

949 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds) 

950 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

951 

952 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

953 dataId: Optional[DataId] = None, *, 

954 parameters: Optional[Dict[str, Any]] = None, 

955 collections: Any = None, 

956 **kwds: Any) -> Any: 

957 """Retrieve a stored dataset. 

958 

959 Parameters 

960 ---------- 

961 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

962 When `DatasetRef` the `dataId` should be `None`. 

963 Otherwise the `DatasetType` or name thereof. 

964 dataId : `dict` or `DataCoordinate` 

965 A `dict` of `Dimension` link name, value pairs that label the 

966 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

967 should be provided as the first argument. 

968 parameters : `dict` 

969 Additional StorageClass-defined options to control reading, 

970 typically used to efficiently read only a subset of the dataset. 

971 collections : Any, optional 

972 Collections to be searched, overriding ``self.collections``. 

973 Can be any of the types supported by the ``collections`` argument 

974 to butler construction. 

975 kwds 

976 Additional keyword arguments used to augment or construct a 

977 `DataCoordinate`. See `DataCoordinate.standardize` 

978 parameters. 

979 

980 Returns 

981 ------- 

982 obj : `object` 

983 The dataset. 

984 

985 Raises 

986 ------ 

987 ValueError 

988 Raised if a resolved `DatasetRef` was passed as an input, but it 

989 differs from the one found in the registry. 

990 LookupError 

991 Raised if no matching dataset exists in the `Registry`. 

992 TypeError 

993 Raised if no collections were provided. 

994 

995 Notes 

996 ----- 

997 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

998 this method requires that the given data ID include temporal dimensions 

999 beyond the dimensions of the dataset type itself, in order to find the 

1000 dataset with the appropriate validity range. For example, a "bias" 

1001 dataset with native dimensions ``{instrument, detector}`` could be 

1002 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1003 ``exposure`` is a temporal dimension. 

1004 """ 

1005 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1006 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds) 

1007 return self.getDirect(ref, parameters=parameters) 

1008 

1009 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1010 dataId: Optional[DataId] = None, *, 

1011 predict: bool = False, 

1012 collections: Any = None, 

1013 run: Optional[str] = None, 

1014 **kwds: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

1015 """Returns the URIs associated with the dataset. 

1016 

1017 Parameters 

1018 ---------- 

1019 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1020 When `DatasetRef` the `dataId` should be `None`. 

1021 Otherwise the `DatasetType` or name thereof. 

1022 dataId : `dict` or `DataCoordinate` 

1023 A `dict` of `Dimension` link name, value pairs that label the 

1024 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1025 should be provided as the first argument. 

1026 predict : `bool` 

1027 If `True`, allow URIs to be returned of datasets that have not 

1028 been written. 

1029 collections : Any, optional 

1030 Collections to be searched, overriding ``self.collections``. 

1031 Can be any of the types supported by the ``collections`` argument 

1032 to butler construction. 

1033 run : `str`, optional 

1034 Run to use for predictions, overriding ``self.run``. 

1035 kwds 

1036 Additional keyword arguments used to augment or construct a 

1037 `DataCoordinate`. See `DataCoordinate.standardize` 

1038 parameters. 

1039 

1040 Returns 

1041 ------- 

1042 primary : `ButlerURI` 

1043 The URI to the primary artifact associated with this dataset. 

1044 If the dataset was disassembled within the datastore this 

1045 may be `None`. 

1046 components : `dict` 

1047 URIs to any components associated with the dataset artifact. 

1048 Can be empty if there are no components. 

1049 """ 

1050 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict, 

1051 collections=collections, **kwds) 

1052 if ref.id is None: # only possible if predict is True 

1053 if run is None: 

1054 run = self.run 

1055 if run is None: 

1056 raise TypeError("Cannot predict location with run=None.") 

1057 # Lie about ID, because we can't guess it, and only 

1058 # Datastore.getURIs() will ever see it (and it doesn't use it). 

1059 ref = ref.resolved(id=0, run=run) 

1060 return self.datastore.getURIs(ref, predict) 

1061 

1062 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1063 dataId: Optional[DataId] = None, *, 

1064 predict: bool = False, 

1065 collections: Any = None, 

1066 run: Optional[str] = None, 

1067 **kwds: Any) -> ButlerURI: 

1068 """Return the URI to the Dataset. 

1069 

1070 Parameters 

1071 ---------- 

1072 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1073 When `DatasetRef` the `dataId` should be `None`. 

1074 Otherwise the `DatasetType` or name thereof. 

1075 dataId : `dict` or `DataCoordinate` 

1076 A `dict` of `Dimension` link name, value pairs that label the 

1077 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1078 should be provided as the first argument. 

1079 predict : `bool` 

1080 If `True`, allow URIs to be returned of datasets that have not 

1081 been written. 

1082 collections : Any, optional 

1083 Collections to be searched, overriding ``self.collections``. 

1084 Can be any of the types supported by the ``collections`` argument 

1085 to butler construction. 

1086 run : `str`, optional 

1087 Run to use for predictions, overriding ``self.run``. 

1088 kwds 

1089 Additional keyword arguments used to augment or construct a 

1090 `DataCoordinate`. See `DataCoordinate.standardize` 

1091 parameters. 

1092 

1093 Returns 

1094 ------- 

1095 uri : `ButlerURI` 

1096 URI pointing to the Dataset within the datastore. If the 

1097 Dataset does not exist in the datastore, and if ``predict`` is 

1098 `True`, the URI will be a prediction and will include a URI 

1099 fragment "#predicted". 

1100 If the datastore does not have entities that relate well 

1101 to the concept of a URI the returned URI string will be 

1102 descriptive. The returned URI is not guaranteed to be obtainable. 

1103 

1104 Raises 

1105 ------ 

1106 LookupError 

1107 A URI has been requested for a dataset that does not exist and 

1108 guessing is not allowed. 

1109 ValueError 

1110 Raised if a resolved `DatasetRef` was passed as an input, but it 

1111 differs from the one found in the registry. 

1112 TypeError 

1113 Raised if no collections were provided. 

1114 RuntimeError 

1115 Raised if a URI is requested for a dataset that consists of 

1116 multiple artifacts. 

1117 """ 

1118 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict, 

1119 collections=collections, run=run, **kwds) 

1120 

1121 if primary is None or components: 

1122 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1123 "Use Butler.getURIs() instead.") 

1124 return primary 

1125 

1126 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1127 dataId: Optional[DataId] = None, *, 

1128 collections: Any = None, 

1129 **kwds: Any) -> bool: 

1130 """Return True if the Dataset is actually present in the Datastore. 

1131 

1132 Parameters 

1133 ---------- 

1134 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1135 When `DatasetRef` the `dataId` should be `None`. 

1136 Otherwise the `DatasetType` or name thereof. 

1137 dataId : `dict` or `DataCoordinate` 

1138 A `dict` of `Dimension` link name, value pairs that label the 

1139 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1140 should be provided as the first argument. 

1141 collections : Any, optional 

1142 Collections to be searched, overriding ``self.collections``. 

1143 Can be any of the types supported by the ``collections`` argument 

1144 to butler construction. 

1145 kwds 

1146 Additional keyword arguments used to augment or construct a 

1147 `DataCoordinate`. See `DataCoordinate.standardize` 

1148 parameters. 

1149 

1150 Raises 

1151 ------ 

1152 LookupError 

1153 Raised if the dataset is not even present in the Registry. 

1154 ValueError 

1155 Raised if a resolved `DatasetRef` was passed as an input, but it 

1156 differs from the one found in the registry. 

1157 TypeError 

1158 Raised if no collections were provided. 

1159 """ 

1160 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds) 

1161 return self.datastore.exists(ref) 

1162 

1163 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1164 """Remove one or more `~CollectionType.RUN` collections and the 

1165 datasets within them. 

1166 

1167 Parameters 

1168 ---------- 

1169 names : `Iterable` [ `str` ] 

1170 The names of the collections to remove. 

1171 unstore : `bool`, optional 

1172 If `True` (default), delete datasets from all datastores in which 

1173 they are present, and attempt to rollback the registry deletions if 

1174 datastore deletions fail (which may not always be possible). If 

1175 `False`, datastore records for these datasets are still removed, 

1176 but any artifacts (e.g. files) will not be. 

1177 

1178 Raises 

1179 ------ 

1180 TypeError 

1181 Raised if one or more collections are not of type 

1182 `~CollectionType.RUN`. 

1183 """ 

1184 if not self.isWriteable(): 

1185 raise TypeError("Butler is read-only.") 

1186 names = list(names) 

1187 refs: List[DatasetRef] = [] 

1188 for name in names: 

1189 collectionType = self.registry.getCollectionType(name) 

1190 if collectionType is not CollectionType.RUN: 

1191 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1192 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True)) 

1193 with self.registry.transaction(): 

1194 if unstore: 

1195 for ref in refs: 

1196 if self.datastore.exists(ref): 

1197 self.datastore.trash(ref) 

1198 else: 

1199 self.datastore.forget(refs) 

1200 for name in names: 

1201 self.registry.removeCollection(name) 

1202 if unstore: 

1203 # Point of no return for removing artifacts 

1204 self.datastore.emptyTrash() 

1205 

1206 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False) -> None: 

1207 """Remove a collection and possibly prune datasets within it. 

1208 

1209 Parameters 

1210 ---------- 

1211 name : `str` 

1212 Name of the collection to remove. If this is a 

1213 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection, 

1214 datasets within the collection are not modified unless ``unstore`` 

1215 is `True`. If this is a `~CollectionType.RUN` collection, 

1216 ``purge`` and ``unstore`` must be `True`, and all datasets in it 

1217 are fully removed from the data repository. 

1218 purge : `bool`, optional 

1219 If `True`, permit `~CollectionType.RUN` collections to be removed, 

1220 fully removing datasets within them. Requires ``unstore=True`` as 

1221 well as an added precaution against accidental deletion. Must be 

1222 `False` (default) if the collection is not a ``RUN``. 

1223 unstore: `bool`, optional 

1224 If `True`, remove all datasets in the collection from all 

1225 datastores in which they appear. 

1226 

1227 Raises 

1228 ------ 

1229 TypeError 

1230 Raised if the butler is read-only or arguments are mutually 

1231 inconsistent. 

1232 """ 

1233 

1234 # See pruneDatasets comments for more information about the logic here; 

1235 # the cases are almost the same, but here we can rely on Registry to 

1236 # take care everything but Datastore deletion when we remove the 

1237 # collection. 

1238 if not self.isWriteable(): 

1239 raise TypeError("Butler is read-only.") 

1240 collectionType = self.registry.getCollectionType(name) 

1241 if purge and not unstore: 

1242 raise PurgeWithoutUnstorePruneCollectionsError() 

1243 if collectionType is CollectionType.RUN and not purge: 

1244 raise RunWithoutPurgePruneCollectionsError(collectionType) 

1245 if collectionType is not CollectionType.RUN and purge: 

1246 raise PurgeUnsupportedPruneCollectionsError(collectionType) 

1247 

1248 with self.registry.transaction(): 

1249 if unstore: 

1250 for ref in self.registry.queryDatasets(..., collections=name, findFirst=True): 

1251 if self.datastore.exists(ref): 

1252 self.datastore.trash(ref) 

1253 self.registry.removeCollection(name) 

1254 if unstore: 

1255 # Point of no return for removing artifacts 

1256 self.datastore.emptyTrash() 

1257 

1258 def pruneDatasets(self, refs: Iterable[DatasetRef], *, 

1259 disassociate: bool = True, 

1260 unstore: bool = False, 

1261 tags: Iterable[str] = (), 

1262 purge: bool = False, 

1263 run: Optional[str] = None) -> None: 

1264 """Remove one or more datasets from a collection and/or storage. 

1265 

1266 Parameters 

1267 ---------- 

1268 refs : `~collections.abc.Iterable` of `DatasetRef` 

1269 Datasets to prune. These must be "resolved" references (not just 

1270 a `DatasetType` and data ID). 

1271 disassociate : `bool`, optional 

1272 Disassociate pruned datasets from ``tags``, or from all collections 

1273 if ``purge=True``. 

1274 unstore : `bool`, optional 

1275 If `True` (`False` is default) remove these datasets from all 

1276 datastores known to this butler. Note that this will make it 

1277 impossible to retrieve these datasets even via other collections. 

1278 Datasets that are already not stored are ignored by this option. 

1279 tags : `Iterable` [ `str` ], optional 

1280 `~CollectionType.TAGGED` collections to disassociate the datasets 

1281 from. Ignored if ``disassociate`` is `False` or ``purge`` is 

1282 `True`. 

1283 purge : `bool`, optional 

1284 If `True` (`False` is default), completely remove the dataset from 

1285 the `Registry`. To prevent accidental deletions, ``purge`` may 

1286 only be `True` if all of the following conditions are met: 

1287 

1288 - All given datasets are in the given run. 

1289 - ``disassociate`` is `True`; 

1290 - ``unstore`` is `True`. 

1291 

1292 This mode may remove provenance information from datasets other 

1293 than those provided, and should be used with extreme care. 

1294 

1295 Raises 

1296 ------ 

1297 TypeError 

1298 Raised if the butler is read-only, if no collection was provided, 

1299 or the conditions for ``purge=True`` were not met. 

1300 """ 

1301 if not self.isWriteable(): 

1302 raise TypeError("Butler is read-only.") 

1303 if purge: 

1304 if not disassociate: 

1305 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1306 if not unstore: 

1307 raise TypeError("Cannot pass purge=True without unstore=True.") 

1308 elif disassociate: 

1309 tags = tuple(tags) 

1310 if not tags: 

1311 raise TypeError("No tags provided but disassociate=True.") 

1312 for tag in tags: 

1313 collectionType = self.registry.getCollectionType(tag) 

1314 if collectionType is not CollectionType.TAGGED: 

1315 raise TypeError(f"Cannot disassociate from collection '{tag}' " 

1316 f"of non-TAGGED type {collectionType.name}.") 

1317 # Transform possibly-single-pass iterable into something we can iterate 

1318 # over multiple times. 

1319 refs = list(refs) 

1320 # Pruning a component of a DatasetRef makes no sense since registry 

1321 # doesn't know about components and datastore might not store 

1322 # components in a separate file 

1323 for ref in refs: 

1324 if ref.datasetType.component(): 

1325 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1326 # We don't need an unreliable Datastore transaction for this, because 

1327 # we've been extra careful to ensure that Datastore.trash only involves 

1328 # mutating the Registry (it can _look_ at Datastore-specific things, 

1329 # but shouldn't change them), and hence all operations here are 

1330 # Registry operations. 

1331 with self.registry.transaction(): 

1332 if unstore: 

1333 for ref in refs: 

1334 # There is a difference between a concrete composite 

1335 # and virtual composite. In a virtual composite the 

1336 # datastore is never given the top level DatasetRef. In 

1337 # the concrete composite the datastore knows all the 

1338 # refs and will clean up itself if asked to remove the 

1339 # parent ref. We can not check configuration for this 

1340 # since we can not trust that the configuration is the 

1341 # same. We therefore have to ask if the ref exists or 

1342 # not. This is consistent with the fact that we want 

1343 # to ignore already-removed-from-datastore datasets 

1344 # anyway. 

1345 if self.datastore.exists(ref): 

1346 self.datastore.trash(ref) 

1347 if purge: 

1348 self.registry.removeDatasets(refs) 

1349 elif disassociate: 

1350 assert tags, "Guaranteed by earlier logic in this function." 

1351 for tag in tags: 

1352 self.registry.disassociate(tag, refs) 

1353 # We've exited the Registry transaction, and apparently committed. 

1354 # (if there was an exception, everything rolled back, and it's as if 

1355 # nothing happened - and we never get here). 

1356 # Datastore artifacts are not yet gone, but they're clearly marked 

1357 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1358 # problems we can try again later, and if manual administrative 

1359 # intervention is required, it's pretty clear what that should entail: 

1360 # deleting everything on disk and in private Datastore tables that is 

1361 # in the dataset_location_trash table. 

1362 if unstore: 

1363 # Point of no return for removing artifacts 

1364 self.datastore.emptyTrash() 

1365 

1366 @transactional 

1367 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None, 

1368 ) -> None: 

1369 """Store and register one or more datasets that already exist on disk. 

1370 

1371 Parameters 

1372 ---------- 

1373 datasets : `FileDataset` 

1374 Each positional argument is a struct containing information about 

1375 a file to be ingested, including its path (either absolute or 

1376 relative to the datastore root, if applicable), a `DatasetRef`, 

1377 and optionally a formatter class or its fully-qualified string 

1378 name. If a formatter is not provided, the formatter that would be 

1379 used for `put` is assumed. On successful return, all 

1380 `FileDataset.ref` attributes will have their `DatasetRef.id` 

1381 attribute populated and all `FileDataset.formatter` attributes will 

1382 be set to the formatter class used. `FileDataset.path` attributes 

1383 may be modified to put paths in whatever the datastore considers a 

1384 standardized form. 

1385 transfer : `str`, optional 

1386 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1387 'hardlink', 'relsymlink' or 'symlink', indicating how to transfer 

1388 the file. 

1389 run : `str`, optional 

1390 The name of the run ingested datasets should be added to, 

1391 overriding ``self.run``. 

1392 

1393 Raises 

1394 ------ 

1395 TypeError 

1396 Raised if the butler is read-only or if no run was provided. 

1397 NotImplementedError 

1398 Raised if the `Datastore` does not support the given transfer mode. 

1399 DatasetTypeNotSupportedError 

1400 Raised if one or more files to be ingested have a dataset type that 

1401 is not supported by the `Datastore`.. 

1402 FileNotFoundError 

1403 Raised if one of the given files does not exist. 

1404 FileExistsError 

1405 Raised if transfer is not `None` but the (internal) location the 

1406 file would be moved to is already occupied. 

1407 

1408 Notes 

1409 ----- 

1410 This operation is not fully exception safe: if a database operation 

1411 fails, the given `FileDataset` instances may be only partially updated. 

1412 

1413 It is atomic in terms of database operations (they will either all 

1414 succeed or all fail) providing the database engine implements 

1415 transactions correctly. It will attempt to be atomic in terms of 

1416 filesystem operations as well, but this cannot be implemented 

1417 rigorously for most datastores. 

1418 """ 

1419 if not self.isWriteable(): 

1420 raise TypeError("Butler is read-only.") 

1421 # Reorganize the inputs so they're grouped by DatasetType and then 

1422 # data ID. We also include a list of DatasetRefs for each FileDataset 

1423 # to hold the resolved DatasetRefs returned by the Registry, before 

1424 # it's safe to swap them into FileDataset.refs. 

1425 # Some type annotation aliases to make that clearer: 

1426 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]] 

1427 GroupedData = MutableMapping[DatasetType, GroupForType] 

1428 # The actual data structure: 

1429 groupedData: GroupedData = defaultdict(dict) 

1430 # And the nested loop that populates it: 

1431 for dataset in datasets: 

1432 # This list intentionally shared across the inner loop, since it's 

1433 # associated with `dataset`. 

1434 resolvedRefs: List[DatasetRef] = [] 

1435 for ref in dataset.refs: 

1436 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs) 

1437 

1438 # Now we can bulk-insert into Registry for each DatasetType. 

1439 allResolvedRefs: List[DatasetRef] = [] 

1440 for datasetType, groupForType in groupedData.items(): 

1441 refs = self.registry.insertDatasets(datasetType, 

1442 dataIds=groupForType.keys(), 

1443 run=run) 

1444 # Append those resolved DatasetRefs to the new lists we set up for 

1445 # them. 

1446 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()): 

1447 resolvedRefs.append(ref) 

1448 

1449 # Go back to the original FileDatasets to replace their refs with the 

1450 # new resolved ones, and also build a big list of all refs. 

1451 allResolvedRefs = [] 

1452 for groupForType in groupedData.values(): 

1453 for dataset, resolvedRefs in groupForType.values(): 

1454 dataset.refs = resolvedRefs 

1455 allResolvedRefs.extend(resolvedRefs) 

1456 

1457 # Bulk-insert everything into Datastore. 

1458 self.datastore.ingest(*datasets, transfer=transfer) 

1459 

1460 @contextlib.contextmanager 

1461 def export(self, *, directory: Optional[str] = None, 

1462 filename: Optional[str] = None, 

1463 format: Optional[str] = None, 

1464 transfer: Optional[str] = None) -> Iterator[RepoExportContext]: 

1465 """Export datasets from the repository represented by this `Butler`. 

1466 

1467 This method is a context manager that returns a helper object 

1468 (`RepoExportContext`) that is used to indicate what information from 

1469 the repository should be exported. 

1470 

1471 Parameters 

1472 ---------- 

1473 directory : `str`, optional 

1474 Directory dataset files should be written to if ``transfer`` is not 

1475 `None`. 

1476 filename : `str`, optional 

1477 Name for the file that will include database information associated 

1478 with the exported datasets. If this is not an absolute path and 

1479 ``directory`` is not `None`, it will be written to ``directory`` 

1480 instead of the current working directory. Defaults to 

1481 "export.{format}". 

1482 format : `str`, optional 

1483 File format for the database information file. If `None`, the 

1484 extension of ``filename`` will be used. 

1485 transfer : `str`, optional 

1486 Transfer mode passed to `Datastore.export`. 

1487 

1488 Raises 

1489 ------ 

1490 TypeError 

1491 Raised if the set of arguments passed is inconsistent. 

1492 

1493 Examples 

1494 -------- 

1495 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

1496 methods are used to provide the iterables over data IDs and/or datasets 

1497 to be exported:: 

1498 

1499 with butler.export("exports.yaml") as export: 

1500 # Export all flats, but none of the dimension element rows 

1501 # (i.e. data ID information) associated with them. 

1502 export.saveDatasets(butler.registry.queryDatasets("flat"), 

1503 elements=()) 

1504 # Export all datasets that start with "deepCoadd_" and all of 

1505 # their associated data ID information. 

1506 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

1507 """ 

1508 if directory is None and transfer is not None: 

1509 raise TypeError("Cannot transfer without providing a directory.") 

1510 if transfer == "move": 

1511 raise TypeError("Transfer may not be 'move': export is read-only") 

1512 if format is None: 

1513 if filename is None: 

1514 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1515 else: 

1516 _, format = os.path.splitext(filename) 

1517 elif filename is None: 

1518 filename = f"export.{format}" 

1519 if directory is not None: 

1520 filename = os.path.join(directory, filename) 

1521 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"]) 

1522 with open(filename, 'w') as stream: 

1523 backend = BackendClass(stream) 

1524 try: 

1525 helper = RepoExportContext(self.registry, self.datastore, backend=backend, 

1526 directory=directory, transfer=transfer) 

1527 yield helper 

1528 except BaseException: 

1529 raise 

1530 else: 

1531 helper._finish() 

1532 

1533 def import_(self, *, directory: Optional[str] = None, 

1534 filename: Union[str, TextIO, None] = None, 

1535 format: Optional[str] = None, 

1536 transfer: Optional[str] = None, 

1537 skip_dimensions: Optional[Set] = None) -> None: 

1538 """Import datasets into this repository that were exported from a 

1539 different butler repository via `~lsst.daf.butler.Butler.export`. 

1540 

1541 Parameters 

1542 ---------- 

1543 directory : `str`, optional 

1544 Directory containing dataset files to import from. If `None`, 

1545 ``filename`` and all dataset file paths specified therein must 

1546 be absolute. 

1547 filename : `str` or `TextIO`, optional 

1548 A stream or name of file that contains database information 

1549 associated with the exported datasets, typically generated by 

1550 `~lsst.daf.butler.Butler.export`. If this a string (name) and 

1551 is not an absolute path, does not exist in the current working 

1552 directory, and ``directory`` is not `None`, it is assumed to be in 

1553 ``directory``. Defaults to "export.{format}". 

1554 format : `str`, optional 

1555 File format for ``filename``. If `None`, the extension of 

1556 ``filename`` will be used. 

1557 transfer : `str`, optional 

1558 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

1559 skip_dimensions : `set`, optional 

1560 Names of dimensions that should be skipped and not imported. 

1561 

1562 Raises 

1563 ------ 

1564 TypeError 

1565 Raised if the set of arguments passed is inconsistent, or if the 

1566 butler is read-only. 

1567 """ 

1568 if not self.isWriteable(): 

1569 raise TypeError("Butler is read-only.") 

1570 if format is None: 

1571 if filename is None: 

1572 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1573 else: 

1574 _, format = os.path.splitext(filename) # type: ignore 

1575 elif filename is None: 

1576 filename = f"export.{format}" 

1577 if isinstance(filename, str) and directory is not None and not os.path.exists(filename): 

1578 filename = os.path.join(directory, filename) 

1579 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"]) 

1580 

1581 def doImport(importStream: TextIO) -> None: 

1582 backend = BackendClass(importStream, self.registry) 

1583 backend.register() 

1584 with self.transaction(): 

1585 backend.load(self.datastore, directory=directory, transfer=transfer, 

1586 skip_dimensions=skip_dimensions) 

1587 

1588 if isinstance(filename, str): 

1589 with open(filename, "r") as stream: 

1590 doImport(stream) 

1591 else: 

1592 doImport(filename) 

1593 

1594 def validateConfiguration(self, logFailures: bool = False, 

1595 datasetTypeNames: Optional[Iterable[str]] = None, 

1596 ignore: Iterable[str] = None) -> None: 

1597 """Validate butler configuration. 

1598 

1599 Checks that each `DatasetType` can be stored in the `Datastore`. 

1600 

1601 Parameters 

1602 ---------- 

1603 logFailures : `bool`, optional 

1604 If `True`, output a log message for every validation error 

1605 detected. 

1606 datasetTypeNames : iterable of `str`, optional 

1607 The `DatasetType` names that should be checked. This allows 

1608 only a subset to be selected. 

1609 ignore : iterable of `str`, optional 

1610 Names of DatasetTypes to skip over. This can be used to skip 

1611 known problems. If a named `DatasetType` corresponds to a 

1612 composite, all components of that `DatasetType` will also be 

1613 ignored. 

1614 

1615 Raises 

1616 ------ 

1617 ButlerValidationError 

1618 Raised if there is some inconsistency with how this Butler 

1619 is configured. 

1620 """ 

1621 if datasetTypeNames: 

1622 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

1623 else: 

1624 datasetTypes = list(self.registry.queryDatasetTypes()) 

1625 

1626 # filter out anything from the ignore list 

1627 if ignore: 

1628 ignore = set(ignore) 

1629 datasetTypes = [e for e in datasetTypes 

1630 if e.name not in ignore and e.nameAndComponent()[0] not in ignore] 

1631 else: 

1632 ignore = set() 

1633 

1634 # Find all the registered instruments 

1635 instruments = set( 

1636 record.name for record in self.registry.queryDimensionRecords("instrument") 

1637 ) 

1638 

1639 # For each datasetType that has an instrument dimension, create 

1640 # a DatasetRef for each defined instrument 

1641 datasetRefs = [] 

1642 

1643 for datasetType in datasetTypes: 

1644 if "instrument" in datasetType.dimensions: 

1645 for instrument in instruments: 

1646 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, # type: ignore 

1647 conform=False) 

1648 datasetRefs.append(datasetRef) 

1649 

1650 entities: List[Union[DatasetType, DatasetRef]] = [] 

1651 entities.extend(datasetTypes) 

1652 entities.extend(datasetRefs) 

1653 

1654 datastoreErrorStr = None 

1655 try: 

1656 self.datastore.validateConfiguration(entities, logFailures=logFailures) 

1657 except ValidationError as e: 

1658 datastoreErrorStr = str(e) 

1659 

1660 # Also check that the LookupKeys used by the datastores match 

1661 # registry and storage class definitions 

1662 keys = self.datastore.getLookupKeys() 

1663 

1664 failedNames = set() 

1665 failedDataId = set() 

1666 for key in keys: 

1667 if key.name is not None: 

1668 if key.name in ignore: 

1669 continue 

1670 

1671 # skip if specific datasetType names were requested and this 

1672 # name does not match 

1673 if datasetTypeNames and key.name not in datasetTypeNames: 

1674 continue 

1675 

1676 # See if it is a StorageClass or a DatasetType 

1677 if key.name in self.storageClasses: 

1678 pass 

1679 else: 

1680 try: 

1681 self.registry.getDatasetType(key.name) 

1682 except KeyError: 

1683 if logFailures: 

1684 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

1685 failedNames.add(key) 

1686 else: 

1687 # Dimensions are checked for consistency when the Butler 

1688 # is created and rendezvoused with a universe. 

1689 pass 

1690 

1691 # Check that the instrument is a valid instrument 

1692 # Currently only support instrument so check for that 

1693 if key.dataId: 

1694 dataIdKeys = set(key.dataId) 

1695 if set(["instrument"]) != dataIdKeys: 

1696 if logFailures: 

1697 log.critical("Key '%s' has unsupported DataId override", key) 

1698 failedDataId.add(key) 

1699 elif key.dataId["instrument"] not in instruments: 

1700 if logFailures: 

1701 log.critical("Key '%s' has unknown instrument", key) 

1702 failedDataId.add(key) 

1703 

1704 messages = [] 

1705 

1706 if datastoreErrorStr: 

1707 messages.append(datastoreErrorStr) 

1708 

1709 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

1710 (failedDataId, "Keys with bad DataId entries: ")): 

1711 if failed: 

1712 msg += ", ".join(str(k) for k in failed) 

1713 messages.append(msg) 

1714 

1715 if messages: 

1716 raise ValidationError(";\n".join(messages)) 

1717 

1718 @property 

1719 def collections(self) -> CollectionSearch: 

1720 """The collections to search by default, in order (`CollectionSearch`). 

1721 

1722 This is an alias for ``self.registry.defaults.collections``. It cannot 

1723 be set directly in isolation, but all defaults may be changed together 

1724 by assigning a new `RegistryDefaults` instance to 

1725 ``self.registry.defaults``. 

1726 """ 

1727 return self.registry.defaults.collections 

1728 

1729 @property 

1730 def run(self) -> Optional[str]: 

1731 """Name of the run this butler writes outputs to by default (`str` or 

1732 `None`). 

1733 

1734 This is an alias for ``self.registry.defaults.run``. It cannot be set 

1735 directly in isolation, but all defaults may be changed together by 

1736 assigning a new `RegistryDefaults` instance to 

1737 ``self.registry.defaults``. 

1738 """ 

1739 return self.registry.defaults.run 

1740 

1741 registry: Registry 

1742 """The object that manages dataset metadata and relationships (`Registry`). 

1743 

1744 Most operations that don't involve reading or writing butler datasets are 

1745 accessible only via `Registry` methods. 

1746 """ 

1747 

1748 datastore: Datastore 

1749 """The object that manages actual dataset storage (`Datastore`). 

1750 

1751 Direct user access to the datastore should rarely be necessary; the primary 

1752 exception is the case where a `Datastore` implementation provides extra 

1753 functionality beyond what the base class defines. 

1754 """ 

1755 

1756 storageClasses: StorageClassFactory 

1757 """An object that maps known storage class names to objects that fully 

1758 describe them (`StorageClassFactory`). 

1759 """