Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Butler top level classes. 

24""" 

25from __future__ import annotations 

26 

27__all__ = ( 

28 "Butler", 

29 "ButlerValidationError", 

30 "PruneCollectionsArgsError", 

31 "PurgeWithoutUnstorePruneCollectionsError", 

32 "RunWithoutPurgePruneCollectionsError", 

33 "PurgeUnsupportedPruneCollectionsError", 

34) 

35 

36 

37from collections import defaultdict 

38import contextlib 

39import logging 

40import numbers 

41import os 

42from typing import ( 

43 Any, 

44 ClassVar, 

45 Counter, 

46 Dict, 

47 Iterable, 

48 Iterator, 

49 List, 

50 MutableMapping, 

51 Optional, 

52 Set, 

53 TextIO, 

54 Tuple, 

55 Type, 

56 Union, 

57) 

58 

59try: 

60 import boto3 

61except ImportError: 

62 boto3 = None 

63 

64from lsst.utils import doImport 

65from .core import ( 

66 AmbiguousDatasetError, 

67 ButlerURI, 

68 Config, 

69 ConfigSubset, 

70 DataCoordinate, 

71 DataId, 

72 DataIdValue, 

73 DatasetRef, 

74 DatasetType, 

75 Datastore, 

76 Dimension, 

77 DimensionConfig, 

78 FileDataset, 

79 StorageClassFactory, 

80 Timespan, 

81 ValidationError, 

82) 

83from .core.repoRelocation import BUTLER_ROOT_TAG 

84from .core.utils import transactional, getClassOf 

85from ._deferredDatasetHandle import DeferredDatasetHandle 

86from ._butlerConfig import ButlerConfig 

87from .registry import Registry, RegistryConfig, RegistryDefaults, CollectionType 

88from .registry.wildcards import CollectionSearch 

89from .transfers import RepoExportContext 

90 

91log = logging.getLogger(__name__) 

92 

93 

94class ButlerValidationError(ValidationError): 

95 """There is a problem with the Butler configuration.""" 

96 pass 

97 

98 

99class PruneCollectionsArgsError(TypeError): 

100 """Base class for errors relating to Butler.pruneCollections input 

101 arguments. 

102 """ 

103 pass 

104 

105 

106class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError): 

107 """Raised when purge and unstore are both required to be True, and 

108 purge is True but unstore is False. 

109 """ 

110 

111 def __init__(self) -> None: 

112 super().__init__("Cannot pass purge=True without unstore=True.") 

113 

114 

115class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError): 

116 """Raised when pruning a RUN collection but purge is False.""" 

117 

118 def __init__(self, collectionType: CollectionType): 

119 self.collectionType = collectionType 

120 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.") 

121 

122 

123class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError): 

124 """Raised when purge is True but is not supported for the given 

125 collection.""" 

126 

127 def __init__(self, collectionType: CollectionType): 

128 self.collectionType = collectionType 

129 super().__init__( 

130 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True.") 

131 

132 

133class Butler: 

134 """Main entry point for the data access system. 

135 

136 Parameters 

137 ---------- 

138 config : `ButlerConfig`, `Config` or `str`, optional. 

139 Configuration. Anything acceptable to the 

140 `ButlerConfig` constructor. If a directory path 

141 is given the configuration will be read from a ``butler.yaml`` file in 

142 that location. If `None` is given default values will be used. 

143 butler : `Butler`, optional. 

144 If provided, construct a new Butler that uses the same registry and 

145 datastore as the given one, but with the given collection and run. 

146 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

147 arguments. 

148 collections : `str` or `Iterable` [ `str` ], optional 

149 An expression specifying the collections to be searched (in order) when 

150 reading datasets. 

151 This may be a `str` collection name or an iterable thereof. 

152 See :ref:`daf_butler_collection_expressions` for more information. 

153 These collections are not registered automatically and must be 

154 manually registered before they are used by any method, but they may be 

155 manually registered after the `Butler` is initialized. 

156 run : `str`, optional 

157 Name of the `~CollectionType.RUN` collection new datasets should be 

158 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

159 ``collections`` will be set to ``[run]``. If not `None`, this 

160 collection will automatically be registered. If this is not set (and 

161 ``writeable`` is not set either), a read-only butler will be created. 

162 searchPaths : `list` of `str`, optional 

163 Directory paths to search when calculating the full Butler 

164 configuration. Not used if the supplied config is already a 

165 `ButlerConfig`. 

166 writeable : `bool`, optional 

167 Explicitly sets whether the butler supports write operations. If not 

168 provided, a read-write butler is created if any of ``run``, ``tags``, 

169 or ``chains`` is non-empty. 

170 inferDefaults : `bool`, optional 

171 If `True` (default) infer default data ID values from the values 

172 present in the datasets in ``collections``: if all collections have the 

173 same value (or no value) for a governor dimension, that value will be 

174 the default for that dimension. Nonexistent collections are ignored. 

175 If a default value is provided explicitly for a governor dimension via 

176 ``**kwargs``, no default will be inferred for that dimension. 

177 **kwargs : `str` 

178 Default data ID key-value pairs. These may only identify "governor" 

179 dimensions like ``instrument`` and ``skymap``. 

180 

181 Examples 

182 -------- 

183 While there are many ways to control exactly how a `Butler` interacts with 

184 the collections in its `Registry`, the most common cases are still simple. 

185 

186 For a read-only `Butler` that searches one collection, do:: 

187 

188 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

189 

190 For a read-write `Butler` that writes to and reads from a 

191 `~CollectionType.RUN` collection:: 

192 

193 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

194 

195 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

196 because we want to write to one `~CollectionType.RUN` collection but read 

197 from several others (as well):: 

198 

199 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

200 collections=["u/alice/DM-50000/a", 

201 "u/bob/DM-49998", 

202 "HSC/defaults"]) 

203 

204 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

205 Datasets will be read first from that run (since it appears first in the 

206 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

207 

208 Finally, one can always create a `Butler` with no collections:: 

209 

210 butler = Butler("/path/to/repo", writeable=True) 

211 

212 This can be extremely useful when you just want to use ``butler.registry``, 

213 e.g. for inserting dimension data or managing collections, or when the 

214 collections you want to use with the butler are not consistent. 

215 Passing ``writeable`` explicitly here is only necessary if you want to be 

216 able to make changes to the repo - usually the value for ``writeable`` can 

217 be guessed from the collection arguments provided, but it defaults to 

218 `False` when there are not collection arguments. 

219 """ 

220 def __init__(self, config: Union[Config, str, None] = None, *, 

221 butler: Optional[Butler] = None, 

222 collections: Any = None, 

223 run: Optional[str] = None, 

224 searchPaths: Optional[List[str]] = None, 

225 writeable: Optional[bool] = None, 

226 inferDefaults: bool = True, 

227 **kwargs: str, 

228 ): 

229 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

230 # Load registry, datastore, etc. from config or existing butler. 

231 if butler is not None: 

232 if config is not None or searchPaths is not None or writeable is not None: 

233 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' " 

234 "arguments with 'butler' argument.") 

235 self.registry = butler.registry.copy(defaults) 

236 self.datastore = butler.datastore 

237 self.storageClasses = butler.storageClasses 

238 self._config: ButlerConfig = butler._config 

239 else: 

240 self._config = ButlerConfig(config, searchPaths=searchPaths) 

241 if "root" in self._config: 

242 butlerRoot = self._config["root"] 

243 else: 

244 butlerRoot = self._config.configDir 

245 if writeable is None: 

246 writeable = run is not None 

247 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable, 

248 defaults=defaults) 

249 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(), 

250 butlerRoot=butlerRoot) 

251 self.storageClasses = StorageClassFactory() 

252 self.storageClasses.addFromConfig(self._config) 

253 if "run" in self._config or "collection" in self._config: 

254 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

255 

256 GENERATION: ClassVar[int] = 3 

257 """This is a Generation 3 Butler. 

258 

259 This attribute may be removed in the future, once the Generation 2 Butler 

260 interface has been fully retired; it should only be used in transitional 

261 code. 

262 """ 

263 

264 @staticmethod 

265 def makeRepo(root: str, config: Union[Config, str, None] = None, 

266 dimensionConfig: Union[Config, str, None] = None, standalone: bool = False, 

267 searchPaths: Optional[List[str]] = None, forceConfigRoot: bool = True, 

268 outfile: Optional[str] = None, overwrite: bool = False) -> Config: 

269 """Create an empty data repository by adding a butler.yaml config 

270 to a repository root directory. 

271 

272 Parameters 

273 ---------- 

274 root : `str` or `ButlerURI` 

275 Path or URI to the root location of the new repository. Will be 

276 created if it does not exist. 

277 config : `Config` or `str`, optional 

278 Configuration to write to the repository, after setting any 

279 root-dependent Registry or Datastore config options. Can not 

280 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

281 configuration will be used. Root-dependent config options 

282 specified in this config are overwritten if ``forceConfigRoot`` 

283 is `True`. 

284 dimensionConfig : `Config` or `str`, optional 

285 Configuration for dimensions, will be used to initialize registry 

286 database. 

287 standalone : `bool` 

288 If True, write all expanded defaults, not just customized or 

289 repository-specific settings. 

290 This (mostly) decouples the repository from the default 

291 configuration, insulating it from changes to the defaults (which 

292 may be good or bad, depending on the nature of the changes). 

293 Future *additions* to the defaults will still be picked up when 

294 initializing `Butlers` to repos created with ``standalone=True``. 

295 searchPaths : `list` of `str`, optional 

296 Directory paths to search when calculating the full butler 

297 configuration. 

298 forceConfigRoot : `bool`, optional 

299 If `False`, any values present in the supplied ``config`` that 

300 would normally be reset are not overridden and will appear 

301 directly in the output config. This allows non-standard overrides 

302 of the root directory for a datastore or registry to be given. 

303 If this parameter is `True` the values for ``root`` will be 

304 forced into the resulting config if appropriate. 

305 outfile : `str`, optional 

306 If not-`None`, the output configuration will be written to this 

307 location rather than into the repository itself. Can be a URI 

308 string. Can refer to a directory that will be used to write 

309 ``butler.yaml``. 

310 overwrite : `bool`, optional 

311 Create a new configuration file even if one already exists 

312 in the specified output location. Default is to raise 

313 an exception. 

314 

315 Returns 

316 ------- 

317 config : `Config` 

318 The updated `Config` instance written to the repo. 

319 

320 Raises 

321 ------ 

322 ValueError 

323 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

324 regular Config (as these subclasses would make it impossible to 

325 support ``standalone=False``). 

326 FileExistsError 

327 Raised if the output config file already exists. 

328 os.error 

329 Raised if the directory does not exist, exists but is not a 

330 directory, or cannot be created. 

331 

332 Notes 

333 ----- 

334 Note that when ``standalone=False`` (the default), the configuration 

335 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

336 construct the repository should also be used to construct any Butlers 

337 to avoid configuration inconsistencies. 

338 """ 

339 if isinstance(config, (ButlerConfig, ConfigSubset)): 

340 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

341 

342 # Ensure that the root of the repository exists or can be made 

343 uri = ButlerURI(root, forceDirectory=True) 

344 uri.mkdir() 

345 

346 config = Config(config) 

347 

348 # If we are creating a new repo from scratch with relative roots, 

349 # do not propagate an explicit root from the config file 

350 if "root" in config: 

351 del config["root"] 

352 

353 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

354 datastoreClass: Type[Datastore] = doImport(full["datastore", "cls"]) 

355 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

356 

357 # if key exists in given config, parse it, otherwise parse the defaults 

358 # in the expanded config 

359 if config.get(("registry", "db")): 

360 registryConfig = RegistryConfig(config) 

361 else: 

362 registryConfig = RegistryConfig(full) 

363 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

364 if defaultDatabaseUri is not None: 

365 Config.updateParameters(RegistryConfig, config, full, 

366 toUpdate={"db": defaultDatabaseUri}, 

367 overwrite=forceConfigRoot) 

368 else: 

369 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), 

370 overwrite=forceConfigRoot) 

371 

372 if standalone: 

373 config.merge(full) 

374 else: 

375 # Always expand the registry.managers section into the per-repo 

376 # config, because after the database schema is created, it's not 

377 # allowed to change anymore. Note that in the standalone=True 

378 # branch, _everything_ in the config is expanded, so there's no 

379 # need to special case this. 

380 Config.updateParameters(RegistryConfig, config, full, toCopy=("managers",), overwrite=False) 

381 configURI: Union[str, ButlerURI] 

382 if outfile is not None: 

383 # When writing to a separate location we must include 

384 # the root of the butler repo in the config else it won't know 

385 # where to look. 

386 config["root"] = uri.geturl() 

387 configURI = outfile 

388 else: 

389 configURI = uri 

390 config.dumpToUri(configURI, overwrite=overwrite) 

391 

392 # Create Registry and populate tables 

393 registryConfig = RegistryConfig(config.get("registry")) 

394 dimensionConfig = DimensionConfig(dimensionConfig) 

395 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root) 

396 

397 return config 

398 

399 @classmethod 

400 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str], 

401 defaultDataId: Dict[str, str], writeable: bool) -> Butler: 

402 """Callable used to unpickle a Butler. 

403 

404 We prefer not to use ``Butler.__init__`` directly so we can force some 

405 of its many arguments to be keyword-only (note that ``__reduce__`` 

406 can only invoke callables with positional arguments). 

407 

408 Parameters 

409 ---------- 

410 config : `ButlerConfig` 

411 Butler configuration, already coerced into a true `ButlerConfig` 

412 instance (and hence after any search paths for overrides have been 

413 utilized). 

414 collections : `CollectionSearch` 

415 Names of the default collections to read from. 

416 run : `str`, optional 

417 Name of the default `~CollectionType.RUN` collection to write to. 

418 defaultDataId : `dict` [ `str`, `str` ] 

419 Default data ID values. 

420 writeable : `bool` 

421 Whether the Butler should support write operations. 

422 

423 Returns 

424 ------- 

425 butler : `Butler` 

426 A new `Butler` instance. 

427 """ 

428 # MyPy doesn't recognize that the kwargs below are totally valid; it 

429 # seems to think '**defaultDataId* is a _positional_ argument! 

430 return cls(config=config, collections=collections, run=run, writeable=writeable, 

431 **defaultDataId) # type: ignore 

432 

433 def __reduce__(self) -> tuple: 

434 """Support pickling. 

435 """ 

436 return (Butler._unpickle, (self._config, self.collections, self.run, 

437 self.registry.defaults.dataId.byName(), 

438 self.registry.isWriteable())) 

439 

440 def __str__(self) -> str: 

441 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

442 self.collections, self.run, self.datastore, self.registry) 

443 

444 def isWriteable(self) -> bool: 

445 """Return `True` if this `Butler` supports write operations. 

446 """ 

447 return self.registry.isWriteable() 

448 

449 @contextlib.contextmanager 

450 def transaction(self) -> Iterator[None]: 

451 """Context manager supporting `Butler` transactions. 

452 

453 Transactions can be nested. 

454 """ 

455 with self.registry.transaction(): 

456 with self.datastore.transaction(): 

457 yield 

458 

459 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

460 dataId: Optional[DataId] = None, **kwds: Any 

461 ) -> Tuple[DatasetType, Optional[DataId]]: 

462 """Standardize the arguments passed to several Butler APIs. 

463 

464 Parameters 

465 ---------- 

466 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

467 When `DatasetRef` the `dataId` should be `None`. 

468 Otherwise the `DatasetType` or name thereof. 

469 dataId : `dict` or `DataCoordinate` 

470 A `dict` of `Dimension` link name, value pairs that label the 

471 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

472 should be provided as the second argument. 

473 kwds 

474 Additional keyword arguments used to augment or construct a 

475 `DataCoordinate`. See `DataCoordinate.standardize` 

476 parameters. 

477 

478 Returns 

479 ------- 

480 datasetType : `DatasetType` 

481 A `DatasetType` instance extracted from ``datasetRefOrType``. 

482 dataId : `dict` or `DataId`, optional 

483 Argument that can be used (along with ``kwds``) to construct a 

484 `DataId`. 

485 

486 Notes 

487 ----- 

488 Butler APIs that conceptually need a DatasetRef also allow passing a 

489 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

490 keyword arguments that can be used to construct one) separately. This 

491 method accepts those arguments and always returns a true `DatasetType` 

492 and a `DataId` or `dict`. 

493 

494 Standardization of `dict` vs `DataId` is best handled by passing the 

495 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are 

496 generally similarly flexible. 

497 """ 

498 externalDatasetType: Optional[DatasetType] = None 

499 internalDatasetType: Optional[DatasetType] = None 

500 if isinstance(datasetRefOrType, DatasetRef): 

501 if dataId is not None or kwds: 

502 raise ValueError("DatasetRef given, cannot use dataId as well") 

503 externalDatasetType = datasetRefOrType.datasetType 

504 dataId = datasetRefOrType.dataId 

505 else: 

506 # Don't check whether DataId is provided, because Registry APIs 

507 # can usually construct a better error message when it wasn't. 

508 if isinstance(datasetRefOrType, DatasetType): 

509 externalDatasetType = datasetRefOrType 

510 else: 

511 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

512 

513 # Check that they are self-consistent 

514 if externalDatasetType is not None: 

515 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

516 if externalDatasetType != internalDatasetType: 

517 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

518 f"registry definition ({internalDatasetType})") 

519 

520 assert internalDatasetType is not None 

521 return internalDatasetType, dataId 

522 

523 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

524 dataId: Optional[DataId] = None, *, 

525 collections: Any = None, 

526 allowUnresolved: bool = False, 

527 **kwds: Any) -> DatasetRef: 

528 """Shared logic for methods that start with a search for a dataset in 

529 the registry. 

530 

531 Parameters 

532 ---------- 

533 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

534 When `DatasetRef` the `dataId` should be `None`. 

535 Otherwise the `DatasetType` or name thereof. 

536 dataId : `dict` or `DataCoordinate`, optional 

537 A `dict` of `Dimension` link name, value pairs that label the 

538 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

539 should be provided as the first argument. 

540 collections : Any, optional 

541 Collections to be searched, overriding ``self.collections``. 

542 Can be any of the types supported by the ``collections`` argument 

543 to butler construction. 

544 allowUnresolved : `bool`, optional 

545 If `True`, return an unresolved `DatasetRef` if finding a resolved 

546 one in the `Registry` fails. Defaults to `False`. 

547 kwds 

548 Additional keyword arguments used to augment or construct a 

549 `DataId`. See `DataId` parameters. 

550 

551 Returns 

552 ------- 

553 ref : `DatasetRef` 

554 A reference to the dataset identified by the given arguments. 

555 

556 Raises 

557 ------ 

558 LookupError 

559 Raised if no matching dataset exists in the `Registry` (and 

560 ``allowUnresolved is False``). 

561 ValueError 

562 Raised if a resolved `DatasetRef` was passed as an input, but it 

563 differs from the one found in the registry. 

564 TypeError 

565 Raised if no collections were provided. 

566 """ 

567 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds) 

568 if isinstance(datasetRefOrType, DatasetRef): 

569 idNumber = datasetRefOrType.id 

570 else: 

571 idNumber = None 

572 timespan: Optional[Timespan] = None 

573 

574 # Process dimension records that are using record information 

575 # rather than ids 

576 newDataId: Dict[str, DataIdValue] = {} 

577 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict) 

578 

579 # if all the dataId comes from keyword parameters we do not need 

580 # to do anything here because they can't be of the form 

581 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

582 if dataId: 

583 for k, v in dataId.items(): 

584 # If we have a Dimension we do not need to do anything 

585 # because it cannot be a compound key. 

586 if isinstance(k, str) and "." in k: 

587 # Someone is using a more human-readable dataId 

588 dimensionName, record = k.split(".", 1) 

589 byRecord[dimensionName][record] = v 

590 elif isinstance(k, Dimension): 

591 newDataId[k.name] = v 

592 else: 

593 newDataId[k] = v 

594 

595 # Go through the updated dataId and check the type in case someone is 

596 # using an alternate key. We have already filtered out the compound 

597 # keys dimensions.record format. 

598 not_dimensions = {} 

599 

600 # Will need to look in the dataId and the keyword arguments 

601 # and will remove them if they need to be fixed or are unrecognized. 

602 for dataIdDict in (newDataId, kwds): 

603 # Use a list so we can adjust the dict safely in the loop 

604 for dimensionName in list(dataIdDict): 

605 value = dataIdDict[dimensionName] 

606 try: 

607 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

608 except KeyError: 

609 # This is not a real dimension 

610 not_dimensions[dimensionName] = value 

611 del dataIdDict[dimensionName] 

612 continue 

613 

614 # Convert an integral type to an explicit int to simplify 

615 # comparisons here 

616 if isinstance(value, numbers.Integral): 

617 value = int(value) 

618 

619 if not isinstance(value, dimension.primaryKey.getPythonType()): 

620 for alternate in dimension.alternateKeys: 

621 if isinstance(value, alternate.getPythonType()): 

622 byRecord[dimensionName][alternate.name] = value 

623 del dataIdDict[dimensionName] 

624 log.debug("Converting dimension %s to %s.%s=%s", 

625 dimensionName, dimensionName, alternate.name, value) 

626 break 

627 else: 

628 log.warning("Type mismatch found for value '%r' provided for dimension %s. " 

629 "Could not find matching alternative (primary key has type %s) " 

630 "so attempting to use as-is.", 

631 value, dimensionName, dimension.primaryKey.getPythonType()) 

632 

633 # If we have some unrecognized dimensions we have to try to connect 

634 # them to records in other dimensions. This is made more complicated 

635 # by some dimensions having records with clashing names. A mitigation 

636 # is that we can tell by this point which dimensions are missing 

637 # for the DatasetType but this does not work for calibrations 

638 # where additional dimensions can be used to constrain the temporal 

639 # axis. 

640 if not_dimensions: 

641 # Calculate missing dimensions 

642 provided = set(newDataId) | set(kwds) | set(byRecord) 

643 missingDimensions = datasetType.dimensions.names - provided 

644 

645 # For calibrations we may well be needing temporal dimensions 

646 # so rather than always including all dimensions in the scan 

647 # restrict things a little. It is still possible for there 

648 # to be confusion over day_obs in visit vs exposure for example. 

649 # If we are not searching calibration collections things may 

650 # fail but they are going to fail anyway because of the 

651 # ambiguousness of the dataId... 

652 candidateDimensions: Set[str] = set() 

653 candidateDimensions.update(missingDimensions) 

654 if datasetType.isCalibration(): 

655 for dim in self.registry.dimensions.getStaticDimensions(): 

656 if dim.temporal: 

657 candidateDimensions.add(str(dim)) 

658 

659 # Look up table for the first association with a dimension 

660 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict) 

661 

662 # Keep track of whether an item is associated with multiple 

663 # dimensions. 

664 counter: Counter[str] = Counter() 

665 assigned: Dict[str, Set[str]] = defaultdict(set) 

666 

667 # Go through the missing dimensions and associate the 

668 # given names with records within those dimensions 

669 for dimensionName in candidateDimensions: 

670 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

671 fields = dimension.metadata.names | dimension.uniqueKeys.names 

672 for field in not_dimensions: 

673 if field in fields: 

674 guessedAssociation[dimensionName][field] = not_dimensions[field] 

675 counter[dimensionName] += 1 

676 assigned[field].add(dimensionName) 

677 

678 # There is a chance we have allocated a single dataId item 

679 # to multiple dimensions. Need to decide which should be retained. 

680 # For now assume that the most popular alternative wins. 

681 # This means that day_obs with seq_num will result in 

682 # exposure.day_obs and not visit.day_obs 

683 # Also prefer an explicitly missing dimension over an inferred 

684 # temporal dimension. 

685 for fieldName, assignedDimensions in assigned.items(): 

686 if len(assignedDimensions) > 1: 

687 # Pick the most popular (preferring mandatory dimensions) 

688 requiredButMissing = assignedDimensions.intersection(missingDimensions) 

689 if requiredButMissing: 

690 candidateDimensions = requiredButMissing 

691 else: 

692 candidateDimensions = assignedDimensions 

693 

694 # Select the relevant items and get a new restricted 

695 # counter. 

696 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

697 duplicatesCounter: Counter[str] = Counter() 

698 duplicatesCounter.update(theseCounts) 

699 

700 # Choose the most common. If they are equally common 

701 # we will pick the one that was found first. 

702 # Returns a list of tuples 

703 selected = duplicatesCounter.most_common(1)[0][0] 

704 

705 log.debug("Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

706 " Removed ambiguity by choosing dimension %s.", 

707 fieldName, ", ".join(assignedDimensions), selected) 

708 

709 for candidateDimension in assignedDimensions: 

710 if candidateDimension != selected: 

711 del guessedAssociation[candidateDimension][fieldName] 

712 

713 # Update the record look up dict with the new associations 

714 for dimensionName, values in guessedAssociation.items(): 

715 if values: # A dict might now be empty 

716 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", 

717 dimensionName, values) 

718 byRecord[dimensionName].update(values) 

719 

720 if byRecord: 

721 # Some record specifiers were found so we need to convert 

722 # them to the Id form 

723 for dimensionName, values in byRecord.items(): 

724 if dimensionName in newDataId: 

725 log.warning("DataId specified explicit %s dimension value of %s in addition to" 

726 " general record specifiers for it of %s. Ignoring record information.", 

727 dimensionName, newDataId[dimensionName], str(values)) 

728 continue 

729 

730 # Build up a WHERE expression -- use single quotes 

731 def quote(s: Any) -> str: 

732 if isinstance(s, str): 

733 return f"'{s}'" 

734 else: 

735 return s 

736 

737 where = " AND ".join(f"{dimensionName}.{k} = {quote(v)}" 

738 for k, v in values.items()) 

739 

740 # Hopefully we get a single record that matches 

741 records = set(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId, 

742 where=where, **kwds)) 

743 

744 if len(records) != 1: 

745 if len(records) > 1: 

746 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

747 for r in records: 

748 log.debug("- %s", str(r)) 

749 raise RuntimeError(f"DataId specification for dimension {dimensionName} is not" 

750 f" uniquely constrained to a single dataset by {values}." 

751 f" Got {len(records)} results.") 

752 raise RuntimeError(f"DataId specification for dimension {dimensionName} matched no" 

753 f" records when constrained by {values}") 

754 

755 # Get the primary key from the real dimension object 

756 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

757 if not isinstance(dimension, Dimension): 

758 raise RuntimeError( 

759 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

760 ) 

761 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

762 

763 # We have modified the dataId so need to switch to it 

764 dataId = newDataId 

765 

766 if datasetType.isCalibration(): 

767 # Because this is a calibration dataset, first try to make a 

768 # standardize the data ID without restricting the dimensions to 

769 # those of the dataset type requested, because there may be extra 

770 # dimensions that provide temporal information for a validity-range 

771 # lookup. 

772 dataId = DataCoordinate.standardize(dataId, universe=self.registry.dimensions, 

773 defaults=self.registry.defaults.dataId, **kwds) 

774 if dataId.graph.temporal: 

775 dataId = self.registry.expandDataId(dataId) 

776 timespan = dataId.timespan 

777 else: 

778 # Standardize the data ID to just the dimensions of the dataset 

779 # type instead of letting registry.findDataset do it, so we get the 

780 # result even if no dataset is found. 

781 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions, 

782 defaults=self.registry.defaults.dataId, **kwds) 

783 # Always lookup the DatasetRef, even if one is given, to ensure it is 

784 # present in the current collection. 

785 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

786 if ref is None: 

787 if allowUnresolved: 

788 return DatasetRef(datasetType, dataId) 

789 else: 

790 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} " 

791 f"could not be found in collections {collections}.") 

792 if idNumber is not None and idNumber != ref.id: 

793 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match " 

794 f"id ({ref.id}) in registry in collections {collections}.") 

795 return ref 

796 

797 @transactional 

798 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

799 dataId: Optional[DataId] = None, *, 

800 run: Optional[str] = None, 

801 **kwds: Any) -> DatasetRef: 

802 """Store and register a dataset. 

803 

804 Parameters 

805 ---------- 

806 obj : `object` 

807 The dataset. 

808 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

809 When `DatasetRef` is provided, ``dataId`` should be `None`. 

810 Otherwise the `DatasetType` or name thereof. 

811 dataId : `dict` or `DataCoordinate` 

812 A `dict` of `Dimension` link name, value pairs that label the 

813 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

814 should be provided as the second argument. 

815 run : `str`, optional 

816 The name of the run the dataset should be added to, overriding 

817 ``self.run``. 

818 kwds 

819 Additional keyword arguments used to augment or construct a 

820 `DataCoordinate`. See `DataCoordinate.standardize` 

821 parameters. 

822 

823 Returns 

824 ------- 

825 ref : `DatasetRef` 

826 A reference to the stored dataset, updated with the correct id if 

827 given. 

828 

829 Raises 

830 ------ 

831 TypeError 

832 Raised if the butler is read-only or if no run has been provided. 

833 """ 

834 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

835 if not self.isWriteable(): 

836 raise TypeError("Butler is read-only.") 

837 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds) 

838 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

839 raise ValueError("DatasetRef must not be in registry, must have None id") 

840 

841 # Add Registry Dataset entry. 

842 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds) 

843 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

844 

845 # Add Datastore entry. 

846 self.datastore.put(obj, ref) 

847 

848 return ref 

849 

850 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any: 

851 """Retrieve a stored dataset. 

852 

853 Unlike `Butler.get`, this method allows datasets outside the Butler's 

854 collection to be read as long as the `DatasetRef` that identifies them 

855 can be obtained separately. 

856 

857 Parameters 

858 ---------- 

859 ref : `DatasetRef` 

860 Resolved reference to an already stored dataset. 

861 parameters : `dict` 

862 Additional StorageClass-defined options to control reading, 

863 typically used to efficiently read only a subset of the dataset. 

864 

865 Returns 

866 ------- 

867 obj : `object` 

868 The dataset. 

869 """ 

870 return self.datastore.get(ref, parameters=parameters) 

871 

872 def getDirectDeferred(self, ref: DatasetRef, *, 

873 parameters: Union[dict, None] = None) -> DeferredDatasetHandle: 

874 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

875 from a resolved `DatasetRef`. 

876 

877 Parameters 

878 ---------- 

879 ref : `DatasetRef` 

880 Resolved reference to an already stored dataset. 

881 parameters : `dict` 

882 Additional StorageClass-defined options to control reading, 

883 typically used to efficiently read only a subset of the dataset. 

884 

885 Returns 

886 ------- 

887 obj : `DeferredDatasetHandle` 

888 A handle which can be used to retrieve a dataset at a later time. 

889 

890 Raises 

891 ------ 

892 AmbiguousDatasetError 

893 Raised if ``ref.id is None``, i.e. the reference is unresolved. 

894 """ 

895 if ref.id is None: 

896 raise AmbiguousDatasetError( 

897 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved." 

898 ) 

899 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

900 

901 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

902 dataId: Optional[DataId] = None, *, 

903 parameters: Union[dict, None] = None, 

904 collections: Any = None, 

905 **kwds: Any) -> DeferredDatasetHandle: 

906 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

907 after an immediate registry lookup. 

908 

909 Parameters 

910 ---------- 

911 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

912 When `DatasetRef` the `dataId` should be `None`. 

913 Otherwise the `DatasetType` or name thereof. 

914 dataId : `dict` or `DataCoordinate`, optional 

915 A `dict` of `Dimension` link name, value pairs that label the 

916 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

917 should be provided as the first argument. 

918 parameters : `dict` 

919 Additional StorageClass-defined options to control reading, 

920 typically used to efficiently read only a subset of the dataset. 

921 collections : Any, optional 

922 Collections to be searched, overriding ``self.collections``. 

923 Can be any of the types supported by the ``collections`` argument 

924 to butler construction. 

925 kwds 

926 Additional keyword arguments used to augment or construct a 

927 `DataId`. See `DataId` parameters. 

928 

929 Returns 

930 ------- 

931 obj : `DeferredDatasetHandle` 

932 A handle which can be used to retrieve a dataset at a later time. 

933 

934 Raises 

935 ------ 

936 LookupError 

937 Raised if no matching dataset exists in the `Registry` (and 

938 ``allowUnresolved is False``). 

939 ValueError 

940 Raised if a resolved `DatasetRef` was passed as an input, but it 

941 differs from the one found in the registry. 

942 TypeError 

943 Raised if no collections were provided. 

944 """ 

945 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds) 

946 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

947 

948 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

949 dataId: Optional[DataId] = None, *, 

950 parameters: Optional[Dict[str, Any]] = None, 

951 collections: Any = None, 

952 **kwds: Any) -> Any: 

953 """Retrieve a stored dataset. 

954 

955 Parameters 

956 ---------- 

957 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

958 When `DatasetRef` the `dataId` should be `None`. 

959 Otherwise the `DatasetType` or name thereof. 

960 dataId : `dict` or `DataCoordinate` 

961 A `dict` of `Dimension` link name, value pairs that label the 

962 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

963 should be provided as the first argument. 

964 parameters : `dict` 

965 Additional StorageClass-defined options to control reading, 

966 typically used to efficiently read only a subset of the dataset. 

967 collections : Any, optional 

968 Collections to be searched, overriding ``self.collections``. 

969 Can be any of the types supported by the ``collections`` argument 

970 to butler construction. 

971 kwds 

972 Additional keyword arguments used to augment or construct a 

973 `DataCoordinate`. See `DataCoordinate.standardize` 

974 parameters. 

975 

976 Returns 

977 ------- 

978 obj : `object` 

979 The dataset. 

980 

981 Raises 

982 ------ 

983 ValueError 

984 Raised if a resolved `DatasetRef` was passed as an input, but it 

985 differs from the one found in the registry. 

986 LookupError 

987 Raised if no matching dataset exists in the `Registry`. 

988 TypeError 

989 Raised if no collections were provided. 

990 

991 Notes 

992 ----- 

993 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

994 this method requires that the given data ID include temporal dimensions 

995 beyond the dimensions of the dataset type itself, in order to find the 

996 dataset with the appropriate validity range. For example, a "bias" 

997 dataset with native dimensions ``{instrument, detector}`` could be 

998 fetched with a ``{instrument, detector, exposure}`` data ID, because 

999 ``exposure`` is a temporal dimension. 

1000 """ 

1001 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1002 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds) 

1003 return self.getDirect(ref, parameters=parameters) 

1004 

1005 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1006 dataId: Optional[DataId] = None, *, 

1007 predict: bool = False, 

1008 collections: Any = None, 

1009 run: Optional[str] = None, 

1010 **kwds: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

1011 """Returns the URIs associated with the dataset. 

1012 

1013 Parameters 

1014 ---------- 

1015 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1016 When `DatasetRef` the `dataId` should be `None`. 

1017 Otherwise the `DatasetType` or name thereof. 

1018 dataId : `dict` or `DataCoordinate` 

1019 A `dict` of `Dimension` link name, value pairs that label the 

1020 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1021 should be provided as the first argument. 

1022 predict : `bool` 

1023 If `True`, allow URIs to be returned of datasets that have not 

1024 been written. 

1025 collections : Any, optional 

1026 Collections to be searched, overriding ``self.collections``. 

1027 Can be any of the types supported by the ``collections`` argument 

1028 to butler construction. 

1029 run : `str`, optional 

1030 Run to use for predictions, overriding ``self.run``. 

1031 kwds 

1032 Additional keyword arguments used to augment or construct a 

1033 `DataCoordinate`. See `DataCoordinate.standardize` 

1034 parameters. 

1035 

1036 Returns 

1037 ------- 

1038 primary : `ButlerURI` 

1039 The URI to the primary artifact associated with this dataset. 

1040 If the dataset was disassembled within the datastore this 

1041 may be `None`. 

1042 components : `dict` 

1043 URIs to any components associated with the dataset artifact. 

1044 Can be empty if there are no components. 

1045 """ 

1046 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict, 

1047 collections=collections, **kwds) 

1048 if ref.id is None: # only possible if predict is True 

1049 if run is None: 

1050 run = self.run 

1051 if run is None: 

1052 raise TypeError("Cannot predict location with run=None.") 

1053 # Lie about ID, because we can't guess it, and only 

1054 # Datastore.getURIs() will ever see it (and it doesn't use it). 

1055 ref = ref.resolved(id=0, run=run) 

1056 return self.datastore.getURIs(ref, predict) 

1057 

1058 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1059 dataId: Optional[DataId] = None, *, 

1060 predict: bool = False, 

1061 collections: Any = None, 

1062 run: Optional[str] = None, 

1063 **kwds: Any) -> ButlerURI: 

1064 """Return the URI to the Dataset. 

1065 

1066 Parameters 

1067 ---------- 

1068 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1069 When `DatasetRef` the `dataId` should be `None`. 

1070 Otherwise the `DatasetType` or name thereof. 

1071 dataId : `dict` or `DataCoordinate` 

1072 A `dict` of `Dimension` link name, value pairs that label the 

1073 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1074 should be provided as the first argument. 

1075 predict : `bool` 

1076 If `True`, allow URIs to be returned of datasets that have not 

1077 been written. 

1078 collections : Any, optional 

1079 Collections to be searched, overriding ``self.collections``. 

1080 Can be any of the types supported by the ``collections`` argument 

1081 to butler construction. 

1082 run : `str`, optional 

1083 Run to use for predictions, overriding ``self.run``. 

1084 kwds 

1085 Additional keyword arguments used to augment or construct a 

1086 `DataCoordinate`. See `DataCoordinate.standardize` 

1087 parameters. 

1088 

1089 Returns 

1090 ------- 

1091 uri : `ButlerURI` 

1092 URI pointing to the Dataset within the datastore. If the 

1093 Dataset does not exist in the datastore, and if ``predict`` is 

1094 `True`, the URI will be a prediction and will include a URI 

1095 fragment "#predicted". 

1096 If the datastore does not have entities that relate well 

1097 to the concept of a URI the returned URI string will be 

1098 descriptive. The returned URI is not guaranteed to be obtainable. 

1099 

1100 Raises 

1101 ------ 

1102 LookupError 

1103 A URI has been requested for a dataset that does not exist and 

1104 guessing is not allowed. 

1105 ValueError 

1106 Raised if a resolved `DatasetRef` was passed as an input, but it 

1107 differs from the one found in the registry. 

1108 TypeError 

1109 Raised if no collections were provided. 

1110 RuntimeError 

1111 Raised if a URI is requested for a dataset that consists of 

1112 multiple artifacts. 

1113 """ 

1114 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict, 

1115 collections=collections, run=run, **kwds) 

1116 

1117 if primary is None or components: 

1118 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1119 "Use Butler.getURIs() instead.") 

1120 return primary 

1121 

1122 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1123 dataId: Optional[DataId] = None, *, 

1124 collections: Any = None, 

1125 **kwds: Any) -> bool: 

1126 """Return True if the Dataset is actually present in the Datastore. 

1127 

1128 Parameters 

1129 ---------- 

1130 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1131 When `DatasetRef` the `dataId` should be `None`. 

1132 Otherwise the `DatasetType` or name thereof. 

1133 dataId : `dict` or `DataCoordinate` 

1134 A `dict` of `Dimension` link name, value pairs that label the 

1135 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1136 should be provided as the first argument. 

1137 collections : Any, optional 

1138 Collections to be searched, overriding ``self.collections``. 

1139 Can be any of the types supported by the ``collections`` argument 

1140 to butler construction. 

1141 kwds 

1142 Additional keyword arguments used to augment or construct a 

1143 `DataCoordinate`. See `DataCoordinate.standardize` 

1144 parameters. 

1145 

1146 Raises 

1147 ------ 

1148 LookupError 

1149 Raised if the dataset is not even present in the Registry. 

1150 ValueError 

1151 Raised if a resolved `DatasetRef` was passed as an input, but it 

1152 differs from the one found in the registry. 

1153 TypeError 

1154 Raised if no collections were provided. 

1155 """ 

1156 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds) 

1157 return self.datastore.exists(ref) 

1158 

1159 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False) -> None: 

1160 """Remove a collection and possibly prune datasets within it. 

1161 

1162 Parameters 

1163 ---------- 

1164 name : `str` 

1165 Name of the collection to remove. If this is a 

1166 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection, 

1167 datasets within the collection are not modified unless ``unstore`` 

1168 is `True`. If this is a `~CollectionType.RUN` collection, 

1169 ``purge`` and ``unstore`` must be `True`, and all datasets in it 

1170 are fully removed from the data repository. 

1171 purge : `bool`, optional 

1172 If `True`, permit `~CollectionType.RUN` collections to be removed, 

1173 fully removing datasets within them. Requires ``unstore=True`` as 

1174 well as an added precaution against accidental deletion. Must be 

1175 `False` (default) if the collection is not a ``RUN``. 

1176 unstore: `bool`, optional 

1177 If `True`, remove all datasets in the collection from all 

1178 datastores in which they appear. 

1179 

1180 Raises 

1181 ------ 

1182 TypeError 

1183 Raised if the butler is read-only or arguments are mutually 

1184 inconsistent. 

1185 """ 

1186 

1187 # See pruneDatasets comments for more information about the logic here; 

1188 # the cases are almost the same, but here we can rely on Registry to 

1189 # take care everything but Datastore deletion when we remove the 

1190 # collection. 

1191 if not self.isWriteable(): 

1192 raise TypeError("Butler is read-only.") 

1193 collectionType = self.registry.getCollectionType(name) 

1194 if purge and not unstore: 

1195 raise PurgeWithoutUnstorePruneCollectionsError() 

1196 if collectionType is CollectionType.RUN and not purge: 

1197 raise RunWithoutPurgePruneCollectionsError(collectionType) 

1198 if collectionType is not CollectionType.RUN and purge: 

1199 raise PurgeUnsupportedPruneCollectionsError(collectionType) 

1200 

1201 with self.registry.transaction(): 

1202 if unstore: 

1203 for ref in self.registry.queryDatasets(..., collections=name, findFirst=True): 

1204 if self.datastore.exists(ref): 

1205 self.datastore.trash(ref) 

1206 self.registry.removeCollection(name) 

1207 if unstore: 

1208 # Point of no return for removing artifacts 

1209 self.datastore.emptyTrash() 

1210 

1211 def pruneDatasets(self, refs: Iterable[DatasetRef], *, 

1212 disassociate: bool = True, 

1213 unstore: bool = False, 

1214 tags: Iterable[str] = (), 

1215 purge: bool = False, 

1216 run: Optional[str] = None) -> None: 

1217 """Remove one or more datasets from a collection and/or storage. 

1218 

1219 Parameters 

1220 ---------- 

1221 refs : `~collections.abc.Iterable` of `DatasetRef` 

1222 Datasets to prune. These must be "resolved" references (not just 

1223 a `DatasetType` and data ID). 

1224 disassociate : `bool`, optional 

1225 Disassociate pruned datasets from ``tags``, or from all collections 

1226 if ``purge=True``. 

1227 unstore : `bool`, optional 

1228 If `True` (`False` is default) remove these datasets from all 

1229 datastores known to this butler. Note that this will make it 

1230 impossible to retrieve these datasets even via other collections. 

1231 Datasets that are already not stored are ignored by this option. 

1232 tags : `Iterable` [ `str` ], optional 

1233 `~CollectionType.TAGGED` collections to disassociate the datasets 

1234 from. Ignored if ``disassociate`` is `False` or ``purge`` is 

1235 `True`. 

1236 purge : `bool`, optional 

1237 If `True` (`False` is default), completely remove the dataset from 

1238 the `Registry`. To prevent accidental deletions, ``purge`` may 

1239 only be `True` if all of the following conditions are met: 

1240 

1241 - All given datasets are in the given run. 

1242 - ``disassociate`` is `True`; 

1243 - ``unstore`` is `True`. 

1244 

1245 This mode may remove provenance information from datasets other 

1246 than those provided, and should be used with extreme care. 

1247 

1248 Raises 

1249 ------ 

1250 TypeError 

1251 Raised if the butler is read-only, if no collection was provided, 

1252 or the conditions for ``purge=True`` were not met. 

1253 """ 

1254 if not self.isWriteable(): 

1255 raise TypeError("Butler is read-only.") 

1256 if purge: 

1257 if not disassociate: 

1258 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1259 if not unstore: 

1260 raise TypeError("Cannot pass purge=True without unstore=True.") 

1261 elif disassociate: 

1262 tags = tuple(tags) 

1263 if not tags: 

1264 raise TypeError("No tags provided but disassociate=True.") 

1265 for tag in tags: 

1266 collectionType = self.registry.getCollectionType(tag) 

1267 if collectionType is not CollectionType.TAGGED: 

1268 raise TypeError(f"Cannot disassociate from collection '{tag}' " 

1269 f"of non-TAGGED type {collectionType.name}.") 

1270 # Transform possibly-single-pass iterable into something we can iterate 

1271 # over multiple times. 

1272 refs = list(refs) 

1273 # Pruning a component of a DatasetRef makes no sense since registry 

1274 # doesn't know about components and datastore might not store 

1275 # components in a separate file 

1276 for ref in refs: 

1277 if ref.datasetType.component(): 

1278 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1279 # We don't need an unreliable Datastore transaction for this, because 

1280 # we've been extra careful to ensure that Datastore.trash only involves 

1281 # mutating the Registry (it can _look_ at Datastore-specific things, 

1282 # but shouldn't change them), and hence all operations here are 

1283 # Registry operations. 

1284 with self.registry.transaction(): 

1285 if unstore: 

1286 for ref in refs: 

1287 # There is a difference between a concrete composite 

1288 # and virtual composite. In a virtual composite the 

1289 # datastore is never given the top level DatasetRef. In 

1290 # the concrete composite the datastore knows all the 

1291 # refs and will clean up itself if asked to remove the 

1292 # parent ref. We can not check configuration for this 

1293 # since we can not trust that the configuration is the 

1294 # same. We therefore have to ask if the ref exists or 

1295 # not. This is consistent with the fact that we want 

1296 # to ignore already-removed-from-datastore datasets 

1297 # anyway. 

1298 if self.datastore.exists(ref): 

1299 self.datastore.trash(ref) 

1300 if purge: 

1301 self.registry.removeDatasets(refs) 

1302 elif disassociate: 

1303 assert tags, "Guaranteed by earlier logic in this function." 

1304 for tag in tags: 

1305 self.registry.disassociate(tag, refs) 

1306 # We've exited the Registry transaction, and apparently committed. 

1307 # (if there was an exception, everything rolled back, and it's as if 

1308 # nothing happened - and we never get here). 

1309 # Datastore artifacts are not yet gone, but they're clearly marked 

1310 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1311 # problems we can try again later, and if manual administrative 

1312 # intervention is required, it's pretty clear what that should entail: 

1313 # deleting everything on disk and in private Datastore tables that is 

1314 # in the dataset_location_trash table. 

1315 if unstore: 

1316 # Point of no return for removing artifacts 

1317 self.datastore.emptyTrash() 

1318 

1319 @transactional 

1320 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None, 

1321 ) -> None: 

1322 """Store and register one or more datasets that already exist on disk. 

1323 

1324 Parameters 

1325 ---------- 

1326 datasets : `FileDataset` 

1327 Each positional argument is a struct containing information about 

1328 a file to be ingested, including its path (either absolute or 

1329 relative to the datastore root, if applicable), a `DatasetRef`, 

1330 and optionally a formatter class or its fully-qualified string 

1331 name. If a formatter is not provided, the formatter that would be 

1332 used for `put` is assumed. On successful return, all 

1333 `FileDataset.ref` attributes will have their `DatasetRef.id` 

1334 attribute populated and all `FileDataset.formatter` attributes will 

1335 be set to the formatter class used. `FileDataset.path` attributes 

1336 may be modified to put paths in whatever the datastore considers a 

1337 standardized form. 

1338 transfer : `str`, optional 

1339 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1340 'hardlink', 'relsymlink' or 'symlink', indicating how to transfer 

1341 the file. 

1342 run : `str`, optional 

1343 The name of the run ingested datasets should be added to, 

1344 overriding ``self.run``. 

1345 

1346 Raises 

1347 ------ 

1348 TypeError 

1349 Raised if the butler is read-only or if no run was provided. 

1350 NotImplementedError 

1351 Raised if the `Datastore` does not support the given transfer mode. 

1352 DatasetTypeNotSupportedError 

1353 Raised if one or more files to be ingested have a dataset type that 

1354 is not supported by the `Datastore`.. 

1355 FileNotFoundError 

1356 Raised if one of the given files does not exist. 

1357 FileExistsError 

1358 Raised if transfer is not `None` but the (internal) location the 

1359 file would be moved to is already occupied. 

1360 

1361 Notes 

1362 ----- 

1363 This operation is not fully exception safe: if a database operation 

1364 fails, the given `FileDataset` instances may be only partially updated. 

1365 

1366 It is atomic in terms of database operations (they will either all 

1367 succeed or all fail) providing the database engine implements 

1368 transactions correctly. It will attempt to be atomic in terms of 

1369 filesystem operations as well, but this cannot be implemented 

1370 rigorously for most datastores. 

1371 """ 

1372 if not self.isWriteable(): 

1373 raise TypeError("Butler is read-only.") 

1374 # Reorganize the inputs so they're grouped by DatasetType and then 

1375 # data ID. We also include a list of DatasetRefs for each FileDataset 

1376 # to hold the resolved DatasetRefs returned by the Registry, before 

1377 # it's safe to swap them into FileDataset.refs. 

1378 # Some type annotation aliases to make that clearer: 

1379 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]] 

1380 GroupedData = MutableMapping[DatasetType, GroupForType] 

1381 # The actual data structure: 

1382 groupedData: GroupedData = defaultdict(dict) 

1383 # And the nested loop that populates it: 

1384 for dataset in datasets: 

1385 # This list intentionally shared across the inner loop, since it's 

1386 # associated with `dataset`. 

1387 resolvedRefs: List[DatasetRef] = [] 

1388 for ref in dataset.refs: 

1389 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs) 

1390 

1391 # Now we can bulk-insert into Registry for each DatasetType. 

1392 allResolvedRefs: List[DatasetRef] = [] 

1393 for datasetType, groupForType in groupedData.items(): 

1394 refs = self.registry.insertDatasets(datasetType, 

1395 dataIds=groupForType.keys(), 

1396 run=run) 

1397 # Append those resolved DatasetRefs to the new lists we set up for 

1398 # them. 

1399 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()): 

1400 resolvedRefs.append(ref) 

1401 

1402 # Go back to the original FileDatasets to replace their refs with the 

1403 # new resolved ones, and also build a big list of all refs. 

1404 allResolvedRefs = [] 

1405 for groupForType in groupedData.values(): 

1406 for dataset, resolvedRefs in groupForType.values(): 

1407 dataset.refs = resolvedRefs 

1408 allResolvedRefs.extend(resolvedRefs) 

1409 

1410 # Bulk-insert everything into Datastore. 

1411 self.datastore.ingest(*datasets, transfer=transfer) 

1412 

1413 @contextlib.contextmanager 

1414 def export(self, *, directory: Optional[str] = None, 

1415 filename: Optional[str] = None, 

1416 format: Optional[str] = None, 

1417 transfer: Optional[str] = None) -> Iterator[RepoExportContext]: 

1418 """Export datasets from the repository represented by this `Butler`. 

1419 

1420 This method is a context manager that returns a helper object 

1421 (`RepoExportContext`) that is used to indicate what information from 

1422 the repository should be exported. 

1423 

1424 Parameters 

1425 ---------- 

1426 directory : `str`, optional 

1427 Directory dataset files should be written to if ``transfer`` is not 

1428 `None`. 

1429 filename : `str`, optional 

1430 Name for the file that will include database information associated 

1431 with the exported datasets. If this is not an absolute path and 

1432 ``directory`` is not `None`, it will be written to ``directory`` 

1433 instead of the current working directory. Defaults to 

1434 "export.{format}". 

1435 format : `str`, optional 

1436 File format for the database information file. If `None`, the 

1437 extension of ``filename`` will be used. 

1438 transfer : `str`, optional 

1439 Transfer mode passed to `Datastore.export`. 

1440 

1441 Raises 

1442 ------ 

1443 TypeError 

1444 Raised if the set of arguments passed is inconsistent. 

1445 

1446 Examples 

1447 -------- 

1448 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

1449 methods are used to provide the iterables over data IDs and/or datasets 

1450 to be exported:: 

1451 

1452 with butler.export("exports.yaml") as export: 

1453 # Export all flats, but none of the dimension element rows 

1454 # (i.e. data ID information) associated with them. 

1455 export.saveDatasets(butler.registry.queryDatasets("flat"), 

1456 elements=()) 

1457 # Export all datasets that start with "deepCoadd_" and all of 

1458 # their associated data ID information. 

1459 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

1460 """ 

1461 if directory is None and transfer is not None: 

1462 raise TypeError("Cannot transfer without providing a directory.") 

1463 if transfer == "move": 

1464 raise TypeError("Transfer may not be 'move': export is read-only") 

1465 if format is None: 

1466 if filename is None: 

1467 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1468 else: 

1469 _, format = os.path.splitext(filename) 

1470 elif filename is None: 

1471 filename = f"export.{format}" 

1472 if directory is not None: 

1473 filename = os.path.join(directory, filename) 

1474 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"]) 

1475 with open(filename, 'w') as stream: 

1476 backend = BackendClass(stream) 

1477 try: 

1478 helper = RepoExportContext(self.registry, self.datastore, backend=backend, 

1479 directory=directory, transfer=transfer) 

1480 yield helper 

1481 except BaseException: 

1482 raise 

1483 else: 

1484 helper._finish() 

1485 

1486 def import_(self, *, directory: Optional[str] = None, 

1487 filename: Union[str, TextIO, None] = None, 

1488 format: Optional[str] = None, 

1489 transfer: Optional[str] = None, 

1490 skip_dimensions: Optional[Set] = None) -> None: 

1491 """Import datasets into this repository that were exported from a 

1492 different butler repository via `~lsst.daf.butler.Butler.export`. 

1493 

1494 Parameters 

1495 ---------- 

1496 directory : `str`, optional 

1497 Directory containing dataset files to import from. If `None`, 

1498 ``filename`` and all dataset file paths specified therein must 

1499 be absolute. 

1500 filename : `str` or `TextIO`, optional 

1501 A stream or name of file that contains database information 

1502 associated with the exported datasets, typically generated by 

1503 `~lsst.daf.butler.Butler.export`. If this a string (name) and 

1504 is not an absolute path, does not exist in the current working 

1505 directory, and ``directory`` is not `None`, it is assumed to be in 

1506 ``directory``. Defaults to "export.{format}". 

1507 format : `str`, optional 

1508 File format for ``filename``. If `None`, the extension of 

1509 ``filename`` will be used. 

1510 transfer : `str`, optional 

1511 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

1512 skip_dimensions : `set`, optional 

1513 Names of dimensions that should be skipped and not imported. 

1514 

1515 Raises 

1516 ------ 

1517 TypeError 

1518 Raised if the set of arguments passed is inconsistent, or if the 

1519 butler is read-only. 

1520 """ 

1521 if not self.isWriteable(): 

1522 raise TypeError("Butler is read-only.") 

1523 if format is None: 

1524 if filename is None: 

1525 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1526 else: 

1527 _, format = os.path.splitext(filename) # type: ignore 

1528 elif filename is None: 

1529 filename = f"export.{format}" 

1530 if isinstance(filename, str) and directory is not None and not os.path.exists(filename): 

1531 filename = os.path.join(directory, filename) 

1532 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"]) 

1533 

1534 def doImport(importStream: TextIO) -> None: 

1535 backend = BackendClass(importStream, self.registry) 

1536 backend.register() 

1537 with self.transaction(): 

1538 backend.load(self.datastore, directory=directory, transfer=transfer, 

1539 skip_dimensions=skip_dimensions) 

1540 

1541 if isinstance(filename, str): 

1542 with open(filename, "r") as stream: 

1543 doImport(stream) 

1544 else: 

1545 doImport(filename) 

1546 

1547 def validateConfiguration(self, logFailures: bool = False, 

1548 datasetTypeNames: Optional[Iterable[str]] = None, 

1549 ignore: Iterable[str] = None) -> None: 

1550 """Validate butler configuration. 

1551 

1552 Checks that each `DatasetType` can be stored in the `Datastore`. 

1553 

1554 Parameters 

1555 ---------- 

1556 logFailures : `bool`, optional 

1557 If `True`, output a log message for every validation error 

1558 detected. 

1559 datasetTypeNames : iterable of `str`, optional 

1560 The `DatasetType` names that should be checked. This allows 

1561 only a subset to be selected. 

1562 ignore : iterable of `str`, optional 

1563 Names of DatasetTypes to skip over. This can be used to skip 

1564 known problems. If a named `DatasetType` corresponds to a 

1565 composite, all components of that `DatasetType` will also be 

1566 ignored. 

1567 

1568 Raises 

1569 ------ 

1570 ButlerValidationError 

1571 Raised if there is some inconsistency with how this Butler 

1572 is configured. 

1573 """ 

1574 if datasetTypeNames: 

1575 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

1576 else: 

1577 datasetTypes = list(self.registry.queryDatasetTypes()) 

1578 

1579 # filter out anything from the ignore list 

1580 if ignore: 

1581 ignore = set(ignore) 

1582 datasetTypes = [e for e in datasetTypes 

1583 if e.name not in ignore and e.nameAndComponent()[0] not in ignore] 

1584 else: 

1585 ignore = set() 

1586 

1587 # Find all the registered instruments 

1588 instruments = set( 

1589 record.name for record in self.registry.queryDimensionRecords("instrument") 

1590 ) 

1591 

1592 # For each datasetType that has an instrument dimension, create 

1593 # a DatasetRef for each defined instrument 

1594 datasetRefs = [] 

1595 

1596 for datasetType in datasetTypes: 

1597 if "instrument" in datasetType.dimensions: 

1598 for instrument in instruments: 

1599 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, # type: ignore 

1600 conform=False) 

1601 datasetRefs.append(datasetRef) 

1602 

1603 entities: List[Union[DatasetType, DatasetRef]] = [] 

1604 entities.extend(datasetTypes) 

1605 entities.extend(datasetRefs) 

1606 

1607 datastoreErrorStr = None 

1608 try: 

1609 self.datastore.validateConfiguration(entities, logFailures=logFailures) 

1610 except ValidationError as e: 

1611 datastoreErrorStr = str(e) 

1612 

1613 # Also check that the LookupKeys used by the datastores match 

1614 # registry and storage class definitions 

1615 keys = self.datastore.getLookupKeys() 

1616 

1617 failedNames = set() 

1618 failedDataId = set() 

1619 for key in keys: 

1620 if key.name is not None: 

1621 if key.name in ignore: 

1622 continue 

1623 

1624 # skip if specific datasetType names were requested and this 

1625 # name does not match 

1626 if datasetTypeNames and key.name not in datasetTypeNames: 

1627 continue 

1628 

1629 # See if it is a StorageClass or a DatasetType 

1630 if key.name in self.storageClasses: 

1631 pass 

1632 else: 

1633 try: 

1634 self.registry.getDatasetType(key.name) 

1635 except KeyError: 

1636 if logFailures: 

1637 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

1638 failedNames.add(key) 

1639 else: 

1640 # Dimensions are checked for consistency when the Butler 

1641 # is created and rendezvoused with a universe. 

1642 pass 

1643 

1644 # Check that the instrument is a valid instrument 

1645 # Currently only support instrument so check for that 

1646 if key.dataId: 

1647 dataIdKeys = set(key.dataId) 

1648 if set(["instrument"]) != dataIdKeys: 

1649 if logFailures: 

1650 log.critical("Key '%s' has unsupported DataId override", key) 

1651 failedDataId.add(key) 

1652 elif key.dataId["instrument"] not in instruments: 

1653 if logFailures: 

1654 log.critical("Key '%s' has unknown instrument", key) 

1655 failedDataId.add(key) 

1656 

1657 messages = [] 

1658 

1659 if datastoreErrorStr: 

1660 messages.append(datastoreErrorStr) 

1661 

1662 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

1663 (failedDataId, "Keys with bad DataId entries: ")): 

1664 if failed: 

1665 msg += ", ".join(str(k) for k in failed) 

1666 messages.append(msg) 

1667 

1668 if messages: 

1669 raise ValidationError(";\n".join(messages)) 

1670 

1671 @property 

1672 def collections(self) -> CollectionSearch: 

1673 """The collections to search by default, in order (`CollectionSearch`). 

1674 

1675 This is an alias for ``self.registry.defaults.collections``. It cannot 

1676 be set directly in isolation, but all defaults may be changed together 

1677 by assigning a new `RegistryDefaults` instance to 

1678 ``self.registry.defaults``. 

1679 """ 

1680 return self.registry.defaults.collections 

1681 

1682 @property 

1683 def run(self) -> Optional[str]: 

1684 """Name of the run this butler writes outputs to by default (`str` or 

1685 `None`). 

1686 

1687 This is an alias for ``self.registry.defaults.run``. It cannot be set 

1688 directly in isolation, but all defaults may be changed together by 

1689 assigning a new `RegistryDefaults` instance to 

1690 ``self.registry.defaults``. 

1691 """ 

1692 return self.registry.defaults.run 

1693 

1694 registry: Registry 

1695 """The object that manages dataset metadata and relationships (`Registry`). 

1696 

1697 Most operations that don't involve reading or writing butler datasets are 

1698 accessible only via `Registry` methods. 

1699 """ 

1700 

1701 datastore: Datastore 

1702 """The object that manages actual dataset storage (`Datastore`). 

1703 

1704 Direct user access to the datastore should rarely be necessary; the primary 

1705 exception is the case where a `Datastore` implementation provides extra 

1706 functionality beyond what the base class defines. 

1707 """ 

1708 

1709 storageClasses: StorageClassFactory 

1710 """An object that maps known storage class names to objects that fully 

1711 describe them (`StorageClassFactory`). 

1712 """