Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Butler top level classes. 

24""" 

25from __future__ import annotations 

26 

27__all__ = ( 

28 "Butler", 

29 "ButlerValidationError", 

30 "PruneCollectionsArgsError", 

31 "PurgeWithoutUnstorePruneCollectionsError", 

32 "RunWithoutPurgePruneCollectionsError", 

33 "PurgeUnsupportedPruneCollectionsError", 

34) 

35 

36 

37from collections import defaultdict 

38import contextlib 

39import logging 

40import os 

41from typing import ( 

42 Any, 

43 ClassVar, 

44 ContextManager, 

45 Dict, 

46 Iterable, 

47 List, 

48 Mapping, 

49 MutableMapping, 

50 Optional, 

51 Set, 

52 TextIO, 

53 Tuple, 

54 Union, 

55) 

56 

57try: 

58 import boto3 

59except ImportError: 

60 boto3 = None 

61 

62from lsst.utils import doImport 

63from .core import ( 

64 AmbiguousDatasetError, 

65 ButlerURI, 

66 Config, 

67 ConfigSubset, 

68 DataCoordinate, 

69 DataId, 

70 DatasetRef, 

71 DatasetType, 

72 Datastore, 

73 DimensionConfig, 

74 FileDataset, 

75 StorageClassFactory, 

76 Timespan, 

77 ValidationError, 

78) 

79from .core.repoRelocation import BUTLER_ROOT_TAG 

80from .core.utils import transactional, getClassOf 

81from ._deferredDatasetHandle import DeferredDatasetHandle 

82from ._butlerConfig import ButlerConfig 

83from .registry import Registry, RegistryConfig, CollectionType 

84from .registry.wildcards import CollectionSearch 

85from .transfers import RepoExportContext 

86 

87log = logging.getLogger(__name__) 

88 

89 

90class ButlerValidationError(ValidationError): 

91 """There is a problem with the Butler configuration.""" 

92 pass 

93 

94 

95class PruneCollectionsArgsError(TypeError): 

96 """Base class for errors relating to Butler.pruneCollections input 

97 arguments. 

98 """ 

99 pass 

100 

101 

102class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError): 

103 """Raised when purge and unstore are both required to be True, and 

104 purge is True but unstore is False. 

105 """ 

106 

107 def __init__(self): 

108 super().__init__("Cannot pass purge=True without unstore=True.") 

109 

110 

111class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError): 

112 """Raised when pruning a RUN collection but purge is False.""" 

113 

114 def __init__(self, collectionType): 

115 self.collectionType = collectionType 

116 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.") 

117 

118 

119class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError): 

120 """Raised when purge is True but is not supported for the given 

121 collection.""" 

122 

123 def __init__(self, collectionType): 

124 self.collectionType = collectionType 

125 super().__init__( 

126 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True.") 

127 

128 

129class Butler: 

130 """Main entry point for the data access system. 

131 

132 Parameters 

133 ---------- 

134 config : `ButlerConfig`, `Config` or `str`, optional. 

135 Configuration. Anything acceptable to the 

136 `ButlerConfig` constructor. If a directory path 

137 is given the configuration will be read from a ``butler.yaml`` file in 

138 that location. If `None` is given default values will be used. 

139 butler : `Butler`, optional. 

140 If provided, construct a new Butler that uses the same registry and 

141 datastore as the given one, but with the given collection and run. 

142 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

143 arguments. 

144 collections : `Any`, optional 

145 An expression specifying the collections to be searched (in order) when 

146 reading datasets, and optionally dataset type restrictions on them. 

147 This may be: 

148 - a `str` collection name; 

149 - a tuple of (collection name, *dataset type restriction*); 

150 - an iterable of either of the above; 

151 - a mapping from `str` to *dataset type restriction*. 

152 

153 See :ref:`daf_butler_collection_expressions` for more information, 

154 including the definition of a *dataset type restriction*. All 

155 collections must either already exist or be specified to be created 

156 by other arguments. 

157 run : `str`, optional 

158 Name of the run datasets should be output to. If the run 

159 does not exist, it will be created. If ``collections`` is `None`, it 

160 will be set to ``[run]``. If this is not set (and ``writeable`` is 

161 not set either), a read-only butler will be created. 

162 tags : `Iterable` [ `str` ], optional 

163 A list of `~CollectionType.TAGGED` collections that datasets should be 

164 associated with in `put` or `ingest` and disassociated from in 

165 `pruneDatasets`. If any of these collections does not exist, it will 

166 be created. 

167 chains : `Mapping` [ `str`, `Iterable` [ `str` ] ], optional 

168 A mapping from the names of new `~CollectionType.CHAINED` collections 

169 to an expression identifying their child collections (which takes the 

170 same form as the ``collections`` argument. Chains may be nested only 

171 if children precede their parents in this mapping. 

172 searchPaths : `list` of `str`, optional 

173 Directory paths to search when calculating the full Butler 

174 configuration. Not used if the supplied config is already a 

175 `ButlerConfig`. 

176 writeable : `bool`, optional 

177 Explicitly sets whether the butler supports write operations. If not 

178 provided, a read-write butler is created if any of ``run``, ``tags``, 

179 or ``chains`` is non-empty. 

180 

181 Examples 

182 -------- 

183 While there are many ways to control exactly how a `Butler` interacts with 

184 the collections in its `Registry`, the most common cases are still simple. 

185 

186 For a read-only `Butler` that searches one collection, do:: 

187 

188 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

189 

190 For a read-write `Butler` that writes to and reads from a 

191 `~CollectionType.RUN` collection:: 

192 

193 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

194 

195 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

196 because we want to write to one `~CollectionType.RUN` collection but read 

197 from several others (as well), while defining a new 

198 `~CollectionType.CHAINED` collection that combines them all:: 

199 

200 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

201 collections=["u/alice/DM-50000"], 

202 chains={ 

203 "u/alice/DM-50000": ["u/alice/DM-50000/a", 

204 "u/bob/DM-49998", 

205 "raw/hsc"] 

206 }) 

207 

208 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``, but 

209 they'll also be available from the chained collection ``u/alice/DM-50000``. 

210 Datasets will be read first from that run (since it appears first in the 

211 chain), and then from ``u/bob/DM-49998`` and finally ``raw/hsc``. 

212 If ``u/alice/DM-50000`` had already been defined, the ``chain`` argument 

213 would be unnecessary. We could also construct a butler that performs 

214 exactly the same `put` and `get` operations without actually creating a 

215 chained collection, just by passing multiple items is ``collections``:: 

216 

217 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

218 collections=["u/alice/DM-50000/a", 

219 "u/bob/DM-49998", 

220 "raw/hsc"]) 

221 

222 Finally, one can always create a `Butler` with no collections:: 

223 

224 butler = Butler("/path/to/repo", writeable=True) 

225 

226 This can be extremely useful when you just want to use ``butler.registry``, 

227 e.g. for inserting dimension data or managing collections, or when the 

228 collections you want to use with the butler are not consistent. 

229 Passing ``writeable`` explicitly here is only necessary if you want to be 

230 able to make changes to the repo - usually the value for ``writeable`` is 

231 can be guessed from the collection arguments provided, but it defaults to 

232 `False` when there are not collection arguments. 

233 """ 

234 def __init__(self, config: Union[Config, str, None] = None, *, 

235 butler: Optional[Butler] = None, 

236 collections: Any = None, 

237 run: Optional[str] = None, 

238 tags: Iterable[str] = (), 

239 chains: Optional[Mapping[str, Any]] = None, 

240 searchPaths: Optional[List[str]] = None, 

241 writeable: Optional[bool] = None, 

242 ): 

243 # Transform any single-pass iterator into an actual sequence so we 

244 # can see if its empty 

245 self.tags = tuple(tags) 

246 # Load registry, datastore, etc. from config or existing butler. 

247 if butler is not None: 

248 if config is not None or searchPaths is not None or writeable is not None: 

249 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' " 

250 "arguments with 'butler' argument.") 

251 self.registry = butler.registry 

252 self.datastore = butler.datastore 

253 self.storageClasses = butler.storageClasses 

254 self._config = butler._config 

255 else: 

256 self._config = ButlerConfig(config, searchPaths=searchPaths) 

257 if "root" in self._config: 

258 butlerRoot = self._config["root"] 

259 else: 

260 butlerRoot = self._config.configDir 

261 if writeable is None: 

262 writeable = run is not None or chains is not None or self.tags 

263 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable) 

264 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(), 

265 butlerRoot=butlerRoot) 

266 self.storageClasses = StorageClassFactory() 

267 self.storageClasses.addFromConfig(self._config) 

268 # Check the many collection arguments for consistency and create any 

269 # needed collections that don't exist. 

270 if collections is None: 

271 if run is not None: 

272 collections = (run,) 

273 else: 

274 collections = () 

275 self.collections = CollectionSearch.fromExpression(collections) 

276 if chains is None: 

277 chains = {} 

278 self.run = run 

279 if "run" in self._config or "collection" in self._config: 

280 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

281 if self.run is not None: 

282 self.registry.registerCollection(self.run, type=CollectionType.RUN) 

283 for tag in self.tags: 

284 self.registry.registerCollection(tag, type=CollectionType.TAGGED) 

285 for parent, children in chains.items(): 

286 self.registry.registerCollection(parent, type=CollectionType.CHAINED) 

287 self.registry.setCollectionChain(parent, children) 

288 

289 GENERATION: ClassVar[int] = 3 

290 """This is a Generation 3 Butler. 

291 

292 This attribute may be removed in the future, once the Generation 2 Butler 

293 interface has been fully retired; it should only be used in transitional 

294 code. 

295 """ 

296 

297 @staticmethod 

298 def makeRepo(root: str, config: Union[Config, str, None] = None, 

299 dimensionConfig: Union[Config, str, None] = None, standalone: bool = False, 

300 searchPaths: Optional[List[str]] = None, forceConfigRoot: bool = True, 

301 outfile: Optional[str] = None, overwrite: bool = False) -> Config: 

302 """Create an empty data repository by adding a butler.yaml config 

303 to a repository root directory. 

304 

305 Parameters 

306 ---------- 

307 root : `str` or `ButlerURI` 

308 Path or URI to the root location of the new repository. Will be 

309 created if it does not exist. 

310 config : `Config` or `str`, optional 

311 Configuration to write to the repository, after setting any 

312 root-dependent Registry or Datastore config options. Can not 

313 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

314 configuration will be used. Root-dependent config options 

315 specified in this config are overwritten if ``forceConfigRoot`` 

316 is `True`. 

317 dimensionConfig : `Config` or `str`, optional 

318 Configuration for dimensions, will be used to initialize registry 

319 database. 

320 standalone : `bool` 

321 If True, write all expanded defaults, not just customized or 

322 repository-specific settings. 

323 This (mostly) decouples the repository from the default 

324 configuration, insulating it from changes to the defaults (which 

325 may be good or bad, depending on the nature of the changes). 

326 Future *additions* to the defaults will still be picked up when 

327 initializing `Butlers` to repos created with ``standalone=True``. 

328 searchPaths : `list` of `str`, optional 

329 Directory paths to search when calculating the full butler 

330 configuration. 

331 forceConfigRoot : `bool`, optional 

332 If `False`, any values present in the supplied ``config`` that 

333 would normally be reset are not overridden and will appear 

334 directly in the output config. This allows non-standard overrides 

335 of the root directory for a datastore or registry to be given. 

336 If this parameter is `True` the values for ``root`` will be 

337 forced into the resulting config if appropriate. 

338 outfile : `str`, optional 

339 If not-`None`, the output configuration will be written to this 

340 location rather than into the repository itself. Can be a URI 

341 string. Can refer to a directory that will be used to write 

342 ``butler.yaml``. 

343 overwrite : `bool`, optional 

344 Create a new configuration file even if one already exists 

345 in the specified output location. Default is to raise 

346 an exception. 

347 

348 Returns 

349 ------- 

350 config : `Config` 

351 The updated `Config` instance written to the repo. 

352 

353 Raises 

354 ------ 

355 ValueError 

356 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

357 regular Config (as these subclasses would make it impossible to 

358 support ``standalone=False``). 

359 FileExistsError 

360 Raised if the output config file already exists. 

361 os.error 

362 Raised if the directory does not exist, exists but is not a 

363 directory, or cannot be created. 

364 

365 Notes 

366 ----- 

367 Note that when ``standalone=False`` (the default), the configuration 

368 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

369 construct the repository should also be used to construct any Butlers 

370 to avoid configuration inconsistencies. 

371 """ 

372 if isinstance(config, (ButlerConfig, ConfigSubset)): 

373 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

374 

375 # Ensure that the root of the repository exists or can be made 

376 uri = ButlerURI(root, forceDirectory=True) 

377 uri.mkdir() 

378 

379 config = Config(config) 

380 

381 # If we are creating a new repo from scratch with relative roots, 

382 # do not propagate an explicit root from the config file 

383 if "root" in config: 

384 del config["root"] 

385 

386 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

387 datastoreClass = doImport(full["datastore", "cls"]) 

388 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

389 

390 # if key exists in given config, parse it, otherwise parse the defaults 

391 # in the expanded config 

392 if config.get(("registry", "db")): 

393 registryConfig = RegistryConfig(config) 

394 else: 

395 registryConfig = RegistryConfig(full) 

396 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

397 if defaultDatabaseUri is not None: 

398 Config.updateParameters(RegistryConfig, config, full, 

399 toUpdate={"db": defaultDatabaseUri}, 

400 overwrite=forceConfigRoot) 

401 else: 

402 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), 

403 overwrite=forceConfigRoot) 

404 

405 if standalone: 

406 config.merge(full) 

407 else: 

408 # Always expand the registry.managers section into the per-repo 

409 # config, because after the database schema is created, it's not 

410 # allowed to change anymore. Note that in the standalone=True 

411 # branch, _everything_ in the config is expanded, so there's no 

412 # need to special case this. 

413 Config.updateParameters(RegistryConfig, config, full, toCopy=("managers",), overwrite=False) 

414 if outfile is not None: 

415 # When writing to a separate location we must include 

416 # the root of the butler repo in the config else it won't know 

417 # where to look. 

418 config["root"] = uri.geturl() 

419 configURI = outfile 

420 else: 

421 configURI = uri 

422 config.dumpToUri(configURI, overwrite=overwrite) 

423 

424 # Create Registry and populate tables 

425 registryConfig = RegistryConfig(config.get("registry")) 

426 dimensionConfig = DimensionConfig(dimensionConfig) 

427 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root) 

428 

429 return config 

430 

431 @classmethod 

432 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str], 

433 tags: Tuple[str, ...], writeable: bool) -> Butler: 

434 """Callable used to unpickle a Butler. 

435 

436 We prefer not to use ``Butler.__init__`` directly so we can force some 

437 of its many arguments to be keyword-only (note that ``__reduce__`` 

438 can only invoke callables with positional arguments). 

439 

440 Parameters 

441 ---------- 

442 config : `ButlerConfig` 

443 Butler configuration, already coerced into a true `ButlerConfig` 

444 instance (and hence after any search paths for overrides have been 

445 utilized). 

446 collections : `CollectionSearch` 

447 Names of collections to read from. 

448 run : `str`, optional 

449 Name of `~CollectionType.RUN` collection to write to. 

450 tags : `tuple` [`str`] 

451 Names of `~CollectionType.TAGGED` collections to associate with. 

452 writeable : `bool` 

453 Whether the Butler should support write operations. 

454 

455 Returns 

456 ------- 

457 butler : `Butler` 

458 A new `Butler` instance. 

459 """ 

460 return cls(config=config, collections=collections, run=run, tags=tags, writeable=writeable) 

461 

462 def __reduce__(self): 

463 """Support pickling. 

464 """ 

465 return (Butler._unpickle, (self._config, self.collections, self.run, self.tags, 

466 self.registry.isWriteable())) 

467 

468 def __str__(self): 

469 return "Butler(collections={}, run={}, tags={}, datastore='{}', registry='{}')".format( 

470 self.collections, self.run, self.tags, self.datastore, self.registry) 

471 

472 def isWriteable(self) -> bool: 

473 """Return `True` if this `Butler` supports write operations. 

474 """ 

475 return self.registry.isWriteable() 

476 

477 @contextlib.contextmanager 

478 def transaction(self): 

479 """Context manager supporting `Butler` transactions. 

480 

481 Transactions can be nested. 

482 """ 

483 with self.registry.transaction(): 

484 with self.datastore.transaction(): 

485 yield 

486 

487 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

488 dataId: Optional[DataId] = None, **kwds: Any) -> Tuple[DatasetType, DataId]: 

489 """Standardize the arguments passed to several Butler APIs. 

490 

491 Parameters 

492 ---------- 

493 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

494 When `DatasetRef` the `dataId` should be `None`. 

495 Otherwise the `DatasetType` or name thereof. 

496 dataId : `dict` or `DataCoordinate` 

497 A `dict` of `Dimension` link name, value pairs that label the 

498 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

499 should be provided as the second argument. 

500 kwds 

501 Additional keyword arguments used to augment or construct a 

502 `DataCoordinate`. See `DataCoordinate.standardize` 

503 parameters. 

504 

505 Returns 

506 ------- 

507 datasetType : `DatasetType` 

508 A `DatasetType` instance extracted from ``datasetRefOrType``. 

509 dataId : `dict` or `DataId`, optional 

510 Argument that can be used (along with ``kwds``) to construct a 

511 `DataId`. 

512 

513 Notes 

514 ----- 

515 Butler APIs that conceptually need a DatasetRef also allow passing a 

516 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

517 keyword arguments that can be used to construct one) separately. This 

518 method accepts those arguments and always returns a true `DatasetType` 

519 and a `DataId` or `dict`. 

520 

521 Standardization of `dict` vs `DataId` is best handled by passing the 

522 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are 

523 generally similarly flexible. 

524 """ 

525 externalDatasetType = None 

526 internalDatasetType = None 

527 if isinstance(datasetRefOrType, DatasetRef): 

528 if dataId is not None or kwds: 

529 raise ValueError("DatasetRef given, cannot use dataId as well") 

530 externalDatasetType = datasetRefOrType.datasetType 

531 dataId = datasetRefOrType.dataId 

532 else: 

533 # Don't check whether DataId is provided, because Registry APIs 

534 # can usually construct a better error message when it wasn't. 

535 if isinstance(datasetRefOrType, DatasetType): 

536 externalDatasetType = datasetRefOrType 

537 else: 

538 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

539 

540 # Check that they are self-consistent 

541 if externalDatasetType is not None: 

542 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

543 if externalDatasetType != internalDatasetType: 

544 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

545 f"registry definition ({internalDatasetType})") 

546 

547 return internalDatasetType, dataId 

548 

549 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

550 dataId: Optional[DataId] = None, *, 

551 collections: Any = None, 

552 allowUnresolved: bool = False, 

553 **kwds: Any) -> DatasetRef: 

554 """Shared logic for methods that start with a search for a dataset in 

555 the registry. 

556 

557 Parameters 

558 ---------- 

559 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

560 When `DatasetRef` the `dataId` should be `None`. 

561 Otherwise the `DatasetType` or name thereof. 

562 dataId : `dict` or `DataCoordinate`, optional 

563 A `dict` of `Dimension` link name, value pairs that label the 

564 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

565 should be provided as the first argument. 

566 collections : Any, optional 

567 Collections to be searched, overriding ``self.collections``. 

568 Can be any of the types supported by the ``collections`` argument 

569 to butler construction. 

570 allowUnresolved : `bool`, optional 

571 If `True`, return an unresolved `DatasetRef` if finding a resolved 

572 one in the `Registry` fails. Defaults to `False`. 

573 kwds 

574 Additional keyword arguments used to augment or construct a 

575 `DataId`. See `DataId` parameters. 

576 

577 Returns 

578 ------- 

579 ref : `DatasetRef` 

580 A reference to the dataset identified by the given arguments. 

581 

582 Raises 

583 ------ 

584 LookupError 

585 Raised if no matching dataset exists in the `Registry` (and 

586 ``allowUnresolved is False``). 

587 ValueError 

588 Raised if a resolved `DatasetRef` was passed as an input, but it 

589 differs from the one found in the registry. 

590 TypeError 

591 Raised if no collections were provided. 

592 """ 

593 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds) 

594 if isinstance(datasetRefOrType, DatasetRef): 

595 idNumber = datasetRefOrType.id 

596 else: 

597 idNumber = None 

598 timespan: Optional[Timespan] = None 

599 

600 # Process dimension records that are using record information 

601 # rather than ids 

602 newDataId: dict[Any, Any] = {} 

603 byRecord: dict[Any, dict[str, Any]] = defaultdict(dict) 

604 

605 # if all the dataId comes from keyword parameters we do not need 

606 # to do anything here because they can't be of the form 

607 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

608 if dataId: 

609 for k, v in dataId.items(): 

610 # If we have a Dimension we do not need to do anything 

611 # because it cannot be a compound key. 

612 if isinstance(k, str) and "." in k: 

613 # Someone is using a more human-readable dataId 

614 dimension, record = k.split(".", 1) 

615 byRecord[dimension][record] = v 

616 else: 

617 newDataId[k] = v 

618 

619 if byRecord: 

620 # Some record specifiers were found so we need to convert 

621 # them to the Id form 

622 for dimensionName, values in byRecord.items(): 

623 if dimensionName in newDataId: 

624 log.warning("DataId specified explicit %s dimension value of %s in addition to" 

625 " general record specifiers for it of %s. Ignoring record information.", 

626 dimensionName, newDataId[dimensionName], str(values)) 

627 continue 

628 

629 # Build up a WHERE expression -- use single quotes 

630 def quote(s): 

631 if isinstance(s, str): 

632 return f"'{s}'" 

633 else: 

634 return s 

635 

636 where = " AND ".join(f"{dimensionName}.{k} = {quote(v)}" 

637 for k, v in values.items()) 

638 

639 # Hopefully we get a single record that matches 

640 records = set(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId, 

641 where=where, **kwds)) 

642 

643 if len(records) != 1: 

644 if len(records) > 1: 

645 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

646 for r in records: 

647 log.debug("- %s", str(r)) 

648 raise RuntimeError(f"DataId specification for dimension {dimensionName} is not" 

649 f" uniquely constrained to a single dataset by {values}." 

650 f" Got {len(records)} results.") 

651 raise RuntimeError(f"DataId specification for dimension {dimensionName} matched no" 

652 f" records when constrained by {values}") 

653 

654 # Get the primary key from the real dimension object 

655 dimension = self.registry.dimensions[dimensionName] 

656 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

657 

658 # We have modified the dataId so need to switch to it 

659 dataId = newDataId 

660 

661 if datasetType.isCalibration(): 

662 # Because this is a calibration dataset, first try to make a 

663 # standardize the data ID without restricting the dimensions to 

664 # those of the dataset type requested, because there may be extra 

665 # dimensions that provide temporal information for a validity-range 

666 # lookup. 

667 dataId = DataCoordinate.standardize(dataId, universe=self.registry.dimensions, **kwds) 

668 if dataId.graph.temporal: 

669 dataId = self.registry.expandDataId(dataId) 

670 timespan = dataId.timespan 

671 else: 

672 # Standardize the data ID to just the dimensions of the dataset 

673 # type instead of letting registry.findDataset do it, so we get the 

674 # result even if no dataset is found. 

675 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions, **kwds) 

676 if collections is None: 

677 collections = self.collections 

678 if not collections: 

679 raise TypeError("No input collections provided.") 

680 else: 

681 collections = CollectionSearch.fromExpression(collections) 

682 # Always lookup the DatasetRef, even if one is given, to ensure it is 

683 # present in the current collection. 

684 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

685 if ref is None: 

686 if allowUnresolved: 

687 return DatasetRef(datasetType, dataId) 

688 else: 

689 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} " 

690 f"could not be found in collections {collections}.") 

691 if idNumber is not None and idNumber != ref.id: 

692 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match " 

693 f"id ({ref.id}) in registry in collections {collections}.") 

694 return ref 

695 

696 @transactional 

697 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

698 dataId: Optional[DataId] = None, *, 

699 run: Optional[str] = None, 

700 tags: Optional[Iterable[str]] = None, 

701 **kwds: Any) -> DatasetRef: 

702 """Store and register a dataset. 

703 

704 Parameters 

705 ---------- 

706 obj : `object` 

707 The dataset. 

708 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

709 When `DatasetRef` is provided, ``dataId`` should be `None`. 

710 Otherwise the `DatasetType` or name thereof. 

711 dataId : `dict` or `DataCoordinate` 

712 A `dict` of `Dimension` link name, value pairs that label the 

713 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

714 should be provided as the second argument. 

715 run : `str`, optional 

716 The name of the run the dataset should be added to, overriding 

717 ``self.run``. 

718 tags : `Iterable` [ `str` ], optional 

719 The names of a `~CollectionType.TAGGED` collections to associate 

720 the dataset with, overriding ``self.tags``. These collections 

721 must have already been added to the `Registry`. 

722 kwds 

723 Additional keyword arguments used to augment or construct a 

724 `DataCoordinate`. See `DataCoordinate.standardize` 

725 parameters. 

726 

727 Returns 

728 ------- 

729 ref : `DatasetRef` 

730 A reference to the stored dataset, updated with the correct id if 

731 given. 

732 

733 Raises 

734 ------ 

735 TypeError 

736 Raised if the butler is read-only or if no run has been provided. 

737 """ 

738 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

739 if not self.isWriteable(): 

740 raise TypeError("Butler is read-only.") 

741 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds) 

742 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

743 raise ValueError("DatasetRef must not be in registry, must have None id") 

744 

745 if run is None: 

746 if self.run is None: 

747 raise TypeError("No run provided.") 

748 run = self.run 

749 # No need to check type for run; first thing we do is 

750 # insertDatasets, and that will check for us. 

751 

752 if tags is None: 

753 tags = self.tags 

754 else: 

755 tags = tuple(tags) 

756 for tag in tags: 

757 # Check that these are tagged collections up front, because we want 

758 # to avoid relying on Datastore transactionality to avoid modifying 

759 # the repo if there's an error later. 

760 collectionType = self.registry.getCollectionType(tag) 

761 if collectionType is not CollectionType.TAGGED: 

762 raise TypeError(f"Cannot associate into collection '{tag}' of non-TAGGED type " 

763 f"{collectionType.name}.") 

764 

765 # Add Registry Dataset entry. 

766 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds) 

767 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

768 

769 # Add Datastore entry. 

770 self.datastore.put(obj, ref) 

771 

772 for tag in tags: 

773 self.registry.associate(tag, [ref]) 

774 

775 return ref 

776 

777 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None): 

778 """Retrieve a stored dataset. 

779 

780 Unlike `Butler.get`, this method allows datasets outside the Butler's 

781 collection to be read as long as the `DatasetRef` that identifies them 

782 can be obtained separately. 

783 

784 Parameters 

785 ---------- 

786 ref : `DatasetRef` 

787 Resolved reference to an already stored dataset. 

788 parameters : `dict` 

789 Additional StorageClass-defined options to control reading, 

790 typically used to efficiently read only a subset of the dataset. 

791 

792 Returns 

793 ------- 

794 obj : `object` 

795 The dataset. 

796 """ 

797 return self.datastore.get(ref, parameters=parameters) 

798 

799 def getDirectDeferred(self, ref: DatasetRef, *, 

800 parameters: Union[dict, None] = None) -> DeferredDatasetHandle: 

801 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

802 from a resolved `DatasetRef`. 

803 

804 Parameters 

805 ---------- 

806 ref : `DatasetRef` 

807 Resolved reference to an already stored dataset. 

808 parameters : `dict` 

809 Additional StorageClass-defined options to control reading, 

810 typically used to efficiently read only a subset of the dataset. 

811 

812 Returns 

813 ------- 

814 obj : `DeferredDatasetHandle` 

815 A handle which can be used to retrieve a dataset at a later time. 

816 

817 Raises 

818 ------ 

819 AmbiguousDatasetError 

820 Raised if ``ref.id is None``, i.e. the reference is unresolved. 

821 """ 

822 if ref.id is None: 

823 raise AmbiguousDatasetError( 

824 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved." 

825 ) 

826 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

827 

828 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

829 dataId: Optional[DataId] = None, *, 

830 parameters: Union[dict, None] = None, 

831 collections: Any = None, 

832 **kwds: Any) -> DeferredDatasetHandle: 

833 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

834 after an immediate registry lookup. 

835 

836 Parameters 

837 ---------- 

838 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

839 When `DatasetRef` the `dataId` should be `None`. 

840 Otherwise the `DatasetType` or name thereof. 

841 dataId : `dict` or `DataCoordinate`, optional 

842 A `dict` of `Dimension` link name, value pairs that label the 

843 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

844 should be provided as the first argument. 

845 parameters : `dict` 

846 Additional StorageClass-defined options to control reading, 

847 typically used to efficiently read only a subset of the dataset. 

848 collections : Any, optional 

849 Collections to be searched, overriding ``self.collections``. 

850 Can be any of the types supported by the ``collections`` argument 

851 to butler construction. 

852 kwds 

853 Additional keyword arguments used to augment or construct a 

854 `DataId`. See `DataId` parameters. 

855 

856 Returns 

857 ------- 

858 obj : `DeferredDatasetHandle` 

859 A handle which can be used to retrieve a dataset at a later time. 

860 

861 Raises 

862 ------ 

863 LookupError 

864 Raised if no matching dataset exists in the `Registry` (and 

865 ``allowUnresolved is False``). 

866 ValueError 

867 Raised if a resolved `DatasetRef` was passed as an input, but it 

868 differs from the one found in the registry. 

869 TypeError 

870 Raised if no collections were provided. 

871 """ 

872 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds) 

873 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

874 

875 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

876 dataId: Optional[DataId] = None, *, 

877 parameters: Optional[Dict[str, Any]] = None, 

878 collections: Any = None, 

879 **kwds: Any) -> Any: 

880 """Retrieve a stored dataset. 

881 

882 Parameters 

883 ---------- 

884 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

885 When `DatasetRef` the `dataId` should be `None`. 

886 Otherwise the `DatasetType` or name thereof. 

887 dataId : `dict` or `DataCoordinate` 

888 A `dict` of `Dimension` link name, value pairs that label the 

889 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

890 should be provided as the first argument. 

891 parameters : `dict` 

892 Additional StorageClass-defined options to control reading, 

893 typically used to efficiently read only a subset of the dataset. 

894 collections : Any, optional 

895 Collections to be searched, overriding ``self.collections``. 

896 Can be any of the types supported by the ``collections`` argument 

897 to butler construction. 

898 kwds 

899 Additional keyword arguments used to augment or construct a 

900 `DataCoordinate`. See `DataCoordinate.standardize` 

901 parameters. 

902 

903 Returns 

904 ------- 

905 obj : `object` 

906 The dataset. 

907 

908 Raises 

909 ------ 

910 ValueError 

911 Raised if a resolved `DatasetRef` was passed as an input, but it 

912 differs from the one found in the registry. 

913 LookupError 

914 Raised if no matching dataset exists in the `Registry`. 

915 TypeError 

916 Raised if no collections were provided. 

917 

918 Notes 

919 ----- 

920 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

921 this method requires that the given data ID include temporal dimensions 

922 beyond the dimensions of the dataset type itself, in order to find the 

923 dataset with the appropriate validity range. For example, a "bias" 

924 dataset with native dimensions ``{instrument, detector}`` could be 

925 fetched with a ``{instrument, detector, exposure}`` data ID, because 

926 ``exposure`` is a temporal dimension. 

927 """ 

928 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

929 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds) 

930 return self.getDirect(ref, parameters=parameters) 

931 

932 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

933 dataId: Optional[DataId] = None, *, 

934 predict: bool = False, 

935 collections: Any = None, 

936 run: Optional[str] = None, 

937 **kwds: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

938 """Returns the URIs associated with the dataset. 

939 

940 Parameters 

941 ---------- 

942 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

943 When `DatasetRef` the `dataId` should be `None`. 

944 Otherwise the `DatasetType` or name thereof. 

945 dataId : `dict` or `DataCoordinate` 

946 A `dict` of `Dimension` link name, value pairs that label the 

947 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

948 should be provided as the first argument. 

949 predict : `bool` 

950 If `True`, allow URIs to be returned of datasets that have not 

951 been written. 

952 collections : Any, optional 

953 Collections to be searched, overriding ``self.collections``. 

954 Can be any of the types supported by the ``collections`` argument 

955 to butler construction. 

956 run : `str`, optional 

957 Run to use for predictions, overriding ``self.run``. 

958 kwds 

959 Additional keyword arguments used to augment or construct a 

960 `DataCoordinate`. See `DataCoordinate.standardize` 

961 parameters. 

962 

963 Returns 

964 ------- 

965 primary : `ButlerURI` 

966 The URI to the primary artifact associated with this dataset. 

967 If the dataset was disassembled within the datastore this 

968 may be `None`. 

969 components : `dict` 

970 URIs to any components associated with the dataset artifact. 

971 Can be empty if there are no components. 

972 """ 

973 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict, 

974 collections=collections, **kwds) 

975 if ref.id is None: # only possible if predict is True 

976 if run is None: 

977 run = self.run 

978 if run is None: 

979 raise TypeError("Cannot predict location with run=None.") 

980 # Lie about ID, because we can't guess it, and only 

981 # Datastore.getURIs() will ever see it (and it doesn't use it). 

982 ref = ref.resolved(id=0, run=run) 

983 return self.datastore.getURIs(ref, predict) 

984 

985 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

986 dataId: Optional[DataId] = None, *, 

987 predict: bool = False, 

988 collections: Any = None, 

989 run: Optional[str] = None, 

990 **kwds: Any) -> ButlerURI: 

991 """Return the URI to the Dataset. 

992 

993 Parameters 

994 ---------- 

995 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

996 When `DatasetRef` the `dataId` should be `None`. 

997 Otherwise the `DatasetType` or name thereof. 

998 dataId : `dict` or `DataCoordinate` 

999 A `dict` of `Dimension` link name, value pairs that label the 

1000 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1001 should be provided as the first argument. 

1002 predict : `bool` 

1003 If `True`, allow URIs to be returned of datasets that have not 

1004 been written. 

1005 collections : Any, optional 

1006 Collections to be searched, overriding ``self.collections``. 

1007 Can be any of the types supported by the ``collections`` argument 

1008 to butler construction. 

1009 run : `str`, optional 

1010 Run to use for predictions, overriding ``self.run``. 

1011 kwds 

1012 Additional keyword arguments used to augment or construct a 

1013 `DataCoordinate`. See `DataCoordinate.standardize` 

1014 parameters. 

1015 

1016 Returns 

1017 ------- 

1018 uri : `ButlerURI` 

1019 URI pointing to the Dataset within the datastore. If the 

1020 Dataset does not exist in the datastore, and if ``predict`` is 

1021 `True`, the URI will be a prediction and will include a URI 

1022 fragment "#predicted". 

1023 If the datastore does not have entities that relate well 

1024 to the concept of a URI the returned URI string will be 

1025 descriptive. The returned URI is not guaranteed to be obtainable. 

1026 

1027 Raises 

1028 ------ 

1029 LookupError 

1030 A URI has been requested for a dataset that does not exist and 

1031 guessing is not allowed. 

1032 ValueError 

1033 Raised if a resolved `DatasetRef` was passed as an input, but it 

1034 differs from the one found in the registry. 

1035 TypeError 

1036 Raised if no collections were provided. 

1037 RuntimeError 

1038 Raised if a URI is requested for a dataset that consists of 

1039 multiple artifacts. 

1040 """ 

1041 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict, 

1042 collections=collections, run=run, **kwds) 

1043 

1044 if primary is None or components: 

1045 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1046 "Use Butler.getURIs() instead.") 

1047 return primary 

1048 

1049 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1050 dataId: Optional[DataId] = None, *, 

1051 collections: Any = None, 

1052 **kwds: Any) -> bool: 

1053 """Return True if the Dataset is actually present in the Datastore. 

1054 

1055 Parameters 

1056 ---------- 

1057 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1058 When `DatasetRef` the `dataId` should be `None`. 

1059 Otherwise the `DatasetType` or name thereof. 

1060 dataId : `dict` or `DataCoordinate` 

1061 A `dict` of `Dimension` link name, value pairs that label the 

1062 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1063 should be provided as the first argument. 

1064 collections : Any, optional 

1065 Collections to be searched, overriding ``self.collections``. 

1066 Can be any of the types supported by the ``collections`` argument 

1067 to butler construction. 

1068 kwds 

1069 Additional keyword arguments used to augment or construct a 

1070 `DataCoordinate`. See `DataCoordinate.standardize` 

1071 parameters. 

1072 

1073 Raises 

1074 ------ 

1075 LookupError 

1076 Raised if the dataset is not even present in the Registry. 

1077 ValueError 

1078 Raised if a resolved `DatasetRef` was passed as an input, but it 

1079 differs from the one found in the registry. 

1080 TypeError 

1081 Raised if no collections were provided. 

1082 """ 

1083 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds) 

1084 return self.datastore.exists(ref) 

1085 

1086 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False): 

1087 """Remove a collection and possibly prune datasets within it. 

1088 

1089 Parameters 

1090 ---------- 

1091 name : `str` 

1092 Name of the collection to remove. If this is a 

1093 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection, 

1094 datasets within the collection are not modified unless ``unstore`` 

1095 is `True`. If this is a `~CollectionType.RUN` collection, 

1096 ``purge`` and ``unstore`` must be `True`, and all datasets in it 

1097 are fully removed from the data repository. 

1098 purge : `bool`, optional 

1099 If `True`, permit `~CollectionType.RUN` collections to be removed, 

1100 fully removing datasets within them. Requires ``unstore=True`` as 

1101 well as an added precaution against accidental deletion. Must be 

1102 `False` (default) if the collection is not a ``RUN``. 

1103 unstore: `bool`, optional 

1104 If `True`, remove all datasets in the collection from all 

1105 datastores in which they appear. 

1106 

1107 Raises 

1108 ------ 

1109 TypeError 

1110 Raised if the butler is read-only or arguments are mutually 

1111 inconsistent. 

1112 """ 

1113 

1114 # See pruneDatasets comments for more information about the logic here; 

1115 # the cases are almost the same, but here we can rely on Registry to 

1116 # take care everything but Datastore deletion when we remove the 

1117 # collection. 

1118 if not self.isWriteable(): 

1119 raise TypeError("Butler is read-only.") 

1120 collectionType = self.registry.getCollectionType(name) 

1121 if purge and not unstore: 

1122 raise PurgeWithoutUnstorePruneCollectionsError() 

1123 if collectionType is CollectionType.RUN and not purge: 

1124 raise RunWithoutPurgePruneCollectionsError(collectionType) 

1125 if collectionType is not CollectionType.RUN and purge: 

1126 raise PurgeUnsupportedPruneCollectionsError(collectionType) 

1127 

1128 with self.registry.transaction(): 

1129 if unstore: 

1130 for ref in self.registry.queryDatasets(..., collections=name, findFirst=True): 

1131 if self.datastore.exists(ref): 

1132 self.datastore.trash(ref) 

1133 self.registry.removeCollection(name) 

1134 if unstore: 

1135 # Point of no return for removing artifacts 

1136 self.datastore.emptyTrash() 

1137 

1138 def pruneDatasets(self, refs: Iterable[DatasetRef], *, 

1139 disassociate: bool = True, 

1140 unstore: bool = False, 

1141 tags: Optional[Iterable[str]] = None, 

1142 purge: bool = False, 

1143 run: Optional[str] = None): 

1144 """Remove one or more datasets from a collection and/or storage. 

1145 

1146 Parameters 

1147 ---------- 

1148 refs : `~collections.abc.Iterable` of `DatasetRef` 

1149 Datasets to prune. These must be "resolved" references (not just 

1150 a `DatasetType` and data ID). 

1151 disassociate : `bool`, optional 

1152 Disassociate pruned datasets from ``self.tags`` (or the collections 

1153 given via the ``tags`` argument). 

1154 unstore : `bool`, optional 

1155 If `True` (`False` is default) remove these datasets from all 

1156 datastores known to this butler. Note that this will make it 

1157 impossible to retrieve these datasets even via other collections. 

1158 Datasets that are already not stored are ignored by this option. 

1159 tags : `Iterable` [ `str` ], optional 

1160 `~CollectionType.TAGGED` collections to disassociate the datasets 

1161 from, overriding ``self.tags``. Ignored if ``disassociate`` is 

1162 `False` or ``purge`` is `True`. 

1163 purge : `bool`, optional 

1164 If `True` (`False` is default), completely remove the dataset from 

1165 the `Registry`. To prevent accidental deletions, ``purge`` may 

1166 only be `True` if all of the following conditions are met: 

1167 

1168 - All given datasets are in the given run. 

1169 - ``disassociate`` is `True`; 

1170 - ``unstore`` is `True`. 

1171 

1172 This mode may remove provenance information from datasets other 

1173 than those provided, and should be used with extreme care. 

1174 run : `str`, optional 

1175 `~CollectionType.RUN` collection to purge from, overriding 

1176 ``self.run``. Ignored unless ``purge`` is `True`. 

1177 

1178 Raises 

1179 ------ 

1180 TypeError 

1181 Raised if the butler is read-only, if no collection was provided, 

1182 or the conditions for ``purge=True`` were not met. 

1183 """ 

1184 if not self.isWriteable(): 

1185 raise TypeError("Butler is read-only.") 

1186 if purge: 

1187 if not disassociate: 

1188 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1189 if not unstore: 

1190 raise TypeError("Cannot pass purge=True without unstore=True.") 

1191 if run is None: 

1192 run = self.run 

1193 if run is None: 

1194 raise TypeError("No run provided but purge=True.") 

1195 collectionType = self.registry.getCollectionType(run) 

1196 if collectionType is not CollectionType.RUN: 

1197 raise TypeError(f"Cannot purge from collection '{run}' " 

1198 f"of non-RUN type {collectionType.name}.") 

1199 elif disassociate: 

1200 if tags is None: 

1201 tags = self.tags 

1202 else: 

1203 tags = tuple(tags) 

1204 if not tags: 

1205 raise TypeError("No tags provided but disassociate=True.") 

1206 for tag in tags: 

1207 collectionType = self.registry.getCollectionType(tag) 

1208 if collectionType is not CollectionType.TAGGED: 

1209 raise TypeError(f"Cannot disassociate from collection '{tag}' " 

1210 f"of non-TAGGED type {collectionType.name}.") 

1211 # Transform possibly-single-pass iterable into something we can iterate 

1212 # over multiple times. 

1213 refs = list(refs) 

1214 # Pruning a component of a DatasetRef makes no sense since registry 

1215 # doesn't know about components and datastore might not store 

1216 # components in a separate file 

1217 for ref in refs: 

1218 if ref.datasetType.component(): 

1219 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1220 # We don't need an unreliable Datastore transaction for this, because 

1221 # we've been extra careful to ensure that Datastore.trash only involves 

1222 # mutating the Registry (it can _look_ at Datastore-specific things, 

1223 # but shouldn't change them), and hence all operations here are 

1224 # Registry operations. 

1225 with self.registry.transaction(): 

1226 if unstore: 

1227 for ref in refs: 

1228 # There is a difference between a concrete composite 

1229 # and virtual composite. In a virtual composite the 

1230 # datastore is never given the top level DatasetRef. In 

1231 # the concrete composite the datastore knows all the 

1232 # refs and will clean up itself if asked to remove the 

1233 # parent ref. We can not check configuration for this 

1234 # since we can not trust that the configuration is the 

1235 # same. We therefore have to ask if the ref exists or 

1236 # not. This is consistent with the fact that we want 

1237 # to ignore already-removed-from-datastore datasets 

1238 # anyway. 

1239 if self.datastore.exists(ref): 

1240 self.datastore.trash(ref) 

1241 if purge: 

1242 self.registry.removeDatasets(refs) 

1243 elif disassociate: 

1244 for tag in tags: 

1245 self.registry.disassociate(tag, refs) 

1246 # We've exited the Registry transaction, and apparently committed. 

1247 # (if there was an exception, everything rolled back, and it's as if 

1248 # nothing happened - and we never get here). 

1249 # Datastore artifacts are not yet gone, but they're clearly marked 

1250 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1251 # problems we can try again later, and if manual administrative 

1252 # intervention is required, it's pretty clear what that should entail: 

1253 # deleting everything on disk and in private Datastore tables that is 

1254 # in the dataset_location_trash table. 

1255 if unstore: 

1256 # Point of no return for removing artifacts 

1257 self.datastore.emptyTrash() 

1258 

1259 @transactional 

1260 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None, 

1261 tags: Optional[Iterable[str]] = None,): 

1262 """Store and register one or more datasets that already exist on disk. 

1263 

1264 Parameters 

1265 ---------- 

1266 datasets : `FileDataset` 

1267 Each positional argument is a struct containing information about 

1268 a file to be ingested, including its path (either absolute or 

1269 relative to the datastore root, if applicable), a `DatasetRef`, 

1270 and optionally a formatter class or its fully-qualified string 

1271 name. If a formatter is not provided, the formatter that would be 

1272 used for `put` is assumed. On successful return, all 

1273 `FileDataset.ref` attributes will have their `DatasetRef.id` 

1274 attribute populated and all `FileDataset.formatter` attributes will 

1275 be set to the formatter class used. `FileDataset.path` attributes 

1276 may be modified to put paths in whatever the datastore considers a 

1277 standardized form. 

1278 transfer : `str`, optional 

1279 If not `None`, must be one of 'auto', 'move', 'copy', 'hardlink', 

1280 'relsymlink' or 'symlink', indicating how to transfer the file. 

1281 run : `str`, optional 

1282 The name of the run ingested datasets should be added to, 

1283 overriding ``self.run``. 

1284 tags : `Iterable` [ `str` ], optional 

1285 The names of a `~CollectionType.TAGGED` collections to associate 

1286 the dataset with, overriding ``self.tags``. These collections 

1287 must have already been added to the `Registry`. 

1288 

1289 Raises 

1290 ------ 

1291 TypeError 

1292 Raised if the butler is read-only or if no run was provided. 

1293 NotImplementedError 

1294 Raised if the `Datastore` does not support the given transfer mode. 

1295 DatasetTypeNotSupportedError 

1296 Raised if one or more files to be ingested have a dataset type that 

1297 is not supported by the `Datastore`.. 

1298 FileNotFoundError 

1299 Raised if one of the given files does not exist. 

1300 FileExistsError 

1301 Raised if transfer is not `None` but the (internal) location the 

1302 file would be moved to is already occupied. 

1303 

1304 Notes 

1305 ----- 

1306 This operation is not fully exception safe: if a database operation 

1307 fails, the given `FileDataset` instances may be only partially updated. 

1308 

1309 It is atomic in terms of database operations (they will either all 

1310 succeed or all fail) providing the database engine implements 

1311 transactions correctly. It will attempt to be atomic in terms of 

1312 filesystem operations as well, but this cannot be implemented 

1313 rigorously for most datastores. 

1314 """ 

1315 if not self.isWriteable(): 

1316 raise TypeError("Butler is read-only.") 

1317 if run is None: 

1318 if self.run is None: 

1319 raise TypeError("No run provided.") 

1320 run = self.run 

1321 # No need to check run type, since insertDatasets will do that 

1322 # (safely) for us. 

1323 if tags is None: 

1324 tags = self.tags 

1325 else: 

1326 tags = tuple(tags) 

1327 for tag in tags: 

1328 # Check that these are tagged collections up front, because we want 

1329 # to avoid relying on Datastore transactionality to avoid modifying 

1330 # the repo if there's an error later. 

1331 collectionType = self.registry.getCollectionType(tag) 

1332 if collectionType is not CollectionType.TAGGED: 

1333 raise TypeError(f"Cannot associate into collection '{tag}' of non-TAGGED type " 

1334 f"{collectionType.name}.") 

1335 # Reorganize the inputs so they're grouped by DatasetType and then 

1336 # data ID. We also include a list of DatasetRefs for each FileDataset 

1337 # to hold the resolved DatasetRefs returned by the Registry, before 

1338 # it's safe to swap them into FileDataset.refs. 

1339 # Some type annotation aliases to make that clearer: 

1340 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]] 

1341 GroupedData = MutableMapping[DatasetType, GroupForType] 

1342 # The actual data structure: 

1343 groupedData: GroupedData = defaultdict(dict) 

1344 # And the nested loop that populates it: 

1345 for dataset in datasets: 

1346 # This list intentionally shared across the inner loop, since it's 

1347 # associated with `dataset`. 

1348 resolvedRefs = [] 

1349 for ref in dataset.refs: 

1350 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs) 

1351 

1352 # Now we can bulk-insert into Registry for each DatasetType. 

1353 allResolvedRefs = [] 

1354 for datasetType, groupForType in groupedData.items(): 

1355 refs = self.registry.insertDatasets(datasetType, 

1356 dataIds=groupForType.keys(), 

1357 run=run) 

1358 # Append those resolved DatasetRefs to the new lists we set up for 

1359 # them. 

1360 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()): 

1361 resolvedRefs.append(ref) 

1362 

1363 # Go back to the original FileDatasets to replace their refs with the 

1364 # new resolved ones, and also build a big list of all refs. 

1365 allResolvedRefs = [] 

1366 for groupForType in groupedData.values(): 

1367 for dataset, resolvedRefs in groupForType.values(): 

1368 dataset.refs = resolvedRefs 

1369 allResolvedRefs.extend(resolvedRefs) 

1370 

1371 # Bulk-associate everything with any tagged collections. 

1372 for tag in tags: 

1373 self.registry.associate(tag, allResolvedRefs) 

1374 

1375 # Bulk-insert everything into Datastore. 

1376 self.datastore.ingest(*datasets, transfer=transfer) 

1377 

1378 @contextlib.contextmanager 

1379 def export(self, *, directory: Optional[str] = None, 

1380 filename: Optional[str] = None, 

1381 format: Optional[str] = None, 

1382 transfer: Optional[str] = None) -> ContextManager[RepoExportContext]: 

1383 """Export datasets from the repository represented by this `Butler`. 

1384 

1385 This method is a context manager that returns a helper object 

1386 (`RepoExportContext`) that is used to indicate what information from 

1387 the repository should be exported. 

1388 

1389 Parameters 

1390 ---------- 

1391 directory : `str`, optional 

1392 Directory dataset files should be written to if ``transfer`` is not 

1393 `None`. 

1394 filename : `str`, optional 

1395 Name for the file that will include database information associated 

1396 with the exported datasets. If this is not an absolute path and 

1397 ``directory`` is not `None`, it will be written to ``directory`` 

1398 instead of the current working directory. Defaults to 

1399 "export.{format}". 

1400 format : `str`, optional 

1401 File format for the database information file. If `None`, the 

1402 extension of ``filename`` will be used. 

1403 transfer : `str`, optional 

1404 Transfer mode passed to `Datastore.export`. 

1405 

1406 Raises 

1407 ------ 

1408 TypeError 

1409 Raised if the set of arguments passed is inconsistent. 

1410 

1411 Examples 

1412 -------- 

1413 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

1414 methods are used to provide the iterables over data IDs and/or datasets 

1415 to be exported:: 

1416 

1417 with butler.export("exports.yaml") as export: 

1418 # Export all flats, but none of the dimension element rows 

1419 # (i.e. data ID information) associated with them. 

1420 export.saveDatasets(butler.registry.queryDatasets("flat"), 

1421 elements=()) 

1422 # Export all datasets that start with "deepCoadd_" and all of 

1423 # their associated data ID information. 

1424 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

1425 """ 

1426 if directory is None and transfer is not None: 

1427 raise TypeError("Cannot transfer without providing a directory.") 

1428 if transfer == "move": 

1429 raise TypeError("Transfer may not be 'move': export is read-only") 

1430 if format is None: 

1431 if filename is None: 

1432 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1433 else: 

1434 _, format = os.path.splitext(filename) 

1435 elif filename is None: 

1436 filename = f"export.{format}" 

1437 if directory is not None: 

1438 filename = os.path.join(directory, filename) 

1439 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"]) 

1440 with open(filename, 'w') as stream: 

1441 backend = BackendClass(stream) 

1442 try: 

1443 helper = RepoExportContext(self.registry, self.datastore, backend=backend, 

1444 directory=directory, transfer=transfer) 

1445 yield helper 

1446 except BaseException: 

1447 raise 

1448 else: 

1449 helper._finish() 

1450 

1451 def import_(self, *, directory: Optional[str] = None, 

1452 filename: Union[str, TextIO, None] = None, 

1453 format: Optional[str] = None, 

1454 transfer: Optional[str] = None, 

1455 skip_dimensions: Optional[Set] = None): 

1456 """Import datasets exported from a different butler repository. 

1457 

1458 Parameters 

1459 ---------- 

1460 directory : `str`, optional 

1461 Directory containing dataset files. If `None`, all file paths 

1462 must be absolute. 

1463 filename : `str` or `TextIO`, optional 

1464 A stream or name of file that contains database information 

1465 associated with the exported datasets. If this a string (name) and 

1466 is not an absolute path, does not exist in the current working 

1467 directory, and ``directory`` is not `None`, it is assumed to be in 

1468 ``directory``. Defaults to "export.{format}". 

1469 format : `str`, optional 

1470 File format for the database information file. If `None`, the 

1471 extension of ``filename`` will be used. 

1472 transfer : `str`, optional 

1473 Transfer mode passed to `Datastore.ingest`. 

1474 skip_dimensions : `set`, optional 

1475 Names of dimensions that should be skipped and not imported. 

1476 

1477 Raises 

1478 ------ 

1479 TypeError 

1480 Raised if the set of arguments passed is inconsistent, or if the 

1481 butler is read-only. 

1482 """ 

1483 if not self.isWriteable(): 

1484 raise TypeError("Butler is read-only.") 

1485 if format is None: 

1486 if filename is None: 

1487 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1488 else: 

1489 _, format = os.path.splitext(filename) 

1490 elif filename is None: 

1491 filename = f"export.{format}" 

1492 if isinstance(filename, str) and directory is not None and not os.path.exists(filename): 

1493 filename = os.path.join(directory, filename) 

1494 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"]) 

1495 

1496 def doImport(importStream): 

1497 backend = BackendClass(importStream, self.registry) 

1498 backend.register() 

1499 with self.transaction(): 

1500 backend.load(self.datastore, directory=directory, transfer=transfer, 

1501 skip_dimensions=skip_dimensions) 

1502 

1503 if isinstance(filename, str): 

1504 with open(filename, "r") as stream: 

1505 doImport(stream) 

1506 else: 

1507 doImport(filename) 

1508 

1509 def validateConfiguration(self, logFailures: bool = False, 

1510 datasetTypeNames: Optional[Iterable[str]] = None, 

1511 ignore: Iterable[str] = None): 

1512 """Validate butler configuration. 

1513 

1514 Checks that each `DatasetType` can be stored in the `Datastore`. 

1515 

1516 Parameters 

1517 ---------- 

1518 logFailures : `bool`, optional 

1519 If `True`, output a log message for every validation error 

1520 detected. 

1521 datasetTypeNames : iterable of `str`, optional 

1522 The `DatasetType` names that should be checked. This allows 

1523 only a subset to be selected. 

1524 ignore : iterable of `str`, optional 

1525 Names of DatasetTypes to skip over. This can be used to skip 

1526 known problems. If a named `DatasetType` corresponds to a 

1527 composite, all components of that `DatasetType` will also be 

1528 ignored. 

1529 

1530 Raises 

1531 ------ 

1532 ButlerValidationError 

1533 Raised if there is some inconsistency with how this Butler 

1534 is configured. 

1535 """ 

1536 if datasetTypeNames: 

1537 entities = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

1538 else: 

1539 entities = list(self.registry.queryDatasetTypes()) 

1540 

1541 # filter out anything from the ignore list 

1542 if ignore: 

1543 ignore = set(ignore) 

1544 entities = [e for e in entities if e.name not in ignore and e.nameAndComponent()[0] not in ignore] 

1545 else: 

1546 ignore = set() 

1547 

1548 # Find all the registered instruments 

1549 instruments = set( 

1550 record.name for record in self.registry.queryDimensionRecords("instrument") 

1551 ) 

1552 

1553 # For each datasetType that has an instrument dimension, create 

1554 # a DatasetRef for each defined instrument 

1555 datasetRefs = [] 

1556 

1557 for datasetType in entities: 

1558 if "instrument" in datasetType.dimensions: 

1559 for instrument in instruments: 

1560 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, conform=False) 

1561 datasetRefs.append(datasetRef) 

1562 

1563 entities.extend(datasetRefs) 

1564 

1565 datastoreErrorStr = None 

1566 try: 

1567 self.datastore.validateConfiguration(entities, logFailures=logFailures) 

1568 except ValidationError as e: 

1569 datastoreErrorStr = str(e) 

1570 

1571 # Also check that the LookupKeys used by the datastores match 

1572 # registry and storage class definitions 

1573 keys = self.datastore.getLookupKeys() 

1574 

1575 failedNames = set() 

1576 failedDataId = set() 

1577 for key in keys: 

1578 datasetType = None 

1579 if key.name is not None: 

1580 if key.name in ignore: 

1581 continue 

1582 

1583 # skip if specific datasetType names were requested and this 

1584 # name does not match 

1585 if datasetTypeNames and key.name not in datasetTypeNames: 

1586 continue 

1587 

1588 # See if it is a StorageClass or a DatasetType 

1589 if key.name in self.storageClasses: 

1590 pass 

1591 else: 

1592 try: 

1593 self.registry.getDatasetType(key.name) 

1594 except KeyError: 

1595 if logFailures: 

1596 log.fatal("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

1597 failedNames.add(key) 

1598 else: 

1599 # Dimensions are checked for consistency when the Butler 

1600 # is created and rendezvoused with a universe. 

1601 pass 

1602 

1603 # Check that the instrument is a valid instrument 

1604 # Currently only support instrument so check for that 

1605 if key.dataId: 

1606 dataIdKeys = set(key.dataId) 

1607 if set(["instrument"]) != dataIdKeys: 

1608 if logFailures: 

1609 log.fatal("Key '%s' has unsupported DataId override", key) 

1610 failedDataId.add(key) 

1611 elif key.dataId["instrument"] not in instruments: 

1612 if logFailures: 

1613 log.fatal("Key '%s' has unknown instrument", key) 

1614 failedDataId.add(key) 

1615 

1616 messages = [] 

1617 

1618 if datastoreErrorStr: 

1619 messages.append(datastoreErrorStr) 

1620 

1621 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

1622 (failedDataId, "Keys with bad DataId entries: ")): 

1623 if failed: 

1624 msg += ", ".join(str(k) for k in failed) 

1625 messages.append(msg) 

1626 

1627 if messages: 

1628 raise ValidationError(";\n".join(messages)) 

1629 

1630 registry: Registry 

1631 """The object that manages dataset metadata and relationships (`Registry`). 

1632 

1633 Most operations that don't involve reading or writing butler datasets are 

1634 accessible only via `Registry` methods. 

1635 """ 

1636 

1637 datastore: Datastore 

1638 """The object that manages actual dataset storage (`Datastore`). 

1639 

1640 Direct user access to the datastore should rarely be necessary; the primary 

1641 exception is the case where a `Datastore` implementation provides extra 

1642 functionality beyond what the base class defines. 

1643 """ 

1644 

1645 storageClasses: StorageClassFactory 

1646 """An object that maps known storage class names to objects that fully 

1647 describe them (`StorageClassFactory`). 

1648 """ 

1649 

1650 collections: Optional[CollectionSearch] 

1651 """The collections to search and any restrictions on the dataset types to 

1652 search for within them, in order (`CollectionSearch`). 

1653 """ 

1654 

1655 run: Optional[str] 

1656 """Name of the run this butler writes outputs to (`str` or `None`). 

1657 """ 

1658 

1659 tags: Tuple[str, ...] 

1660 """Names of `~CollectionType.TAGGED` collections this butler associates 

1661 with in `put` and `ingest`, and disassociates from in `pruneDatasets` 

1662 (`tuple` [ `str` ]). 

1663 """