Coverage for python/lsst/daf/butler/_butler.py: 8%

687 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-01-05 10:36 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Butler top level classes. 

24""" 

25from __future__ import annotations 

26 

27__all__ = ( 

28 "Butler", 

29 "ButlerValidationError", 

30 "PruneCollectionsArgsError", 

31 "PurgeWithoutUnstorePruneCollectionsError", 

32 "RunWithoutPurgePruneCollectionsError", 

33 "PurgeUnsupportedPruneCollectionsError", 

34) 

35 

36import collections.abc 

37import contextlib 

38import logging 

39import numbers 

40import os 

41from collections import defaultdict 

42from typing import ( 

43 Any, 

44 ClassVar, 

45 Counter, 

46 Dict, 

47 Iterable, 

48 Iterator, 

49 List, 

50 MutableMapping, 

51 Optional, 

52 Sequence, 

53 Set, 

54 TextIO, 

55 Tuple, 

56 Type, 

57 Union, 

58) 

59 

60from lsst.resources import ResourcePath, ResourcePathExpression 

61from lsst.utils import doImportType 

62from lsst.utils.introspection import get_class_of 

63from lsst.utils.logging import VERBOSE, getLogger 

64 

65from ._butlerConfig import ButlerConfig 

66from ._butlerRepoIndex import ButlerRepoIndex 

67from ._deferredDatasetHandle import DeferredDatasetHandle 

68from ._limited_butler import LimitedButler 

69from .core import ( 

70 AmbiguousDatasetError, 

71 Config, 

72 ConfigSubset, 

73 DataCoordinate, 

74 DataId, 

75 DataIdValue, 

76 DatasetRef, 

77 DatasetRefURIs, 

78 DatasetType, 

79 Datastore, 

80 Dimension, 

81 DimensionConfig, 

82 DimensionElement, 

83 DimensionRecord, 

84 DimensionUniverse, 

85 FileDataset, 

86 Progress, 

87 StorageClass, 

88 StorageClassFactory, 

89 Timespan, 

90 ValidationError, 

91) 

92from .core.repoRelocation import BUTLER_ROOT_TAG 

93from .core.utils import transactional 

94from .registry import ( 

95 CollectionType, 

96 ConflictingDefinitionError, 

97 DataIdError, 

98 DatasetIdGenEnum, 

99 Registry, 

100 RegistryConfig, 

101 RegistryDefaults, 

102) 

103from .transfers import RepoExportContext 

104 

105log = getLogger(__name__) 

106 

107 

108class ButlerValidationError(ValidationError): 

109 """There is a problem with the Butler configuration.""" 

110 

111 pass 

112 

113 

114class PruneCollectionsArgsError(TypeError): 

115 """Base class for errors relating to Butler.pruneCollections input 

116 arguments. 

117 """ 

118 

119 pass 

120 

121 

122class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError): 

123 """Raised when purge and unstore are both required to be True, and 

124 purge is True but unstore is False. 

125 """ 

126 

127 def __init__(self) -> None: 

128 super().__init__("Cannot pass purge=True without unstore=True.") 

129 

130 

131class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError): 

132 """Raised when pruning a RUN collection but purge is False.""" 

133 

134 def __init__(self, collectionType: CollectionType): 

135 self.collectionType = collectionType 

136 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.") 

137 

138 

139class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError): 

140 """Raised when purge is True but is not supported for the given 

141 collection.""" 

142 

143 def __init__(self, collectionType: CollectionType): 

144 self.collectionType = collectionType 

145 super().__init__( 

146 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True." 

147 ) 

148 

149 

150class Butler(LimitedButler): 

151 """Main entry point for the data access system. 

152 

153 Parameters 

154 ---------- 

155 config : `ButlerConfig`, `Config` or `str`, optional. 

156 Configuration. Anything acceptable to the 

157 `ButlerConfig` constructor. If a directory path 

158 is given the configuration will be read from a ``butler.yaml`` file in 

159 that location. If `None` is given default values will be used. 

160 butler : `Butler`, optional. 

161 If provided, construct a new Butler that uses the same registry and 

162 datastore as the given one, but with the given collection and run. 

163 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

164 arguments. 

165 collections : `str` or `Iterable` [ `str` ], optional 

166 An expression specifying the collections to be searched (in order) when 

167 reading datasets. 

168 This may be a `str` collection name or an iterable thereof. 

169 See :ref:`daf_butler_collection_expressions` for more information. 

170 These collections are not registered automatically and must be 

171 manually registered before they are used by any method, but they may be 

172 manually registered after the `Butler` is initialized. 

173 run : `str`, optional 

174 Name of the `~CollectionType.RUN` collection new datasets should be 

175 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

176 ``collections`` will be set to ``[run]``. If not `None`, this 

177 collection will automatically be registered. If this is not set (and 

178 ``writeable`` is not set either), a read-only butler will be created. 

179 searchPaths : `list` of `str`, optional 

180 Directory paths to search when calculating the full Butler 

181 configuration. Not used if the supplied config is already a 

182 `ButlerConfig`. 

183 writeable : `bool`, optional 

184 Explicitly sets whether the butler supports write operations. If not 

185 provided, a read-write butler is created if any of ``run``, ``tags``, 

186 or ``chains`` is non-empty. 

187 inferDefaults : `bool`, optional 

188 If `True` (default) infer default data ID values from the values 

189 present in the datasets in ``collections``: if all collections have the 

190 same value (or no value) for a governor dimension, that value will be 

191 the default for that dimension. Nonexistent collections are ignored. 

192 If a default value is provided explicitly for a governor dimension via 

193 ``**kwargs``, no default will be inferred for that dimension. 

194 **kwargs : `str` 

195 Default data ID key-value pairs. These may only identify "governor" 

196 dimensions like ``instrument`` and ``skymap``. 

197 

198 Examples 

199 -------- 

200 While there are many ways to control exactly how a `Butler` interacts with 

201 the collections in its `Registry`, the most common cases are still simple. 

202 

203 For a read-only `Butler` that searches one collection, do:: 

204 

205 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

206 

207 For a read-write `Butler` that writes to and reads from a 

208 `~CollectionType.RUN` collection:: 

209 

210 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

211 

212 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

213 because we want to write to one `~CollectionType.RUN` collection but read 

214 from several others (as well):: 

215 

216 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

217 collections=["u/alice/DM-50000/a", 

218 "u/bob/DM-49998", 

219 "HSC/defaults"]) 

220 

221 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

222 Datasets will be read first from that run (since it appears first in the 

223 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

224 

225 Finally, one can always create a `Butler` with no collections:: 

226 

227 butler = Butler("/path/to/repo", writeable=True) 

228 

229 This can be extremely useful when you just want to use ``butler.registry``, 

230 e.g. for inserting dimension data or managing collections, or when the 

231 collections you want to use with the butler are not consistent. 

232 Passing ``writeable`` explicitly here is only necessary if you want to be 

233 able to make changes to the repo - usually the value for ``writeable`` can 

234 be guessed from the collection arguments provided, but it defaults to 

235 `False` when there are not collection arguments. 

236 """ 

237 

238 def __init__( 

239 self, 

240 config: Union[Config, str, None] = None, 

241 *, 

242 butler: Optional[Butler] = None, 

243 collections: Any = None, 

244 run: Optional[str] = None, 

245 searchPaths: Optional[List[str]] = None, 

246 writeable: Optional[bool] = None, 

247 inferDefaults: bool = True, 

248 **kwargs: str, 

249 ): 

250 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

251 # Load registry, datastore, etc. from config or existing butler. 

252 if butler is not None: 

253 if config is not None or searchPaths is not None or writeable is not None: 

254 raise TypeError( 

255 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument." 

256 ) 

257 self.registry = butler.registry.copy(defaults) 

258 self.datastore = butler.datastore 

259 self.storageClasses = butler.storageClasses 

260 self._config: ButlerConfig = butler._config 

261 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset 

262 else: 

263 # Can only look for strings in the known repos list. 

264 if isinstance(config, str) and config in self.get_known_repos(): 

265 config = str(self.get_repo_uri(config)) 

266 try: 

267 self._config = ButlerConfig(config, searchPaths=searchPaths) 

268 except FileNotFoundError as e: 

269 if known := self.get_known_repos(): 

270 aliases = f"(known aliases: {', '.join(known)})" 

271 else: 

272 aliases = "(no known aliases)" 

273 raise FileNotFoundError(f"{e} {aliases}") from e 

274 self._config = ButlerConfig(config, searchPaths=searchPaths) 

275 try: 

276 if "root" in self._config: 

277 butlerRoot = self._config["root"] 

278 else: 

279 butlerRoot = self._config.configDir 

280 if writeable is None: 

281 writeable = run is not None 

282 self.registry = Registry.fromConfig( 

283 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

284 ) 

285 self.datastore = Datastore.fromConfig( 

286 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

287 ) 

288 self.storageClasses = StorageClassFactory() 

289 self.storageClasses.addFromConfig(self._config) 

290 self._allow_put_of_predefined_dataset = self._config.get( 

291 "allow_put_of_predefined_dataset", False 

292 ) 

293 except Exception: 

294 # Failures here usually mean that configuration is incomplete, 

295 # just issue an error message which includes config file URI. 

296 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

297 raise 

298 

299 if "run" in self._config or "collection" in self._config: 

300 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

301 

302 GENERATION: ClassVar[int] = 3 

303 """This is a Generation 3 Butler. 

304 

305 This attribute may be removed in the future, once the Generation 2 Butler 

306 interface has been fully retired; it should only be used in transitional 

307 code. 

308 """ 

309 

310 @classmethod 

311 def get_repo_uri(cls, label: str) -> ResourcePath: 

312 """Look up the label in a butler repository index. 

313 

314 Parameters 

315 ---------- 

316 label : `str` 

317 Label of the Butler repository to look up. 

318 

319 Returns 

320 ------- 

321 uri : `lsst.resources.ResourcePath` 

322 URI to the Butler repository associated with the given label. 

323 

324 Raises 

325 ------ 

326 KeyError 

327 Raised if the label is not found in the index, or if an index 

328 can not be found at all. 

329 

330 Notes 

331 ----- 

332 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

333 information is discovered. 

334 """ 

335 return ButlerRepoIndex.get_repo_uri(label) 

336 

337 @classmethod 

338 def get_known_repos(cls) -> Set[str]: 

339 """Retrieve the list of known repository labels. 

340 

341 Returns 

342 ------- 

343 repos : `set` of `str` 

344 All the known labels. Can be empty if no index can be found. 

345 

346 Notes 

347 ----- 

348 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

349 information is discovered. 

350 """ 

351 return ButlerRepoIndex.get_known_repos() 

352 

353 @staticmethod 

354 def makeRepo( 

355 root: ResourcePathExpression, 

356 config: Union[Config, str, None] = None, 

357 dimensionConfig: Union[Config, str, None] = None, 

358 standalone: bool = False, 

359 searchPaths: Optional[List[str]] = None, 

360 forceConfigRoot: bool = True, 

361 outfile: Optional[ResourcePathExpression] = None, 

362 overwrite: bool = False, 

363 ) -> Config: 

364 """Create an empty data repository by adding a butler.yaml config 

365 to a repository root directory. 

366 

367 Parameters 

368 ---------- 

369 root : `lsst.resources.ResourcePathExpression` 

370 Path or URI to the root location of the new repository. Will be 

371 created if it does not exist. 

372 config : `Config` or `str`, optional 

373 Configuration to write to the repository, after setting any 

374 root-dependent Registry or Datastore config options. Can not 

375 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

376 configuration will be used. Root-dependent config options 

377 specified in this config are overwritten if ``forceConfigRoot`` 

378 is `True`. 

379 dimensionConfig : `Config` or `str`, optional 

380 Configuration for dimensions, will be used to initialize registry 

381 database. 

382 standalone : `bool` 

383 If True, write all expanded defaults, not just customized or 

384 repository-specific settings. 

385 This (mostly) decouples the repository from the default 

386 configuration, insulating it from changes to the defaults (which 

387 may be good or bad, depending on the nature of the changes). 

388 Future *additions* to the defaults will still be picked up when 

389 initializing `Butlers` to repos created with ``standalone=True``. 

390 searchPaths : `list` of `str`, optional 

391 Directory paths to search when calculating the full butler 

392 configuration. 

393 forceConfigRoot : `bool`, optional 

394 If `False`, any values present in the supplied ``config`` that 

395 would normally be reset are not overridden and will appear 

396 directly in the output config. This allows non-standard overrides 

397 of the root directory for a datastore or registry to be given. 

398 If this parameter is `True` the values for ``root`` will be 

399 forced into the resulting config if appropriate. 

400 outfile : `lss.resources.ResourcePathExpression`, optional 

401 If not-`None`, the output configuration will be written to this 

402 location rather than into the repository itself. Can be a URI 

403 string. Can refer to a directory that will be used to write 

404 ``butler.yaml``. 

405 overwrite : `bool`, optional 

406 Create a new configuration file even if one already exists 

407 in the specified output location. Default is to raise 

408 an exception. 

409 

410 Returns 

411 ------- 

412 config : `Config` 

413 The updated `Config` instance written to the repo. 

414 

415 Raises 

416 ------ 

417 ValueError 

418 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

419 regular Config (as these subclasses would make it impossible to 

420 support ``standalone=False``). 

421 FileExistsError 

422 Raised if the output config file already exists. 

423 os.error 

424 Raised if the directory does not exist, exists but is not a 

425 directory, or cannot be created. 

426 

427 Notes 

428 ----- 

429 Note that when ``standalone=False`` (the default), the configuration 

430 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

431 construct the repository should also be used to construct any Butlers 

432 to avoid configuration inconsistencies. 

433 """ 

434 if isinstance(config, (ButlerConfig, ConfigSubset)): 

435 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

436 

437 # Ensure that the root of the repository exists or can be made 

438 root_uri = ResourcePath(root, forceDirectory=True) 

439 root_uri.mkdir() 

440 

441 config = Config(config) 

442 

443 # If we are creating a new repo from scratch with relative roots, 

444 # do not propagate an explicit root from the config file 

445 if "root" in config: 

446 del config["root"] 

447 

448 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

449 imported_class = doImportType(full["datastore", "cls"]) 

450 if not issubclass(imported_class, Datastore): 

451 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore") 

452 datastoreClass: Type[Datastore] = imported_class 

453 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

454 

455 # if key exists in given config, parse it, otherwise parse the defaults 

456 # in the expanded config 

457 if config.get(("registry", "db")): 

458 registryConfig = RegistryConfig(config) 

459 else: 

460 registryConfig = RegistryConfig(full) 

461 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

462 if defaultDatabaseUri is not None: 

463 Config.updateParameters( 

464 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot 

465 ) 

466 else: 

467 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot) 

468 

469 if standalone: 

470 config.merge(full) 

471 else: 

472 # Always expand the registry.managers section into the per-repo 

473 # config, because after the database schema is created, it's not 

474 # allowed to change anymore. Note that in the standalone=True 

475 # branch, _everything_ in the config is expanded, so there's no 

476 # need to special case this. 

477 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

478 configURI: ResourcePathExpression 

479 if outfile is not None: 

480 # When writing to a separate location we must include 

481 # the root of the butler repo in the config else it won't know 

482 # where to look. 

483 config["root"] = root_uri.geturl() 

484 configURI = outfile 

485 else: 

486 configURI = root_uri 

487 # Strip obscore configuration, if it is present, before writing config 

488 # to a file, obscore config will be stored in registry. 

489 config_to_write = config 

490 if ("registry", "managers", "obscore") in config: 

491 config_to_write = config.copy() 

492 del config_to_write["registry", "managers", "obscore", "config"] 

493 config_to_write.dumpToUri(configURI, overwrite=overwrite) 

494 

495 # Create Registry and populate tables 

496 registryConfig = RegistryConfig(config.get("registry")) 

497 dimensionConfig = DimensionConfig(dimensionConfig) 

498 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri) 

499 

500 log.verbose("Wrote new Butler configuration file to %s", configURI) 

501 

502 return config 

503 

504 @classmethod 

505 def _unpickle( 

506 cls, 

507 config: ButlerConfig, 

508 collections: Optional[tuple[str, ...]], 

509 run: Optional[str], 

510 defaultDataId: Dict[str, str], 

511 writeable: bool, 

512 ) -> Butler: 

513 """Callable used to unpickle a Butler. 

514 

515 We prefer not to use ``Butler.__init__`` directly so we can force some 

516 of its many arguments to be keyword-only (note that ``__reduce__`` 

517 can only invoke callables with positional arguments). 

518 

519 Parameters 

520 ---------- 

521 config : `ButlerConfig` 

522 Butler configuration, already coerced into a true `ButlerConfig` 

523 instance (and hence after any search paths for overrides have been 

524 utilized). 

525 collections : `tuple` [ `str` ] 

526 Names of the default collections to read from. 

527 run : `str`, optional 

528 Name of the default `~CollectionType.RUN` collection to write to. 

529 defaultDataId : `dict` [ `str`, `str` ] 

530 Default data ID values. 

531 writeable : `bool` 

532 Whether the Butler should support write operations. 

533 

534 Returns 

535 ------- 

536 butler : `Butler` 

537 A new `Butler` instance. 

538 """ 

539 # MyPy doesn't recognize that the kwargs below are totally valid; it 

540 # seems to think '**defaultDataId* is a _positional_ argument! 

541 return cls( 

542 config=config, 

543 collections=collections, 

544 run=run, 

545 writeable=writeable, 

546 **defaultDataId, # type: ignore 

547 ) 

548 

549 def __reduce__(self) -> tuple: 

550 """Support pickling.""" 

551 return ( 

552 Butler._unpickle, 

553 ( 

554 self._config, 

555 self.collections, 

556 self.run, 

557 self.registry.defaults.dataId.byName(), 

558 self.registry.isWriteable(), 

559 ), 

560 ) 

561 

562 def __str__(self) -> str: 

563 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

564 self.collections, self.run, self.datastore, self.registry 

565 ) 

566 

567 def isWriteable(self) -> bool: 

568 """Return `True` if this `Butler` supports write operations.""" 

569 return self.registry.isWriteable() 

570 

571 @contextlib.contextmanager 

572 def transaction(self) -> Iterator[None]: 

573 """Context manager supporting `Butler` transactions. 

574 

575 Transactions can be nested. 

576 """ 

577 with self.registry.transaction(): 

578 with self.datastore.transaction(): 

579 yield 

580 

581 def _standardizeArgs( 

582 self, 

583 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

584 dataId: Optional[DataId] = None, 

585 for_put: bool = True, 

586 **kwargs: Any, 

587 ) -> Tuple[DatasetType, Optional[DataId]]: 

588 """Standardize the arguments passed to several Butler APIs. 

589 

590 Parameters 

591 ---------- 

592 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

593 When `DatasetRef` the `dataId` should be `None`. 

594 Otherwise the `DatasetType` or name thereof. 

595 dataId : `dict` or `DataCoordinate` 

596 A `dict` of `Dimension` link name, value pairs that label the 

597 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

598 should be provided as the second argument. 

599 for_put : `bool`, optional 

600 If `True` this call is invoked as part of a `Butler.put()`. 

601 Otherwise it is assumed to be part of a `Butler.get()`. This 

602 parameter is only relevant if there is dataset type 

603 inconsistency. 

604 **kwargs 

605 Additional keyword arguments used to augment or construct a 

606 `DataCoordinate`. See `DataCoordinate.standardize` 

607 parameters. 

608 

609 Returns 

610 ------- 

611 datasetType : `DatasetType` 

612 A `DatasetType` instance extracted from ``datasetRefOrType``. 

613 dataId : `dict` or `DataId`, optional 

614 Argument that can be used (along with ``kwargs``) to construct a 

615 `DataId`. 

616 

617 Notes 

618 ----- 

619 Butler APIs that conceptually need a DatasetRef also allow passing a 

620 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

621 keyword arguments that can be used to construct one) separately. This 

622 method accepts those arguments and always returns a true `DatasetType` 

623 and a `DataId` or `dict`. 

624 

625 Standardization of `dict` vs `DataId` is best handled by passing the 

626 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

627 generally similarly flexible. 

628 """ 

629 externalDatasetType: Optional[DatasetType] = None 

630 internalDatasetType: Optional[DatasetType] = None 

631 if isinstance(datasetRefOrType, DatasetRef): 

632 if dataId is not None or kwargs: 

633 raise ValueError("DatasetRef given, cannot use dataId as well") 

634 externalDatasetType = datasetRefOrType.datasetType 

635 dataId = datasetRefOrType.dataId 

636 else: 

637 # Don't check whether DataId is provided, because Registry APIs 

638 # can usually construct a better error message when it wasn't. 

639 if isinstance(datasetRefOrType, DatasetType): 

640 externalDatasetType = datasetRefOrType 

641 else: 

642 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

643 

644 # Check that they are self-consistent 

645 if externalDatasetType is not None: 

646 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

647 if externalDatasetType != internalDatasetType: 

648 # We can allow differences if they are compatible, depending 

649 # on whether this is a get or a put. A get requires that 

650 # the python type associated with the datastore can be 

651 # converted to the user type. A put requires that the user 

652 # supplied python type can be converted to the internal 

653 # type expected by registry. 

654 relevantDatasetType = internalDatasetType 

655 if for_put: 

656 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

657 else: 

658 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

659 relevantDatasetType = externalDatasetType 

660 if not is_compatible: 

661 raise ValueError( 

662 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

663 f"registry definition ({internalDatasetType})" 

664 ) 

665 # Override the internal definition. 

666 internalDatasetType = relevantDatasetType 

667 

668 assert internalDatasetType is not None 

669 return internalDatasetType, dataId 

670 

671 def _rewrite_data_id( 

672 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any 

673 ) -> Tuple[Optional[DataId], Dict[str, Any]]: 

674 """Rewrite a data ID taking into account dimension records. 

675 

676 Take a Data ID and keyword args and rewrite it if necessary to 

677 allow the user to specify dimension records rather than dimension 

678 primary values. 

679 

680 This allows a user to include a dataId dict with keys of 

681 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

682 the integer exposure ID. It also allows a string to be given 

683 for a dimension value rather than the integer ID if that is more 

684 convenient. For example, rather than having to specifyin the 

685 detector with ``detector.full_name``, a string given for ``detector`` 

686 will be interpreted as the full name and converted to the integer 

687 value. 

688 

689 Keyword arguments can also use strings for dimensions like detector 

690 and exposure but python does not allow them to include ``.`` and 

691 so the ``exposure.day_obs`` syntax can not be used in a keyword 

692 argument. 

693 

694 Parameters 

695 ---------- 

696 dataId : `dict` or `DataCoordinate` 

697 A `dict` of `Dimension` link name, value pairs that will label the 

698 `DatasetRef` within a Collection. 

699 datasetType : `DatasetType` 

700 The dataset type associated with this dataId. Required to 

701 determine the relevant dimensions. 

702 **kwargs 

703 Additional keyword arguments used to augment or construct a 

704 `DataId`. See `DataId` parameters. 

705 

706 Returns 

707 ------- 

708 dataId : `dict` or `DataCoordinate` 

709 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

710 no keyword arguments, the original dataId will be returned 

711 unchanged. 

712 **kwargs : `dict` 

713 Any unused keyword arguments (would normally be empty dict). 

714 """ 

715 # Do nothing if we have a standalone DataCoordinate. 

716 if isinstance(dataId, DataCoordinate) and not kwargs: 

717 return dataId, kwargs 

718 

719 # Process dimension records that are using record information 

720 # rather than ids 

721 newDataId: Dict[str, DataIdValue] = {} 

722 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict) 

723 

724 # if all the dataId comes from keyword parameters we do not need 

725 # to do anything here because they can't be of the form 

726 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

727 if dataId: 

728 for k, v in dataId.items(): 

729 # If we have a Dimension we do not need to do anything 

730 # because it cannot be a compound key. 

731 if isinstance(k, str) and "." in k: 

732 # Someone is using a more human-readable dataId 

733 dimensionName, record = k.split(".", 1) 

734 byRecord[dimensionName][record] = v 

735 elif isinstance(k, Dimension): 

736 newDataId[k.name] = v 

737 else: 

738 newDataId[k] = v 

739 

740 # Go through the updated dataId and check the type in case someone is 

741 # using an alternate key. We have already filtered out the compound 

742 # keys dimensions.record format. 

743 not_dimensions = {} 

744 

745 # Will need to look in the dataId and the keyword arguments 

746 # and will remove them if they need to be fixed or are unrecognized. 

747 for dataIdDict in (newDataId, kwargs): 

748 # Use a list so we can adjust the dict safely in the loop 

749 for dimensionName in list(dataIdDict): 

750 value = dataIdDict[dimensionName] 

751 try: 

752 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

753 except KeyError: 

754 # This is not a real dimension 

755 not_dimensions[dimensionName] = value 

756 del dataIdDict[dimensionName] 

757 continue 

758 

759 # Convert an integral type to an explicit int to simplify 

760 # comparisons here 

761 if isinstance(value, numbers.Integral): 

762 value = int(value) 

763 

764 if not isinstance(value, dimension.primaryKey.getPythonType()): 

765 for alternate in dimension.alternateKeys: 

766 if isinstance(value, alternate.getPythonType()): 

767 byRecord[dimensionName][alternate.name] = value 

768 del dataIdDict[dimensionName] 

769 log.debug( 

770 "Converting dimension %s to %s.%s=%s", 

771 dimensionName, 

772 dimensionName, 

773 alternate.name, 

774 value, 

775 ) 

776 break 

777 else: 

778 log.warning( 

779 "Type mismatch found for value '%r' provided for dimension %s. " 

780 "Could not find matching alternative (primary key has type %s) " 

781 "so attempting to use as-is.", 

782 value, 

783 dimensionName, 

784 dimension.primaryKey.getPythonType(), 

785 ) 

786 

787 # By this point kwargs and newDataId should only include valid 

788 # dimensions. Merge kwargs in to the new dataId and log if there 

789 # are dimensions in both (rather than calling update). 

790 for k, v in kwargs.items(): 

791 if k in newDataId and newDataId[k] != v: 

792 log.debug( 

793 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

794 ) 

795 newDataId[k] = v 

796 # No need to retain any values in kwargs now. 

797 kwargs = {} 

798 

799 # If we have some unrecognized dimensions we have to try to connect 

800 # them to records in other dimensions. This is made more complicated 

801 # by some dimensions having records with clashing names. A mitigation 

802 # is that we can tell by this point which dimensions are missing 

803 # for the DatasetType but this does not work for calibrations 

804 # where additional dimensions can be used to constrain the temporal 

805 # axis. 

806 if not_dimensions: 

807 # Search for all dimensions even if we have been given a value 

808 # explicitly. In some cases records are given as well as the 

809 # actually dimension and this should not be an error if they 

810 # match. 

811 mandatoryDimensions = datasetType.dimensions.names # - provided 

812 

813 candidateDimensions: Set[str] = set() 

814 candidateDimensions.update(mandatoryDimensions) 

815 

816 # For calibrations we may well be needing temporal dimensions 

817 # so rather than always including all dimensions in the scan 

818 # restrict things a little. It is still possible for there 

819 # to be confusion over day_obs in visit vs exposure for example. 

820 # If we are not searching calibration collections things may 

821 # fail but they are going to fail anyway because of the 

822 # ambiguousness of the dataId... 

823 if datasetType.isCalibration(): 

824 for dim in self.registry.dimensions.getStaticDimensions(): 

825 if dim.temporal: 

826 candidateDimensions.add(str(dim)) 

827 

828 # Look up table for the first association with a dimension 

829 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict) 

830 

831 # Keep track of whether an item is associated with multiple 

832 # dimensions. 

833 counter: Counter[str] = Counter() 

834 assigned: Dict[str, Set[str]] = defaultdict(set) 

835 

836 # Go through the missing dimensions and associate the 

837 # given names with records within those dimensions 

838 matched_dims = set() 

839 for dimensionName in candidateDimensions: 

840 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

841 fields = dimension.metadata.names | dimension.uniqueKeys.names 

842 for field in not_dimensions: 

843 if field in fields: 

844 guessedAssociation[dimensionName][field] = not_dimensions[field] 

845 counter[dimensionName] += 1 

846 assigned[field].add(dimensionName) 

847 matched_dims.add(field) 

848 

849 # Calculate the fields that matched nothing. 

850 never_found = set(not_dimensions) - matched_dims 

851 

852 if never_found: 

853 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

854 

855 # There is a chance we have allocated a single dataId item 

856 # to multiple dimensions. Need to decide which should be retained. 

857 # For now assume that the most popular alternative wins. 

858 # This means that day_obs with seq_num will result in 

859 # exposure.day_obs and not visit.day_obs 

860 # Also prefer an explicitly missing dimension over an inferred 

861 # temporal dimension. 

862 for fieldName, assignedDimensions in assigned.items(): 

863 if len(assignedDimensions) > 1: 

864 # Pick the most popular (preferring mandatory dimensions) 

865 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

866 if requiredButMissing: 

867 candidateDimensions = requiredButMissing 

868 else: 

869 candidateDimensions = assignedDimensions 

870 

871 # If this is a choice between visit and exposure and 

872 # neither was a required part of the dataset type, 

873 # (hence in this branch) always prefer exposure over 

874 # visit since exposures are always defined and visits 

875 # are defined from exposures. 

876 if candidateDimensions == {"exposure", "visit"}: 

877 candidateDimensions = {"exposure"} 

878 

879 # Select the relevant items and get a new restricted 

880 # counter. 

881 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

882 duplicatesCounter: Counter[str] = Counter() 

883 duplicatesCounter.update(theseCounts) 

884 

885 # Choose the most common. If they are equally common 

886 # we will pick the one that was found first. 

887 # Returns a list of tuples 

888 selected = duplicatesCounter.most_common(1)[0][0] 

889 

890 log.debug( 

891 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

892 " Removed ambiguity by choosing dimension %s.", 

893 fieldName, 

894 ", ".join(assignedDimensions), 

895 selected, 

896 ) 

897 

898 for candidateDimension in assignedDimensions: 

899 if candidateDimension != selected: 

900 del guessedAssociation[candidateDimension][fieldName] 

901 

902 # Update the record look up dict with the new associations 

903 for dimensionName, values in guessedAssociation.items(): 

904 if values: # A dict might now be empty 

905 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values) 

906 byRecord[dimensionName].update(values) 

907 

908 if byRecord: 

909 # Some record specifiers were found so we need to convert 

910 # them to the Id form 

911 for dimensionName, values in byRecord.items(): 

912 if dimensionName in newDataId: 

913 log.debug( 

914 "DataId specified explicit %s dimension value of %s in addition to" 

915 " general record specifiers for it of %s. Ignoring record information.", 

916 dimensionName, 

917 newDataId[dimensionName], 

918 str(values), 

919 ) 

920 # Get the actual record and compare with these values. 

921 try: 

922 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

923 except DataIdError: 

924 raise ValueError( 

925 f"Could not find dimension '{dimensionName}'" 

926 f" with dataId {newDataId} as part of comparing with" 

927 f" record values {byRecord[dimensionName]}" 

928 ) from None 

929 if len(recs) == 1: 

930 errmsg: List[str] = [] 

931 for k, v in values.items(): 

932 if (recval := getattr(recs[0], k)) != v: 

933 errmsg.append(f"{k}({recval} != {v})") 

934 if errmsg: 

935 raise ValueError( 

936 f"Dimension {dimensionName} in dataId has explicit value" 

937 " inconsistent with records: " + ", ".join(errmsg) 

938 ) 

939 else: 

940 # Multiple matches for an explicit dimension 

941 # should never happen but let downstream complain. 

942 pass 

943 continue 

944 

945 # Build up a WHERE expression 

946 bind = {k: v for k, v in values.items()} 

947 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

948 

949 # Hopefully we get a single record that matches 

950 records = set( 

951 self.registry.queryDimensionRecords( 

952 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

953 ) 

954 ) 

955 

956 if len(records) != 1: 

957 if len(records) > 1: 

958 # visit can have an ambiguous answer without involving 

959 # visit_system. The default visit_system is defined 

960 # by the instrument. 

961 if ( 

962 dimensionName == "visit" 

963 and "visit_system_membership" in self.registry.dimensions 

964 and "visit_system" 

965 in self.registry.dimensions["instrument"].metadata # type: ignore 

966 ): 

967 instrument_records = list( 

968 self.registry.queryDimensionRecords( 

969 "instrument", 

970 dataId=newDataId, 

971 **kwargs, 

972 ) 

973 ) 

974 if len(instrument_records) == 1: 

975 visit_system = instrument_records[0].visit_system 

976 if visit_system is None: 

977 # Set to a value that will never match. 

978 visit_system = -1 

979 

980 # Look up each visit in the 

981 # visit_system_membership records. 

982 for rec in records: 

983 membership = list( 

984 self.registry.queryDimensionRecords( 

985 # Use bind to allow zero results. 

986 # This is a fully-specified query. 

987 "visit_system_membership", 

988 where="instrument = inst AND visit_system = system AND visit = v", 

989 bind=dict( 

990 inst=instrument_records[0].name, system=visit_system, v=rec.id 

991 ), 

992 ) 

993 ) 

994 if membership: 

995 # This record is the right answer. 

996 records = set([rec]) 

997 break 

998 

999 # The ambiguity may have been resolved so check again. 

1000 if len(records) > 1: 

1001 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

1002 for r in records: 

1003 log.debug("- %s", str(r)) 

1004 raise ValueError( 

1005 f"DataId specification for dimension {dimensionName} is not" 

1006 f" uniquely constrained to a single dataset by {values}." 

1007 f" Got {len(records)} results." 

1008 ) 

1009 else: 

1010 raise ValueError( 

1011 f"DataId specification for dimension {dimensionName} matched no" 

1012 f" records when constrained by {values}" 

1013 ) 

1014 

1015 # Get the primary key from the real dimension object 

1016 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

1017 if not isinstance(dimension, Dimension): 

1018 raise RuntimeError( 

1019 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

1020 ) 

1021 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

1022 

1023 return newDataId, kwargs 

1024 

1025 def _findDatasetRef( 

1026 self, 

1027 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1028 dataId: Optional[DataId] = None, 

1029 *, 

1030 collections: Any = None, 

1031 allowUnresolved: bool = False, 

1032 **kwargs: Any, 

1033 ) -> DatasetRef: 

1034 """Shared logic for methods that start with a search for a dataset in 

1035 the registry. 

1036 

1037 Parameters 

1038 ---------- 

1039 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1040 When `DatasetRef` the `dataId` should be `None`. 

1041 Otherwise the `DatasetType` or name thereof. 

1042 dataId : `dict` or `DataCoordinate`, optional 

1043 A `dict` of `Dimension` link name, value pairs that label the 

1044 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1045 should be provided as the first argument. 

1046 collections : Any, optional 

1047 Collections to be searched, overriding ``self.collections``. 

1048 Can be any of the types supported by the ``collections`` argument 

1049 to butler construction. 

1050 allowUnresolved : `bool`, optional 

1051 If `True`, return an unresolved `DatasetRef` if finding a resolved 

1052 one in the `Registry` fails. Defaults to `False`. 

1053 **kwargs 

1054 Additional keyword arguments used to augment or construct a 

1055 `DataId`. See `DataId` parameters. 

1056 

1057 Returns 

1058 ------- 

1059 ref : `DatasetRef` 

1060 A reference to the dataset identified by the given arguments. 

1061 

1062 Raises 

1063 ------ 

1064 LookupError 

1065 Raised if no matching dataset exists in the `Registry` (and 

1066 ``allowUnresolved is False``). 

1067 ValueError 

1068 Raised if a resolved `DatasetRef` was passed as an input, but it 

1069 differs from the one found in the registry. 

1070 TypeError 

1071 Raised if no collections were provided. 

1072 """ 

1073 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

1074 if isinstance(datasetRefOrType, DatasetRef): 

1075 idNumber = datasetRefOrType.id 

1076 else: 

1077 idNumber = None 

1078 timespan: Optional[Timespan] = None 

1079 

1080 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1081 

1082 if datasetType.isCalibration(): 

1083 # Because this is a calibration dataset, first try to make a 

1084 # standardize the data ID without restricting the dimensions to 

1085 # those of the dataset type requested, because there may be extra 

1086 # dimensions that provide temporal information for a validity-range 

1087 # lookup. 

1088 dataId = DataCoordinate.standardize( 

1089 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1090 ) 

1091 if dataId.graph.temporal: 

1092 dataId = self.registry.expandDataId(dataId) 

1093 timespan = dataId.timespan 

1094 else: 

1095 # Standardize the data ID to just the dimensions of the dataset 

1096 # type instead of letting registry.findDataset do it, so we get the 

1097 # result even if no dataset is found. 

1098 dataId = DataCoordinate.standardize( 

1099 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1100 ) 

1101 # Always lookup the DatasetRef, even if one is given, to ensure it is 

1102 # present in the current collection. 

1103 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

1104 if ref is None: 

1105 if allowUnresolved: 

1106 return DatasetRef(datasetType, dataId) 

1107 else: 

1108 if collections is None: 

1109 collections = self.registry.defaults.collections 

1110 raise LookupError( 

1111 f"Dataset {datasetType.name} with data ID {dataId} " 

1112 f"could not be found in collections {collections}." 

1113 ) 

1114 if idNumber is not None and idNumber != ref.id: 

1115 if collections is None: 

1116 collections = self.registry.defaults.collections 

1117 raise ValueError( 

1118 f"DatasetRef.id provided ({idNumber}) does not match " 

1119 f"id ({ref.id}) in registry in collections {collections}." 

1120 ) 

1121 if datasetType != ref.datasetType: 

1122 # If they differ it is because the user explicitly specified 

1123 # a compatible dataset type to this call rather than using the 

1124 # registry definition. The DatasetRef must therefore be recreated 

1125 # using the user definition such that the expected type is 

1126 # returned. 

1127 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id) 

1128 

1129 return ref 

1130 

1131 @transactional 

1132 def putDirect(self, obj: Any, ref: DatasetRef) -> DatasetRef: 

1133 # Docstring inherited. 

1134 (imported_ref,) = self.registry._importDatasets( 

1135 [ref], 

1136 expand=True, 

1137 ) 

1138 if imported_ref.id != ref.getCheckedId(): 

1139 raise RuntimeError("This registry configuration does not support putDirect.") 

1140 self.datastore.put(obj, ref) 

1141 return ref 

1142 

1143 @transactional 

1144 def put( 

1145 self, 

1146 obj: Any, 

1147 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1148 dataId: Optional[DataId] = None, 

1149 *, 

1150 run: Optional[str] = None, 

1151 **kwargs: Any, 

1152 ) -> DatasetRef: 

1153 """Store and register a dataset. 

1154 

1155 Parameters 

1156 ---------- 

1157 obj : `object` 

1158 The dataset. 

1159 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1160 When `DatasetRef` is provided, ``dataId`` should be `None`. 

1161 Otherwise the `DatasetType` or name thereof. 

1162 dataId : `dict` or `DataCoordinate` 

1163 A `dict` of `Dimension` link name, value pairs that label the 

1164 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1165 should be provided as the second argument. 

1166 run : `str`, optional 

1167 The name of the run the dataset should be added to, overriding 

1168 ``self.run``. 

1169 **kwargs 

1170 Additional keyword arguments used to augment or construct a 

1171 `DataCoordinate`. See `DataCoordinate.standardize` 

1172 parameters. 

1173 

1174 Returns 

1175 ------- 

1176 ref : `DatasetRef` 

1177 A reference to the stored dataset, updated with the correct id if 

1178 given. 

1179 

1180 Raises 

1181 ------ 

1182 TypeError 

1183 Raised if the butler is read-only or if no run has been provided. 

1184 """ 

1185 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

1186 if not self.isWriteable(): 

1187 raise TypeError("Butler is read-only.") 

1188 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

1189 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

1190 raise ValueError("DatasetRef must not be in registry, must have None id") 

1191 

1192 # Handle dimension records in dataId 

1193 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1194 

1195 # Add Registry Dataset entry. 

1196 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

1197 

1198 # For an execution butler the datasets will be pre-defined. 

1199 # If the butler is configured that way datasets should only be inserted 

1200 # if they do not already exist in registry. Trying and catching 

1201 # ConflictingDefinitionError will not work because the transaction 

1202 # will be corrupted. Instead, in this mode always check first. 

1203 ref = None 

1204 ref_is_predefined = False 

1205 if self._allow_put_of_predefined_dataset: 

1206 # Get the matching ref for this run. 

1207 ref = self.registry.findDataset(datasetType, collections=run, dataId=dataId) 

1208 

1209 if ref: 

1210 # Must be expanded form for datastore templating 

1211 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions) 

1212 ref = ref.expanded(dataId) 

1213 ref_is_predefined = True 

1214 

1215 if not ref: 

1216 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

1217 

1218 # If the ref is predefined it is possible that the datastore also 

1219 # has the record. Asking datastore to put it again will result in 

1220 # the artifact being recreated, overwriting previous, then will cause 

1221 # a failure in writing the record which will cause the artifact 

1222 # to be removed. Much safer to ask first before attempting to 

1223 # overwrite. Race conditions should not be an issue for the 

1224 # execution butler environment. 

1225 if ref_is_predefined: 

1226 if self.datastore.knows(ref): 

1227 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.") 

1228 

1229 self.datastore.put(obj, ref) 

1230 

1231 return ref 

1232 

1233 def getDirect( 

1234 self, 

1235 ref: DatasetRef, 

1236 *, 

1237 parameters: Optional[Dict[str, Any]] = None, 

1238 storageClass: Optional[Union[StorageClass, str]] = None, 

1239 ) -> Any: 

1240 """Retrieve a stored dataset. 

1241 

1242 Unlike `Butler.get`, this method allows datasets outside the Butler's 

1243 collection to be read as long as the `DatasetRef` that identifies them 

1244 can be obtained separately. 

1245 

1246 Parameters 

1247 ---------- 

1248 ref : `DatasetRef` 

1249 Resolved reference to an already stored dataset. 

1250 parameters : `dict` 

1251 Additional StorageClass-defined options to control reading, 

1252 typically used to efficiently read only a subset of the dataset. 

1253 storageClass : `StorageClass` or `str`, optional 

1254 The storage class to be used to override the Python type 

1255 returned by this method. By default the returned type matches 

1256 the dataset type definition for this dataset. Specifying a 

1257 read `StorageClass` can force a different type to be returned. 

1258 This type must be compatible with the original type. 

1259 

1260 Returns 

1261 ------- 

1262 obj : `object` 

1263 The dataset. 

1264 """ 

1265 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1266 

1267 def getDirectDeferred( 

1268 self, 

1269 ref: DatasetRef, 

1270 *, 

1271 parameters: Union[dict, None] = None, 

1272 storageClass: str | StorageClass | None = None, 

1273 ) -> DeferredDatasetHandle: 

1274 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1275 from a resolved `DatasetRef`. 

1276 

1277 Parameters 

1278 ---------- 

1279 ref : `DatasetRef` 

1280 Resolved reference to an already stored dataset. 

1281 parameters : `dict` 

1282 Additional StorageClass-defined options to control reading, 

1283 typically used to efficiently read only a subset of the dataset. 

1284 storageClass : `StorageClass` or `str`, optional 

1285 The storage class to be used to override the Python type 

1286 returned by this method. By default the returned type matches 

1287 the dataset type definition for this dataset. Specifying a 

1288 read `StorageClass` can force a different type to be returned. 

1289 This type must be compatible with the original type. 

1290 

1291 Returns 

1292 ------- 

1293 obj : `DeferredDatasetHandle` 

1294 A handle which can be used to retrieve a dataset at a later time. 

1295 

1296 Raises 

1297 ------ 

1298 AmbiguousDatasetError 

1299 Raised if ``ref.id is None``, i.e. the reference is unresolved. 

1300 """ 

1301 if ref.id is None: 

1302 raise AmbiguousDatasetError( 

1303 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved." 

1304 ) 

1305 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1306 

1307 def getDeferred( 

1308 self, 

1309 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1310 dataId: Optional[DataId] = None, 

1311 *, 

1312 parameters: Union[dict, None] = None, 

1313 collections: Any = None, 

1314 storageClass: str | StorageClass | None = None, 

1315 **kwargs: Any, 

1316 ) -> DeferredDatasetHandle: 

1317 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1318 after an immediate registry lookup. 

1319 

1320 Parameters 

1321 ---------- 

1322 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1323 When `DatasetRef` the `dataId` should be `None`. 

1324 Otherwise the `DatasetType` or name thereof. 

1325 dataId : `dict` or `DataCoordinate`, optional 

1326 A `dict` of `Dimension` link name, value pairs that label the 

1327 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1328 should be provided as the first argument. 

1329 parameters : `dict` 

1330 Additional StorageClass-defined options to control reading, 

1331 typically used to efficiently read only a subset of the dataset. 

1332 collections : Any, optional 

1333 Collections to be searched, overriding ``self.collections``. 

1334 Can be any of the types supported by the ``collections`` argument 

1335 to butler construction. 

1336 storageClass : `StorageClass` or `str`, optional 

1337 The storage class to be used to override the Python type 

1338 returned by this method. By default the returned type matches 

1339 the dataset type definition for this dataset. Specifying a 

1340 read `StorageClass` can force a different type to be returned. 

1341 This type must be compatible with the original type. 

1342 **kwargs 

1343 Additional keyword arguments used to augment or construct a 

1344 `DataId`. See `DataId` parameters. 

1345 

1346 Returns 

1347 ------- 

1348 obj : `DeferredDatasetHandle` 

1349 A handle which can be used to retrieve a dataset at a later time. 

1350 

1351 Raises 

1352 ------ 

1353 LookupError 

1354 Raised if no matching dataset exists in the `Registry` (and 

1355 ``allowUnresolved is False``). 

1356 ValueError 

1357 Raised if a resolved `DatasetRef` was passed as an input, but it 

1358 differs from the one found in the registry. 

1359 TypeError 

1360 Raised if no collections were provided. 

1361 """ 

1362 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1363 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1364 

1365 def get( 

1366 self, 

1367 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1368 dataId: Optional[DataId] = None, 

1369 *, 

1370 parameters: Optional[Dict[str, Any]] = None, 

1371 collections: Any = None, 

1372 storageClass: Optional[Union[StorageClass, str]] = None, 

1373 **kwargs: Any, 

1374 ) -> Any: 

1375 """Retrieve a stored dataset. 

1376 

1377 Parameters 

1378 ---------- 

1379 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1380 When `DatasetRef` the `dataId` should be `None`. 

1381 Otherwise the `DatasetType` or name thereof. 

1382 dataId : `dict` or `DataCoordinate` 

1383 A `dict` of `Dimension` link name, value pairs that label the 

1384 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1385 should be provided as the first argument. 

1386 parameters : `dict` 

1387 Additional StorageClass-defined options to control reading, 

1388 typically used to efficiently read only a subset of the dataset. 

1389 collections : Any, optional 

1390 Collections to be searched, overriding ``self.collections``. 

1391 Can be any of the types supported by the ``collections`` argument 

1392 to butler construction. 

1393 storageClass : `StorageClass` or `str`, optional 

1394 The storage class to be used to override the Python type 

1395 returned by this method. By default the returned type matches 

1396 the dataset type definition for this dataset. Specifying a 

1397 read `StorageClass` can force a different type to be returned. 

1398 This type must be compatible with the original type. 

1399 **kwargs 

1400 Additional keyword arguments used to augment or construct a 

1401 `DataCoordinate`. See `DataCoordinate.standardize` 

1402 parameters. 

1403 

1404 Returns 

1405 ------- 

1406 obj : `object` 

1407 The dataset. 

1408 

1409 Raises 

1410 ------ 

1411 ValueError 

1412 Raised if a resolved `DatasetRef` was passed as an input, but it 

1413 differs from the one found in the registry. 

1414 LookupError 

1415 Raised if no matching dataset exists in the `Registry`. 

1416 TypeError 

1417 Raised if no collections were provided. 

1418 

1419 Notes 

1420 ----- 

1421 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1422 this method requires that the given data ID include temporal dimensions 

1423 beyond the dimensions of the dataset type itself, in order to find the 

1424 dataset with the appropriate validity range. For example, a "bias" 

1425 dataset with native dimensions ``{instrument, detector}`` could be 

1426 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1427 ``exposure`` is a temporal dimension. 

1428 """ 

1429 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1430 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1431 return self.getDirect(ref, parameters=parameters, storageClass=storageClass) 

1432 

1433 def getURIs( 

1434 self, 

1435 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1436 dataId: Optional[DataId] = None, 

1437 *, 

1438 predict: bool = False, 

1439 collections: Any = None, 

1440 run: Optional[str] = None, 

1441 **kwargs: Any, 

1442 ) -> DatasetRefURIs: 

1443 """Returns the URIs associated with the dataset. 

1444 

1445 Parameters 

1446 ---------- 

1447 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1448 When `DatasetRef` the `dataId` should be `None`. 

1449 Otherwise the `DatasetType` or name thereof. 

1450 dataId : `dict` or `DataCoordinate` 

1451 A `dict` of `Dimension` link name, value pairs that label the 

1452 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1453 should be provided as the first argument. 

1454 predict : `bool` 

1455 If `True`, allow URIs to be returned of datasets that have not 

1456 been written. 

1457 collections : Any, optional 

1458 Collections to be searched, overriding ``self.collections``. 

1459 Can be any of the types supported by the ``collections`` argument 

1460 to butler construction. 

1461 run : `str`, optional 

1462 Run to use for predictions, overriding ``self.run``. 

1463 **kwargs 

1464 Additional keyword arguments used to augment or construct a 

1465 `DataCoordinate`. See `DataCoordinate.standardize` 

1466 parameters. 

1467 

1468 Returns 

1469 ------- 

1470 uris : `DatasetRefURIs` 

1471 The URI to the primary artifact associated with this dataset (if 

1472 the dataset was disassembled within the datastore this may be 

1473 `None`), and the URIs to any components associated with the dataset 

1474 artifact. (can be empty if there are no components). 

1475 """ 

1476 ref = self._findDatasetRef( 

1477 datasetRefOrType, dataId, allowUnresolved=predict, collections=collections, **kwargs 

1478 ) 

1479 if ref.id is None: # only possible if predict is True 

1480 if run is None: 

1481 run = self.run 

1482 if run is None: 

1483 raise TypeError("Cannot predict location with run=None.") 

1484 # Lie about ID, because we can't guess it, and only 

1485 # Datastore.getURIs() will ever see it (and it doesn't use it). 

1486 ref = ref.resolved(id=0, run=run) 

1487 return self.datastore.getURIs(ref, predict) 

1488 

1489 def getURI( 

1490 self, 

1491 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1492 dataId: Optional[DataId] = None, 

1493 *, 

1494 predict: bool = False, 

1495 collections: Any = None, 

1496 run: Optional[str] = None, 

1497 **kwargs: Any, 

1498 ) -> ResourcePath: 

1499 """Return the URI to the Dataset. 

1500 

1501 Parameters 

1502 ---------- 

1503 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1504 When `DatasetRef` the `dataId` should be `None`. 

1505 Otherwise the `DatasetType` or name thereof. 

1506 dataId : `dict` or `DataCoordinate` 

1507 A `dict` of `Dimension` link name, value pairs that label the 

1508 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1509 should be provided as the first argument. 

1510 predict : `bool` 

1511 If `True`, allow URIs to be returned of datasets that have not 

1512 been written. 

1513 collections : Any, optional 

1514 Collections to be searched, overriding ``self.collections``. 

1515 Can be any of the types supported by the ``collections`` argument 

1516 to butler construction. 

1517 run : `str`, optional 

1518 Run to use for predictions, overriding ``self.run``. 

1519 **kwargs 

1520 Additional keyword arguments used to augment or construct a 

1521 `DataCoordinate`. See `DataCoordinate.standardize` 

1522 parameters. 

1523 

1524 Returns 

1525 ------- 

1526 uri : `lsst.resources.ResourcePath` 

1527 URI pointing to the Dataset within the datastore. If the 

1528 Dataset does not exist in the datastore, and if ``predict`` is 

1529 `True`, the URI will be a prediction and will include a URI 

1530 fragment "#predicted". 

1531 If the datastore does not have entities that relate well 

1532 to the concept of a URI the returned URI string will be 

1533 descriptive. The returned URI is not guaranteed to be obtainable. 

1534 

1535 Raises 

1536 ------ 

1537 LookupError 

1538 A URI has been requested for a dataset that does not exist and 

1539 guessing is not allowed. 

1540 ValueError 

1541 Raised if a resolved `DatasetRef` was passed as an input, but it 

1542 differs from the one found in the registry. 

1543 TypeError 

1544 Raised if no collections were provided. 

1545 RuntimeError 

1546 Raised if a URI is requested for a dataset that consists of 

1547 multiple artifacts. 

1548 """ 

1549 primary, components = self.getURIs( 

1550 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs 

1551 ) 

1552 

1553 if primary is None or components: 

1554 raise RuntimeError( 

1555 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1556 "Use Butler.getURIs() instead." 

1557 ) 

1558 return primary 

1559 

1560 def retrieveArtifacts( 

1561 self, 

1562 refs: Iterable[DatasetRef], 

1563 destination: ResourcePathExpression, 

1564 transfer: str = "auto", 

1565 preserve_path: bool = True, 

1566 overwrite: bool = False, 

1567 ) -> List[ResourcePath]: 

1568 """Retrieve the artifacts associated with the supplied refs. 

1569 

1570 Parameters 

1571 ---------- 

1572 refs : iterable of `DatasetRef` 

1573 The datasets for which artifacts are to be retrieved. 

1574 A single ref can result in multiple artifacts. The refs must 

1575 be resolved. 

1576 destination : `lsst.resources.ResourcePath` or `str` 

1577 Location to write the artifacts. 

1578 transfer : `str`, optional 

1579 Method to use to transfer the artifacts. Must be one of the options 

1580 supported by `~lsst.resources.ResourcePath.transfer_from()`. 

1581 "move" is not allowed. 

1582 preserve_path : `bool`, optional 

1583 If `True` the full path of the artifact within the datastore 

1584 is preserved. If `False` the final file component of the path 

1585 is used. 

1586 overwrite : `bool`, optional 

1587 If `True` allow transfers to overwrite existing files at the 

1588 destination. 

1589 

1590 Returns 

1591 ------- 

1592 targets : `list` of `lsst.resources.ResourcePath` 

1593 URIs of file artifacts in destination location. Order is not 

1594 preserved. 

1595 

1596 Notes 

1597 ----- 

1598 For non-file datastores the artifacts written to the destination 

1599 may not match the representation inside the datastore. For example 

1600 a hierarchical data structure in a NoSQL database may well be stored 

1601 as a JSON file. 

1602 """ 

1603 return self.datastore.retrieveArtifacts( 

1604 refs, 

1605 ResourcePath(destination), 

1606 transfer=transfer, 

1607 preserve_path=preserve_path, 

1608 overwrite=overwrite, 

1609 ) 

1610 

1611 def datasetExists( 

1612 self, 

1613 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1614 dataId: Optional[DataId] = None, 

1615 *, 

1616 collections: Any = None, 

1617 **kwargs: Any, 

1618 ) -> bool: 

1619 """Return True if the Dataset is actually present in the Datastore. 

1620 

1621 Parameters 

1622 ---------- 

1623 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1624 When `DatasetRef` the `dataId` should be `None`. 

1625 Otherwise the `DatasetType` or name thereof. 

1626 dataId : `dict` or `DataCoordinate` 

1627 A `dict` of `Dimension` link name, value pairs that label the 

1628 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1629 should be provided as the first argument. 

1630 collections : Any, optional 

1631 Collections to be searched, overriding ``self.collections``. 

1632 Can be any of the types supported by the ``collections`` argument 

1633 to butler construction. 

1634 **kwargs 

1635 Additional keyword arguments used to augment or construct a 

1636 `DataCoordinate`. See `DataCoordinate.standardize` 

1637 parameters. 

1638 

1639 Raises 

1640 ------ 

1641 LookupError 

1642 Raised if the dataset is not even present in the Registry. 

1643 ValueError 

1644 Raised if a resolved `DatasetRef` was passed as an input, but it 

1645 differs from the one found in the registry. 

1646 TypeError 

1647 Raised if no collections were provided. 

1648 """ 

1649 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1650 return self.datastore.exists(ref) 

1651 

1652 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1653 """Remove one or more `~CollectionType.RUN` collections and the 

1654 datasets within them. 

1655 

1656 Parameters 

1657 ---------- 

1658 names : `Iterable` [ `str` ] 

1659 The names of the collections to remove. 

1660 unstore : `bool`, optional 

1661 If `True` (default), delete datasets from all datastores in which 

1662 they are present, and attempt to rollback the registry deletions if 

1663 datastore deletions fail (which may not always be possible). If 

1664 `False`, datastore records for these datasets are still removed, 

1665 but any artifacts (e.g. files) will not be. 

1666 

1667 Raises 

1668 ------ 

1669 TypeError 

1670 Raised if one or more collections are not of type 

1671 `~CollectionType.RUN`. 

1672 """ 

1673 if not self.isWriteable(): 

1674 raise TypeError("Butler is read-only.") 

1675 names = list(names) 

1676 refs: List[DatasetRef] = [] 

1677 for name in names: 

1678 collectionType = self.registry.getCollectionType(name) 

1679 if collectionType is not CollectionType.RUN: 

1680 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1681 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True)) 

1682 with self.datastore.transaction(): 

1683 with self.registry.transaction(): 

1684 if unstore: 

1685 self.datastore.trash(refs) 

1686 else: 

1687 self.datastore.forget(refs) 

1688 for name in names: 

1689 self.registry.removeCollection(name) 

1690 if unstore: 

1691 # Point of no return for removing artifacts 

1692 self.datastore.emptyTrash() 

1693 

1694 def pruneCollection( 

1695 self, name: str, purge: bool = False, unstore: bool = False, unlink: Optional[List[str]] = None 

1696 ) -> None: 

1697 """Remove a collection and possibly prune datasets within it. 

1698 

1699 Parameters 

1700 ---------- 

1701 name : `str` 

1702 Name of the collection to remove. If this is a 

1703 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection, 

1704 datasets within the collection are not modified unless ``unstore`` 

1705 is `True`. If this is a `~CollectionType.RUN` collection, 

1706 ``purge`` and ``unstore`` must be `True`, and all datasets in it 

1707 are fully removed from the data repository. 

1708 purge : `bool`, optional 

1709 If `True`, permit `~CollectionType.RUN` collections to be removed, 

1710 fully removing datasets within them. Requires ``unstore=True`` as 

1711 well as an added precaution against accidental deletion. Must be 

1712 `False` (default) if the collection is not a ``RUN``. 

1713 unstore: `bool`, optional 

1714 If `True`, remove all datasets in the collection from all 

1715 datastores in which they appear. 

1716 unlink: `list` [`str`], optional 

1717 Before removing the given `collection` unlink it from from these 

1718 parent collections. 

1719 

1720 Raises 

1721 ------ 

1722 TypeError 

1723 Raised if the butler is read-only or arguments are mutually 

1724 inconsistent. 

1725 """ 

1726 # See pruneDatasets comments for more information about the logic here; 

1727 # the cases are almost the same, but here we can rely on Registry to 

1728 # take care everything but Datastore deletion when we remove the 

1729 # collection. 

1730 if not self.isWriteable(): 

1731 raise TypeError("Butler is read-only.") 

1732 collectionType = self.registry.getCollectionType(name) 

1733 if purge and not unstore: 

1734 raise PurgeWithoutUnstorePruneCollectionsError() 

1735 if collectionType is CollectionType.RUN and not purge: 

1736 raise RunWithoutPurgePruneCollectionsError(collectionType) 

1737 if collectionType is not CollectionType.RUN and purge: 

1738 raise PurgeUnsupportedPruneCollectionsError(collectionType) 

1739 

1740 def remove(child: str, parent: str) -> None: 

1741 """Remove a child collection from a parent collection.""" 

1742 # Remove child from parent. 

1743 chain = list(self.registry.getCollectionChain(parent)) 

1744 try: 

1745 chain.remove(name) 

1746 except ValueError as e: 

1747 raise RuntimeError(f"{name} is not a child of {parent}") from e 

1748 self.registry.setCollectionChain(parent, chain) 

1749 

1750 with self.datastore.transaction(): 

1751 with self.registry.transaction(): 

1752 if unlink: 

1753 for parent in unlink: 

1754 remove(name, parent) 

1755 if unstore: 

1756 refs = self.registry.queryDatasets(..., collections=name, findFirst=True) 

1757 self.datastore.trash(refs) 

1758 self.registry.removeCollection(name) 

1759 

1760 if unstore: 

1761 # Point of no return for removing artifacts 

1762 self.datastore.emptyTrash() 

1763 

1764 def pruneDatasets( 

1765 self, 

1766 refs: Iterable[DatasetRef], 

1767 *, 

1768 disassociate: bool = True, 

1769 unstore: bool = False, 

1770 tags: Iterable[str] = (), 

1771 purge: bool = False, 

1772 ) -> None: 

1773 # docstring inherited from LimitedButler 

1774 

1775 if not self.isWriteable(): 

1776 raise TypeError("Butler is read-only.") 

1777 if purge: 

1778 if not disassociate: 

1779 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1780 if not unstore: 

1781 raise TypeError("Cannot pass purge=True without unstore=True.") 

1782 elif disassociate: 

1783 tags = tuple(tags) 

1784 if not tags: 

1785 raise TypeError("No tags provided but disassociate=True.") 

1786 for tag in tags: 

1787 collectionType = self.registry.getCollectionType(tag) 

1788 if collectionType is not CollectionType.TAGGED: 

1789 raise TypeError( 

1790 f"Cannot disassociate from collection '{tag}' " 

1791 f"of non-TAGGED type {collectionType.name}." 

1792 ) 

1793 # For an execution butler we want to keep existing UUIDs for the 

1794 # datasets, for that we need to keep them in the collections but 

1795 # remove from datastore. 

1796 if self._allow_put_of_predefined_dataset and purge: 

1797 purge = False 

1798 disassociate = False 

1799 # Transform possibly-single-pass iterable into something we can iterate 

1800 # over multiple times. 

1801 refs = list(refs) 

1802 # Pruning a component of a DatasetRef makes no sense since registry 

1803 # doesn't know about components and datastore might not store 

1804 # components in a separate file 

1805 for ref in refs: 

1806 if ref.datasetType.component(): 

1807 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1808 # We don't need an unreliable Datastore transaction for this, because 

1809 # we've been extra careful to ensure that Datastore.trash only involves 

1810 # mutating the Registry (it can _look_ at Datastore-specific things, 

1811 # but shouldn't change them), and hence all operations here are 

1812 # Registry operations. 

1813 with self.datastore.transaction(): 

1814 with self.registry.transaction(): 

1815 if unstore: 

1816 self.datastore.trash(refs) 

1817 if purge: 

1818 self.registry.removeDatasets(refs) 

1819 elif disassociate: 

1820 assert tags, "Guaranteed by earlier logic in this function." 

1821 for tag in tags: 

1822 self.registry.disassociate(tag, refs) 

1823 # We've exited the Registry transaction, and apparently committed. 

1824 # (if there was an exception, everything rolled back, and it's as if 

1825 # nothing happened - and we never get here). 

1826 # Datastore artifacts are not yet gone, but they're clearly marked 

1827 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1828 # problems we can try again later, and if manual administrative 

1829 # intervention is required, it's pretty clear what that should entail: 

1830 # deleting everything on disk and in private Datastore tables that is 

1831 # in the dataset_location_trash table. 

1832 if unstore: 

1833 # Point of no return for removing artifacts 

1834 self.datastore.emptyTrash() 

1835 

1836 @transactional 

1837 def ingest( 

1838 self, 

1839 *datasets: FileDataset, 

1840 transfer: Optional[str] = "auto", 

1841 run: Optional[str] = None, 

1842 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1843 record_validation_info: bool = True, 

1844 ) -> None: 

1845 """Store and register one or more datasets that already exist on disk. 

1846 

1847 Parameters 

1848 ---------- 

1849 datasets : `FileDataset` 

1850 Each positional argument is a struct containing information about 

1851 a file to be ingested, including its URI (either absolute or 

1852 relative to the datastore root, if applicable), a `DatasetRef`, 

1853 and optionally a formatter class or its fully-qualified string 

1854 name. If a formatter is not provided, the formatter that would be 

1855 used for `put` is assumed. On successful return, all 

1856 `FileDataset.ref` attributes will have their `DatasetRef.id` 

1857 attribute populated and all `FileDataset.formatter` attributes will 

1858 be set to the formatter class used. `FileDataset.path` attributes 

1859 may be modified to put paths in whatever the datastore considers a 

1860 standardized form. 

1861 transfer : `str`, optional 

1862 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1863 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to 

1864 transfer the file. 

1865 run : `str`, optional 

1866 The name of the run ingested datasets should be added to, 

1867 overriding ``self.run``. 

1868 idGenerationMode : `DatasetIdGenEnum`, optional 

1869 Specifies option for generating dataset IDs. By default unique IDs 

1870 are generated for each inserted dataset. 

1871 record_validation_info : `bool`, optional 

1872 If `True`, the default, the datastore can record validation 

1873 information associated with the file. If `False` the datastore 

1874 will not attempt to track any information such as checksums 

1875 or file sizes. This can be useful if such information is tracked 

1876 in an external system or if the file is to be compressed in place. 

1877 It is up to the datastore whether this parameter is relevant. 

1878 

1879 Raises 

1880 ------ 

1881 TypeError 

1882 Raised if the butler is read-only or if no run was provided. 

1883 NotImplementedError 

1884 Raised if the `Datastore` does not support the given transfer mode. 

1885 DatasetTypeNotSupportedError 

1886 Raised if one or more files to be ingested have a dataset type that 

1887 is not supported by the `Datastore`.. 

1888 FileNotFoundError 

1889 Raised if one of the given files does not exist. 

1890 FileExistsError 

1891 Raised if transfer is not `None` but the (internal) location the 

1892 file would be moved to is already occupied. 

1893 

1894 Notes 

1895 ----- 

1896 This operation is not fully exception safe: if a database operation 

1897 fails, the given `FileDataset` instances may be only partially updated. 

1898 

1899 It is atomic in terms of database operations (they will either all 

1900 succeed or all fail) providing the database engine implements 

1901 transactions correctly. It will attempt to be atomic in terms of 

1902 filesystem operations as well, but this cannot be implemented 

1903 rigorously for most datastores. 

1904 """ 

1905 if not self.isWriteable(): 

1906 raise TypeError("Butler is read-only.") 

1907 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1908 # Reorganize the inputs so they're grouped by DatasetType and then 

1909 # data ID. We also include a list of DatasetRefs for each FileDataset 

1910 # to hold the resolved DatasetRefs returned by the Registry, before 

1911 # it's safe to swap them into FileDataset.refs. 

1912 # Some type annotation aliases to make that clearer: 

1913 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]] 

1914 GroupedData = MutableMapping[DatasetType, GroupForType] 

1915 # The actual data structure: 

1916 groupedData: GroupedData = defaultdict(dict) 

1917 # And the nested loop that populates it: 

1918 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1919 # This list intentionally shared across the inner loop, since it's 

1920 # associated with `dataset`. 

1921 resolvedRefs: List[DatasetRef] = [] 

1922 

1923 # Somewhere to store pre-existing refs if we have an 

1924 # execution butler. 

1925 existingRefs: List[DatasetRef] = [] 

1926 

1927 for ref in dataset.refs: 

1928 if ref.dataId in groupedData[ref.datasetType]: 

1929 raise ConflictingDefinitionError( 

1930 f"Ingest conflict. Dataset {dataset.path} has same" 

1931 " DataId as other ingest dataset" 

1932 f" {groupedData[ref.datasetType][ref.dataId][0].path} " 

1933 f" ({ref.dataId})" 

1934 ) 

1935 if self._allow_put_of_predefined_dataset: 

1936 existing_ref = self.registry.findDataset( 

1937 ref.datasetType, dataId=ref.dataId, collections=run 

1938 ) 

1939 if existing_ref: 

1940 if self.datastore.knows(existing_ref): 

1941 raise ConflictingDefinitionError( 

1942 f"Dataset associated with path {dataset.path}" 

1943 f" already exists as {existing_ref}." 

1944 ) 

1945 # Store this ref elsewhere since it already exists 

1946 # and we do not want to remake it but we do want 

1947 # to store it in the datastore. 

1948 existingRefs.append(existing_ref) 

1949 

1950 # Nothing else to do until we have finished 

1951 # iterating. 

1952 continue 

1953 

1954 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs) 

1955 

1956 if existingRefs: 

1957 

1958 if len(dataset.refs) != len(existingRefs): 

1959 # Keeping track of partially pre-existing datasets is hard 

1960 # and should generally never happen. For now don't allow 

1961 # it. 

1962 raise ConflictingDefinitionError( 

1963 f"For dataset {dataset.path} some dataIds already exist" 

1964 " in registry but others do not. This is not supported." 

1965 ) 

1966 

1967 # Attach the resolved refs if we found them. 

1968 dataset.refs = existingRefs 

1969 

1970 # Now we can bulk-insert into Registry for each DatasetType. 

1971 for datasetType, groupForType in progress.iter_item_chunks( 

1972 groupedData.items(), desc="Bulk-inserting datasets by type" 

1973 ): 

1974 refs = self.registry.insertDatasets( 

1975 datasetType, 

1976 dataIds=groupForType.keys(), 

1977 run=run, 

1978 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType), 

1979 idGenerationMode=idGenerationMode, 

1980 ) 

1981 # Append those resolved DatasetRefs to the new lists we set up for 

1982 # them. 

1983 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()): 

1984 resolvedRefs.append(ref) 

1985 

1986 # Go back to the original FileDatasets to replace their refs with the 

1987 # new resolved ones. 

1988 for groupForType in progress.iter_chunks( 

1989 groupedData.values(), desc="Reassociating resolved dataset refs with files" 

1990 ): 

1991 for dataset, resolvedRefs in groupForType.values(): 

1992 dataset.refs = resolvedRefs 

1993 

1994 # Bulk-insert everything into Datastore. 

1995 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info) 

1996 

1997 @contextlib.contextmanager 

1998 def export( 

1999 self, 

2000 *, 

2001 directory: Optional[str] = None, 

2002 filename: Optional[str] = None, 

2003 format: Optional[str] = None, 

2004 transfer: Optional[str] = None, 

2005 ) -> Iterator[RepoExportContext]: 

2006 """Export datasets from the repository represented by this `Butler`. 

2007 

2008 This method is a context manager that returns a helper object 

2009 (`RepoExportContext`) that is used to indicate what information from 

2010 the repository should be exported. 

2011 

2012 Parameters 

2013 ---------- 

2014 directory : `str`, optional 

2015 Directory dataset files should be written to if ``transfer`` is not 

2016 `None`. 

2017 filename : `str`, optional 

2018 Name for the file that will include database information associated 

2019 with the exported datasets. If this is not an absolute path and 

2020 ``directory`` is not `None`, it will be written to ``directory`` 

2021 instead of the current working directory. Defaults to 

2022 "export.{format}". 

2023 format : `str`, optional 

2024 File format for the database information file. If `None`, the 

2025 extension of ``filename`` will be used. 

2026 transfer : `str`, optional 

2027 Transfer mode passed to `Datastore.export`. 

2028 

2029 Raises 

2030 ------ 

2031 TypeError 

2032 Raised if the set of arguments passed is inconsistent. 

2033 

2034 Examples 

2035 -------- 

2036 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

2037 methods are used to provide the iterables over data IDs and/or datasets 

2038 to be exported:: 

2039 

2040 with butler.export("exports.yaml") as export: 

2041 # Export all flats, but none of the dimension element rows 

2042 # (i.e. data ID information) associated with them. 

2043 export.saveDatasets(butler.registry.queryDatasets("flat"), 

2044 elements=()) 

2045 # Export all datasets that start with "deepCoadd_" and all of 

2046 # their associated data ID information. 

2047 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

2048 """ 

2049 if directory is None and transfer is not None: 

2050 raise TypeError("Cannot transfer without providing a directory.") 

2051 if transfer == "move": 

2052 raise TypeError("Transfer may not be 'move': export is read-only") 

2053 if format is None: 

2054 if filename is None: 

2055 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2056 else: 

2057 _, format = os.path.splitext(filename) 

2058 elif filename is None: 

2059 filename = f"export.{format}" 

2060 if directory is not None: 

2061 filename = os.path.join(directory, filename) 

2062 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["export"]) 

2063 with open(filename, "w") as stream: 

2064 backend = BackendClass(stream, universe=self.registry.dimensions) 

2065 try: 

2066 helper = RepoExportContext( 

2067 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer 

2068 ) 

2069 yield helper 

2070 except BaseException: 

2071 raise 

2072 else: 

2073 helper._finish() 

2074 

2075 def import_( 

2076 self, 

2077 *, 

2078 directory: Optional[str] = None, 

2079 filename: Union[str, TextIO, None] = None, 

2080 format: Optional[str] = None, 

2081 transfer: Optional[str] = None, 

2082 skip_dimensions: Optional[Set] = None, 

2083 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

2084 reuseIds: bool = False, 

2085 ) -> None: 

2086 """Import datasets into this repository that were exported from a 

2087 different butler repository via `~lsst.daf.butler.Butler.export`. 

2088 

2089 Parameters 

2090 ---------- 

2091 directory : `str`, optional 

2092 Directory containing dataset files to import from. If `None`, 

2093 ``filename`` and all dataset file paths specified therein must 

2094 be absolute. 

2095 filename : `str` or `TextIO`, optional 

2096 A stream or name of file that contains database information 

2097 associated with the exported datasets, typically generated by 

2098 `~lsst.daf.butler.Butler.export`. If this a string (name) and 

2099 is not an absolute path, does not exist in the current working 

2100 directory, and ``directory`` is not `None`, it is assumed to be in 

2101 ``directory``. Defaults to "export.{format}". 

2102 format : `str`, optional 

2103 File format for ``filename``. If `None`, the extension of 

2104 ``filename`` will be used. 

2105 transfer : `str`, optional 

2106 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

2107 skip_dimensions : `set`, optional 

2108 Names of dimensions that should be skipped and not imported. 

2109 idGenerationMode : `DatasetIdGenEnum`, optional 

2110 Specifies option for generating dataset IDs when IDs are not 

2111 provided or their type does not match backend type. By default 

2112 unique IDs are generated for each inserted dataset. 

2113 reuseIds : `bool`, optional 

2114 If `True` then forces re-use of imported dataset IDs for integer 

2115 IDs which are normally generated as auto-incremented; exception 

2116 will be raised if imported IDs clash with existing ones. This 

2117 option has no effect on the use of globally-unique IDs which are 

2118 always re-used (or generated if integer IDs are being imported). 

2119 

2120 Raises 

2121 ------ 

2122 TypeError 

2123 Raised if the set of arguments passed is inconsistent, or if the 

2124 butler is read-only. 

2125 """ 

2126 if not self.isWriteable(): 

2127 raise TypeError("Butler is read-only.") 

2128 if format is None: 

2129 if filename is None: 

2130 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2131 else: 

2132 _, format = os.path.splitext(filename) # type: ignore 

2133 elif filename is None: 

2134 filename = f"export.{format}" 

2135 if isinstance(filename, str) and directory is not None and not os.path.exists(filename): 

2136 filename = os.path.join(directory, filename) 

2137 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"]) 

2138 

2139 def doImport(importStream: TextIO) -> None: 

2140 backend = BackendClass(importStream, self.registry) 

2141 backend.register() 

2142 with self.transaction(): 

2143 backend.load( 

2144 self.datastore, 

2145 directory=directory, 

2146 transfer=transfer, 

2147 skip_dimensions=skip_dimensions, 

2148 idGenerationMode=idGenerationMode, 

2149 reuseIds=reuseIds, 

2150 ) 

2151 

2152 if isinstance(filename, str): 

2153 with open(filename, "r") as stream: 

2154 doImport(stream) 

2155 else: 

2156 doImport(filename) 

2157 

2158 def transfer_from( 

2159 self, 

2160 source_butler: Butler, 

2161 source_refs: Iterable[DatasetRef], 

2162 transfer: str = "auto", 

2163 id_gen_map: Dict[str, DatasetIdGenEnum] | None = None, 

2164 skip_missing: bool = True, 

2165 register_dataset_types: bool = False, 

2166 transfer_dimensions: bool = False, 

2167 ) -> List[DatasetRef]: 

2168 """Transfer datasets to this Butler from a run in another Butler. 

2169 

2170 Parameters 

2171 ---------- 

2172 source_butler : `Butler` 

2173 Butler from which the datasets are to be transferred. 

2174 source_refs : iterable of `DatasetRef` 

2175 Datasets defined in the source butler that should be transferred to 

2176 this butler. 

2177 transfer : `str`, optional 

2178 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

2179 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional 

2180 A mapping of dataset type to ID generation mode. Only used if 

2181 the source butler is using integer IDs. Should not be used 

2182 if this receiving butler uses integer IDs. Without this dataset 

2183 import always uses unique. 

2184 skip_missing : `bool` 

2185 If `True`, datasets with no datastore artifact associated with 

2186 them are not transferred. If `False` a registry entry will be 

2187 created even if no datastore record is created (and so will 

2188 look equivalent to the dataset being unstored). 

2189 register_dataset_types : `bool` 

2190 If `True` any missing dataset types are registered. Otherwise 

2191 an exception is raised. 

2192 transfer_dimensions : `bool`, optional 

2193 If `True`, dimension record data associated with the new datasets 

2194 will be transferred. 

2195 

2196 Returns 

2197 ------- 

2198 refs : `list` of `DatasetRef` 

2199 The refs added to this Butler. 

2200 

2201 Notes 

2202 ----- 

2203 Requires that any dimension definitions are already present in the 

2204 receiving Butler. The datastore artifact has to exist for a transfer 

2205 to be made but non-existence is not an error. 

2206 

2207 Datasets that already exist in this run will be skipped. 

2208 

2209 The datasets are imported as part of a transaction, although 

2210 dataset types are registered before the transaction is started. 

2211 This means that it is possible for a dataset type to be registered 

2212 even though transfer has failed. 

2213 """ 

2214 if not self.isWriteable(): 

2215 raise TypeError("Butler is read-only.") 

2216 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

2217 

2218 # Will iterate through the refs multiple times so need to convert 

2219 # to a list if this isn't a collection. 

2220 if not isinstance(source_refs, collections.abc.Collection): 

2221 source_refs = list(source_refs) 

2222 

2223 original_count = len(source_refs) 

2224 log.info("Transferring %d datasets into %s", original_count, str(self)) 

2225 

2226 if id_gen_map is None: 

2227 id_gen_map = {} 

2228 

2229 # In some situations the datastore artifact may be missing 

2230 # and we do not want that registry entry to be imported. 

2231 # Asking datastore is not sufficient, the records may have been 

2232 # purged, we have to ask for the (predicted) URI and check 

2233 # existence explicitly. Execution butler is set up exactly like 

2234 # this with no datastore records. 

2235 artifact_existence: Dict[ResourcePath, bool] = {} 

2236 if skip_missing: 

2237 dataset_existence = source_butler.datastore.mexists( 

2238 source_refs, artifact_existence=artifact_existence 

2239 ) 

2240 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

2241 filtered_count = len(source_refs) 

2242 log.verbose( 

2243 "%d datasets removed because the artifact does not exist. Now have %d.", 

2244 original_count - filtered_count, 

2245 filtered_count, 

2246 ) 

2247 

2248 # Importing requires that we group the refs by dataset type and run 

2249 # before doing the import. 

2250 source_dataset_types = set() 

2251 grouped_refs = defaultdict(list) 

2252 grouped_indices = defaultdict(list) 

2253 for i, ref in enumerate(source_refs): 

2254 grouped_refs[ref.datasetType, ref.run].append(ref) 

2255 grouped_indices[ref.datasetType, ref.run].append(i) 

2256 source_dataset_types.add(ref.datasetType) 

2257 

2258 # Check to see if the dataset type in the source butler has 

2259 # the same definition in the target butler and register missing 

2260 # ones if requested. Registration must happen outside a transaction. 

2261 newly_registered_dataset_types = set() 

2262 for datasetType in source_dataset_types: 

2263 if register_dataset_types: 

2264 # Let this raise immediately if inconsistent. Continuing 

2265 # on to find additional inconsistent dataset types 

2266 # might result in additional unwanted dataset types being 

2267 # registered. 

2268 if self.registry.registerDatasetType(datasetType): 

2269 newly_registered_dataset_types.add(datasetType) 

2270 else: 

2271 # If the dataset type is missing, let it fail immediately. 

2272 target_dataset_type = self.registry.getDatasetType(datasetType.name) 

2273 if target_dataset_type != datasetType: 

2274 raise ConflictingDefinitionError( 

2275 "Source butler dataset type differs from definition" 

2276 f" in target butler: {datasetType} !=" 

2277 f" {target_dataset_type}" 

2278 ) 

2279 if newly_registered_dataset_types: 

2280 # We may have registered some even if there were inconsistencies 

2281 # but should let people know (or else remove them again). 

2282 log.log( 

2283 VERBOSE, 

2284 "Registered the following dataset types in the target Butler: %s", 

2285 ", ".join(d.name for d in newly_registered_dataset_types), 

2286 ) 

2287 else: 

2288 log.log(VERBOSE, "All required dataset types are known to the target Butler") 

2289 

2290 dimension_records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

2291 if transfer_dimensions: 

2292 # Collect all the dimension records for these refs. 

2293 # All dimensions are to be copied but the list of valid dimensions 

2294 # come from this butler's universe. 

2295 elements = frozenset( 

2296 element 

2297 for element in self.registry.dimensions.getStaticElements() 

2298 if element.hasTable() and element.viewOf is None 

2299 ) 

2300 dataIds = set(ref.dataId for ref in source_refs) 

2301 # This logic comes from saveDataIds. 

2302 for dataId in dataIds: 

2303 # Should be a no-op if the ref has already been expanded. 

2304 dataId = source_butler.registry.expandDataId(dataId) 

2305 # If this butler doesn't know about a dimension in the source 

2306 # butler things will break later. 

2307 for record in dataId.records.values(): 

2308 if record is not None and record.definition in elements: 

2309 dimension_records[record.definition].setdefault(record.dataId, record) 

2310 

2311 # The returned refs should be identical for UUIDs. 

2312 # For now must also support integers and so need to retain the 

2313 # newly-created refs from this registry. 

2314 # Pre-size it so we can assign refs into the correct slots 

2315 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs) 

2316 default_id_gen = DatasetIdGenEnum.UNIQUE 

2317 

2318 handled_collections: Set[str] = set() 

2319 

2320 # Do all the importing in a single transaction. 

2321 with self.transaction(): 

2322 if dimension_records: 

2323 log.verbose("Ensuring that dimension records exist for transferred datasets.") 

2324 for element, r in dimension_records.items(): 

2325 records = [r[dataId] for dataId in r] 

2326 # Assume that if the record is already present that we can 

2327 # use it without having to check that the record metadata 

2328 # is consistent. 

2329 self.registry.insertDimensionData(element, *records, skip_existing=True) 

2330 

2331 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

2332 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

2333 ): 

2334 if run not in handled_collections: 

2335 run_doc = source_butler.registry.getCollectionDocumentation(run) 

2336 registered = self.registry.registerRun(run, doc=run_doc) 

2337 handled_collections.add(run) 

2338 if registered: 

2339 log.log(VERBOSE, "Creating output run %s", run) 

2340 

2341 id_generation_mode = default_id_gen 

2342 if isinstance(refs_to_import[0].id, int): 

2343 # ID generation mode might need to be overridden when 

2344 # targetting UUID 

2345 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen) 

2346 

2347 n_refs = len(refs_to_import) 

2348 log.verbose( 

2349 "Importing %d ref%s of dataset type %s into run %s", 

2350 n_refs, 

2351 "" if n_refs == 1 else "s", 

2352 datasetType.name, 

2353 run, 

2354 ) 

2355 

2356 # No way to know if this butler's registry uses UUID. 

2357 # We have to trust the caller on this. If it fails they will 

2358 # have to change their approach. We can't catch the exception 

2359 # and retry with unique because that will mess up the 

2360 # transaction handling. We aren't allowed to ask the registry 

2361 # manager what type of ID it is using. 

2362 imported_refs = self.registry._importDatasets( 

2363 refs_to_import, idGenerationMode=id_generation_mode, expand=False 

2364 ) 

2365 

2366 # Map them into the correct slots to match the initial order 

2367 for i, ref in zip(grouped_indices[datasetType, run], imported_refs): 

2368 transferred_refs_tmp[i] = ref 

2369 

2370 # Mypy insists that we might have None in here so we have to make 

2371 # that explicit by assigning to a new variable and filtering out 

2372 # something that won't be there. 

2373 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None] 

2374 

2375 # Check consistency 

2376 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given" 

2377 

2378 log.verbose("Imported %d datasets into destination butler", len(transferred_refs)) 

2379 

2380 # The transferred refs need to be reordered to match the original 

2381 # ordering given by the caller. Without this the datastore transfer 

2382 # will be broken. 

2383 

2384 # Ask the datastore to transfer. The datastore has to check that 

2385 # the source datastore is compatible with the target datastore. 

2386 self.datastore.transfer_from( 

2387 source_butler.datastore, 

2388 source_refs, 

2389 local_refs=transferred_refs, 

2390 transfer=transfer, 

2391 artifact_existence=artifact_existence, 

2392 ) 

2393 

2394 return transferred_refs 

2395 

2396 def validateConfiguration( 

2397 self, 

2398 logFailures: bool = False, 

2399 datasetTypeNames: Optional[Iterable[str]] = None, 

2400 ignore: Iterable[str] | None = None, 

2401 ) -> None: 

2402 """Validate butler configuration. 

2403 

2404 Checks that each `DatasetType` can be stored in the `Datastore`. 

2405 

2406 Parameters 

2407 ---------- 

2408 logFailures : `bool`, optional 

2409 If `True`, output a log message for every validation error 

2410 detected. 

2411 datasetTypeNames : iterable of `str`, optional 

2412 The `DatasetType` names that should be checked. This allows 

2413 only a subset to be selected. 

2414 ignore : iterable of `str`, optional 

2415 Names of DatasetTypes to skip over. This can be used to skip 

2416 known problems. If a named `DatasetType` corresponds to a 

2417 composite, all components of that `DatasetType` will also be 

2418 ignored. 

2419 

2420 Raises 

2421 ------ 

2422 ButlerValidationError 

2423 Raised if there is some inconsistency with how this Butler 

2424 is configured. 

2425 """ 

2426 if datasetTypeNames: 

2427 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

2428 else: 

2429 datasetTypes = list(self.registry.queryDatasetTypes()) 

2430 

2431 # filter out anything from the ignore list 

2432 if ignore: 

2433 ignore = set(ignore) 

2434 datasetTypes = [ 

2435 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

2436 ] 

2437 else: 

2438 ignore = set() 

2439 

2440 # Find all the registered instruments 

2441 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument")) 

2442 

2443 # For each datasetType that has an instrument dimension, create 

2444 # a DatasetRef for each defined instrument 

2445 datasetRefs = [] 

2446 

2447 for datasetType in datasetTypes: 

2448 if "instrument" in datasetType.dimensions: 

2449 for instrument in instruments: 

2450 datasetRef = DatasetRef( 

2451 datasetType, {"instrument": instrument}, conform=False # type: ignore 

2452 ) 

2453 datasetRefs.append(datasetRef) 

2454 

2455 entities: List[Union[DatasetType, DatasetRef]] = [] 

2456 entities.extend(datasetTypes) 

2457 entities.extend(datasetRefs) 

2458 

2459 datastoreErrorStr = None 

2460 try: 

2461 self.datastore.validateConfiguration(entities, logFailures=logFailures) 

2462 except ValidationError as e: 

2463 datastoreErrorStr = str(e) 

2464 

2465 # Also check that the LookupKeys used by the datastores match 

2466 # registry and storage class definitions 

2467 keys = self.datastore.getLookupKeys() 

2468 

2469 failedNames = set() 

2470 failedDataId = set() 

2471 for key in keys: 

2472 if key.name is not None: 

2473 if key.name in ignore: 

2474 continue 

2475 

2476 # skip if specific datasetType names were requested and this 

2477 # name does not match 

2478 if datasetTypeNames and key.name not in datasetTypeNames: 

2479 continue 

2480 

2481 # See if it is a StorageClass or a DatasetType 

2482 if key.name in self.storageClasses: 

2483 pass 

2484 else: 

2485 try: 

2486 self.registry.getDatasetType(key.name) 

2487 except KeyError: 

2488 if logFailures: 

2489 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

2490 failedNames.add(key) 

2491 else: 

2492 # Dimensions are checked for consistency when the Butler 

2493 # is created and rendezvoused with a universe. 

2494 pass 

2495 

2496 # Check that the instrument is a valid instrument 

2497 # Currently only support instrument so check for that 

2498 if key.dataId: 

2499 dataIdKeys = set(key.dataId) 

2500 if set(["instrument"]) != dataIdKeys: 

2501 if logFailures: 

2502 log.critical("Key '%s' has unsupported DataId override", key) 

2503 failedDataId.add(key) 

2504 elif key.dataId["instrument"] not in instruments: 

2505 if logFailures: 

2506 log.critical("Key '%s' has unknown instrument", key) 

2507 failedDataId.add(key) 

2508 

2509 messages = [] 

2510 

2511 if datastoreErrorStr: 

2512 messages.append(datastoreErrorStr) 

2513 

2514 for failed, msg in ( 

2515 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2516 (failedDataId, "Keys with bad DataId entries: "), 

2517 ): 

2518 if failed: 

2519 msg += ", ".join(str(k) for k in failed) 

2520 messages.append(msg) 

2521 

2522 if messages: 

2523 raise ValidationError(";\n".join(messages)) 

2524 

2525 @property 

2526 def collections(self) -> Sequence[str]: 

2527 """The collections to search by default, in order 

2528 (`Sequence` [ `str` ]). 

2529 

2530 This is an alias for ``self.registry.defaults.collections``. It cannot 

2531 be set directly in isolation, but all defaults may be changed together 

2532 by assigning a new `RegistryDefaults` instance to 

2533 ``self.registry.defaults``. 

2534 """ 

2535 return self.registry.defaults.collections 

2536 

2537 @property 

2538 def run(self) -> Optional[str]: 

2539 """Name of the run this butler writes outputs to by default (`str` or 

2540 `None`). 

2541 

2542 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2543 directly in isolation, but all defaults may be changed together by 

2544 assigning a new `RegistryDefaults` instance to 

2545 ``self.registry.defaults``. 

2546 """ 

2547 return self.registry.defaults.run 

2548 

2549 @property 

2550 def dimensions(self) -> DimensionUniverse: 

2551 # Docstring inherited. 

2552 return self.registry.dimensions 

2553 

2554 registry: Registry 

2555 """The object that manages dataset metadata and relationships (`Registry`). 

2556 

2557 Most operations that don't involve reading or writing butler datasets are 

2558 accessible only via `Registry` methods. 

2559 """ 

2560 

2561 datastore: Datastore 

2562 """The object that manages actual dataset storage (`Datastore`). 

2563 

2564 Direct user access to the datastore should rarely be necessary; the primary 

2565 exception is the case where a `Datastore` implementation provides extra 

2566 functionality beyond what the base class defines. 

2567 """ 

2568 

2569 storageClasses: StorageClassFactory 

2570 """An object that maps known storage class names to objects that fully 

2571 describe them (`StorageClassFactory`). 

2572 """ 

2573 

2574 _allow_put_of_predefined_dataset: bool 

2575 """Allow a put to succeed even if there is already a registry entry for it 

2576 but not a datastore record. (`bool`)."""