Coverage for python/lsst/daf/butler/_butler.py: 11%

724 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-10-02 08:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Butler top level classes. 

29""" 

30from __future__ import annotations 

31 

32__all__ = ( 

33 "Butler", 

34 "ButlerValidationError", 

35) 

36 

37import collections.abc 

38import contextlib 

39import io 

40import logging 

41import numbers 

42import os 

43import warnings 

44from collections import Counter, defaultdict 

45from collections.abc import Iterable, Iterator, MutableMapping, Sequence 

46from typing import TYPE_CHECKING, Any, ClassVar, TextIO 

47 

48from deprecated.sphinx import deprecated 

49from lsst.resources import ResourcePath, ResourcePathExpression 

50from lsst.utils import doImportType 

51from lsst.utils.introspection import get_class_of 

52from lsst.utils.logging import VERBOSE, getLogger 

53from sqlalchemy.exc import IntegrityError 

54 

55from ._butlerConfig import ButlerConfig 

56from ._butlerRepoIndex import ButlerRepoIndex 

57from ._dataset_existence import DatasetExistence 

58from ._deferredDatasetHandle import DeferredDatasetHandle 

59from ._limited_butler import LimitedButler 

60from ._registry_shim import RegistryShim 

61from .core import ( 

62 Config, 

63 ConfigSubset, 

64 DataCoordinate, 

65 DataId, 

66 DataIdValue, 

67 DatasetIdGenEnum, 

68 DatasetRef, 

69 DatasetRefURIs, 

70 DatasetType, 

71 Datastore, 

72 Dimension, 

73 DimensionConfig, 

74 DimensionElement, 

75 DimensionRecord, 

76 DimensionUniverse, 

77 FileDataset, 

78 NullDatastore, 

79 Progress, 

80 StorageClass, 

81 StorageClassFactory, 

82 Timespan, 

83 ValidationError, 

84) 

85from .core.repoRelocation import BUTLER_ROOT_TAG 

86from .core.utils import transactional 

87from .registry import ( 

88 CollectionType, 

89 ConflictingDefinitionError, 

90 DataIdError, 

91 MissingDatasetTypeError, 

92 NoDefaultCollectionError, 

93 Registry, 

94 RegistryConfig, 

95 RegistryDefaults, 

96 _ButlerRegistry, 

97 _RegistryFactory, 

98) 

99from .transfers import RepoExportContext 

100 

101if TYPE_CHECKING: 

102 from lsst.resources import ResourceHandleProtocol 

103 

104 from .transfers import RepoImportBackend 

105 

106log = getLogger(__name__) 

107 

108 

109class ButlerValidationError(ValidationError): 

110 """There is a problem with the Butler configuration.""" 

111 

112 pass 

113 

114 

115class Butler(LimitedButler): 

116 """Main entry point for the data access system. 

117 

118 Parameters 

119 ---------- 

120 config : `ButlerConfig`, `Config` or `str`, optional. 

121 Configuration. Anything acceptable to the 

122 `ButlerConfig` constructor. If a directory path 

123 is given the configuration will be read from a ``butler.yaml`` file in 

124 that location. If `None` is given default values will be used. 

125 butler : `Butler`, optional. 

126 If provided, construct a new Butler that uses the same registry and 

127 datastore as the given one, but with the given collection and run. 

128 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

129 arguments. 

130 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional 

131 An expression specifying the collections to be searched (in order) when 

132 reading datasets. 

133 This may be a `str` collection name or an iterable thereof. 

134 See :ref:`daf_butler_collection_expressions` for more information. 

135 These collections are not registered automatically and must be 

136 manually registered before they are used by any method, but they may be 

137 manually registered after the `Butler` is initialized. 

138 run : `str`, optional 

139 Name of the `~CollectionType.RUN` collection new datasets should be 

140 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

141 ``collections`` will be set to ``[run]``. If not `None`, this 

142 collection will automatically be registered. If this is not set (and 

143 ``writeable`` is not set either), a read-only butler will be created. 

144 searchPaths : `list` of `str`, optional 

145 Directory paths to search when calculating the full Butler 

146 configuration. Not used if the supplied config is already a 

147 `ButlerConfig`. 

148 writeable : `bool`, optional 

149 Explicitly sets whether the butler supports write operations. If not 

150 provided, a read-write butler is created if any of ``run``, ``tags``, 

151 or ``chains`` is non-empty. 

152 inferDefaults : `bool`, optional 

153 If `True` (default) infer default data ID values from the values 

154 present in the datasets in ``collections``: if all collections have the 

155 same value (or no value) for a governor dimension, that value will be 

156 the default for that dimension. Nonexistent collections are ignored. 

157 If a default value is provided explicitly for a governor dimension via 

158 ``**kwargs``, no default will be inferred for that dimension. 

159 without_datastore : `bool`, optional 

160 If `True` do not attach a datastore to this butler. Any attempts 

161 to use a datastore will fail. 

162 **kwargs : `str` 

163 Default data ID key-value pairs. These may only identify "governor" 

164 dimensions like ``instrument`` and ``skymap``. 

165 

166 Examples 

167 -------- 

168 While there are many ways to control exactly how a `Butler` interacts with 

169 the collections in its `Registry`, the most common cases are still simple. 

170 

171 For a read-only `Butler` that searches one collection, do:: 

172 

173 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

174 

175 For a read-write `Butler` that writes to and reads from a 

176 `~CollectionType.RUN` collection:: 

177 

178 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

179 

180 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

181 because we want to write to one `~CollectionType.RUN` collection but read 

182 from several others (as well):: 

183 

184 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

185 collections=["u/alice/DM-50000/a", 

186 "u/bob/DM-49998", 

187 "HSC/defaults"]) 

188 

189 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

190 Datasets will be read first from that run (since it appears first in the 

191 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

192 

193 Finally, one can always create a `Butler` with no collections:: 

194 

195 butler = Butler("/path/to/repo", writeable=True) 

196 

197 This can be extremely useful when you just want to use ``butler.registry``, 

198 e.g. for inserting dimension data or managing collections, or when the 

199 collections you want to use with the butler are not consistent. 

200 Passing ``writeable`` explicitly here is only necessary if you want to be 

201 able to make changes to the repo - usually the value for ``writeable`` can 

202 be guessed from the collection arguments provided, but it defaults to 

203 `False` when there are not collection arguments. 

204 """ 

205 

206 def __init__( 

207 self, 

208 config: Config | ResourcePathExpression | None = None, 

209 *, 

210 butler: Butler | None = None, 

211 collections: Any = None, 

212 run: str | None = None, 

213 searchPaths: Sequence[ResourcePathExpression] | None = None, 

214 writeable: bool | None = None, 

215 inferDefaults: bool = True, 

216 without_datastore: bool = False, 

217 **kwargs: str, 

218 ): 

219 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

220 # Load registry, datastore, etc. from config or existing butler. 

221 if butler is not None: 

222 if config is not None or searchPaths is not None or writeable is not None: 

223 raise TypeError( 

224 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument." 

225 ) 

226 self._registry = butler._registry.copy(defaults) 

227 self._datastore = butler._datastore 

228 self.storageClasses = butler.storageClasses 

229 self._config: ButlerConfig = butler._config 

230 else: 

231 self._config = ButlerConfig(config, searchPaths=searchPaths, without_datastore=without_datastore) 

232 try: 

233 butlerRoot = self._config.get("root", self._config.configDir) 

234 if writeable is None: 

235 writeable = run is not None 

236 self._registry = _RegistryFactory(self._config).from_config( 

237 butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

238 ) 

239 if without_datastore: 

240 self._datastore = NullDatastore(None, None) 

241 else: 

242 self._datastore = Datastore.fromConfig( 

243 self._config, self._registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

244 ) 

245 self.storageClasses = StorageClassFactory() 

246 self.storageClasses.addFromConfig(self._config) 

247 except Exception: 

248 # Failures here usually mean that configuration is incomplete, 

249 # just issue an error message which includes config file URI. 

250 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

251 raise 

252 

253 # For execution butler the datastore needs a special 

254 # dependency-inversion trick. This is not used by regular butler, 

255 # but we do not have a way to distinguish regular butler from execution 

256 # butler. 

257 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

258 

259 if "run" in self._config or "collection" in self._config: 

260 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

261 

262 self._registry_shim = RegistryShim(self) 

263 

264 GENERATION: ClassVar[int] = 3 

265 """This is a Generation 3 Butler. 

266 

267 This attribute may be removed in the future, once the Generation 2 Butler 

268 interface has been fully retired; it should only be used in transitional 

269 code. 

270 """ 

271 

272 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

273 """Return DatasetType defined in registry given dataset type name.""" 

274 try: 

275 return self._registry.getDatasetType(name) 

276 except MissingDatasetTypeError: 

277 return None 

278 

279 @classmethod 

280 def get_repo_uri(cls, label: str, return_label: bool = False) -> ResourcePath: 

281 """Look up the label in a butler repository index. 

282 

283 Parameters 

284 ---------- 

285 label : `str` 

286 Label of the Butler repository to look up. 

287 return_label : `bool`, optional 

288 If ``label`` cannot be found in the repository index (either 

289 because index is not defined or ``label`` is not in the index) and 

290 ``return_label`` is `True` then return ``ResourcePath(label)``. 

291 If ``return_label`` is `False` (default) then an exception will be 

292 raised instead. 

293 

294 Returns 

295 ------- 

296 uri : `lsst.resources.ResourcePath` 

297 URI to the Butler repository associated with the given label or 

298 default value if it is provided. 

299 

300 Raises 

301 ------ 

302 KeyError 

303 Raised if the label is not found in the index, or if an index 

304 is not defined, and ``return_label`` is `False`. 

305 

306 Notes 

307 ----- 

308 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

309 information is discovered. 

310 """ 

311 return ButlerRepoIndex.get_repo_uri(label, return_label) 

312 

313 @classmethod 

314 def get_known_repos(cls) -> set[str]: 

315 """Retrieve the list of known repository labels. 

316 

317 Returns 

318 ------- 

319 repos : `set` of `str` 

320 All the known labels. Can be empty if no index can be found. 

321 

322 Notes 

323 ----- 

324 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

325 information is discovered. 

326 """ 

327 return ButlerRepoIndex.get_known_repos() 

328 

329 @staticmethod 

330 def makeRepo( 

331 root: ResourcePathExpression, 

332 config: Config | str | None = None, 

333 dimensionConfig: Config | str | None = None, 

334 standalone: bool = False, 

335 searchPaths: list[str] | None = None, 

336 forceConfigRoot: bool = True, 

337 outfile: ResourcePathExpression | None = None, 

338 overwrite: bool = False, 

339 ) -> Config: 

340 """Create an empty data repository by adding a butler.yaml config 

341 to a repository root directory. 

342 

343 Parameters 

344 ---------- 

345 root : `lsst.resources.ResourcePathExpression` 

346 Path or URI to the root location of the new repository. Will be 

347 created if it does not exist. 

348 config : `Config` or `str`, optional 

349 Configuration to write to the repository, after setting any 

350 root-dependent Registry or Datastore config options. Can not 

351 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

352 configuration will be used. Root-dependent config options 

353 specified in this config are overwritten if ``forceConfigRoot`` 

354 is `True`. 

355 dimensionConfig : `Config` or `str`, optional 

356 Configuration for dimensions, will be used to initialize registry 

357 database. 

358 standalone : `bool` 

359 If True, write all expanded defaults, not just customized or 

360 repository-specific settings. 

361 This (mostly) decouples the repository from the default 

362 configuration, insulating it from changes to the defaults (which 

363 may be good or bad, depending on the nature of the changes). 

364 Future *additions* to the defaults will still be picked up when 

365 initializing `Butlers` to repos created with ``standalone=True``. 

366 searchPaths : `list` of `str`, optional 

367 Directory paths to search when calculating the full butler 

368 configuration. 

369 forceConfigRoot : `bool`, optional 

370 If `False`, any values present in the supplied ``config`` that 

371 would normally be reset are not overridden and will appear 

372 directly in the output config. This allows non-standard overrides 

373 of the root directory for a datastore or registry to be given. 

374 If this parameter is `True` the values for ``root`` will be 

375 forced into the resulting config if appropriate. 

376 outfile : `lss.resources.ResourcePathExpression`, optional 

377 If not-`None`, the output configuration will be written to this 

378 location rather than into the repository itself. Can be a URI 

379 string. Can refer to a directory that will be used to write 

380 ``butler.yaml``. 

381 overwrite : `bool`, optional 

382 Create a new configuration file even if one already exists 

383 in the specified output location. Default is to raise 

384 an exception. 

385 

386 Returns 

387 ------- 

388 config : `Config` 

389 The updated `Config` instance written to the repo. 

390 

391 Raises 

392 ------ 

393 ValueError 

394 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

395 regular Config (as these subclasses would make it impossible to 

396 support ``standalone=False``). 

397 FileExistsError 

398 Raised if the output config file already exists. 

399 os.error 

400 Raised if the directory does not exist, exists but is not a 

401 directory, or cannot be created. 

402 

403 Notes 

404 ----- 

405 Note that when ``standalone=False`` (the default), the configuration 

406 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

407 construct the repository should also be used to construct any Butlers 

408 to avoid configuration inconsistencies. 

409 """ 

410 if isinstance(config, ButlerConfig | ConfigSubset): 

411 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

412 

413 # Ensure that the root of the repository exists or can be made 

414 root_uri = ResourcePath(root, forceDirectory=True) 

415 root_uri.mkdir() 

416 

417 config = Config(config) 

418 

419 # If we are creating a new repo from scratch with relative roots, 

420 # do not propagate an explicit root from the config file 

421 if "root" in config: 

422 del config["root"] 

423 

424 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

425 imported_class = doImportType(full["datastore", "cls"]) 

426 if not issubclass(imported_class, Datastore): 

427 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore") 

428 datastoreClass: type[Datastore] = imported_class 

429 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

430 

431 # if key exists in given config, parse it, otherwise parse the defaults 

432 # in the expanded config 

433 if config.get(("registry", "db")): 

434 registryConfig = RegistryConfig(config) 

435 else: 

436 registryConfig = RegistryConfig(full) 

437 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

438 if defaultDatabaseUri is not None: 

439 Config.updateParameters( 

440 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot 

441 ) 

442 else: 

443 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot) 

444 

445 if standalone: 

446 config.merge(full) 

447 else: 

448 # Always expand the registry.managers section into the per-repo 

449 # config, because after the database schema is created, it's not 

450 # allowed to change anymore. Note that in the standalone=True 

451 # branch, _everything_ in the config is expanded, so there's no 

452 # need to special case this. 

453 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

454 configURI: ResourcePathExpression 

455 if outfile is not None: 

456 # When writing to a separate location we must include 

457 # the root of the butler repo in the config else it won't know 

458 # where to look. 

459 config["root"] = root_uri.geturl() 

460 configURI = outfile 

461 else: 

462 configURI = root_uri 

463 # Strip obscore configuration, if it is present, before writing config 

464 # to a file, obscore config will be stored in registry. 

465 if (obscore_config_key := ("registry", "managers", "obscore", "config")) in config: 

466 config_to_write = config.copy() 

467 del config_to_write[obscore_config_key] 

468 config_to_write.dumpToUri(configURI, overwrite=overwrite) 

469 # configFile attribute is updated, need to copy it to original. 

470 config.configFile = config_to_write.configFile 

471 else: 

472 config.dumpToUri(configURI, overwrite=overwrite) 

473 

474 # Create Registry and populate tables 

475 registryConfig = RegistryConfig(config.get("registry")) 

476 dimensionConfig = DimensionConfig(dimensionConfig) 

477 _RegistryFactory(registryConfig).create_from_config( 

478 dimensionConfig=dimensionConfig, butlerRoot=root_uri 

479 ) 

480 

481 log.verbose("Wrote new Butler configuration file to %s", configURI) 

482 

483 return config 

484 

485 @classmethod 

486 def _unpickle( 

487 cls, 

488 config: ButlerConfig, 

489 collections: tuple[str, ...] | None, 

490 run: str | None, 

491 defaultDataId: dict[str, str], 

492 writeable: bool, 

493 ) -> Butler: 

494 """Callable used to unpickle a Butler. 

495 

496 We prefer not to use ``Butler.__init__`` directly so we can force some 

497 of its many arguments to be keyword-only (note that ``__reduce__`` 

498 can only invoke callables with positional arguments). 

499 

500 Parameters 

501 ---------- 

502 config : `ButlerConfig` 

503 Butler configuration, already coerced into a true `ButlerConfig` 

504 instance (and hence after any search paths for overrides have been 

505 utilized). 

506 collections : `tuple` [ `str` ] 

507 Names of the default collections to read from. 

508 run : `str`, optional 

509 Name of the default `~CollectionType.RUN` collection to write to. 

510 defaultDataId : `dict` [ `str`, `str` ] 

511 Default data ID values. 

512 writeable : `bool` 

513 Whether the Butler should support write operations. 

514 

515 Returns 

516 ------- 

517 butler : `Butler` 

518 A new `Butler` instance. 

519 """ 

520 # MyPy doesn't recognize that the kwargs below are totally valid; it 

521 # seems to think '**defaultDataId* is a _positional_ argument! 

522 return cls( 

523 config=config, 

524 collections=collections, 

525 run=run, 

526 writeable=writeable, 

527 **defaultDataId, # type: ignore 

528 ) 

529 

530 def __reduce__(self) -> tuple: 

531 """Support pickling.""" 

532 return ( 

533 Butler._unpickle, 

534 ( 

535 self._config, 

536 self.collections, 

537 self.run, 

538 self._registry.defaults.dataId.byName(), 

539 self._registry.isWriteable(), 

540 ), 

541 ) 

542 

543 def __str__(self) -> str: 

544 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

545 self.collections, self.run, self._datastore, self._registry 

546 ) 

547 

548 def isWriteable(self) -> bool: 

549 """Return `True` if this `Butler` supports write operations.""" 

550 return self._registry.isWriteable() 

551 

552 @contextlib.contextmanager 

553 def transaction(self) -> Iterator[None]: 

554 """Context manager supporting `Butler` transactions. 

555 

556 Transactions can be nested. 

557 """ 

558 with self._registry.transaction(), self._datastore.transaction(): 

559 yield 

560 

561 def _standardizeArgs( 

562 self, 

563 datasetRefOrType: DatasetRef | DatasetType | str, 

564 dataId: DataId | None = None, 

565 for_put: bool = True, 

566 **kwargs: Any, 

567 ) -> tuple[DatasetType, DataId | None]: 

568 """Standardize the arguments passed to several Butler APIs. 

569 

570 Parameters 

571 ---------- 

572 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

573 When `DatasetRef` the `dataId` should be `None`. 

574 Otherwise the `DatasetType` or name thereof. 

575 dataId : `dict` or `DataCoordinate` 

576 A `dict` of `Dimension` link name, value pairs that label the 

577 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

578 should be provided as the second argument. 

579 for_put : `bool`, optional 

580 If `True` this call is invoked as part of a `Butler.put()`. 

581 Otherwise it is assumed to be part of a `Butler.get()`. This 

582 parameter is only relevant if there is dataset type 

583 inconsistency. 

584 **kwargs 

585 Additional keyword arguments used to augment or construct a 

586 `DataCoordinate`. See `DataCoordinate.standardize` 

587 parameters. 

588 

589 Returns 

590 ------- 

591 datasetType : `DatasetType` 

592 A `DatasetType` instance extracted from ``datasetRefOrType``. 

593 dataId : `dict` or `DataId`, optional 

594 Argument that can be used (along with ``kwargs``) to construct a 

595 `DataId`. 

596 

597 Notes 

598 ----- 

599 Butler APIs that conceptually need a DatasetRef also allow passing a 

600 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

601 keyword arguments that can be used to construct one) separately. This 

602 method accepts those arguments and always returns a true `DatasetType` 

603 and a `DataId` or `dict`. 

604 

605 Standardization of `dict` vs `DataId` is best handled by passing the 

606 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

607 generally similarly flexible. 

608 """ 

609 externalDatasetType: DatasetType | None = None 

610 internalDatasetType: DatasetType | None = None 

611 if isinstance(datasetRefOrType, DatasetRef): 

612 if dataId is not None or kwargs: 

613 raise ValueError("DatasetRef given, cannot use dataId as well") 

614 externalDatasetType = datasetRefOrType.datasetType 

615 dataId = datasetRefOrType.dataId 

616 else: 

617 # Don't check whether DataId is provided, because Registry APIs 

618 # can usually construct a better error message when it wasn't. 

619 if isinstance(datasetRefOrType, DatasetType): 

620 externalDatasetType = datasetRefOrType 

621 else: 

622 internalDatasetType = self._registry.getDatasetType(datasetRefOrType) 

623 

624 # Check that they are self-consistent 

625 if externalDatasetType is not None: 

626 internalDatasetType = self._registry.getDatasetType(externalDatasetType.name) 

627 if externalDatasetType != internalDatasetType: 

628 # We can allow differences if they are compatible, depending 

629 # on whether this is a get or a put. A get requires that 

630 # the python type associated with the datastore can be 

631 # converted to the user type. A put requires that the user 

632 # supplied python type can be converted to the internal 

633 # type expected by registry. 

634 relevantDatasetType = internalDatasetType 

635 if for_put: 

636 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

637 else: 

638 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

639 relevantDatasetType = externalDatasetType 

640 if not is_compatible: 

641 raise ValueError( 

642 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

643 f"registry definition ({internalDatasetType})" 

644 ) 

645 # Override the internal definition. 

646 internalDatasetType = relevantDatasetType 

647 

648 assert internalDatasetType is not None 

649 return internalDatasetType, dataId 

650 

651 def _rewrite_data_id( 

652 self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any 

653 ) -> tuple[DataId | None, dict[str, Any]]: 

654 """Rewrite a data ID taking into account dimension records. 

655 

656 Take a Data ID and keyword args and rewrite it if necessary to 

657 allow the user to specify dimension records rather than dimension 

658 primary values. 

659 

660 This allows a user to include a dataId dict with keys of 

661 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

662 the integer exposure ID. It also allows a string to be given 

663 for a dimension value rather than the integer ID if that is more 

664 convenient. For example, rather than having to specifying the 

665 detector with ``detector.full_name``, a string given for ``detector`` 

666 will be interpreted as the full name and converted to the integer 

667 value. 

668 

669 Keyword arguments can also use strings for dimensions like detector 

670 and exposure but python does not allow them to include ``.`` and 

671 so the ``exposure.day_obs`` syntax can not be used in a keyword 

672 argument. 

673 

674 Parameters 

675 ---------- 

676 dataId : `dict` or `DataCoordinate` 

677 A `dict` of `Dimension` link name, value pairs that will label the 

678 `DatasetRef` within a Collection. 

679 datasetType : `DatasetType` 

680 The dataset type associated with this dataId. Required to 

681 determine the relevant dimensions. 

682 **kwargs 

683 Additional keyword arguments used to augment or construct a 

684 `DataId`. See `DataId` parameters. 

685 

686 Returns 

687 ------- 

688 dataId : `dict` or `DataCoordinate` 

689 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

690 no keyword arguments, the original dataId will be returned 

691 unchanged. 

692 **kwargs : `dict` 

693 Any unused keyword arguments (would normally be empty dict). 

694 """ 

695 # Do nothing if we have a standalone DataCoordinate. 

696 if isinstance(dataId, DataCoordinate) and not kwargs: 

697 return dataId, kwargs 

698 

699 # Process dimension records that are using record information 

700 # rather than ids 

701 newDataId: dict[str, DataIdValue] = {} 

702 byRecord: dict[str, dict[str, Any]] = defaultdict(dict) 

703 

704 # if all the dataId comes from keyword parameters we do not need 

705 # to do anything here because they can't be of the form 

706 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

707 if dataId: 

708 for k, v in dataId.items(): 

709 # If we have a Dimension we do not need to do anything 

710 # because it cannot be a compound key. 

711 if isinstance(k, str) and "." in k: 

712 # Someone is using a more human-readable dataId 

713 dimensionName, record = k.split(".", 1) 

714 byRecord[dimensionName][record] = v 

715 elif isinstance(k, Dimension): 

716 newDataId[k.name] = v 

717 else: 

718 newDataId[k] = v 

719 

720 # Go through the updated dataId and check the type in case someone is 

721 # using an alternate key. We have already filtered out the compound 

722 # keys dimensions.record format. 

723 not_dimensions = {} 

724 

725 # Will need to look in the dataId and the keyword arguments 

726 # and will remove them if they need to be fixed or are unrecognized. 

727 for dataIdDict in (newDataId, kwargs): 

728 # Use a list so we can adjust the dict safely in the loop 

729 for dimensionName in list(dataIdDict): 

730 value = dataIdDict[dimensionName] 

731 try: 

732 dimension = self.dimensions.getStaticDimensions()[dimensionName] 

733 except KeyError: 

734 # This is not a real dimension 

735 not_dimensions[dimensionName] = value 

736 del dataIdDict[dimensionName] 

737 continue 

738 

739 # Convert an integral type to an explicit int to simplify 

740 # comparisons here 

741 if isinstance(value, numbers.Integral): 

742 value = int(value) 

743 

744 if not isinstance(value, dimension.primaryKey.getPythonType()): 

745 for alternate in dimension.alternateKeys: 

746 if isinstance(value, alternate.getPythonType()): 

747 byRecord[dimensionName][alternate.name] = value 

748 del dataIdDict[dimensionName] 

749 log.debug( 

750 "Converting dimension %s to %s.%s=%s", 

751 dimensionName, 

752 dimensionName, 

753 alternate.name, 

754 value, 

755 ) 

756 break 

757 else: 

758 log.warning( 

759 "Type mismatch found for value '%r' provided for dimension %s. " 

760 "Could not find matching alternative (primary key has type %s) " 

761 "so attempting to use as-is.", 

762 value, 

763 dimensionName, 

764 dimension.primaryKey.getPythonType(), 

765 ) 

766 

767 # By this point kwargs and newDataId should only include valid 

768 # dimensions. Merge kwargs in to the new dataId and log if there 

769 # are dimensions in both (rather than calling update). 

770 for k, v in kwargs.items(): 

771 if k in newDataId and newDataId[k] != v: 

772 log.debug( 

773 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

774 ) 

775 newDataId[k] = v 

776 # No need to retain any values in kwargs now. 

777 kwargs = {} 

778 

779 # If we have some unrecognized dimensions we have to try to connect 

780 # them to records in other dimensions. This is made more complicated 

781 # by some dimensions having records with clashing names. A mitigation 

782 # is that we can tell by this point which dimensions are missing 

783 # for the DatasetType but this does not work for calibrations 

784 # where additional dimensions can be used to constrain the temporal 

785 # axis. 

786 if not_dimensions: 

787 # Search for all dimensions even if we have been given a value 

788 # explicitly. In some cases records are given as well as the 

789 # actually dimension and this should not be an error if they 

790 # match. 

791 mandatoryDimensions = datasetType.dimensions.names # - provided 

792 

793 candidateDimensions: set[str] = set() 

794 candidateDimensions.update(mandatoryDimensions) 

795 

796 # For calibrations we may well be needing temporal dimensions 

797 # so rather than always including all dimensions in the scan 

798 # restrict things a little. It is still possible for there 

799 # to be confusion over day_obs in visit vs exposure for example. 

800 # If we are not searching calibration collections things may 

801 # fail but they are going to fail anyway because of the 

802 # ambiguousness of the dataId... 

803 if datasetType.isCalibration(): 

804 for dim in self.dimensions.getStaticDimensions(): 

805 if dim.temporal: 

806 candidateDimensions.add(str(dim)) 

807 

808 # Look up table for the first association with a dimension 

809 guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict) 

810 

811 # Keep track of whether an item is associated with multiple 

812 # dimensions. 

813 counter: Counter[str] = Counter() 

814 assigned: dict[str, set[str]] = defaultdict(set) 

815 

816 # Go through the missing dimensions and associate the 

817 # given names with records within those dimensions 

818 matched_dims = set() 

819 for dimensionName in candidateDimensions: 

820 dimension = self.dimensions.getStaticDimensions()[dimensionName] 

821 fields = dimension.metadata.names | dimension.uniqueKeys.names 

822 for field in not_dimensions: 

823 if field in fields: 

824 guessedAssociation[dimensionName][field] = not_dimensions[field] 

825 counter[dimensionName] += 1 

826 assigned[field].add(dimensionName) 

827 matched_dims.add(field) 

828 

829 # Calculate the fields that matched nothing. 

830 never_found = set(not_dimensions) - matched_dims 

831 

832 if never_found: 

833 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

834 

835 # There is a chance we have allocated a single dataId item 

836 # to multiple dimensions. Need to decide which should be retained. 

837 # For now assume that the most popular alternative wins. 

838 # This means that day_obs with seq_num will result in 

839 # exposure.day_obs and not visit.day_obs 

840 # Also prefer an explicitly missing dimension over an inferred 

841 # temporal dimension. 

842 for fieldName, assignedDimensions in assigned.items(): 

843 if len(assignedDimensions) > 1: 

844 # Pick the most popular (preferring mandatory dimensions) 

845 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

846 if requiredButMissing: 

847 candidateDimensions = requiredButMissing 

848 else: 

849 candidateDimensions = assignedDimensions 

850 

851 # If this is a choice between visit and exposure and 

852 # neither was a required part of the dataset type, 

853 # (hence in this branch) always prefer exposure over 

854 # visit since exposures are always defined and visits 

855 # are defined from exposures. 

856 if candidateDimensions == {"exposure", "visit"}: 

857 candidateDimensions = {"exposure"} 

858 

859 # Select the relevant items and get a new restricted 

860 # counter. 

861 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

862 duplicatesCounter: Counter[str] = Counter() 

863 duplicatesCounter.update(theseCounts) 

864 

865 # Choose the most common. If they are equally common 

866 # we will pick the one that was found first. 

867 # Returns a list of tuples 

868 selected = duplicatesCounter.most_common(1)[0][0] 

869 

870 log.debug( 

871 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

872 " Removed ambiguity by choosing dimension %s.", 

873 fieldName, 

874 ", ".join(assignedDimensions), 

875 selected, 

876 ) 

877 

878 for candidateDimension in assignedDimensions: 

879 if candidateDimension != selected: 

880 del guessedAssociation[candidateDimension][fieldName] 

881 

882 # Update the record look up dict with the new associations 

883 for dimensionName, values in guessedAssociation.items(): 

884 if values: # A dict might now be empty 

885 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values) 

886 byRecord[dimensionName].update(values) 

887 

888 if byRecord: 

889 # Some record specifiers were found so we need to convert 

890 # them to the Id form 

891 for dimensionName, values in byRecord.items(): 

892 if dimensionName in newDataId: 

893 log.debug( 

894 "DataId specified explicit %s dimension value of %s in addition to" 

895 " general record specifiers for it of %s. Ignoring record information.", 

896 dimensionName, 

897 newDataId[dimensionName], 

898 str(values), 

899 ) 

900 # Get the actual record and compare with these values. 

901 try: 

902 recs = list(self._registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

903 except DataIdError: 

904 raise ValueError( 

905 f"Could not find dimension '{dimensionName}'" 

906 f" with dataId {newDataId} as part of comparing with" 

907 f" record values {byRecord[dimensionName]}" 

908 ) from None 

909 if len(recs) == 1: 

910 errmsg: list[str] = [] 

911 for k, v in values.items(): 

912 if (recval := getattr(recs[0], k)) != v: 

913 errmsg.append(f"{k}({recval} != {v})") 

914 if errmsg: 

915 raise ValueError( 

916 f"Dimension {dimensionName} in dataId has explicit value" 

917 " inconsistent with records: " + ", ".join(errmsg) 

918 ) 

919 else: 

920 # Multiple matches for an explicit dimension 

921 # should never happen but let downstream complain. 

922 pass 

923 continue 

924 

925 # Build up a WHERE expression 

926 bind = dict(values.items()) 

927 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

928 

929 # Hopefully we get a single record that matches 

930 records = set( 

931 self._registry.queryDimensionRecords( 

932 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

933 ) 

934 ) 

935 

936 if len(records) != 1: 

937 if len(records) > 1: 

938 # visit can have an ambiguous answer without involving 

939 # visit_system. The default visit_system is defined 

940 # by the instrument. 

941 if ( 

942 dimensionName == "visit" 

943 and "visit_system_membership" in self.dimensions 

944 and "visit_system" in self.dimensions["instrument"].metadata 

945 ): 

946 instrument_records = list( 

947 self._registry.queryDimensionRecords( 

948 "instrument", 

949 dataId=newDataId, 

950 **kwargs, 

951 ) 

952 ) 

953 if len(instrument_records) == 1: 

954 visit_system = instrument_records[0].visit_system 

955 if visit_system is None: 

956 # Set to a value that will never match. 

957 visit_system = -1 

958 

959 # Look up each visit in the 

960 # visit_system_membership records. 

961 for rec in records: 

962 membership = list( 

963 self._registry.queryDimensionRecords( 

964 # Use bind to allow zero results. 

965 # This is a fully-specified query. 

966 "visit_system_membership", 

967 where="instrument = inst AND visit_system = system AND visit = v", 

968 bind=dict( 

969 inst=instrument_records[0].name, system=visit_system, v=rec.id 

970 ), 

971 ) 

972 ) 

973 if membership: 

974 # This record is the right answer. 

975 records = {rec} 

976 break 

977 

978 # The ambiguity may have been resolved so check again. 

979 if len(records) > 1: 

980 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

981 for r in records: 

982 log.debug("- %s", str(r)) 

983 raise ValueError( 

984 f"DataId specification for dimension {dimensionName} is not" 

985 f" uniquely constrained to a single dataset by {values}." 

986 f" Got {len(records)} results." 

987 ) 

988 else: 

989 raise ValueError( 

990 f"DataId specification for dimension {dimensionName} matched no" 

991 f" records when constrained by {values}" 

992 ) 

993 

994 # Get the primary key from the real dimension object 

995 dimension = self.dimensions.getStaticDimensions()[dimensionName] 

996 if not isinstance(dimension, Dimension): 

997 raise RuntimeError( 

998 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

999 ) 

1000 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

1001 

1002 return newDataId, kwargs 

1003 

1004 def _findDatasetRef( 

1005 self, 

1006 datasetRefOrType: DatasetRef | DatasetType | str, 

1007 dataId: DataId | None = None, 

1008 *, 

1009 collections: Any = None, 

1010 predict: bool = False, 

1011 run: str | None = None, 

1012 **kwargs: Any, 

1013 ) -> DatasetRef: 

1014 """Shared logic for methods that start with a search for a dataset in 

1015 the registry. 

1016 

1017 Parameters 

1018 ---------- 

1019 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1020 When `DatasetRef` the `dataId` should be `None`. 

1021 Otherwise the `DatasetType` or name thereof. 

1022 dataId : `dict` or `DataCoordinate`, optional 

1023 A `dict` of `Dimension` link name, value pairs that label the 

1024 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1025 should be provided as the first argument. 

1026 collections : Any, optional 

1027 Collections to be searched, overriding ``self.collections``. 

1028 Can be any of the types supported by the ``collections`` argument 

1029 to butler construction. 

1030 predict : `bool`, optional 

1031 If `True`, return a newly created `DatasetRef` with a unique 

1032 dataset ID if finding a reference in the `Registry` fails. 

1033 Defaults to `False`. 

1034 run : `str`, optional 

1035 Run collection name to use for creating `DatasetRef` for predicted 

1036 datasets. Only used if ``predict`` is `True`. 

1037 **kwargs 

1038 Additional keyword arguments used to augment or construct a 

1039 `DataId`. See `DataId` parameters. 

1040 

1041 Returns 

1042 ------- 

1043 ref : `DatasetRef` 

1044 A reference to the dataset identified by the given arguments. 

1045 This can be the same dataset reference as given if it was 

1046 resolved. 

1047 

1048 Raises 

1049 ------ 

1050 LookupError 

1051 Raised if no matching dataset exists in the `Registry` (and 

1052 ``predict`` is `False`). 

1053 ValueError 

1054 Raised if a resolved `DatasetRef` was passed as an input, but it 

1055 differs from the one found in the registry. 

1056 TypeError 

1057 Raised if no collections were provided. 

1058 """ 

1059 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

1060 if isinstance(datasetRefOrType, DatasetRef): 

1061 if collections is not None: 

1062 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3) 

1063 return datasetRefOrType 

1064 timespan: Timespan | None = None 

1065 

1066 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1067 

1068 if datasetType.isCalibration(): 

1069 # Because this is a calibration dataset, first try to make a 

1070 # standardize the data ID without restricting the dimensions to 

1071 # those of the dataset type requested, because there may be extra 

1072 # dimensions that provide temporal information for a validity-range 

1073 # lookup. 

1074 dataId = DataCoordinate.standardize( 

1075 dataId, universe=self.dimensions, defaults=self._registry.defaults.dataId, **kwargs 

1076 ) 

1077 if dataId.graph.temporal: 

1078 dataId = self._registry.expandDataId(dataId) 

1079 timespan = dataId.timespan 

1080 else: 

1081 # Standardize the data ID to just the dimensions of the dataset 

1082 # type instead of letting registry.findDataset do it, so we get the 

1083 # result even if no dataset is found. 

1084 dataId = DataCoordinate.standardize( 

1085 dataId, graph=datasetType.dimensions, defaults=self._registry.defaults.dataId, **kwargs 

1086 ) 

1087 # Always lookup the DatasetRef, even if one is given, to ensure it is 

1088 # present in the current collection. 

1089 ref = self._registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

1090 if ref is None: 

1091 if predict: 

1092 if run is None: 

1093 run = self.run 

1094 if run is None: 

1095 raise TypeError("Cannot predict dataset ID/location with run=None.") 

1096 return DatasetRef(datasetType, dataId, run=run) 

1097 else: 

1098 if collections is None: 

1099 collections = self._registry.defaults.collections 

1100 raise LookupError( 

1101 f"Dataset {datasetType.name} with data ID {dataId} " 

1102 f"could not be found in collections {collections}." 

1103 ) 

1104 if datasetType != ref.datasetType: 

1105 # If they differ it is because the user explicitly specified 

1106 # a compatible dataset type to this call rather than using the 

1107 # registry definition. The DatasetRef must therefore be recreated 

1108 # using the user definition such that the expected type is 

1109 # returned. 

1110 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id) 

1111 

1112 return ref 

1113 

1114 # TODO: remove on DM-40067. 

1115 @transactional 

1116 @deprecated( 

1117 reason="Butler.put() now behaves like Butler.putDirect() when given a DatasetRef." 

1118 " Please use Butler.put(). Be aware that you may need to adjust your usage if you" 

1119 " were relying on the run parameter to determine the run." 

1120 " Will be removed after v26.0.", 

1121 version="v26.0", 

1122 category=FutureWarning, 

1123 ) 

1124 def putDirect(self, obj: Any, ref: DatasetRef, /) -> DatasetRef: 

1125 # Docstring inherited. 

1126 return self.put(obj, ref) 

1127 

1128 @transactional 

1129 def put( 

1130 self, 

1131 obj: Any, 

1132 datasetRefOrType: DatasetRef | DatasetType | str, 

1133 /, 

1134 dataId: DataId | None = None, 

1135 *, 

1136 run: str | None = None, 

1137 **kwargs: Any, 

1138 ) -> DatasetRef: 

1139 """Store and register a dataset. 

1140 

1141 Parameters 

1142 ---------- 

1143 obj : `object` 

1144 The dataset. 

1145 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1146 When `DatasetRef` is provided, ``dataId`` should be `None`. 

1147 Otherwise the `DatasetType` or name thereof. If a fully resolved 

1148 `DatasetRef` is given the run and ID are used directly. 

1149 dataId : `dict` or `DataCoordinate` 

1150 A `dict` of `Dimension` link name, value pairs that label the 

1151 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1152 should be provided as the second argument. 

1153 run : `str`, optional 

1154 The name of the run the dataset should be added to, overriding 

1155 ``self.run``. Not used if a resolved `DatasetRef` is provided. 

1156 **kwargs 

1157 Additional keyword arguments used to augment or construct a 

1158 `DataCoordinate`. See `DataCoordinate.standardize` 

1159 parameters. Not used if a resolve `DatasetRef` is provided. 

1160 

1161 Returns 

1162 ------- 

1163 ref : `DatasetRef` 

1164 A reference to the stored dataset, updated with the correct id if 

1165 given. 

1166 

1167 Raises 

1168 ------ 

1169 TypeError 

1170 Raised if the butler is read-only or if no run has been provided. 

1171 """ 

1172 if isinstance(datasetRefOrType, DatasetRef): 

1173 # This is a direct put of predefined DatasetRef. 

1174 log.debug("Butler put direct: %s", datasetRefOrType) 

1175 if run is not None: 

1176 warnings.warn("Run collection is not used for DatasetRef", stacklevel=3) 

1177 # If registry already has a dataset with the same dataset ID, 

1178 # dataset type and DataId, then _importDatasets will do nothing and 

1179 # just return an original ref. We have to raise in this case, there 

1180 # is a datastore check below for that. 

1181 self._registry._importDatasets([datasetRefOrType], expand=True) 

1182 # Before trying to write to the datastore check that it does not 

1183 # know this dataset. This is prone to races, of course. 

1184 if self._datastore.knows(datasetRefOrType): 

1185 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}") 

1186 # Try to write dataset to the datastore, if it fails due to a race 

1187 # with another write, the content of stored data may be 

1188 # unpredictable. 

1189 try: 

1190 self._datastore.put(obj, datasetRefOrType) 

1191 except IntegrityError as e: 

1192 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") from e 

1193 return datasetRefOrType 

1194 

1195 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

1196 if not self.isWriteable(): 

1197 raise TypeError("Butler is read-only.") 

1198 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

1199 

1200 # Handle dimension records in dataId 

1201 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1202 

1203 # Add Registry Dataset entry. 

1204 dataId = self._registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

1205 (ref,) = self._registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

1206 self._datastore.put(obj, ref) 

1207 

1208 return ref 

1209 

1210 # TODO: remove on DM-40067. 

1211 @deprecated( 

1212 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef." 

1213 " Please use Butler.get(). Will be removed after v26.0.", 

1214 version="v26.0", 

1215 category=FutureWarning, 

1216 ) 

1217 def getDirect( 

1218 self, 

1219 ref: DatasetRef, 

1220 *, 

1221 parameters: dict[str, Any] | None = None, 

1222 storageClass: StorageClass | str | None = None, 

1223 ) -> Any: 

1224 """Retrieve a stored dataset. 

1225 

1226 Parameters 

1227 ---------- 

1228 ref : `DatasetRef` 

1229 Resolved reference to an already stored dataset. 

1230 parameters : `dict` 

1231 Additional StorageClass-defined options to control reading, 

1232 typically used to efficiently read only a subset of the dataset. 

1233 storageClass : `StorageClass` or `str`, optional 

1234 The storage class to be used to override the Python type 

1235 returned by this method. By default the returned type matches 

1236 the dataset type definition for this dataset. Specifying a 

1237 read `StorageClass` can force a different type to be returned. 

1238 This type must be compatible with the original type. 

1239 

1240 Returns 

1241 ------- 

1242 obj : `object` 

1243 The dataset. 

1244 """ 

1245 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1246 

1247 # TODO: remove on DM-40067. 

1248 @deprecated( 

1249 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. " 

1250 "Please use Butler.getDeferred(). Will be removed after v26.0.", 

1251 version="v26.0", 

1252 category=FutureWarning, 

1253 ) 

1254 def getDirectDeferred( 

1255 self, 

1256 ref: DatasetRef, 

1257 *, 

1258 parameters: dict | None = None, 

1259 storageClass: str | StorageClass | None = None, 

1260 ) -> DeferredDatasetHandle: 

1261 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1262 from a resolved `DatasetRef`. 

1263 

1264 Parameters 

1265 ---------- 

1266 ref : `DatasetRef` 

1267 Resolved reference to an already stored dataset. 

1268 parameters : `dict` 

1269 Additional StorageClass-defined options to control reading, 

1270 typically used to efficiently read only a subset of the dataset. 

1271 storageClass : `StorageClass` or `str`, optional 

1272 The storage class to be used to override the Python type 

1273 returned by this method. By default the returned type matches 

1274 the dataset type definition for this dataset. Specifying a 

1275 read `StorageClass` can force a different type to be returned. 

1276 This type must be compatible with the original type. 

1277 

1278 Returns 

1279 ------- 

1280 obj : `DeferredDatasetHandle` 

1281 A handle which can be used to retrieve a dataset at a later time. 

1282 

1283 Raises 

1284 ------ 

1285 LookupError 

1286 Raised if no matching dataset exists in the `Registry`. 

1287 """ 

1288 # Check that dataset is known to the datastore. 

1289 if not self._datastore.knows(ref): 

1290 raise LookupError(f"Dataset reference {ref} is not known to datastore.") 

1291 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1292 

1293 def getDeferred( 

1294 self, 

1295 datasetRefOrType: DatasetRef | DatasetType | str, 

1296 /, 

1297 dataId: DataId | None = None, 

1298 *, 

1299 parameters: dict | None = None, 

1300 collections: Any = None, 

1301 storageClass: str | StorageClass | None = None, 

1302 **kwargs: Any, 

1303 ) -> DeferredDatasetHandle: 

1304 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1305 after an immediate registry lookup. 

1306 

1307 Parameters 

1308 ---------- 

1309 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1310 When `DatasetRef` the `dataId` should be `None`. 

1311 Otherwise the `DatasetType` or name thereof. 

1312 dataId : `dict` or `DataCoordinate`, optional 

1313 A `dict` of `Dimension` link name, value pairs that label the 

1314 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1315 should be provided as the first argument. 

1316 parameters : `dict` 

1317 Additional StorageClass-defined options to control reading, 

1318 typically used to efficiently read only a subset of the dataset. 

1319 collections : Any, optional 

1320 Collections to be searched, overriding ``self.collections``. 

1321 Can be any of the types supported by the ``collections`` argument 

1322 to butler construction. 

1323 storageClass : `StorageClass` or `str`, optional 

1324 The storage class to be used to override the Python type 

1325 returned by this method. By default the returned type matches 

1326 the dataset type definition for this dataset. Specifying a 

1327 read `StorageClass` can force a different type to be returned. 

1328 This type must be compatible with the original type. 

1329 **kwargs 

1330 Additional keyword arguments used to augment or construct a 

1331 `DataId`. See `DataId` parameters. 

1332 

1333 Returns 

1334 ------- 

1335 obj : `DeferredDatasetHandle` 

1336 A handle which can be used to retrieve a dataset at a later time. 

1337 

1338 Raises 

1339 ------ 

1340 LookupError 

1341 Raised if no matching dataset exists in the `Registry` or 

1342 datastore. 

1343 ValueError 

1344 Raised if a resolved `DatasetRef` was passed as an input, but it 

1345 differs from the one found in the registry. 

1346 TypeError 

1347 Raised if no collections were provided. 

1348 """ 

1349 if isinstance(datasetRefOrType, DatasetRef): 

1350 # Do the quick check first and if that fails, check for artifact 

1351 # existence. This is necessary for datastores that are configured 

1352 # in trust mode where there won't be a record but there will be 

1353 # a file. 

1354 if self._datastore.knows(datasetRefOrType) or self._datastore.exists(datasetRefOrType): 

1355 ref = datasetRefOrType 

1356 else: 

1357 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.") 

1358 else: 

1359 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1360 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1361 

1362 def get( 

1363 self, 

1364 datasetRefOrType: DatasetRef | DatasetType | str, 

1365 /, 

1366 dataId: DataId | None = None, 

1367 *, 

1368 parameters: dict[str, Any] | None = None, 

1369 collections: Any = None, 

1370 storageClass: StorageClass | str | None = None, 

1371 **kwargs: Any, 

1372 ) -> Any: 

1373 """Retrieve a stored dataset. 

1374 

1375 Parameters 

1376 ---------- 

1377 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1378 When `DatasetRef` the `dataId` should be `None`. 

1379 Otherwise the `DatasetType` or name thereof. 

1380 If a resolved `DatasetRef`, the associated dataset 

1381 is returned directly without additional querying. 

1382 dataId : `dict` or `DataCoordinate` 

1383 A `dict` of `Dimension` link name, value pairs that label the 

1384 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1385 should be provided as the first argument. 

1386 parameters : `dict` 

1387 Additional StorageClass-defined options to control reading, 

1388 typically used to efficiently read only a subset of the dataset. 

1389 collections : Any, optional 

1390 Collections to be searched, overriding ``self.collections``. 

1391 Can be any of the types supported by the ``collections`` argument 

1392 to butler construction. 

1393 storageClass : `StorageClass` or `str`, optional 

1394 The storage class to be used to override the Python type 

1395 returned by this method. By default the returned type matches 

1396 the dataset type definition for this dataset. Specifying a 

1397 read `StorageClass` can force a different type to be returned. 

1398 This type must be compatible with the original type. 

1399 **kwargs 

1400 Additional keyword arguments used to augment or construct a 

1401 `DataCoordinate`. See `DataCoordinate.standardize` 

1402 parameters. 

1403 

1404 Returns 

1405 ------- 

1406 obj : `object` 

1407 The dataset. 

1408 

1409 Raises 

1410 ------ 

1411 LookupError 

1412 Raised if no matching dataset exists in the `Registry`. 

1413 TypeError 

1414 Raised if no collections were provided. 

1415 

1416 Notes 

1417 ----- 

1418 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1419 this method requires that the given data ID include temporal dimensions 

1420 beyond the dimensions of the dataset type itself, in order to find the 

1421 dataset with the appropriate validity range. For example, a "bias" 

1422 dataset with native dimensions ``{instrument, detector}`` could be 

1423 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1424 ``exposure`` is a temporal dimension. 

1425 """ 

1426 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1427 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1428 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1429 

1430 def getURIs( 

1431 self, 

1432 datasetRefOrType: DatasetRef | DatasetType | str, 

1433 /, 

1434 dataId: DataId | None = None, 

1435 *, 

1436 predict: bool = False, 

1437 collections: Any = None, 

1438 run: str | None = None, 

1439 **kwargs: Any, 

1440 ) -> DatasetRefURIs: 

1441 """Return the URIs associated with the dataset. 

1442 

1443 Parameters 

1444 ---------- 

1445 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1446 When `DatasetRef` the `dataId` should be `None`. 

1447 Otherwise the `DatasetType` or name thereof. 

1448 dataId : `dict` or `DataCoordinate` 

1449 A `dict` of `Dimension` link name, value pairs that label the 

1450 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1451 should be provided as the first argument. 

1452 predict : `bool` 

1453 If `True`, allow URIs to be returned of datasets that have not 

1454 been written. 

1455 collections : Any, optional 

1456 Collections to be searched, overriding ``self.collections``. 

1457 Can be any of the types supported by the ``collections`` argument 

1458 to butler construction. 

1459 run : `str`, optional 

1460 Run to use for predictions, overriding ``self.run``. 

1461 **kwargs 

1462 Additional keyword arguments used to augment or construct a 

1463 `DataCoordinate`. See `DataCoordinate.standardize` 

1464 parameters. 

1465 

1466 Returns 

1467 ------- 

1468 uris : `DatasetRefURIs` 

1469 The URI to the primary artifact associated with this dataset (if 

1470 the dataset was disassembled within the datastore this may be 

1471 `None`), and the URIs to any components associated with the dataset 

1472 artifact. (can be empty if there are no components). 

1473 """ 

1474 ref = self._findDatasetRef( 

1475 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs 

1476 ) 

1477 return self._datastore.getURIs(ref, predict) 

1478 

1479 def getURI( 

1480 self, 

1481 datasetRefOrType: DatasetRef | DatasetType | str, 

1482 /, 

1483 dataId: DataId | None = None, 

1484 *, 

1485 predict: bool = False, 

1486 collections: Any = None, 

1487 run: str | None = None, 

1488 **kwargs: Any, 

1489 ) -> ResourcePath: 

1490 """Return the URI to the Dataset. 

1491 

1492 Parameters 

1493 ---------- 

1494 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1495 When `DatasetRef` the `dataId` should be `None`. 

1496 Otherwise the `DatasetType` or name thereof. 

1497 dataId : `dict` or `DataCoordinate` 

1498 A `dict` of `Dimension` link name, value pairs that label the 

1499 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1500 should be provided as the first argument. 

1501 predict : `bool` 

1502 If `True`, allow URIs to be returned of datasets that have not 

1503 been written. 

1504 collections : Any, optional 

1505 Collections to be searched, overriding ``self.collections``. 

1506 Can be any of the types supported by the ``collections`` argument 

1507 to butler construction. 

1508 run : `str`, optional 

1509 Run to use for predictions, overriding ``self.run``. 

1510 **kwargs 

1511 Additional keyword arguments used to augment or construct a 

1512 `DataCoordinate`. See `DataCoordinate.standardize` 

1513 parameters. 

1514 

1515 Returns 

1516 ------- 

1517 uri : `lsst.resources.ResourcePath` 

1518 URI pointing to the Dataset within the datastore. If the 

1519 Dataset does not exist in the datastore, and if ``predict`` is 

1520 `True`, the URI will be a prediction and will include a URI 

1521 fragment "#predicted". 

1522 If the datastore does not have entities that relate well 

1523 to the concept of a URI the returned URI string will be 

1524 descriptive. The returned URI is not guaranteed to be obtainable. 

1525 

1526 Raises 

1527 ------ 

1528 LookupError 

1529 A URI has been requested for a dataset that does not exist and 

1530 guessing is not allowed. 

1531 ValueError 

1532 Raised if a resolved `DatasetRef` was passed as an input, but it 

1533 differs from the one found in the registry. 

1534 TypeError 

1535 Raised if no collections were provided. 

1536 RuntimeError 

1537 Raised if a URI is requested for a dataset that consists of 

1538 multiple artifacts. 

1539 """ 

1540 primary, components = self.getURIs( 

1541 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs 

1542 ) 

1543 

1544 if primary is None or components: 

1545 raise RuntimeError( 

1546 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1547 "Use Butler.getURIs() instead." 

1548 ) 

1549 return primary 

1550 

1551 def retrieveArtifacts( 

1552 self, 

1553 refs: Iterable[DatasetRef], 

1554 destination: ResourcePathExpression, 

1555 transfer: str = "auto", 

1556 preserve_path: bool = True, 

1557 overwrite: bool = False, 

1558 ) -> list[ResourcePath]: 

1559 """Retrieve the artifacts associated with the supplied refs. 

1560 

1561 Parameters 

1562 ---------- 

1563 refs : iterable of `DatasetRef` 

1564 The datasets for which artifacts are to be retrieved. 

1565 A single ref can result in multiple artifacts. The refs must 

1566 be resolved. 

1567 destination : `lsst.resources.ResourcePath` or `str` 

1568 Location to write the artifacts. 

1569 transfer : `str`, optional 

1570 Method to use to transfer the artifacts. Must be one of the options 

1571 supported by `~lsst.resources.ResourcePath.transfer_from()`. 

1572 "move" is not allowed. 

1573 preserve_path : `bool`, optional 

1574 If `True` the full path of the artifact within the datastore 

1575 is preserved. If `False` the final file component of the path 

1576 is used. 

1577 overwrite : `bool`, optional 

1578 If `True` allow transfers to overwrite existing files at the 

1579 destination. 

1580 

1581 Returns 

1582 ------- 

1583 targets : `list` of `lsst.resources.ResourcePath` 

1584 URIs of file artifacts in destination location. Order is not 

1585 preserved. 

1586 

1587 Notes 

1588 ----- 

1589 For non-file datastores the artifacts written to the destination 

1590 may not match the representation inside the datastore. For example 

1591 a hierarchical data structure in a NoSQL database may well be stored 

1592 as a JSON file. 

1593 """ 

1594 return self._datastore.retrieveArtifacts( 

1595 refs, 

1596 ResourcePath(destination), 

1597 transfer=transfer, 

1598 preserve_path=preserve_path, 

1599 overwrite=overwrite, 

1600 ) 

1601 

1602 def exists( 

1603 self, 

1604 dataset_ref_or_type: DatasetRef | DatasetType | str, 

1605 /, 

1606 data_id: DataId | None = None, 

1607 *, 

1608 full_check: bool = True, 

1609 collections: Any = None, 

1610 **kwargs: Any, 

1611 ) -> DatasetExistence: 

1612 """Indicate whether a dataset is known to Butler registry and 

1613 datastore. 

1614 

1615 Parameters 

1616 ---------- 

1617 dataset_ref_or_type : `DatasetRef`, `DatasetType`, or `str` 

1618 When `DatasetRef` the `dataId` should be `None`. 

1619 Otherwise the `DatasetType` or name thereof. 

1620 data_id : `dict` or `DataCoordinate` 

1621 A `dict` of `Dimension` link name, value pairs that label the 

1622 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1623 should be provided as the first argument. 

1624 full_check : `bool`, optional 

1625 If `True`, an additional check will be made for dataset artifact 

1626 existence. This will involve additional overhead due to the need 

1627 to query an external system. If `False` registry and datastore 

1628 will solely be asked if they know about the dataset but no 

1629 check for the artifact will be performed. 

1630 collections : Any, optional 

1631 Collections to be searched, overriding ``self.collections``. 

1632 Can be any of the types supported by the ``collections`` argument 

1633 to butler construction. 

1634 **kwargs 

1635 Additional keyword arguments used to augment or construct a 

1636 `DataCoordinate`. See `DataCoordinate.standardize` 

1637 parameters. 

1638 

1639 Returns 

1640 ------- 

1641 existence : `DatasetExistence` 

1642 Object indicating whether the dataset is known to registry and 

1643 datastore. Evaluates to `True` if the dataset is present and known 

1644 to both. 

1645 """ 

1646 existence = DatasetExistence.UNRECOGNIZED 

1647 

1648 if isinstance(dataset_ref_or_type, DatasetRef): 

1649 if collections is not None: 

1650 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2) 

1651 if data_id is not None: 

1652 warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2) 

1653 ref = dataset_ref_or_type 

1654 registry_ref = self._registry.getDataset(dataset_ref_or_type.id) 

1655 if registry_ref is not None: 

1656 existence |= DatasetExistence.RECORDED 

1657 

1658 if dataset_ref_or_type != registry_ref: 

1659 # This could mean that storage classes differ, so we should 

1660 # check for that but use the registry ref for the rest of 

1661 # the method. 

1662 if registry_ref.is_compatible_with(dataset_ref_or_type): 

1663 # Use the registry version from now on. 

1664 ref = registry_ref 

1665 else: 

1666 raise ValueError( 

1667 f"The ref given to exists() ({ref}) has the same dataset ID as one " 

1668 f"in registry but has different incompatible values ({registry_ref})." 

1669 ) 

1670 else: 

1671 try: 

1672 ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs) 

1673 except (LookupError, TypeError, NoDefaultCollectionError): 

1674 return existence 

1675 existence |= DatasetExistence.RECORDED 

1676 

1677 if self._datastore.knows(ref): 

1678 existence |= DatasetExistence.DATASTORE 

1679 

1680 if full_check: 

1681 if self._datastore.exists(ref): 

1682 existence |= DatasetExistence._ARTIFACT 

1683 elif existence.value != DatasetExistence.UNRECOGNIZED.value: 

1684 # Do not add this flag if we have no other idea about a dataset. 

1685 existence |= DatasetExistence(DatasetExistence._ASSUMED) 

1686 

1687 return existence 

1688 

1689 def _exists_many( 

1690 self, 

1691 refs: Iterable[DatasetRef], 

1692 /, 

1693 *, 

1694 full_check: bool = True, 

1695 ) -> dict[DatasetRef, DatasetExistence]: 

1696 """Indicate whether multiple datasets are known to Butler registry and 

1697 datastore. 

1698 

1699 This is an experimental API that may change at any moment. 

1700 

1701 Parameters 

1702 ---------- 

1703 refs : iterable of `DatasetRef` 

1704 The datasets to be checked. 

1705 full_check : `bool`, optional 

1706 If `True`, an additional check will be made for dataset artifact 

1707 existence. This will involve additional overhead due to the need 

1708 to query an external system. If `False` registry and datastore 

1709 will solely be asked if they know about the dataset but no 

1710 check for the artifact will be performed. 

1711 

1712 Returns 

1713 ------- 

1714 existence : dict of [`DatasetRef`, `DatasetExistence`] 

1715 Mapping from the given dataset refs to an enum indicating the 

1716 status of the dataset in registry and datastore. 

1717 Each value evaluates to `True` if the dataset is present and known 

1718 to both. 

1719 """ 

1720 existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs} 

1721 

1722 # Registry does not have a bulk API to check for a ref. 

1723 for ref in refs: 

1724 registry_ref = self._registry.getDataset(ref.id) 

1725 if registry_ref is not None: 

1726 # It is possible, albeit unlikely, that the given ref does 

1727 # not match the one in registry even though the UUID matches. 

1728 # When checking a single ref we raise, but it's impolite to 

1729 # do that when potentially hundreds of refs are being checked. 

1730 # We could change the API to only accept UUIDs and that would 

1731 # remove the ability to even check and remove the worry 

1732 # about differing storage classes. Given the ongoing discussion 

1733 # on refs vs UUIDs and whether to raise or have a new 

1734 # private flag, treat this as a private API for now. 

1735 existence[ref] |= DatasetExistence.RECORDED 

1736 

1737 # Ask datastore if it knows about these refs. 

1738 knows = self._datastore.knows_these(refs) 

1739 for ref, known in knows.items(): 

1740 if known: 

1741 existence[ref] |= DatasetExistence.DATASTORE 

1742 

1743 if full_check: 

1744 mexists = self._datastore.mexists(refs) 

1745 for ref, exists in mexists.items(): 

1746 if exists: 

1747 existence[ref] |= DatasetExistence._ARTIFACT 

1748 else: 

1749 # Do not set this flag if nothing is known about the dataset. 

1750 for ref in existence: 

1751 if existence[ref] != DatasetExistence.UNRECOGNIZED: 

1752 existence[ref] |= DatasetExistence._ASSUMED 

1753 

1754 return existence 

1755 

1756 # TODO: remove on DM-40079. 

1757 @deprecated( 

1758 reason="Butler.datasetExists() has been replaced by Butler.exists(). Will be removed after v26.0.", 

1759 version="v26.0", 

1760 category=FutureWarning, 

1761 ) 

1762 def datasetExists( 

1763 self, 

1764 datasetRefOrType: DatasetRef | DatasetType | str, 

1765 dataId: DataId | None = None, 

1766 *, 

1767 collections: Any = None, 

1768 **kwargs: Any, 

1769 ) -> bool: 

1770 """Return True if the Dataset is actually present in the Datastore. 

1771 

1772 Parameters 

1773 ---------- 

1774 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1775 When `DatasetRef` the `dataId` should be `None`. 

1776 Otherwise the `DatasetType` or name thereof. 

1777 dataId : `dict` or `DataCoordinate` 

1778 A `dict` of `Dimension` link name, value pairs that label the 

1779 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1780 should be provided as the first argument. 

1781 collections : Any, optional 

1782 Collections to be searched, overriding ``self.collections``. 

1783 Can be any of the types supported by the ``collections`` argument 

1784 to butler construction. 

1785 **kwargs 

1786 Additional keyword arguments used to augment or construct a 

1787 `DataCoordinate`. See `DataCoordinate.standardize` 

1788 parameters. 

1789 

1790 Raises 

1791 ------ 

1792 LookupError 

1793 Raised if the dataset is not even present in the Registry. 

1794 ValueError 

1795 Raised if a resolved `DatasetRef` was passed as an input, but it 

1796 differs from the one found in the registry. 

1797 NoDefaultCollectionError 

1798 Raised if no collections were provided. 

1799 """ 

1800 # A resolved ref may be given that is not known to this butler. 

1801 if isinstance(datasetRefOrType, DatasetRef): 

1802 ref = self._registry.getDataset(datasetRefOrType.id) 

1803 if ref is None: 

1804 raise LookupError( 

1805 f"Resolved DatasetRef with id {datasetRefOrType.id} is not known to registry." 

1806 ) 

1807 else: 

1808 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1809 return self._datastore.exists(ref) 

1810 

1811 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1812 """Remove one or more `~CollectionType.RUN` collections and the 

1813 datasets within them. 

1814 

1815 Parameters 

1816 ---------- 

1817 names : `~collections.abc.Iterable` [ `str` ] 

1818 The names of the collections to remove. 

1819 unstore : `bool`, optional 

1820 If `True` (default), delete datasets from all datastores in which 

1821 they are present, and attempt to rollback the registry deletions if 

1822 datastore deletions fail (which may not always be possible). If 

1823 `False`, datastore records for these datasets are still removed, 

1824 but any artifacts (e.g. files) will not be. 

1825 

1826 Raises 

1827 ------ 

1828 TypeError 

1829 Raised if one or more collections are not of type 

1830 `~CollectionType.RUN`. 

1831 """ 

1832 if not self.isWriteable(): 

1833 raise TypeError("Butler is read-only.") 

1834 names = list(names) 

1835 refs: list[DatasetRef] = [] 

1836 for name in names: 

1837 collectionType = self._registry.getCollectionType(name) 

1838 if collectionType is not CollectionType.RUN: 

1839 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1840 refs.extend(self._registry.queryDatasets(..., collections=name, findFirst=True)) 

1841 with self._datastore.transaction(), self._registry.transaction(): 

1842 if unstore: 

1843 self._datastore.trash(refs) 

1844 else: 

1845 self._datastore.forget(refs) 

1846 for name in names: 

1847 self._registry.removeCollection(name) 

1848 if unstore: 

1849 # Point of no return for removing artifacts 

1850 self._datastore.emptyTrash() 

1851 

1852 def pruneDatasets( 

1853 self, 

1854 refs: Iterable[DatasetRef], 

1855 *, 

1856 disassociate: bool = True, 

1857 unstore: bool = False, 

1858 tags: Iterable[str] = (), 

1859 purge: bool = False, 

1860 ) -> None: 

1861 # docstring inherited from LimitedButler 

1862 

1863 if not self.isWriteable(): 

1864 raise TypeError("Butler is read-only.") 

1865 if purge: 

1866 if not disassociate: 

1867 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1868 if not unstore: 

1869 raise TypeError("Cannot pass purge=True without unstore=True.") 

1870 elif disassociate: 

1871 tags = tuple(tags) 

1872 if not tags: 

1873 raise TypeError("No tags provided but disassociate=True.") 

1874 for tag in tags: 

1875 collectionType = self._registry.getCollectionType(tag) 

1876 if collectionType is not CollectionType.TAGGED: 

1877 raise TypeError( 

1878 f"Cannot disassociate from collection '{tag}' " 

1879 f"of non-TAGGED type {collectionType.name}." 

1880 ) 

1881 # Transform possibly-single-pass iterable into something we can iterate 

1882 # over multiple times. 

1883 refs = list(refs) 

1884 # Pruning a component of a DatasetRef makes no sense since registry 

1885 # doesn't know about components and datastore might not store 

1886 # components in a separate file 

1887 for ref in refs: 

1888 if ref.datasetType.component(): 

1889 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1890 # We don't need an unreliable Datastore transaction for this, because 

1891 # we've been extra careful to ensure that Datastore.trash only involves 

1892 # mutating the Registry (it can _look_ at Datastore-specific things, 

1893 # but shouldn't change them), and hence all operations here are 

1894 # Registry operations. 

1895 with self._datastore.transaction(), self._registry.transaction(): 

1896 if unstore: 

1897 self._datastore.trash(refs) 

1898 if purge: 

1899 self._registry.removeDatasets(refs) 

1900 elif disassociate: 

1901 assert tags, "Guaranteed by earlier logic in this function." 

1902 for tag in tags: 

1903 self._registry.disassociate(tag, refs) 

1904 # We've exited the Registry transaction, and apparently committed. 

1905 # (if there was an exception, everything rolled back, and it's as if 

1906 # nothing happened - and we never get here). 

1907 # Datastore artifacts are not yet gone, but they're clearly marked 

1908 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1909 # problems we can try again later, and if manual administrative 

1910 # intervention is required, it's pretty clear what that should entail: 

1911 # deleting everything on disk and in private Datastore tables that is 

1912 # in the dataset_location_trash table. 

1913 if unstore: 

1914 # Point of no return for removing artifacts 

1915 self._datastore.emptyTrash() 

1916 

1917 @transactional 

1918 def ingest( 

1919 self, 

1920 *datasets: FileDataset, 

1921 transfer: str | None = "auto", 

1922 run: str | None = None, 

1923 idGenerationMode: DatasetIdGenEnum | None = None, 

1924 record_validation_info: bool = True, 

1925 ) -> None: 

1926 """Store and register one or more datasets that already exist on disk. 

1927 

1928 Parameters 

1929 ---------- 

1930 datasets : `FileDataset` 

1931 Each positional argument is a struct containing information about 

1932 a file to be ingested, including its URI (either absolute or 

1933 relative to the datastore root, if applicable), a resolved 

1934 `DatasetRef`, and optionally a formatter class or its 

1935 fully-qualified string name. If a formatter is not provided, the 

1936 formatter that would be used for `put` is assumed. On successful 

1937 ingest all `FileDataset.formatter` attributes will be set to the 

1938 formatter class used. `FileDataset.path` attributes may be modified 

1939 to put paths in whatever the datastore considers a standardized 

1940 form. 

1941 transfer : `str`, optional 

1942 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1943 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to 

1944 transfer the file. 

1945 run : `str`, optional 

1946 The name of the run ingested datasets should be added to, 

1947 overriding ``self.run``. This parameter is now deprecated since 

1948 the run is encoded in the ``FileDataset``. 

1949 idGenerationMode : `DatasetIdGenEnum`, optional 

1950 Specifies option for generating dataset IDs. Parameter is 

1951 deprecated. 

1952 record_validation_info : `bool`, optional 

1953 If `True`, the default, the datastore can record validation 

1954 information associated with the file. If `False` the datastore 

1955 will not attempt to track any information such as checksums 

1956 or file sizes. This can be useful if such information is tracked 

1957 in an external system or if the file is to be compressed in place. 

1958 It is up to the datastore whether this parameter is relevant. 

1959 

1960 Raises 

1961 ------ 

1962 TypeError 

1963 Raised if the butler is read-only or if no run was provided. 

1964 NotImplementedError 

1965 Raised if the `Datastore` does not support the given transfer mode. 

1966 DatasetTypeNotSupportedError 

1967 Raised if one or more files to be ingested have a dataset type that 

1968 is not supported by the `Datastore`.. 

1969 FileNotFoundError 

1970 Raised if one of the given files does not exist. 

1971 FileExistsError 

1972 Raised if transfer is not `None` but the (internal) location the 

1973 file would be moved to is already occupied. 

1974 

1975 Notes 

1976 ----- 

1977 This operation is not fully exception safe: if a database operation 

1978 fails, the given `FileDataset` instances may be only partially updated. 

1979 

1980 It is atomic in terms of database operations (they will either all 

1981 succeed or all fail) providing the database engine implements 

1982 transactions correctly. It will attempt to be atomic in terms of 

1983 filesystem operations as well, but this cannot be implemented 

1984 rigorously for most datastores. 

1985 """ 

1986 if not self.isWriteable(): 

1987 raise TypeError("Butler is read-only.") 

1988 

1989 log.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s") 

1990 if not datasets: 

1991 return 

1992 

1993 if idGenerationMode is not None: 

1994 warnings.warn( 

1995 "The idGenerationMode parameter is no longer used and is ignored. " 

1996 " Will be removed after v26.0", 

1997 FutureWarning, 

1998 stacklevel=2, 

1999 ) 

2000 

2001 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

2002 

2003 # We need to reorganize all the inputs so that they are grouped 

2004 # by dataset type and run. Multiple refs in a single FileDataset 

2005 # are required to share the run and dataset type. 

2006 GroupedData = MutableMapping[tuple[DatasetType, str], list[FileDataset]] 

2007 groupedData: GroupedData = defaultdict(list) 

2008 

2009 # Track DataIDs that are being ingested so we can spot issues early 

2010 # with duplication. Retain previous FileDataset so we can report it. 

2011 groupedDataIds: MutableMapping[ 

2012 tuple[DatasetType, str], dict[DataCoordinate, FileDataset] 

2013 ] = defaultdict(dict) 

2014 

2015 used_run = False 

2016 

2017 # And the nested loop that populates it: 

2018 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

2019 # Somewhere to store pre-existing refs if we have an 

2020 # execution butler. 

2021 existingRefs: list[DatasetRef] = [] 

2022 

2023 for ref in dataset.refs: 

2024 assert ref.run is not None # For mypy 

2025 group_key = (ref.datasetType, ref.run) 

2026 

2027 if ref.dataId in groupedDataIds[group_key]: 

2028 raise ConflictingDefinitionError( 

2029 f"Ingest conflict. Dataset {dataset.path} has same" 

2030 " DataId as other ingest dataset" 

2031 f" {groupedDataIds[group_key][ref.dataId].path} " 

2032 f" ({ref.dataId})" 

2033 ) 

2034 

2035 groupedDataIds[group_key][ref.dataId] = dataset 

2036 

2037 if existingRefs: 

2038 if len(dataset.refs) != len(existingRefs): 

2039 # Keeping track of partially pre-existing datasets is hard 

2040 # and should generally never happen. For now don't allow 

2041 # it. 

2042 raise ConflictingDefinitionError( 

2043 f"For dataset {dataset.path} some dataIds already exist" 

2044 " in registry but others do not. This is not supported." 

2045 ) 

2046 

2047 # Store expanded form in the original FileDataset. 

2048 dataset.refs = existingRefs 

2049 else: 

2050 groupedData[group_key].append(dataset) 

2051 

2052 if not used_run and run is not None: 

2053 warnings.warn( 

2054 "All DatasetRefs to be ingested had resolved dataset IDs. The value given to the " 

2055 f"'run' parameter ({run!r}) was not used and the parameter will be removed in the future.", 

2056 category=FutureWarning, 

2057 stacklevel=3, # Take into account the @transactional decorator. 

2058 ) 

2059 

2060 # Now we can bulk-insert into Registry for each DatasetType. 

2061 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks( 

2062 groupedData.items(), desc="Bulk-inserting datasets by type" 

2063 ): 

2064 refs_to_import = [] 

2065 for dataset in grouped_datasets: 

2066 refs_to_import.extend(dataset.refs) 

2067 

2068 n_refs = len(refs_to_import) 

2069 log.verbose( 

2070 "Importing %d ref%s of dataset type %r into run %r", 

2071 n_refs, 

2072 "" if n_refs == 1 else "s", 

2073 datasetType.name, 

2074 this_run, 

2075 ) 

2076 

2077 # Import the refs and expand the DataCoordinates since we can't 

2078 # guarantee that they are expanded and Datastore will need 

2079 # the records. 

2080 imported_refs = self._registry._importDatasets(refs_to_import, expand=True) 

2081 assert set(imported_refs) == set(refs_to_import) 

2082 

2083 # Replace all the refs in the FileDataset with expanded versions. 

2084 # Pull them off in the order we put them on the list. 

2085 for dataset in grouped_datasets: 

2086 n_dataset_refs = len(dataset.refs) 

2087 dataset.refs = imported_refs[:n_dataset_refs] 

2088 del imported_refs[:n_dataset_refs] 

2089 

2090 # Bulk-insert everything into Datastore. 

2091 # We do not know if any of the registry entries already existed 

2092 # (_importDatasets only complains if they exist but differ) so 

2093 # we have to catch IntegrityError explicitly. 

2094 try: 

2095 self._datastore.ingest( 

2096 *datasets, transfer=transfer, record_validation_info=record_validation_info 

2097 ) 

2098 except IntegrityError as e: 

2099 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") from e 

2100 

2101 @contextlib.contextmanager 

2102 def export( 

2103 self, 

2104 *, 

2105 directory: str | None = None, 

2106 filename: str | None = None, 

2107 format: str | None = None, 

2108 transfer: str | None = None, 

2109 ) -> Iterator[RepoExportContext]: 

2110 """Export datasets from the repository represented by this `Butler`. 

2111 

2112 This method is a context manager that returns a helper object 

2113 (`RepoExportContext`) that is used to indicate what information from 

2114 the repository should be exported. 

2115 

2116 Parameters 

2117 ---------- 

2118 directory : `str`, optional 

2119 Directory dataset files should be written to if ``transfer`` is not 

2120 `None`. 

2121 filename : `str`, optional 

2122 Name for the file that will include database information associated 

2123 with the exported datasets. If this is not an absolute path and 

2124 ``directory`` is not `None`, it will be written to ``directory`` 

2125 instead of the current working directory. Defaults to 

2126 "export.{format}". 

2127 format : `str`, optional 

2128 File format for the database information file. If `None`, the 

2129 extension of ``filename`` will be used. 

2130 transfer : `str`, optional 

2131 Transfer mode passed to `Datastore.export`. 

2132 

2133 Raises 

2134 ------ 

2135 TypeError 

2136 Raised if the set of arguments passed is inconsistent. 

2137 

2138 Examples 

2139 -------- 

2140 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

2141 methods are used to provide the iterables over data IDs and/or datasets 

2142 to be exported:: 

2143 

2144 with butler.export("exports.yaml") as export: 

2145 # Export all flats, but none of the dimension element rows 

2146 # (i.e. data ID information) associated with them. 

2147 export.saveDatasets(butler.registry.queryDatasets("flat"), 

2148 elements=()) 

2149 # Export all datasets that start with "deepCoadd_" and all of 

2150 # their associated data ID information. 

2151 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

2152 """ 

2153 if directory is None and transfer is not None: 

2154 raise TypeError("Cannot transfer without providing a directory.") 

2155 if transfer == "move": 

2156 raise TypeError("Transfer may not be 'move': export is read-only") 

2157 if format is None: 

2158 if filename is None: 

2159 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2160 else: 

2161 _, format = os.path.splitext(filename) 

2162 if not format: 

2163 raise ValueError("Please specify a file extension to determine export format.") 

2164 format = format[1:] # Strip leading "."" 

2165 elif filename is None: 

2166 filename = f"export.{format}" 

2167 if directory is not None: 

2168 filename = os.path.join(directory, filename) 

2169 formats = self._config["repo_transfer_formats"] 

2170 if format not in formats: 

2171 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}") 

2172 BackendClass = get_class_of(formats[format, "export"]) 

2173 with open(filename, "w") as stream: 

2174 backend = BackendClass(stream, universe=self.dimensions) 

2175 try: 

2176 helper = RepoExportContext( 

2177 self._registry, self._datastore, backend=backend, directory=directory, transfer=transfer 

2178 ) 

2179 yield helper 

2180 except BaseException: 

2181 raise 

2182 else: 

2183 helper._finish() 

2184 

2185 def import_( 

2186 self, 

2187 *, 

2188 directory: ResourcePathExpression | None = None, 

2189 filename: ResourcePathExpression | TextIO | None = None, 

2190 format: str | None = None, 

2191 transfer: str | None = None, 

2192 skip_dimensions: set | None = None, 

2193 ) -> None: 

2194 """Import datasets into this repository that were exported from a 

2195 different butler repository via `~lsst.daf.butler.Butler.export`. 

2196 

2197 Parameters 

2198 ---------- 

2199 directory : `~lsst.resources.ResourcePathExpression`, optional 

2200 Directory containing dataset files to import from. If `None`, 

2201 ``filename`` and all dataset file paths specified therein must 

2202 be absolute. 

2203 filename : `~lsst.resources.ResourcePathExpression` or `TextIO` 

2204 A stream or name of file that contains database information 

2205 associated with the exported datasets, typically generated by 

2206 `~lsst.daf.butler.Butler.export`. If this a string (name) or 

2207 `~lsst.resources.ResourcePath` and is not an absolute path, 

2208 it will first be looked for relative to ``directory`` and if not 

2209 found there it will be looked for in the current working 

2210 directory. Defaults to "export.{format}". 

2211 format : `str`, optional 

2212 File format for ``filename``. If `None`, the extension of 

2213 ``filename`` will be used. 

2214 transfer : `str`, optional 

2215 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

2216 skip_dimensions : `set`, optional 

2217 Names of dimensions that should be skipped and not imported. 

2218 

2219 Raises 

2220 ------ 

2221 TypeError 

2222 Raised if the set of arguments passed is inconsistent, or if the 

2223 butler is read-only. 

2224 """ 

2225 if not self.isWriteable(): 

2226 raise TypeError("Butler is read-only.") 

2227 if format is None: 

2228 if filename is None: 

2229 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2230 else: 

2231 _, format = os.path.splitext(filename) # type: ignore 

2232 elif filename is None: 

2233 filename = ResourcePath(f"export.{format}", forceAbsolute=False) 

2234 if directory is not None: 

2235 directory = ResourcePath(directory, forceDirectory=True) 

2236 # mypy doesn't think this will work but it does in python >= 3.10. 

2237 if isinstance(filename, ResourcePathExpression): # type: ignore 

2238 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore 

2239 if not filename.isabs() and directory is not None: 

2240 potential = directory.join(filename) 

2241 exists_in_cwd = filename.exists() 

2242 exists_in_dir = potential.exists() 

2243 if exists_in_cwd and exists_in_dir: 

2244 log.warning( 

2245 "A relative path for filename was specified (%s) which exists relative to cwd. " 

2246 "Additionally, the file exists relative to the given search directory (%s). " 

2247 "Using the export file in the given directory.", 

2248 filename, 

2249 potential, 

2250 ) 

2251 # Given they specified an explicit directory and that 

2252 # directory has the export file in it, assume that that 

2253 # is what was meant despite the file in cwd. 

2254 filename = potential 

2255 elif exists_in_dir: 

2256 filename = potential 

2257 elif not exists_in_cwd and not exists_in_dir: 

2258 # Raise early. 

2259 raise FileNotFoundError( 

2260 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}." 

2261 ) 

2262 BackendClass: type[RepoImportBackend] = get_class_of( 

2263 self._config["repo_transfer_formats"][format]["import"] 

2264 ) 

2265 

2266 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None: 

2267 backend = BackendClass(importStream, self._registry) # type: ignore[call-arg] 

2268 backend.register() 

2269 with self.transaction(): 

2270 backend.load( 

2271 self._datastore, 

2272 directory=directory, 

2273 transfer=transfer, 

2274 skip_dimensions=skip_dimensions, 

2275 ) 

2276 

2277 if isinstance(filename, ResourcePath): 

2278 # We can not use open() here at the moment because of 

2279 # DM-38589 since yaml does stream.read(8192) in a loop. 

2280 stream = io.StringIO(filename.read().decode()) 

2281 doImport(stream) 

2282 else: 

2283 doImport(filename) # type: ignore 

2284 

2285 def transfer_from( 

2286 self, 

2287 source_butler: LimitedButler, 

2288 source_refs: Iterable[DatasetRef], 

2289 transfer: str = "auto", 

2290 skip_missing: bool = True, 

2291 register_dataset_types: bool = False, 

2292 transfer_dimensions: bool = False, 

2293 ) -> collections.abc.Collection[DatasetRef]: 

2294 """Transfer datasets to this Butler from a run in another Butler. 

2295 

2296 Parameters 

2297 ---------- 

2298 source_butler : `LimitedButler` 

2299 Butler from which the datasets are to be transferred. If data IDs 

2300 in ``source_refs`` are not expanded then this has to be a full 

2301 `Butler` whose registry will be used to expand data IDs. 

2302 source_refs : iterable of `DatasetRef` 

2303 Datasets defined in the source butler that should be transferred to 

2304 this butler. 

2305 transfer : `str`, optional 

2306 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

2307 skip_missing : `bool` 

2308 If `True`, datasets with no datastore artifact associated with 

2309 them are not transferred. If `False` a registry entry will be 

2310 created even if no datastore record is created (and so will 

2311 look equivalent to the dataset being unstored). 

2312 register_dataset_types : `bool` 

2313 If `True` any missing dataset types are registered. Otherwise 

2314 an exception is raised. 

2315 transfer_dimensions : `bool`, optional 

2316 If `True`, dimension record data associated with the new datasets 

2317 will be transferred. 

2318 

2319 Returns 

2320 ------- 

2321 refs : `list` of `DatasetRef` 

2322 The refs added to this Butler. 

2323 

2324 Notes 

2325 ----- 

2326 The datastore artifact has to exist for a transfer 

2327 to be made but non-existence is not an error. 

2328 

2329 Datasets that already exist in this run will be skipped. 

2330 

2331 The datasets are imported as part of a transaction, although 

2332 dataset types are registered before the transaction is started. 

2333 This means that it is possible for a dataset type to be registered 

2334 even though transfer has failed. 

2335 """ 

2336 if not self.isWriteable(): 

2337 raise TypeError("Butler is read-only.") 

2338 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

2339 

2340 # Will iterate through the refs multiple times so need to convert 

2341 # to a list if this isn't a collection. 

2342 if not isinstance(source_refs, collections.abc.Collection): 

2343 source_refs = list(source_refs) 

2344 

2345 original_count = len(source_refs) 

2346 log.info("Transferring %d datasets into %s", original_count, str(self)) 

2347 

2348 # In some situations the datastore artifact may be missing 

2349 # and we do not want that registry entry to be imported. 

2350 # Asking datastore is not sufficient, the records may have been 

2351 # purged, we have to ask for the (predicted) URI and check 

2352 # existence explicitly. Execution butler is set up exactly like 

2353 # this with no datastore records. 

2354 artifact_existence: dict[ResourcePath, bool] = {} 

2355 if skip_missing: 

2356 dataset_existence = source_butler._datastore.mexists( 

2357 source_refs, artifact_existence=artifact_existence 

2358 ) 

2359 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

2360 filtered_count = len(source_refs) 

2361 n_missing = original_count - filtered_count 

2362 log.verbose( 

2363 "%d dataset%s removed because the artifact does not exist. Now have %d.", 

2364 n_missing, 

2365 "" if n_missing == 1 else "s", 

2366 filtered_count, 

2367 ) 

2368 

2369 # Importing requires that we group the refs by dataset type and run 

2370 # before doing the import. 

2371 source_dataset_types = set() 

2372 grouped_refs = defaultdict(list) 

2373 for ref in source_refs: 

2374 grouped_refs[ref.datasetType, ref.run].append(ref) 

2375 source_dataset_types.add(ref.datasetType) 

2376 

2377 # Check to see if the dataset type in the source butler has 

2378 # the same definition in the target butler and register missing 

2379 # ones if requested. Registration must happen outside a transaction. 

2380 newly_registered_dataset_types = set() 

2381 for datasetType in source_dataset_types: 

2382 if register_dataset_types: 

2383 # Let this raise immediately if inconsistent. Continuing 

2384 # on to find additional inconsistent dataset types 

2385 # might result in additional unwanted dataset types being 

2386 # registered. 

2387 if self._registry.registerDatasetType(datasetType): 

2388 newly_registered_dataset_types.add(datasetType) 

2389 else: 

2390 # If the dataset type is missing, let it fail immediately. 

2391 target_dataset_type = self._registry.getDatasetType(datasetType.name) 

2392 if target_dataset_type != datasetType: 

2393 raise ConflictingDefinitionError( 

2394 "Source butler dataset type differs from definition" 

2395 f" in target butler: {datasetType} !=" 

2396 f" {target_dataset_type}" 

2397 ) 

2398 if newly_registered_dataset_types: 

2399 # We may have registered some even if there were inconsistencies 

2400 # but should let people know (or else remove them again). 

2401 log.log( 

2402 VERBOSE, 

2403 "Registered the following dataset types in the target Butler: %s", 

2404 ", ".join(d.name for d in newly_registered_dataset_types), 

2405 ) 

2406 else: 

2407 log.log(VERBOSE, "All required dataset types are known to the target Butler") 

2408 

2409 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

2410 if transfer_dimensions: 

2411 # Collect all the dimension records for these refs. 

2412 # All dimensions are to be copied but the list of valid dimensions 

2413 # come from this butler's universe. 

2414 elements = frozenset( 

2415 element 

2416 for element in self.dimensions.getStaticElements() 

2417 if element.hasTable() and element.viewOf is None 

2418 ) 

2419 dataIds = {ref.dataId for ref in source_refs} 

2420 # This logic comes from saveDataIds. 

2421 for dataId in dataIds: 

2422 # Need an expanded record, if not expanded that we need a full 

2423 # butler with registry (allow mocks with registry too). 

2424 if not dataId.hasRecords(): 

2425 if registry := getattr(source_butler, "registry", None): 

2426 dataId = registry.expandDataId(dataId) 

2427 else: 

2428 raise TypeError("Input butler needs to be a full butler to expand DataId.") 

2429 # If this butler doesn't know about a dimension in the source 

2430 # butler things will break later. 

2431 for record in dataId.records.values(): 

2432 if record is not None and record.definition in elements: 

2433 dimension_records[record.definition].setdefault(record.dataId, record) 

2434 

2435 handled_collections: set[str] = set() 

2436 

2437 # Do all the importing in a single transaction. 

2438 with self.transaction(): 

2439 if dimension_records: 

2440 log.verbose("Ensuring that dimension records exist for transferred datasets.") 

2441 for element, r in dimension_records.items(): 

2442 records = [r[dataId] for dataId in r] 

2443 # Assume that if the record is already present that we can 

2444 # use it without having to check that the record metadata 

2445 # is consistent. 

2446 self._registry.insertDimensionData(element, *records, skip_existing=True) 

2447 

2448 n_imported = 0 

2449 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

2450 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

2451 ): 

2452 if run not in handled_collections: 

2453 # May need to create output collection. If source butler 

2454 # has a registry, ask for documentation string. 

2455 run_doc = None 

2456 if registry := getattr(source_butler, "registry", None): 

2457 run_doc = registry.getCollectionDocumentation(run) 

2458 registered = self._registry.registerRun(run, doc=run_doc) 

2459 handled_collections.add(run) 

2460 if registered: 

2461 log.log(VERBOSE, "Creating output run %s", run) 

2462 

2463 n_refs = len(refs_to_import) 

2464 log.verbose( 

2465 "Importing %d ref%s of dataset type %s into run %s", 

2466 n_refs, 

2467 "" if n_refs == 1 else "s", 

2468 datasetType.name, 

2469 run, 

2470 ) 

2471 

2472 # Assume we are using UUIDs and the source refs will match 

2473 # those imported. 

2474 imported_refs = self._registry._importDatasets(refs_to_import, expand=False) 

2475 assert set(imported_refs) == set(refs_to_import) 

2476 n_imported += len(imported_refs) 

2477 

2478 assert len(source_refs) == n_imported 

2479 log.verbose("Imported %d datasets into destination butler", n_imported) 

2480 

2481 # Ask the datastore to transfer. The datastore has to check that 

2482 # the source datastore is compatible with the target datastore. 

2483 accepted, rejected = self._datastore.transfer_from( 

2484 source_butler._datastore, 

2485 source_refs, 

2486 transfer=transfer, 

2487 artifact_existence=artifact_existence, 

2488 ) 

2489 if rejected: 

2490 # For now, accept the registry entries but not the files. 

2491 log.warning( 

2492 "%d datasets were rejected and %d accepted for dataset type %s in run %r.", 

2493 len(rejected), 

2494 len(accepted), 

2495 datasetType, 

2496 run, 

2497 ) 

2498 

2499 return source_refs 

2500 

2501 def validateConfiguration( 

2502 self, 

2503 logFailures: bool = False, 

2504 datasetTypeNames: Iterable[str] | None = None, 

2505 ignore: Iterable[str] | None = None, 

2506 ) -> None: 

2507 """Validate butler configuration. 

2508 

2509 Checks that each `DatasetType` can be stored in the `Datastore`. 

2510 

2511 Parameters 

2512 ---------- 

2513 logFailures : `bool`, optional 

2514 If `True`, output a log message for every validation error 

2515 detected. 

2516 datasetTypeNames : iterable of `str`, optional 

2517 The `DatasetType` names that should be checked. This allows 

2518 only a subset to be selected. 

2519 ignore : iterable of `str`, optional 

2520 Names of DatasetTypes to skip over. This can be used to skip 

2521 known problems. If a named `DatasetType` corresponds to a 

2522 composite, all components of that `DatasetType` will also be 

2523 ignored. 

2524 

2525 Raises 

2526 ------ 

2527 ButlerValidationError 

2528 Raised if there is some inconsistency with how this Butler 

2529 is configured. 

2530 """ 

2531 if datasetTypeNames: 

2532 datasetTypes = [self._registry.getDatasetType(name) for name in datasetTypeNames] 

2533 else: 

2534 datasetTypes = list(self._registry.queryDatasetTypes()) 

2535 

2536 # filter out anything from the ignore list 

2537 if ignore: 

2538 ignore = set(ignore) 

2539 datasetTypes = [ 

2540 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

2541 ] 

2542 else: 

2543 ignore = set() 

2544 

2545 # For each datasetType that has an instrument dimension, create 

2546 # a DatasetRef for each defined instrument 

2547 datasetRefs = [] 

2548 

2549 # Find all the registered instruments (if "instrument" is in the 

2550 # universe). 

2551 if "instrument" in self.dimensions: 

2552 instruments = {record.name for record in self._registry.queryDimensionRecords("instrument")} 

2553 

2554 for datasetType in datasetTypes: 

2555 if "instrument" in datasetType.dimensions: 

2556 # In order to create a conforming dataset ref, create 

2557 # fake DataCoordinate values for the non-instrument 

2558 # dimensions. The type of the value does not matter here. 

2559 dataId = {dim.name: 1 for dim in datasetType.dimensions if dim.name != "instrument"} 

2560 

2561 for instrument in instruments: 

2562 datasetRef = DatasetRef( 

2563 datasetType, 

2564 DataCoordinate.standardize( 

2565 dataId, instrument=instrument, graph=datasetType.dimensions 

2566 ), 

2567 run="validate", 

2568 ) 

2569 datasetRefs.append(datasetRef) 

2570 

2571 entities: list[DatasetType | DatasetRef] = [] 

2572 entities.extend(datasetTypes) 

2573 entities.extend(datasetRefs) 

2574 

2575 datastoreErrorStr = None 

2576 try: 

2577 self._datastore.validateConfiguration(entities, logFailures=logFailures) 

2578 except ValidationError as e: 

2579 datastoreErrorStr = str(e) 

2580 

2581 # Also check that the LookupKeys used by the datastores match 

2582 # registry and storage class definitions 

2583 keys = self._datastore.getLookupKeys() 

2584 

2585 failedNames = set() 

2586 failedDataId = set() 

2587 for key in keys: 

2588 if key.name is not None: 

2589 if key.name in ignore: 

2590 continue 

2591 

2592 # skip if specific datasetType names were requested and this 

2593 # name does not match 

2594 if datasetTypeNames and key.name not in datasetTypeNames: 

2595 continue 

2596 

2597 # See if it is a StorageClass or a DatasetType 

2598 if key.name in self.storageClasses: 

2599 pass 

2600 else: 

2601 try: 

2602 self._registry.getDatasetType(key.name) 

2603 except KeyError: 

2604 if logFailures: 

2605 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

2606 failedNames.add(key) 

2607 else: 

2608 # Dimensions are checked for consistency when the Butler 

2609 # is created and rendezvoused with a universe. 

2610 pass 

2611 

2612 # Check that the instrument is a valid instrument 

2613 # Currently only support instrument so check for that 

2614 if key.dataId: 

2615 dataIdKeys = set(key.dataId) 

2616 if {"instrument"} != dataIdKeys: 

2617 if logFailures: 

2618 log.critical("Key '%s' has unsupported DataId override", key) 

2619 failedDataId.add(key) 

2620 elif key.dataId["instrument"] not in instruments: 

2621 if logFailures: 

2622 log.critical("Key '%s' has unknown instrument", key) 

2623 failedDataId.add(key) 

2624 

2625 messages = [] 

2626 

2627 if datastoreErrorStr: 

2628 messages.append(datastoreErrorStr) 

2629 

2630 for failed, msg in ( 

2631 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2632 (failedDataId, "Keys with bad DataId entries: "), 

2633 ): 

2634 if failed: 

2635 msg += ", ".join(str(k) for k in failed) 

2636 messages.append(msg) 

2637 

2638 if messages: 

2639 raise ValidationError(";\n".join(messages)) 

2640 

2641 @property 

2642 def collections(self) -> Sequence[str]: 

2643 """The collections to search by default, in order 

2644 (`~collections.abc.Sequence` [ `str` ]). 

2645 

2646 This is an alias for ``self.registry.defaults.collections``. It cannot 

2647 be set directly in isolation, but all defaults may be changed together 

2648 by assigning a new `RegistryDefaults` instance to 

2649 ``self.registry.defaults``. 

2650 """ 

2651 return self._registry.defaults.collections 

2652 

2653 @property 

2654 def run(self) -> str | None: 

2655 """Name of the run this butler writes outputs to by default (`str` or 

2656 `None`). 

2657 

2658 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2659 directly in isolation, but all defaults may be changed together by 

2660 assigning a new `RegistryDefaults` instance to 

2661 ``self.registry.defaults``. 

2662 """ 

2663 return self._registry.defaults.run 

2664 

2665 @property 

2666 def registry(self) -> Registry: 

2667 """The object that manages dataset metadata and relationships 

2668 (`Registry`). 

2669 

2670 Many operations that don't involve reading or writing butler datasets 

2671 are accessible only via `Registry` methods. Eventually these methods 

2672 will be replaced by equivalent `Butler` methods. 

2673 """ 

2674 return self._registry_shim 

2675 

2676 @property 

2677 def dimensions(self) -> DimensionUniverse: 

2678 # Docstring inherited. 

2679 return self._registry.dimensions 

2680 

2681 _registry: _ButlerRegistry 

2682 """The object that manages dataset metadata and relationships 

2683 (`_ButlerRegistry`). 

2684 

2685 Most operations that don't involve reading or writing butler datasets are 

2686 accessible only via `Registry` methods. 

2687 """ 

2688 

2689 datastore: Datastore 

2690 """The object that manages actual dataset storage (`Datastore`). 

2691 

2692 Direct user access to the datastore should rarely be necessary; the primary 

2693 exception is the case where a `Datastore` implementation provides extra 

2694 functionality beyond what the base class defines. 

2695 """ 

2696 

2697 storageClasses: StorageClassFactory 

2698 """An object that maps known storage class names to objects that fully 

2699 describe them (`StorageClassFactory`). 

2700 """