Coverage for python/lsst/daf/butler/_butler.py: 8%

716 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-14 09:11 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Butler top level classes. 

24""" 

25from __future__ import annotations 

26 

27__all__ = ( 

28 "Butler", 

29 "ButlerValidationError", 

30) 

31 

32import collections.abc 

33import contextlib 

34import io 

35import logging 

36import numbers 

37import os 

38import warnings 

39from collections import Counter, defaultdict 

40from collections.abc import Iterable, Iterator, MutableMapping, Sequence 

41from typing import TYPE_CHECKING, Any, ClassVar, TextIO 

42 

43from deprecated.sphinx import deprecated 

44from lsst.resources import ResourcePath, ResourcePathExpression 

45from lsst.utils import doImportType 

46from lsst.utils.introspection import get_class_of 

47from lsst.utils.logging import VERBOSE, getLogger 

48from sqlalchemy.exc import IntegrityError 

49 

50from ._butlerConfig import ButlerConfig 

51from ._butlerRepoIndex import ButlerRepoIndex 

52from ._dataset_existence import DatasetExistence 

53from ._deferredDatasetHandle import DeferredDatasetHandle 

54from ._limited_butler import LimitedButler 

55from .core import ( 

56 Config, 

57 ConfigSubset, 

58 DataCoordinate, 

59 DataId, 

60 DataIdValue, 

61 DatasetIdGenEnum, 

62 DatasetRef, 

63 DatasetRefURIs, 

64 DatasetType, 

65 Datastore, 

66 Dimension, 

67 DimensionConfig, 

68 DimensionElement, 

69 DimensionRecord, 

70 DimensionUniverse, 

71 FileDataset, 

72 Progress, 

73 StorageClass, 

74 StorageClassFactory, 

75 Timespan, 

76 ValidationError, 

77) 

78from .core.repoRelocation import BUTLER_ROOT_TAG 

79from .core.utils import transactional 

80from .registry import ( 

81 CollectionType, 

82 ConflictingDefinitionError, 

83 DataIdError, 

84 MissingDatasetTypeError, 

85 NoDefaultCollectionError, 

86 Registry, 

87 RegistryConfig, 

88 RegistryDefaults, 

89) 

90from .transfers import RepoExportContext 

91 

92if TYPE_CHECKING: 

93 from lsst.resources import ResourceHandleProtocol 

94 

95 from .transfers import RepoImportBackend 

96 

97log = getLogger(__name__) 

98 

99 

100class ButlerValidationError(ValidationError): 

101 """There is a problem with the Butler configuration.""" 

102 

103 pass 

104 

105 

106class Butler(LimitedButler): 

107 """Main entry point for the data access system. 

108 

109 Parameters 

110 ---------- 

111 config : `ButlerConfig`, `Config` or `str`, optional. 

112 Configuration. Anything acceptable to the 

113 `ButlerConfig` constructor. If a directory path 

114 is given the configuration will be read from a ``butler.yaml`` file in 

115 that location. If `None` is given default values will be used. 

116 butler : `Butler`, optional. 

117 If provided, construct a new Butler that uses the same registry and 

118 datastore as the given one, but with the given collection and run. 

119 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

120 arguments. 

121 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional 

122 An expression specifying the collections to be searched (in order) when 

123 reading datasets. 

124 This may be a `str` collection name or an iterable thereof. 

125 See :ref:`daf_butler_collection_expressions` for more information. 

126 These collections are not registered automatically and must be 

127 manually registered before they are used by any method, but they may be 

128 manually registered after the `Butler` is initialized. 

129 run : `str`, optional 

130 Name of the `~CollectionType.RUN` collection new datasets should be 

131 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

132 ``collections`` will be set to ``[run]``. If not `None`, this 

133 collection will automatically be registered. If this is not set (and 

134 ``writeable`` is not set either), a read-only butler will be created. 

135 searchPaths : `list` of `str`, optional 

136 Directory paths to search when calculating the full Butler 

137 configuration. Not used if the supplied config is already a 

138 `ButlerConfig`. 

139 writeable : `bool`, optional 

140 Explicitly sets whether the butler supports write operations. If not 

141 provided, a read-write butler is created if any of ``run``, ``tags``, 

142 or ``chains`` is non-empty. 

143 inferDefaults : `bool`, optional 

144 If `True` (default) infer default data ID values from the values 

145 present in the datasets in ``collections``: if all collections have the 

146 same value (or no value) for a governor dimension, that value will be 

147 the default for that dimension. Nonexistent collections are ignored. 

148 If a default value is provided explicitly for a governor dimension via 

149 ``**kwargs``, no default will be inferred for that dimension. 

150 **kwargs : `str` 

151 Default data ID key-value pairs. These may only identify "governor" 

152 dimensions like ``instrument`` and ``skymap``. 

153 

154 Examples 

155 -------- 

156 While there are many ways to control exactly how a `Butler` interacts with 

157 the collections in its `Registry`, the most common cases are still simple. 

158 

159 For a read-only `Butler` that searches one collection, do:: 

160 

161 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

162 

163 For a read-write `Butler` that writes to and reads from a 

164 `~CollectionType.RUN` collection:: 

165 

166 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

167 

168 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

169 because we want to write to one `~CollectionType.RUN` collection but read 

170 from several others (as well):: 

171 

172 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

173 collections=["u/alice/DM-50000/a", 

174 "u/bob/DM-49998", 

175 "HSC/defaults"]) 

176 

177 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

178 Datasets will be read first from that run (since it appears first in the 

179 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

180 

181 Finally, one can always create a `Butler` with no collections:: 

182 

183 butler = Butler("/path/to/repo", writeable=True) 

184 

185 This can be extremely useful when you just want to use ``butler.registry``, 

186 e.g. for inserting dimension data or managing collections, or when the 

187 collections you want to use with the butler are not consistent. 

188 Passing ``writeable`` explicitly here is only necessary if you want to be 

189 able to make changes to the repo - usually the value for ``writeable`` can 

190 be guessed from the collection arguments provided, but it defaults to 

191 `False` when there are not collection arguments. 

192 """ 

193 

194 def __init__( 

195 self, 

196 config: Config | ResourcePathExpression | None = None, 

197 *, 

198 butler: Butler | None = None, 

199 collections: Any = None, 

200 run: str | None = None, 

201 searchPaths: Sequence[ResourcePathExpression] | None = None, 

202 writeable: bool | None = None, 

203 inferDefaults: bool = True, 

204 **kwargs: str, 

205 ): 

206 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

207 # Load registry, datastore, etc. from config or existing butler. 

208 if butler is not None: 

209 if config is not None or searchPaths is not None or writeable is not None: 

210 raise TypeError( 

211 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument." 

212 ) 

213 self.registry = butler.registry.copy(defaults) 

214 self.datastore = butler.datastore 

215 self.storageClasses = butler.storageClasses 

216 self._config: ButlerConfig = butler._config 

217 else: 

218 self._config = ButlerConfig(config, searchPaths=searchPaths) 

219 try: 

220 if "root" in self._config: 

221 butlerRoot = self._config["root"] 

222 else: 

223 butlerRoot = self._config.configDir 

224 if writeable is None: 

225 writeable = run is not None 

226 self.registry = Registry.fromConfig( 

227 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

228 ) 

229 self.datastore = Datastore.fromConfig( 

230 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

231 ) 

232 self.storageClasses = StorageClassFactory() 

233 self.storageClasses.addFromConfig(self._config) 

234 except Exception: 

235 # Failures here usually mean that configuration is incomplete, 

236 # just issue an error message which includes config file URI. 

237 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

238 raise 

239 

240 # For execution butler the datastore needs a special 

241 # dependency-inversion trick. This is not used by regular butler, 

242 # but we do not have a way to distinguish regular butler from execution 

243 # butler. 

244 self.datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

245 

246 if "run" in self._config or "collection" in self._config: 

247 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

248 

249 GENERATION: ClassVar[int] = 3 

250 """This is a Generation 3 Butler. 

251 

252 This attribute may be removed in the future, once the Generation 2 Butler 

253 interface has been fully retired; it should only be used in transitional 

254 code. 

255 """ 

256 

257 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

258 """Return DatasetType defined in registry given dataset type name.""" 

259 try: 

260 return self.registry.getDatasetType(name) 

261 except MissingDatasetTypeError: 

262 return None 

263 

264 @classmethod 

265 def get_repo_uri(cls, label: str, return_label: bool = False) -> ResourcePath: 

266 """Look up the label in a butler repository index. 

267 

268 Parameters 

269 ---------- 

270 label : `str` 

271 Label of the Butler repository to look up. 

272 return_label : `bool`, optional 

273 If ``label`` cannot be found in the repository index (either 

274 because index is not defined or ``label`` is not in the index) and 

275 ``return_label`` is `True` then return ``ResourcePath(label)``. 

276 If ``return_label`` is `False` (default) then an exception will be 

277 raised instead. 

278 

279 Returns 

280 ------- 

281 uri : `lsst.resources.ResourcePath` 

282 URI to the Butler repository associated with the given label or 

283 default value if it is provided. 

284 

285 Raises 

286 ------ 

287 KeyError 

288 Raised if the label is not found in the index, or if an index 

289 is not defined, and ``return_label`` is `False`. 

290 

291 Notes 

292 ----- 

293 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

294 information is discovered. 

295 """ 

296 return ButlerRepoIndex.get_repo_uri(label, return_label) 

297 

298 @classmethod 

299 def get_known_repos(cls) -> set[str]: 

300 """Retrieve the list of known repository labels. 

301 

302 Returns 

303 ------- 

304 repos : `set` of `str` 

305 All the known labels. Can be empty if no index can be found. 

306 

307 Notes 

308 ----- 

309 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

310 information is discovered. 

311 """ 

312 return ButlerRepoIndex.get_known_repos() 

313 

314 @staticmethod 

315 def makeRepo( 

316 root: ResourcePathExpression, 

317 config: Config | str | None = None, 

318 dimensionConfig: Config | str | None = None, 

319 standalone: bool = False, 

320 searchPaths: list[str] | None = None, 

321 forceConfigRoot: bool = True, 

322 outfile: ResourcePathExpression | None = None, 

323 overwrite: bool = False, 

324 ) -> Config: 

325 """Create an empty data repository by adding a butler.yaml config 

326 to a repository root directory. 

327 

328 Parameters 

329 ---------- 

330 root : `lsst.resources.ResourcePathExpression` 

331 Path or URI to the root location of the new repository. Will be 

332 created if it does not exist. 

333 config : `Config` or `str`, optional 

334 Configuration to write to the repository, after setting any 

335 root-dependent Registry or Datastore config options. Can not 

336 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

337 configuration will be used. Root-dependent config options 

338 specified in this config are overwritten if ``forceConfigRoot`` 

339 is `True`. 

340 dimensionConfig : `Config` or `str`, optional 

341 Configuration for dimensions, will be used to initialize registry 

342 database. 

343 standalone : `bool` 

344 If True, write all expanded defaults, not just customized or 

345 repository-specific settings. 

346 This (mostly) decouples the repository from the default 

347 configuration, insulating it from changes to the defaults (which 

348 may be good or bad, depending on the nature of the changes). 

349 Future *additions* to the defaults will still be picked up when 

350 initializing `Butlers` to repos created with ``standalone=True``. 

351 searchPaths : `list` of `str`, optional 

352 Directory paths to search when calculating the full butler 

353 configuration. 

354 forceConfigRoot : `bool`, optional 

355 If `False`, any values present in the supplied ``config`` that 

356 would normally be reset are not overridden and will appear 

357 directly in the output config. This allows non-standard overrides 

358 of the root directory for a datastore or registry to be given. 

359 If this parameter is `True` the values for ``root`` will be 

360 forced into the resulting config if appropriate. 

361 outfile : `lss.resources.ResourcePathExpression`, optional 

362 If not-`None`, the output configuration will be written to this 

363 location rather than into the repository itself. Can be a URI 

364 string. Can refer to a directory that will be used to write 

365 ``butler.yaml``. 

366 overwrite : `bool`, optional 

367 Create a new configuration file even if one already exists 

368 in the specified output location. Default is to raise 

369 an exception. 

370 

371 Returns 

372 ------- 

373 config : `Config` 

374 The updated `Config` instance written to the repo. 

375 

376 Raises 

377 ------ 

378 ValueError 

379 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

380 regular Config (as these subclasses would make it impossible to 

381 support ``standalone=False``). 

382 FileExistsError 

383 Raised if the output config file already exists. 

384 os.error 

385 Raised if the directory does not exist, exists but is not a 

386 directory, or cannot be created. 

387 

388 Notes 

389 ----- 

390 Note that when ``standalone=False`` (the default), the configuration 

391 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

392 construct the repository should also be used to construct any Butlers 

393 to avoid configuration inconsistencies. 

394 """ 

395 if isinstance(config, (ButlerConfig, ConfigSubset)): 

396 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

397 

398 # Ensure that the root of the repository exists or can be made 

399 root_uri = ResourcePath(root, forceDirectory=True) 

400 root_uri.mkdir() 

401 

402 config = Config(config) 

403 

404 # If we are creating a new repo from scratch with relative roots, 

405 # do not propagate an explicit root from the config file 

406 if "root" in config: 

407 del config["root"] 

408 

409 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

410 imported_class = doImportType(full["datastore", "cls"]) 

411 if not issubclass(imported_class, Datastore): 

412 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore") 

413 datastoreClass: type[Datastore] = imported_class 

414 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

415 

416 # if key exists in given config, parse it, otherwise parse the defaults 

417 # in the expanded config 

418 if config.get(("registry", "db")): 

419 registryConfig = RegistryConfig(config) 

420 else: 

421 registryConfig = RegistryConfig(full) 

422 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

423 if defaultDatabaseUri is not None: 

424 Config.updateParameters( 

425 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot 

426 ) 

427 else: 

428 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot) 

429 

430 if standalone: 

431 config.merge(full) 

432 else: 

433 # Always expand the registry.managers section into the per-repo 

434 # config, because after the database schema is created, it's not 

435 # allowed to change anymore. Note that in the standalone=True 

436 # branch, _everything_ in the config is expanded, so there's no 

437 # need to special case this. 

438 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

439 configURI: ResourcePathExpression 

440 if outfile is not None: 

441 # When writing to a separate location we must include 

442 # the root of the butler repo in the config else it won't know 

443 # where to look. 

444 config["root"] = root_uri.geturl() 

445 configURI = outfile 

446 else: 

447 configURI = root_uri 

448 # Strip obscore configuration, if it is present, before writing config 

449 # to a file, obscore config will be stored in registry. 

450 if (obscore_config_key := ("registry", "managers", "obscore", "config")) in config: 

451 config_to_write = config.copy() 

452 del config_to_write[obscore_config_key] 

453 config_to_write.dumpToUri(configURI, overwrite=overwrite) 

454 # configFile attribute is updated, need to copy it to original. 

455 config.configFile = config_to_write.configFile 

456 else: 

457 config.dumpToUri(configURI, overwrite=overwrite) 

458 

459 # Create Registry and populate tables 

460 registryConfig = RegistryConfig(config.get("registry")) 

461 dimensionConfig = DimensionConfig(dimensionConfig) 

462 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri) 

463 

464 log.verbose("Wrote new Butler configuration file to %s", configURI) 

465 

466 return config 

467 

468 @classmethod 

469 def _unpickle( 

470 cls, 

471 config: ButlerConfig, 

472 collections: tuple[str, ...] | None, 

473 run: str | None, 

474 defaultDataId: dict[str, str], 

475 writeable: bool, 

476 ) -> Butler: 

477 """Callable used to unpickle a Butler. 

478 

479 We prefer not to use ``Butler.__init__`` directly so we can force some 

480 of its many arguments to be keyword-only (note that ``__reduce__`` 

481 can only invoke callables with positional arguments). 

482 

483 Parameters 

484 ---------- 

485 config : `ButlerConfig` 

486 Butler configuration, already coerced into a true `ButlerConfig` 

487 instance (and hence after any search paths for overrides have been 

488 utilized). 

489 collections : `tuple` [ `str` ] 

490 Names of the default collections to read from. 

491 run : `str`, optional 

492 Name of the default `~CollectionType.RUN` collection to write to. 

493 defaultDataId : `dict` [ `str`, `str` ] 

494 Default data ID values. 

495 writeable : `bool` 

496 Whether the Butler should support write operations. 

497 

498 Returns 

499 ------- 

500 butler : `Butler` 

501 A new `Butler` instance. 

502 """ 

503 # MyPy doesn't recognize that the kwargs below are totally valid; it 

504 # seems to think '**defaultDataId* is a _positional_ argument! 

505 return cls( 

506 config=config, 

507 collections=collections, 

508 run=run, 

509 writeable=writeable, 

510 **defaultDataId, # type: ignore 

511 ) 

512 

513 def __reduce__(self) -> tuple: 

514 """Support pickling.""" 

515 return ( 

516 Butler._unpickle, 

517 ( 

518 self._config, 

519 self.collections, 

520 self.run, 

521 self.registry.defaults.dataId.byName(), 

522 self.registry.isWriteable(), 

523 ), 

524 ) 

525 

526 def __str__(self) -> str: 

527 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

528 self.collections, self.run, self.datastore, self.registry 

529 ) 

530 

531 def isWriteable(self) -> bool: 

532 """Return `True` if this `Butler` supports write operations.""" 

533 return self.registry.isWriteable() 

534 

535 @contextlib.contextmanager 

536 def transaction(self) -> Iterator[None]: 

537 """Context manager supporting `Butler` transactions. 

538 

539 Transactions can be nested. 

540 """ 

541 with self.registry.transaction(): 

542 with self.datastore.transaction(): 

543 yield 

544 

545 def _standardizeArgs( 

546 self, 

547 datasetRefOrType: DatasetRef | DatasetType | str, 

548 dataId: DataId | None = None, 

549 for_put: bool = True, 

550 **kwargs: Any, 

551 ) -> tuple[DatasetType, DataId | None]: 

552 """Standardize the arguments passed to several Butler APIs. 

553 

554 Parameters 

555 ---------- 

556 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

557 When `DatasetRef` the `dataId` should be `None`. 

558 Otherwise the `DatasetType` or name thereof. 

559 dataId : `dict` or `DataCoordinate` 

560 A `dict` of `Dimension` link name, value pairs that label the 

561 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

562 should be provided as the second argument. 

563 for_put : `bool`, optional 

564 If `True` this call is invoked as part of a `Butler.put()`. 

565 Otherwise it is assumed to be part of a `Butler.get()`. This 

566 parameter is only relevant if there is dataset type 

567 inconsistency. 

568 **kwargs 

569 Additional keyword arguments used to augment or construct a 

570 `DataCoordinate`. See `DataCoordinate.standardize` 

571 parameters. 

572 

573 Returns 

574 ------- 

575 datasetType : `DatasetType` 

576 A `DatasetType` instance extracted from ``datasetRefOrType``. 

577 dataId : `dict` or `DataId`, optional 

578 Argument that can be used (along with ``kwargs``) to construct a 

579 `DataId`. 

580 

581 Notes 

582 ----- 

583 Butler APIs that conceptually need a DatasetRef also allow passing a 

584 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

585 keyword arguments that can be used to construct one) separately. This 

586 method accepts those arguments and always returns a true `DatasetType` 

587 and a `DataId` or `dict`. 

588 

589 Standardization of `dict` vs `DataId` is best handled by passing the 

590 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

591 generally similarly flexible. 

592 """ 

593 externalDatasetType: DatasetType | None = None 

594 internalDatasetType: DatasetType | None = None 

595 if isinstance(datasetRefOrType, DatasetRef): 

596 if dataId is not None or kwargs: 

597 raise ValueError("DatasetRef given, cannot use dataId as well") 

598 externalDatasetType = datasetRefOrType.datasetType 

599 dataId = datasetRefOrType.dataId 

600 else: 

601 # Don't check whether DataId is provided, because Registry APIs 

602 # can usually construct a better error message when it wasn't. 

603 if isinstance(datasetRefOrType, DatasetType): 

604 externalDatasetType = datasetRefOrType 

605 else: 

606 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

607 

608 # Check that they are self-consistent 

609 if externalDatasetType is not None: 

610 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

611 if externalDatasetType != internalDatasetType: 

612 # We can allow differences if they are compatible, depending 

613 # on whether this is a get or a put. A get requires that 

614 # the python type associated with the datastore can be 

615 # converted to the user type. A put requires that the user 

616 # supplied python type can be converted to the internal 

617 # type expected by registry. 

618 relevantDatasetType = internalDatasetType 

619 if for_put: 

620 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

621 else: 

622 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

623 relevantDatasetType = externalDatasetType 

624 if not is_compatible: 

625 raise ValueError( 

626 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

627 f"registry definition ({internalDatasetType})" 

628 ) 

629 # Override the internal definition. 

630 internalDatasetType = relevantDatasetType 

631 

632 assert internalDatasetType is not None 

633 return internalDatasetType, dataId 

634 

635 def _rewrite_data_id( 

636 self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any 

637 ) -> tuple[DataId | None, dict[str, Any]]: 

638 """Rewrite a data ID taking into account dimension records. 

639 

640 Take a Data ID and keyword args and rewrite it if necessary to 

641 allow the user to specify dimension records rather than dimension 

642 primary values. 

643 

644 This allows a user to include a dataId dict with keys of 

645 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

646 the integer exposure ID. It also allows a string to be given 

647 for a dimension value rather than the integer ID if that is more 

648 convenient. For example, rather than having to specifyin the 

649 detector with ``detector.full_name``, a string given for ``detector`` 

650 will be interpreted as the full name and converted to the integer 

651 value. 

652 

653 Keyword arguments can also use strings for dimensions like detector 

654 and exposure but python does not allow them to include ``.`` and 

655 so the ``exposure.day_obs`` syntax can not be used in a keyword 

656 argument. 

657 

658 Parameters 

659 ---------- 

660 dataId : `dict` or `DataCoordinate` 

661 A `dict` of `Dimension` link name, value pairs that will label the 

662 `DatasetRef` within a Collection. 

663 datasetType : `DatasetType` 

664 The dataset type associated with this dataId. Required to 

665 determine the relevant dimensions. 

666 **kwargs 

667 Additional keyword arguments used to augment or construct a 

668 `DataId`. See `DataId` parameters. 

669 

670 Returns 

671 ------- 

672 dataId : `dict` or `DataCoordinate` 

673 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

674 no keyword arguments, the original dataId will be returned 

675 unchanged. 

676 **kwargs : `dict` 

677 Any unused keyword arguments (would normally be empty dict). 

678 """ 

679 # Do nothing if we have a standalone DataCoordinate. 

680 if isinstance(dataId, DataCoordinate) and not kwargs: 

681 return dataId, kwargs 

682 

683 # Process dimension records that are using record information 

684 # rather than ids 

685 newDataId: dict[str, DataIdValue] = {} 

686 byRecord: dict[str, dict[str, Any]] = defaultdict(dict) 

687 

688 # if all the dataId comes from keyword parameters we do not need 

689 # to do anything here because they can't be of the form 

690 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

691 if dataId: 

692 for k, v in dataId.items(): 

693 # If we have a Dimension we do not need to do anything 

694 # because it cannot be a compound key. 

695 if isinstance(k, str) and "." in k: 

696 # Someone is using a more human-readable dataId 

697 dimensionName, record = k.split(".", 1) 

698 byRecord[dimensionName][record] = v 

699 elif isinstance(k, Dimension): 

700 newDataId[k.name] = v 

701 else: 

702 newDataId[k] = v 

703 

704 # Go through the updated dataId and check the type in case someone is 

705 # using an alternate key. We have already filtered out the compound 

706 # keys dimensions.record format. 

707 not_dimensions = {} 

708 

709 # Will need to look in the dataId and the keyword arguments 

710 # and will remove them if they need to be fixed or are unrecognized. 

711 for dataIdDict in (newDataId, kwargs): 

712 # Use a list so we can adjust the dict safely in the loop 

713 for dimensionName in list(dataIdDict): 

714 value = dataIdDict[dimensionName] 

715 try: 

716 dimension = self.dimensions.getStaticDimensions()[dimensionName] 

717 except KeyError: 

718 # This is not a real dimension 

719 not_dimensions[dimensionName] = value 

720 del dataIdDict[dimensionName] 

721 continue 

722 

723 # Convert an integral type to an explicit int to simplify 

724 # comparisons here 

725 if isinstance(value, numbers.Integral): 

726 value = int(value) 

727 

728 if not isinstance(value, dimension.primaryKey.getPythonType()): 

729 for alternate in dimension.alternateKeys: 

730 if isinstance(value, alternate.getPythonType()): 

731 byRecord[dimensionName][alternate.name] = value 

732 del dataIdDict[dimensionName] 

733 log.debug( 

734 "Converting dimension %s to %s.%s=%s", 

735 dimensionName, 

736 dimensionName, 

737 alternate.name, 

738 value, 

739 ) 

740 break 

741 else: 

742 log.warning( 

743 "Type mismatch found for value '%r' provided for dimension %s. " 

744 "Could not find matching alternative (primary key has type %s) " 

745 "so attempting to use as-is.", 

746 value, 

747 dimensionName, 

748 dimension.primaryKey.getPythonType(), 

749 ) 

750 

751 # By this point kwargs and newDataId should only include valid 

752 # dimensions. Merge kwargs in to the new dataId and log if there 

753 # are dimensions in both (rather than calling update). 

754 for k, v in kwargs.items(): 

755 if k in newDataId and newDataId[k] != v: 

756 log.debug( 

757 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

758 ) 

759 newDataId[k] = v 

760 # No need to retain any values in kwargs now. 

761 kwargs = {} 

762 

763 # If we have some unrecognized dimensions we have to try to connect 

764 # them to records in other dimensions. This is made more complicated 

765 # by some dimensions having records with clashing names. A mitigation 

766 # is that we can tell by this point which dimensions are missing 

767 # for the DatasetType but this does not work for calibrations 

768 # where additional dimensions can be used to constrain the temporal 

769 # axis. 

770 if not_dimensions: 

771 # Search for all dimensions even if we have been given a value 

772 # explicitly. In some cases records are given as well as the 

773 # actually dimension and this should not be an error if they 

774 # match. 

775 mandatoryDimensions = datasetType.dimensions.names # - provided 

776 

777 candidateDimensions: set[str] = set() 

778 candidateDimensions.update(mandatoryDimensions) 

779 

780 # For calibrations we may well be needing temporal dimensions 

781 # so rather than always including all dimensions in the scan 

782 # restrict things a little. It is still possible for there 

783 # to be confusion over day_obs in visit vs exposure for example. 

784 # If we are not searching calibration collections things may 

785 # fail but they are going to fail anyway because of the 

786 # ambiguousness of the dataId... 

787 if datasetType.isCalibration(): 

788 for dim in self.dimensions.getStaticDimensions(): 

789 if dim.temporal: 

790 candidateDimensions.add(str(dim)) 

791 

792 # Look up table for the first association with a dimension 

793 guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict) 

794 

795 # Keep track of whether an item is associated with multiple 

796 # dimensions. 

797 counter: Counter[str] = Counter() 

798 assigned: dict[str, set[str]] = defaultdict(set) 

799 

800 # Go through the missing dimensions and associate the 

801 # given names with records within those dimensions 

802 matched_dims = set() 

803 for dimensionName in candidateDimensions: 

804 dimension = self.dimensions.getStaticDimensions()[dimensionName] 

805 fields = dimension.metadata.names | dimension.uniqueKeys.names 

806 for field in not_dimensions: 

807 if field in fields: 

808 guessedAssociation[dimensionName][field] = not_dimensions[field] 

809 counter[dimensionName] += 1 

810 assigned[field].add(dimensionName) 

811 matched_dims.add(field) 

812 

813 # Calculate the fields that matched nothing. 

814 never_found = set(not_dimensions) - matched_dims 

815 

816 if never_found: 

817 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

818 

819 # There is a chance we have allocated a single dataId item 

820 # to multiple dimensions. Need to decide which should be retained. 

821 # For now assume that the most popular alternative wins. 

822 # This means that day_obs with seq_num will result in 

823 # exposure.day_obs and not visit.day_obs 

824 # Also prefer an explicitly missing dimension over an inferred 

825 # temporal dimension. 

826 for fieldName, assignedDimensions in assigned.items(): 

827 if len(assignedDimensions) > 1: 

828 # Pick the most popular (preferring mandatory dimensions) 

829 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

830 if requiredButMissing: 

831 candidateDimensions = requiredButMissing 

832 else: 

833 candidateDimensions = assignedDimensions 

834 

835 # If this is a choice between visit and exposure and 

836 # neither was a required part of the dataset type, 

837 # (hence in this branch) always prefer exposure over 

838 # visit since exposures are always defined and visits 

839 # are defined from exposures. 

840 if candidateDimensions == {"exposure", "visit"}: 

841 candidateDimensions = {"exposure"} 

842 

843 # Select the relevant items and get a new restricted 

844 # counter. 

845 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

846 duplicatesCounter: Counter[str] = Counter() 

847 duplicatesCounter.update(theseCounts) 

848 

849 # Choose the most common. If they are equally common 

850 # we will pick the one that was found first. 

851 # Returns a list of tuples 

852 selected = duplicatesCounter.most_common(1)[0][0] 

853 

854 log.debug( 

855 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

856 " Removed ambiguity by choosing dimension %s.", 

857 fieldName, 

858 ", ".join(assignedDimensions), 

859 selected, 

860 ) 

861 

862 for candidateDimension in assignedDimensions: 

863 if candidateDimension != selected: 

864 del guessedAssociation[candidateDimension][fieldName] 

865 

866 # Update the record look up dict with the new associations 

867 for dimensionName, values in guessedAssociation.items(): 

868 if values: # A dict might now be empty 

869 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values) 

870 byRecord[dimensionName].update(values) 

871 

872 if byRecord: 

873 # Some record specifiers were found so we need to convert 

874 # them to the Id form 

875 for dimensionName, values in byRecord.items(): 

876 if dimensionName in newDataId: 

877 log.debug( 

878 "DataId specified explicit %s dimension value of %s in addition to" 

879 " general record specifiers for it of %s. Ignoring record information.", 

880 dimensionName, 

881 newDataId[dimensionName], 

882 str(values), 

883 ) 

884 # Get the actual record and compare with these values. 

885 try: 

886 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

887 except DataIdError: 

888 raise ValueError( 

889 f"Could not find dimension '{dimensionName}'" 

890 f" with dataId {newDataId} as part of comparing with" 

891 f" record values {byRecord[dimensionName]}" 

892 ) from None 

893 if len(recs) == 1: 

894 errmsg: list[str] = [] 

895 for k, v in values.items(): 

896 if (recval := getattr(recs[0], k)) != v: 

897 errmsg.append(f"{k}({recval} != {v})") 

898 if errmsg: 

899 raise ValueError( 

900 f"Dimension {dimensionName} in dataId has explicit value" 

901 " inconsistent with records: " + ", ".join(errmsg) 

902 ) 

903 else: 

904 # Multiple matches for an explicit dimension 

905 # should never happen but let downstream complain. 

906 pass 

907 continue 

908 

909 # Build up a WHERE expression 

910 bind = {k: v for k, v in values.items()} 

911 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

912 

913 # Hopefully we get a single record that matches 

914 records = set( 

915 self.registry.queryDimensionRecords( 

916 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

917 ) 

918 ) 

919 

920 if len(records) != 1: 

921 if len(records) > 1: 

922 # visit can have an ambiguous answer without involving 

923 # visit_system. The default visit_system is defined 

924 # by the instrument. 

925 if ( 

926 dimensionName == "visit" 

927 and "visit_system_membership" in self.dimensions 

928 and "visit_system" in self.dimensions["instrument"].metadata 

929 ): 

930 instrument_records = list( 

931 self.registry.queryDimensionRecords( 

932 "instrument", 

933 dataId=newDataId, 

934 **kwargs, 

935 ) 

936 ) 

937 if len(instrument_records) == 1: 

938 visit_system = instrument_records[0].visit_system 

939 if visit_system is None: 

940 # Set to a value that will never match. 

941 visit_system = -1 

942 

943 # Look up each visit in the 

944 # visit_system_membership records. 

945 for rec in records: 

946 membership = list( 

947 self.registry.queryDimensionRecords( 

948 # Use bind to allow zero results. 

949 # This is a fully-specified query. 

950 "visit_system_membership", 

951 where="instrument = inst AND visit_system = system AND visit = v", 

952 bind=dict( 

953 inst=instrument_records[0].name, system=visit_system, v=rec.id 

954 ), 

955 ) 

956 ) 

957 if membership: 

958 # This record is the right answer. 

959 records = {rec} 

960 break 

961 

962 # The ambiguity may have been resolved so check again. 

963 if len(records) > 1: 

964 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

965 for r in records: 

966 log.debug("- %s", str(r)) 

967 raise ValueError( 

968 f"DataId specification for dimension {dimensionName} is not" 

969 f" uniquely constrained to a single dataset by {values}." 

970 f" Got {len(records)} results." 

971 ) 

972 else: 

973 raise ValueError( 

974 f"DataId specification for dimension {dimensionName} matched no" 

975 f" records when constrained by {values}" 

976 ) 

977 

978 # Get the primary key from the real dimension object 

979 dimension = self.dimensions.getStaticDimensions()[dimensionName] 

980 if not isinstance(dimension, Dimension): 

981 raise RuntimeError( 

982 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

983 ) 

984 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

985 

986 return newDataId, kwargs 

987 

988 def _findDatasetRef( 

989 self, 

990 datasetRefOrType: DatasetRef | DatasetType | str, 

991 dataId: DataId | None = None, 

992 *, 

993 collections: Any = None, 

994 predict: bool = False, 

995 run: str | None = None, 

996 **kwargs: Any, 

997 ) -> DatasetRef: 

998 """Shared logic for methods that start with a search for a dataset in 

999 the registry. 

1000 

1001 Parameters 

1002 ---------- 

1003 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1004 When `DatasetRef` the `dataId` should be `None`. 

1005 Otherwise the `DatasetType` or name thereof. 

1006 dataId : `dict` or `DataCoordinate`, optional 

1007 A `dict` of `Dimension` link name, value pairs that label the 

1008 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1009 should be provided as the first argument. 

1010 collections : Any, optional 

1011 Collections to be searched, overriding ``self.collections``. 

1012 Can be any of the types supported by the ``collections`` argument 

1013 to butler construction. 

1014 predict : `bool`, optional 

1015 If `True`, return a newly created `DatasetRef` with a unique 

1016 dataset ID if finding a reference in the `Registry` fails. 

1017 Defaults to `False`. 

1018 run : `str`, optional 

1019 Run collection name to use for creating `DatasetRef` for predicted 

1020 datasets. Only used if ``predict`` is `True`. 

1021 **kwargs 

1022 Additional keyword arguments used to augment or construct a 

1023 `DataId`. See `DataId` parameters. 

1024 

1025 Returns 

1026 ------- 

1027 ref : `DatasetRef` 

1028 A reference to the dataset identified by the given arguments. 

1029 This can be the same dataset reference as given if it was 

1030 resolved. 

1031 

1032 Raises 

1033 ------ 

1034 LookupError 

1035 Raised if no matching dataset exists in the `Registry` (and 

1036 ``predict`` is `False`). 

1037 ValueError 

1038 Raised if a resolved `DatasetRef` was passed as an input, but it 

1039 differs from the one found in the registry. 

1040 TypeError 

1041 Raised if no collections were provided. 

1042 """ 

1043 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

1044 if isinstance(datasetRefOrType, DatasetRef): 

1045 if collections is not None: 

1046 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3) 

1047 return datasetRefOrType 

1048 timespan: Timespan | None = None 

1049 

1050 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1051 

1052 if datasetType.isCalibration(): 

1053 # Because this is a calibration dataset, first try to make a 

1054 # standardize the data ID without restricting the dimensions to 

1055 # those of the dataset type requested, because there may be extra 

1056 # dimensions that provide temporal information for a validity-range 

1057 # lookup. 

1058 dataId = DataCoordinate.standardize( 

1059 dataId, universe=self.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1060 ) 

1061 if dataId.graph.temporal: 

1062 dataId = self.registry.expandDataId(dataId) 

1063 timespan = dataId.timespan 

1064 else: 

1065 # Standardize the data ID to just the dimensions of the dataset 

1066 # type instead of letting registry.findDataset do it, so we get the 

1067 # result even if no dataset is found. 

1068 dataId = DataCoordinate.standardize( 

1069 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1070 ) 

1071 # Always lookup the DatasetRef, even if one is given, to ensure it is 

1072 # present in the current collection. 

1073 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

1074 if ref is None: 

1075 if predict: 

1076 if run is None: 

1077 run = self.run 

1078 if run is None: 

1079 raise TypeError("Cannot predict dataset ID/location with run=None.") 

1080 return DatasetRef(datasetType, dataId, run=run) 

1081 else: 

1082 if collections is None: 

1083 collections = self.registry.defaults.collections 

1084 raise LookupError( 

1085 f"Dataset {datasetType.name} with data ID {dataId} " 

1086 f"could not be found in collections {collections}." 

1087 ) 

1088 if datasetType != ref.datasetType: 

1089 # If they differ it is because the user explicitly specified 

1090 # a compatible dataset type to this call rather than using the 

1091 # registry definition. The DatasetRef must therefore be recreated 

1092 # using the user definition such that the expected type is 

1093 # returned. 

1094 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id) 

1095 

1096 return ref 

1097 

1098 @transactional 

1099 @deprecated( 

1100 reason="Butler.put() now behaves like Butler.putDirect() when given a DatasetRef." 

1101 " Please use Butler.put(). Be aware that you may need to adjust your usage if you" 

1102 " were relying on the run parameter to determine the run." 

1103 " Will be removed after v27.0.", 

1104 version="v26.0", 

1105 category=FutureWarning, 

1106 ) 

1107 def putDirect(self, obj: Any, ref: DatasetRef, /) -> DatasetRef: 

1108 # Docstring inherited. 

1109 return self.put(obj, ref) 

1110 

1111 @transactional 

1112 def put( 

1113 self, 

1114 obj: Any, 

1115 datasetRefOrType: DatasetRef | DatasetType | str, 

1116 /, 

1117 dataId: DataId | None = None, 

1118 *, 

1119 run: str | None = None, 

1120 **kwargs: Any, 

1121 ) -> DatasetRef: 

1122 """Store and register a dataset. 

1123 

1124 Parameters 

1125 ---------- 

1126 obj : `object` 

1127 The dataset. 

1128 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1129 When `DatasetRef` is provided, ``dataId`` should be `None`. 

1130 Otherwise the `DatasetType` or name thereof. If a fully resolved 

1131 `DatasetRef` is given the run and ID are used directly. 

1132 dataId : `dict` or `DataCoordinate` 

1133 A `dict` of `Dimension` link name, value pairs that label the 

1134 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1135 should be provided as the second argument. 

1136 run : `str`, optional 

1137 The name of the run the dataset should be added to, overriding 

1138 ``self.run``. Not used if a resolved `DatasetRef` is provided. 

1139 **kwargs 

1140 Additional keyword arguments used to augment or construct a 

1141 `DataCoordinate`. See `DataCoordinate.standardize` 

1142 parameters. Not used if a resolve `DatasetRef` is provided. 

1143 

1144 Returns 

1145 ------- 

1146 ref : `DatasetRef` 

1147 A reference to the stored dataset, updated with the correct id if 

1148 given. 

1149 

1150 Raises 

1151 ------ 

1152 TypeError 

1153 Raised if the butler is read-only or if no run has been provided. 

1154 """ 

1155 if isinstance(datasetRefOrType, DatasetRef): 

1156 # This is a direct put of predefined DatasetRef. 

1157 log.debug("Butler put direct: %s", datasetRefOrType) 

1158 if run is not None: 

1159 warnings.warn("Run collection is not used for DatasetRef", stacklevel=3) 

1160 # If registry already has a dataset with the same dataset ID, 

1161 # dataset type and DataId, then _importDatasets will do nothing and 

1162 # just return an original ref. We have to raise in this case, there 

1163 # is a datastore check below for that. 

1164 self.registry._importDatasets([datasetRefOrType], expand=True) 

1165 # Before trying to write to the datastore check that it does not 

1166 # know this dataset. This is prone to races, of course. 

1167 if self.datastore.knows(datasetRefOrType): 

1168 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}") 

1169 # Try to write dataset to the datastore, if it fails due to a race 

1170 # with another write, the content of stored data may be 

1171 # unpredictable. 

1172 try: 

1173 self.datastore.put(obj, datasetRefOrType) 

1174 except IntegrityError as e: 

1175 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") 

1176 return datasetRefOrType 

1177 

1178 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

1179 if not self.isWriteable(): 

1180 raise TypeError("Butler is read-only.") 

1181 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

1182 

1183 # Handle dimension records in dataId 

1184 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1185 

1186 # Add Registry Dataset entry. 

1187 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

1188 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

1189 self.datastore.put(obj, ref) 

1190 

1191 return ref 

1192 

1193 @deprecated( 

1194 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef." 

1195 " Please use Butler.get(). Will be removed after v27.0.", 

1196 version="v26.0", 

1197 category=FutureWarning, 

1198 ) 

1199 def getDirect( 

1200 self, 

1201 ref: DatasetRef, 

1202 *, 

1203 parameters: dict[str, Any] | None = None, 

1204 storageClass: StorageClass | str | None = None, 

1205 ) -> Any: 

1206 """Retrieve a stored dataset. 

1207 

1208 Parameters 

1209 ---------- 

1210 ref : `DatasetRef` 

1211 Resolved reference to an already stored dataset. 

1212 parameters : `dict` 

1213 Additional StorageClass-defined options to control reading, 

1214 typically used to efficiently read only a subset of the dataset. 

1215 storageClass : `StorageClass` or `str`, optional 

1216 The storage class to be used to override the Python type 

1217 returned by this method. By default the returned type matches 

1218 the dataset type definition for this dataset. Specifying a 

1219 read `StorageClass` can force a different type to be returned. 

1220 This type must be compatible with the original type. 

1221 

1222 Returns 

1223 ------- 

1224 obj : `object` 

1225 The dataset. 

1226 """ 

1227 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1228 

1229 @deprecated( 

1230 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. " 

1231 "Please use Butler.getDeferred(). Will be removed after v27.0.", 

1232 version="v26.0", 

1233 category=FutureWarning, 

1234 ) 

1235 def getDirectDeferred( 

1236 self, 

1237 ref: DatasetRef, 

1238 *, 

1239 parameters: dict | None = None, 

1240 storageClass: str | StorageClass | None = None, 

1241 ) -> DeferredDatasetHandle: 

1242 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1243 from a resolved `DatasetRef`. 

1244 

1245 Parameters 

1246 ---------- 

1247 ref : `DatasetRef` 

1248 Resolved reference to an already stored dataset. 

1249 parameters : `dict` 

1250 Additional StorageClass-defined options to control reading, 

1251 typically used to efficiently read only a subset of the dataset. 

1252 storageClass : `StorageClass` or `str`, optional 

1253 The storage class to be used to override the Python type 

1254 returned by this method. By default the returned type matches 

1255 the dataset type definition for this dataset. Specifying a 

1256 read `StorageClass` can force a different type to be returned. 

1257 This type must be compatible with the original type. 

1258 

1259 Returns 

1260 ------- 

1261 obj : `DeferredDatasetHandle` 

1262 A handle which can be used to retrieve a dataset at a later time. 

1263 

1264 Raises 

1265 ------ 

1266 LookupError 

1267 Raised if no matching dataset exists in the `Registry`. 

1268 """ 

1269 # Check thad dataset actuall exists. 

1270 if not self.datastore.exists(ref): 

1271 raise LookupError(f"Dataset reference {ref} does not exist.") 

1272 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1273 

1274 def getDeferred( 

1275 self, 

1276 datasetRefOrType: DatasetRef | DatasetType | str, 

1277 /, 

1278 dataId: DataId | None = None, 

1279 *, 

1280 parameters: dict | None = None, 

1281 collections: Any = None, 

1282 storageClass: str | StorageClass | None = None, 

1283 **kwargs: Any, 

1284 ) -> DeferredDatasetHandle: 

1285 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1286 after an immediate registry lookup. 

1287 

1288 Parameters 

1289 ---------- 

1290 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1291 When `DatasetRef` the `dataId` should be `None`. 

1292 Otherwise the `DatasetType` or name thereof. 

1293 dataId : `dict` or `DataCoordinate`, optional 

1294 A `dict` of `Dimension` link name, value pairs that label the 

1295 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1296 should be provided as the first argument. 

1297 parameters : `dict` 

1298 Additional StorageClass-defined options to control reading, 

1299 typically used to efficiently read only a subset of the dataset. 

1300 collections : Any, optional 

1301 Collections to be searched, overriding ``self.collections``. 

1302 Can be any of the types supported by the ``collections`` argument 

1303 to butler construction. 

1304 storageClass : `StorageClass` or `str`, optional 

1305 The storage class to be used to override the Python type 

1306 returned by this method. By default the returned type matches 

1307 the dataset type definition for this dataset. Specifying a 

1308 read `StorageClass` can force a different type to be returned. 

1309 This type must be compatible with the original type. 

1310 **kwargs 

1311 Additional keyword arguments used to augment or construct a 

1312 `DataId`. See `DataId` parameters. 

1313 

1314 Returns 

1315 ------- 

1316 obj : `DeferredDatasetHandle` 

1317 A handle which can be used to retrieve a dataset at a later time. 

1318 

1319 Raises 

1320 ------ 

1321 LookupError 

1322 Raised if no matching dataset exists in the `Registry`. 

1323 ValueError 

1324 Raised if a resolved `DatasetRef` was passed as an input, but it 

1325 differs from the one found in the registry. 

1326 TypeError 

1327 Raised if no collections were provided. 

1328 """ 

1329 if isinstance(datasetRefOrType, DatasetRef) and not self.datastore.exists(datasetRefOrType): 

1330 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.") 

1331 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1332 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1333 

1334 def get( 

1335 self, 

1336 datasetRefOrType: DatasetRef | DatasetType | str, 

1337 /, 

1338 dataId: DataId | None = None, 

1339 *, 

1340 parameters: dict[str, Any] | None = None, 

1341 collections: Any = None, 

1342 storageClass: StorageClass | str | None = None, 

1343 **kwargs: Any, 

1344 ) -> Any: 

1345 """Retrieve a stored dataset. 

1346 

1347 Parameters 

1348 ---------- 

1349 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1350 When `DatasetRef` the `dataId` should be `None`. 

1351 Otherwise the `DatasetType` or name thereof. 

1352 If a resolved `DatasetRef`, the associated dataset 

1353 is returned directly without additional querying. 

1354 dataId : `dict` or `DataCoordinate` 

1355 A `dict` of `Dimension` link name, value pairs that label the 

1356 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1357 should be provided as the first argument. 

1358 parameters : `dict` 

1359 Additional StorageClass-defined options to control reading, 

1360 typically used to efficiently read only a subset of the dataset. 

1361 collections : Any, optional 

1362 Collections to be searched, overriding ``self.collections``. 

1363 Can be any of the types supported by the ``collections`` argument 

1364 to butler construction. 

1365 storageClass : `StorageClass` or `str`, optional 

1366 The storage class to be used to override the Python type 

1367 returned by this method. By default the returned type matches 

1368 the dataset type definition for this dataset. Specifying a 

1369 read `StorageClass` can force a different type to be returned. 

1370 This type must be compatible with the original type. 

1371 **kwargs 

1372 Additional keyword arguments used to augment or construct a 

1373 `DataCoordinate`. See `DataCoordinate.standardize` 

1374 parameters. 

1375 

1376 Returns 

1377 ------- 

1378 obj : `object` 

1379 The dataset. 

1380 

1381 Raises 

1382 ------ 

1383 LookupError 

1384 Raised if no matching dataset exists in the `Registry`. 

1385 TypeError 

1386 Raised if no collections were provided. 

1387 

1388 Notes 

1389 ----- 

1390 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1391 this method requires that the given data ID include temporal dimensions 

1392 beyond the dimensions of the dataset type itself, in order to find the 

1393 dataset with the appropriate validity range. For example, a "bias" 

1394 dataset with native dimensions ``{instrument, detector}`` could be 

1395 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1396 ``exposure`` is a temporal dimension. 

1397 """ 

1398 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1399 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1400 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1401 

1402 def getURIs( 

1403 self, 

1404 datasetRefOrType: DatasetRef | DatasetType | str, 

1405 /, 

1406 dataId: DataId | None = None, 

1407 *, 

1408 predict: bool = False, 

1409 collections: Any = None, 

1410 run: str | None = None, 

1411 **kwargs: Any, 

1412 ) -> DatasetRefURIs: 

1413 """Returns the URIs associated with the dataset. 

1414 

1415 Parameters 

1416 ---------- 

1417 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1418 When `DatasetRef` the `dataId` should be `None`. 

1419 Otherwise the `DatasetType` or name thereof. 

1420 dataId : `dict` or `DataCoordinate` 

1421 A `dict` of `Dimension` link name, value pairs that label the 

1422 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1423 should be provided as the first argument. 

1424 predict : `bool` 

1425 If `True`, allow URIs to be returned of datasets that have not 

1426 been written. 

1427 collections : Any, optional 

1428 Collections to be searched, overriding ``self.collections``. 

1429 Can be any of the types supported by the ``collections`` argument 

1430 to butler construction. 

1431 run : `str`, optional 

1432 Run to use for predictions, overriding ``self.run``. 

1433 **kwargs 

1434 Additional keyword arguments used to augment or construct a 

1435 `DataCoordinate`. See `DataCoordinate.standardize` 

1436 parameters. 

1437 

1438 Returns 

1439 ------- 

1440 uris : `DatasetRefURIs` 

1441 The URI to the primary artifact associated with this dataset (if 

1442 the dataset was disassembled within the datastore this may be 

1443 `None`), and the URIs to any components associated with the dataset 

1444 artifact. (can be empty if there are no components). 

1445 """ 

1446 ref = self._findDatasetRef( 

1447 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs 

1448 ) 

1449 return self.datastore.getURIs(ref, predict) 

1450 

1451 def getURI( 

1452 self, 

1453 datasetRefOrType: DatasetRef | DatasetType | str, 

1454 /, 

1455 dataId: DataId | None = None, 

1456 *, 

1457 predict: bool = False, 

1458 collections: Any = None, 

1459 run: str | None = None, 

1460 **kwargs: Any, 

1461 ) -> ResourcePath: 

1462 """Return the URI to the Dataset. 

1463 

1464 Parameters 

1465 ---------- 

1466 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1467 When `DatasetRef` the `dataId` should be `None`. 

1468 Otherwise the `DatasetType` or name thereof. 

1469 dataId : `dict` or `DataCoordinate` 

1470 A `dict` of `Dimension` link name, value pairs that label the 

1471 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1472 should be provided as the first argument. 

1473 predict : `bool` 

1474 If `True`, allow URIs to be returned of datasets that have not 

1475 been written. 

1476 collections : Any, optional 

1477 Collections to be searched, overriding ``self.collections``. 

1478 Can be any of the types supported by the ``collections`` argument 

1479 to butler construction. 

1480 run : `str`, optional 

1481 Run to use for predictions, overriding ``self.run``. 

1482 **kwargs 

1483 Additional keyword arguments used to augment or construct a 

1484 `DataCoordinate`. See `DataCoordinate.standardize` 

1485 parameters. 

1486 

1487 Returns 

1488 ------- 

1489 uri : `lsst.resources.ResourcePath` 

1490 URI pointing to the Dataset within the datastore. If the 

1491 Dataset does not exist in the datastore, and if ``predict`` is 

1492 `True`, the URI will be a prediction and will include a URI 

1493 fragment "#predicted". 

1494 If the datastore does not have entities that relate well 

1495 to the concept of a URI the returned URI string will be 

1496 descriptive. The returned URI is not guaranteed to be obtainable. 

1497 

1498 Raises 

1499 ------ 

1500 LookupError 

1501 A URI has been requested for a dataset that does not exist and 

1502 guessing is not allowed. 

1503 ValueError 

1504 Raised if a resolved `DatasetRef` was passed as an input, but it 

1505 differs from the one found in the registry. 

1506 TypeError 

1507 Raised if no collections were provided. 

1508 RuntimeError 

1509 Raised if a URI is requested for a dataset that consists of 

1510 multiple artifacts. 

1511 """ 

1512 primary, components = self.getURIs( 

1513 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs 

1514 ) 

1515 

1516 if primary is None or components: 

1517 raise RuntimeError( 

1518 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1519 "Use Butler.getURIs() instead." 

1520 ) 

1521 return primary 

1522 

1523 def retrieveArtifacts( 

1524 self, 

1525 refs: Iterable[DatasetRef], 

1526 destination: ResourcePathExpression, 

1527 transfer: str = "auto", 

1528 preserve_path: bool = True, 

1529 overwrite: bool = False, 

1530 ) -> list[ResourcePath]: 

1531 """Retrieve the artifacts associated with the supplied refs. 

1532 

1533 Parameters 

1534 ---------- 

1535 refs : iterable of `DatasetRef` 

1536 The datasets for which artifacts are to be retrieved. 

1537 A single ref can result in multiple artifacts. The refs must 

1538 be resolved. 

1539 destination : `lsst.resources.ResourcePath` or `str` 

1540 Location to write the artifacts. 

1541 transfer : `str`, optional 

1542 Method to use to transfer the artifacts. Must be one of the options 

1543 supported by `~lsst.resources.ResourcePath.transfer_from()`. 

1544 "move" is not allowed. 

1545 preserve_path : `bool`, optional 

1546 If `True` the full path of the artifact within the datastore 

1547 is preserved. If `False` the final file component of the path 

1548 is used. 

1549 overwrite : `bool`, optional 

1550 If `True` allow transfers to overwrite existing files at the 

1551 destination. 

1552 

1553 Returns 

1554 ------- 

1555 targets : `list` of `lsst.resources.ResourcePath` 

1556 URIs of file artifacts in destination location. Order is not 

1557 preserved. 

1558 

1559 Notes 

1560 ----- 

1561 For non-file datastores the artifacts written to the destination 

1562 may not match the representation inside the datastore. For example 

1563 a hierarchical data structure in a NoSQL database may well be stored 

1564 as a JSON file. 

1565 """ 

1566 return self.datastore.retrieveArtifacts( 

1567 refs, 

1568 ResourcePath(destination), 

1569 transfer=transfer, 

1570 preserve_path=preserve_path, 

1571 overwrite=overwrite, 

1572 ) 

1573 

1574 def exists( 

1575 self, 

1576 dataset_ref_or_type: DatasetRef | DatasetType | str, 

1577 /, 

1578 data_id: DataId | None = None, 

1579 *, 

1580 full_check: bool = True, 

1581 collections: Any = None, 

1582 **kwargs: Any, 

1583 ) -> DatasetExistence: 

1584 """Indicate whether a dataset is known to Butler registry and 

1585 datastore. 

1586 

1587 Parameters 

1588 ---------- 

1589 dataset_ref_or_type : `DatasetRef`, `DatasetType`, or `str` 

1590 When `DatasetRef` the `dataId` should be `None`. 

1591 Otherwise the `DatasetType` or name thereof. 

1592 data_id : `dict` or `DataCoordinate` 

1593 A `dict` of `Dimension` link name, value pairs that label the 

1594 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1595 should be provided as the first argument. 

1596 full_check : `bool`, optional 

1597 If `True`, an additional check will be made for dataset artifact 

1598 existence. This will involve additional overhead due to the need 

1599 to query an external system. If `False` registry and datastore 

1600 will solely be asked if they know about the dataset but no 

1601 check for the artifact will be performed. 

1602 collections : Any, optional 

1603 Collections to be searched, overriding ``self.collections``. 

1604 Can be any of the types supported by the ``collections`` argument 

1605 to butler construction. 

1606 **kwargs 

1607 Additional keyword arguments used to augment or construct a 

1608 `DataCoordinate`. See `DataCoordinate.standardize` 

1609 parameters. 

1610 

1611 Returns 

1612 ------- 

1613 existence : `DatasetExistence` 

1614 Object indicating whether the dataset is known to registry and 

1615 datastore. Evaluates to `True` if the dataset is present and known 

1616 to both. 

1617 """ 

1618 existence = DatasetExistence.UNRECOGNIZED 

1619 

1620 if isinstance(dataset_ref_or_type, DatasetRef): 

1621 if collections is not None: 

1622 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2) 

1623 if data_id is not None: 

1624 warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2) 

1625 ref = dataset_ref_or_type 

1626 registry_ref = self.registry.getDataset(dataset_ref_or_type.id) 

1627 if registry_ref is not None: 

1628 existence |= DatasetExistence.RECORDED 

1629 

1630 if dataset_ref_or_type != registry_ref: 

1631 # This could mean that storage classes differ, so we should 

1632 # check for that but use the registry ref for the rest of 

1633 # the method. 

1634 if registry_ref.is_compatible_with(dataset_ref_or_type): 

1635 # Use the registry version from now on. 

1636 ref = registry_ref 

1637 else: 

1638 raise ValueError( 

1639 f"The ref given to exists() ({ref}) has the same dataset ID as one " 

1640 f"in registry but has different incompatible values ({registry_ref})." 

1641 ) 

1642 else: 

1643 try: 

1644 ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs) 

1645 except (LookupError, TypeError, NoDefaultCollectionError): 

1646 return existence 

1647 existence |= DatasetExistence.RECORDED 

1648 

1649 if self.datastore.knows(ref): 

1650 existence |= DatasetExistence.DATASTORE 

1651 

1652 if full_check: 

1653 if self.datastore.exists(ref): 

1654 existence |= DatasetExistence._ARTIFACT 

1655 elif existence != DatasetExistence.UNRECOGNIZED: 

1656 # Do not add this flag if we have no other idea about a dataset. 

1657 existence |= DatasetExistence._ASSUMED 

1658 

1659 return existence 

1660 

1661 def _exists_many( 

1662 self, 

1663 refs: Iterable[DatasetRef], 

1664 /, 

1665 *, 

1666 full_check: bool = True, 

1667 ) -> dict[DatasetRef, DatasetExistence]: 

1668 """Indicate whether multiple datasets are known to Butler registry and 

1669 datastore. 

1670 

1671 This is an experimental API that may change at any moment. 

1672 

1673 Parameters 

1674 ---------- 

1675 refs : iterable of `DatasetRef` 

1676 The datasets to be checked. 

1677 full_check : `bool`, optional 

1678 If `True`, an additional check will be made for dataset artifact 

1679 existence. This will involve additional overhead due to the need 

1680 to query an external system. If `False` registry and datastore 

1681 will solely be asked if they know about the dataset but no 

1682 check for the artifact will be performed. 

1683 

1684 Returns 

1685 ------- 

1686 existence : dict of [`DatasetRef`, `DatasetExistence`] 

1687 Mapping from the given dataset refs to an enum indicating the 

1688 status of the dataset in registry and datastore. 

1689 Each value evaluates to `True` if the dataset is present and known 

1690 to both. 

1691 """ 

1692 existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs} 

1693 

1694 # Registry does not have a bulk API to check for a ref. 

1695 for ref in refs: 

1696 registry_ref = self.registry.getDataset(ref.id) 

1697 if registry_ref is not None: 

1698 # It is possible, albeit unlikely, that the given ref does 

1699 # not match the one in registry even though the UUID matches. 

1700 # When checking a single ref we raise, but it's impolite to 

1701 # do that when potentially hundreds of refs are being checked. 

1702 # We could change the API to only accept UUIDs and that would 

1703 # remove the ability to even check and remove the worry 

1704 # about differing storage classes. Given the ongoing discussion 

1705 # on refs vs UUIDs and whether to raise or have a new 

1706 # private flag, treat this as a private API for now. 

1707 existence[ref] |= DatasetExistence.RECORDED 

1708 

1709 # Ask datastore if it knows about these refs. 

1710 knows = self.datastore.knows_these(refs) 

1711 for ref, known in knows.items(): 

1712 if known: 

1713 existence[ref] |= DatasetExistence.DATASTORE 

1714 

1715 if full_check: 

1716 mexists = self.datastore.mexists(refs) 

1717 for ref, exists in mexists.items(): 

1718 if exists: 

1719 existence[ref] |= DatasetExistence._ARTIFACT 

1720 else: 

1721 # Do not set this flag if nothing is known about the dataset. 

1722 for ref in existence.keys(): 

1723 if existence[ref] != DatasetExistence.UNRECOGNIZED: 

1724 existence[ref] |= DatasetExistence._ASSUMED 

1725 

1726 return existence 

1727 

1728 @deprecated( 

1729 reason="Butler.datasetExists() has been replaced by Butler.exists(). Will be removed after v27.0.", 

1730 version="v26.0", 

1731 category=FutureWarning, 

1732 ) 

1733 def datasetExists( 

1734 self, 

1735 datasetRefOrType: DatasetRef | DatasetType | str, 

1736 dataId: DataId | None = None, 

1737 *, 

1738 collections: Any = None, 

1739 **kwargs: Any, 

1740 ) -> bool: 

1741 """Return True if the Dataset is actually present in the Datastore. 

1742 

1743 Parameters 

1744 ---------- 

1745 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1746 When `DatasetRef` the `dataId` should be `None`. 

1747 Otherwise the `DatasetType` or name thereof. 

1748 dataId : `dict` or `DataCoordinate` 

1749 A `dict` of `Dimension` link name, value pairs that label the 

1750 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1751 should be provided as the first argument. 

1752 collections : Any, optional 

1753 Collections to be searched, overriding ``self.collections``. 

1754 Can be any of the types supported by the ``collections`` argument 

1755 to butler construction. 

1756 **kwargs 

1757 Additional keyword arguments used to augment or construct a 

1758 `DataCoordinate`. See `DataCoordinate.standardize` 

1759 parameters. 

1760 

1761 Raises 

1762 ------ 

1763 LookupError 

1764 Raised if the dataset is not even present in the Registry. 

1765 ValueError 

1766 Raised if a resolved `DatasetRef` was passed as an input, but it 

1767 differs from the one found in the registry. 

1768 NoDefaultCollectionError 

1769 Raised if no collections were provided. 

1770 """ 

1771 # A resolved ref may be given that is not known to this butler. 

1772 if isinstance(datasetRefOrType, DatasetRef): 

1773 ref = self.registry.getDataset(datasetRefOrType.id) 

1774 if ref is None: 

1775 raise LookupError( 

1776 f"Resolved DatasetRef with id {datasetRefOrType.id} is not known to registry." 

1777 ) 

1778 else: 

1779 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1780 return self.datastore.exists(ref) 

1781 

1782 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1783 """Remove one or more `~CollectionType.RUN` collections and the 

1784 datasets within them. 

1785 

1786 Parameters 

1787 ---------- 

1788 names : `~collections.abc.Iterable` [ `str` ] 

1789 The names of the collections to remove. 

1790 unstore : `bool`, optional 

1791 If `True` (default), delete datasets from all datastores in which 

1792 they are present, and attempt to rollback the registry deletions if 

1793 datastore deletions fail (which may not always be possible). If 

1794 `False`, datastore records for these datasets are still removed, 

1795 but any artifacts (e.g. files) will not be. 

1796 

1797 Raises 

1798 ------ 

1799 TypeError 

1800 Raised if one or more collections are not of type 

1801 `~CollectionType.RUN`. 

1802 """ 

1803 if not self.isWriteable(): 

1804 raise TypeError("Butler is read-only.") 

1805 names = list(names) 

1806 refs: list[DatasetRef] = [] 

1807 for name in names: 

1808 collectionType = self.registry.getCollectionType(name) 

1809 if collectionType is not CollectionType.RUN: 

1810 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1811 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True)) 

1812 with self.datastore.transaction(): 

1813 with self.registry.transaction(): 

1814 if unstore: 

1815 self.datastore.trash(refs) 

1816 else: 

1817 self.datastore.forget(refs) 

1818 for name in names: 

1819 self.registry.removeCollection(name) 

1820 if unstore: 

1821 # Point of no return for removing artifacts 

1822 self.datastore.emptyTrash() 

1823 

1824 def pruneDatasets( 

1825 self, 

1826 refs: Iterable[DatasetRef], 

1827 *, 

1828 disassociate: bool = True, 

1829 unstore: bool = False, 

1830 tags: Iterable[str] = (), 

1831 purge: bool = False, 

1832 ) -> None: 

1833 # docstring inherited from LimitedButler 

1834 

1835 if not self.isWriteable(): 

1836 raise TypeError("Butler is read-only.") 

1837 if purge: 

1838 if not disassociate: 

1839 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1840 if not unstore: 

1841 raise TypeError("Cannot pass purge=True without unstore=True.") 

1842 elif disassociate: 

1843 tags = tuple(tags) 

1844 if not tags: 

1845 raise TypeError("No tags provided but disassociate=True.") 

1846 for tag in tags: 

1847 collectionType = self.registry.getCollectionType(tag) 

1848 if collectionType is not CollectionType.TAGGED: 

1849 raise TypeError( 

1850 f"Cannot disassociate from collection '{tag}' " 

1851 f"of non-TAGGED type {collectionType.name}." 

1852 ) 

1853 # Transform possibly-single-pass iterable into something we can iterate 

1854 # over multiple times. 

1855 refs = list(refs) 

1856 # Pruning a component of a DatasetRef makes no sense since registry 

1857 # doesn't know about components and datastore might not store 

1858 # components in a separate file 

1859 for ref in refs: 

1860 if ref.datasetType.component(): 

1861 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1862 # We don't need an unreliable Datastore transaction for this, because 

1863 # we've been extra careful to ensure that Datastore.trash only involves 

1864 # mutating the Registry (it can _look_ at Datastore-specific things, 

1865 # but shouldn't change them), and hence all operations here are 

1866 # Registry operations. 

1867 with self.datastore.transaction(): 

1868 with self.registry.transaction(): 

1869 if unstore: 

1870 self.datastore.trash(refs) 

1871 if purge: 

1872 self.registry.removeDatasets(refs) 

1873 elif disassociate: 

1874 assert tags, "Guaranteed by earlier logic in this function." 

1875 for tag in tags: 

1876 self.registry.disassociate(tag, refs) 

1877 # We've exited the Registry transaction, and apparently committed. 

1878 # (if there was an exception, everything rolled back, and it's as if 

1879 # nothing happened - and we never get here). 

1880 # Datastore artifacts are not yet gone, but they're clearly marked 

1881 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1882 # problems we can try again later, and if manual administrative 

1883 # intervention is required, it's pretty clear what that should entail: 

1884 # deleting everything on disk and in private Datastore tables that is 

1885 # in the dataset_location_trash table. 

1886 if unstore: 

1887 # Point of no return for removing artifacts 

1888 self.datastore.emptyTrash() 

1889 

1890 @transactional 

1891 def ingest( 

1892 self, 

1893 *datasets: FileDataset, 

1894 transfer: str | None = "auto", 

1895 run: str | None = None, 

1896 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1897 record_validation_info: bool = True, 

1898 ) -> None: 

1899 """Store and register one or more datasets that already exist on disk. 

1900 

1901 Parameters 

1902 ---------- 

1903 datasets : `FileDataset` 

1904 Each positional argument is a struct containing information about 

1905 a file to be ingested, including its URI (either absolute or 

1906 relative to the datastore root, if applicable), a resolved 

1907 `DatasetRef`, and optionally a formatter class or its 

1908 fully-qualified string name. If a formatter is not provided, the 

1909 formatter that would be used for `put` is assumed. On successful 

1910 ingest all `FileDataset.formatter` attributes will be set to the 

1911 formatter class used. `FileDataset.path` attributes may be modified 

1912 to put paths in whatever the datastore considers a standardized 

1913 form. 

1914 transfer : `str`, optional 

1915 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1916 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to 

1917 transfer the file. 

1918 run : `str`, optional 

1919 The name of the run ingested datasets should be added to, 

1920 overriding ``self.run``. This parameter is now deprecated since 

1921 the run is encoded in the ``FileDataset``. 

1922 idGenerationMode : `DatasetIdGenEnum`, optional 

1923 Specifies option for generating dataset IDs. By default unique IDs 

1924 are generated for each inserted dataset. 

1925 record_validation_info : `bool`, optional 

1926 If `True`, the default, the datastore can record validation 

1927 information associated with the file. If `False` the datastore 

1928 will not attempt to track any information such as checksums 

1929 or file sizes. This can be useful if such information is tracked 

1930 in an external system or if the file is to be compressed in place. 

1931 It is up to the datastore whether this parameter is relevant. 

1932 

1933 Raises 

1934 ------ 

1935 TypeError 

1936 Raised if the butler is read-only or if no run was provided. 

1937 NotImplementedError 

1938 Raised if the `Datastore` does not support the given transfer mode. 

1939 DatasetTypeNotSupportedError 

1940 Raised if one or more files to be ingested have a dataset type that 

1941 is not supported by the `Datastore`.. 

1942 FileNotFoundError 

1943 Raised if one of the given files does not exist. 

1944 FileExistsError 

1945 Raised if transfer is not `None` but the (internal) location the 

1946 file would be moved to is already occupied. 

1947 

1948 Notes 

1949 ----- 

1950 This operation is not fully exception safe: if a database operation 

1951 fails, the given `FileDataset` instances may be only partially updated. 

1952 

1953 It is atomic in terms of database operations (they will either all 

1954 succeed or all fail) providing the database engine implements 

1955 transactions correctly. It will attempt to be atomic in terms of 

1956 filesystem operations as well, but this cannot be implemented 

1957 rigorously for most datastores. 

1958 """ 

1959 if not self.isWriteable(): 

1960 raise TypeError("Butler is read-only.") 

1961 

1962 log.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s") 

1963 if not datasets: 

1964 return 

1965 

1966 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1967 

1968 # We need to reorganize all the inputs so that they are grouped 

1969 # by dataset type and run. Multiple refs in a single FileDataset 

1970 # are required to share the run and dataset type. 

1971 GroupedData = MutableMapping[tuple[DatasetType, str], list[FileDataset]] 

1972 groupedData: GroupedData = defaultdict(list) 

1973 

1974 # Track DataIDs that are being ingested so we can spot issues early 

1975 # with duplication. Retain previous FileDataset so we can report it. 

1976 groupedDataIds: MutableMapping[ 

1977 tuple[DatasetType, str], dict[DataCoordinate, FileDataset] 

1978 ] = defaultdict(dict) 

1979 

1980 used_run = False 

1981 

1982 # And the nested loop that populates it: 

1983 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1984 # Somewhere to store pre-existing refs if we have an 

1985 # execution butler. 

1986 existingRefs: list[DatasetRef] = [] 

1987 

1988 for ref in dataset.refs: 

1989 assert ref.run is not None # For mypy 

1990 group_key = (ref.datasetType, ref.run) 

1991 

1992 if ref.dataId in groupedDataIds[group_key]: 

1993 raise ConflictingDefinitionError( 

1994 f"Ingest conflict. Dataset {dataset.path} has same" 

1995 " DataId as other ingest dataset" 

1996 f" {groupedDataIds[group_key][ref.dataId].path} " 

1997 f" ({ref.dataId})" 

1998 ) 

1999 

2000 groupedDataIds[group_key][ref.dataId] = dataset 

2001 

2002 if existingRefs: 

2003 if len(dataset.refs) != len(existingRefs): 

2004 # Keeping track of partially pre-existing datasets is hard 

2005 # and should generally never happen. For now don't allow 

2006 # it. 

2007 raise ConflictingDefinitionError( 

2008 f"For dataset {dataset.path} some dataIds already exist" 

2009 " in registry but others do not. This is not supported." 

2010 ) 

2011 

2012 # Store expanded form in the original FileDataset. 

2013 dataset.refs = existingRefs 

2014 else: 

2015 groupedData[group_key].append(dataset) 

2016 

2017 if not used_run and run is not None: 

2018 warnings.warn( 

2019 "All DatasetRefs to be ingested had resolved dataset IDs. The value given to the " 

2020 f"'run' parameter ({run!r}) was not used and the parameter will be removed in the future.", 

2021 category=FutureWarning, 

2022 stacklevel=3, # Take into account the @transactional decorator. 

2023 ) 

2024 

2025 # Now we can bulk-insert into Registry for each DatasetType. 

2026 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks( 

2027 groupedData.items(), desc="Bulk-inserting datasets by type" 

2028 ): 

2029 refs_to_import = [] 

2030 for dataset in grouped_datasets: 

2031 refs_to_import.extend(dataset.refs) 

2032 

2033 n_refs = len(refs_to_import) 

2034 log.verbose( 

2035 "Importing %d ref%s of dataset type %r into run %r", 

2036 n_refs, 

2037 "" if n_refs == 1 else "s", 

2038 datasetType.name, 

2039 this_run, 

2040 ) 

2041 

2042 # Import the refs and expand the DataCoordinates since we can't 

2043 # guarantee that they are expanded and Datastore will need 

2044 # the records. 

2045 imported_refs = self.registry._importDatasets(refs_to_import, expand=True) 

2046 assert set(imported_refs) == set(refs_to_import) 

2047 

2048 # Replace all the refs in the FileDataset with expanded versions. 

2049 # Pull them off in the order we put them on the list. 

2050 for dataset in grouped_datasets: 

2051 n_dataset_refs = len(dataset.refs) 

2052 dataset.refs = imported_refs[:n_dataset_refs] 

2053 del imported_refs[:n_dataset_refs] 

2054 

2055 # Bulk-insert everything into Datastore. 

2056 # We do not know if any of the registry entries already existed 

2057 # (_importDatasets only complains if they exist but differ) so 

2058 # we have to catch IntegrityError explicitly. 

2059 try: 

2060 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info) 

2061 except IntegrityError as e: 

2062 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") 

2063 

2064 @contextlib.contextmanager 

2065 def export( 

2066 self, 

2067 *, 

2068 directory: str | None = None, 

2069 filename: str | None = None, 

2070 format: str | None = None, 

2071 transfer: str | None = None, 

2072 ) -> Iterator[RepoExportContext]: 

2073 """Export datasets from the repository represented by this `Butler`. 

2074 

2075 This method is a context manager that returns a helper object 

2076 (`RepoExportContext`) that is used to indicate what information from 

2077 the repository should be exported. 

2078 

2079 Parameters 

2080 ---------- 

2081 directory : `str`, optional 

2082 Directory dataset files should be written to if ``transfer`` is not 

2083 `None`. 

2084 filename : `str`, optional 

2085 Name for the file that will include database information associated 

2086 with the exported datasets. If this is not an absolute path and 

2087 ``directory`` is not `None`, it will be written to ``directory`` 

2088 instead of the current working directory. Defaults to 

2089 "export.{format}". 

2090 format : `str`, optional 

2091 File format for the database information file. If `None`, the 

2092 extension of ``filename`` will be used. 

2093 transfer : `str`, optional 

2094 Transfer mode passed to `Datastore.export`. 

2095 

2096 Raises 

2097 ------ 

2098 TypeError 

2099 Raised if the set of arguments passed is inconsistent. 

2100 

2101 Examples 

2102 -------- 

2103 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

2104 methods are used to provide the iterables over data IDs and/or datasets 

2105 to be exported:: 

2106 

2107 with butler.export("exports.yaml") as export: 

2108 # Export all flats, but none of the dimension element rows 

2109 # (i.e. data ID information) associated with them. 

2110 export.saveDatasets(butler.registry.queryDatasets("flat"), 

2111 elements=()) 

2112 # Export all datasets that start with "deepCoadd_" and all of 

2113 # their associated data ID information. 

2114 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

2115 """ 

2116 if directory is None and transfer is not None: 

2117 raise TypeError("Cannot transfer without providing a directory.") 

2118 if transfer == "move": 

2119 raise TypeError("Transfer may not be 'move': export is read-only") 

2120 if format is None: 

2121 if filename is None: 

2122 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2123 else: 

2124 _, format = os.path.splitext(filename) 

2125 if not format: 

2126 raise ValueError("Please specify a file extension to determine export format.") 

2127 format = format[1:] # Strip leading "."" 

2128 elif filename is None: 

2129 filename = f"export.{format}" 

2130 if directory is not None: 

2131 filename = os.path.join(directory, filename) 

2132 formats = self._config["repo_transfer_formats"] 

2133 if format not in formats: 

2134 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}") 

2135 BackendClass = get_class_of(formats[format, "export"]) 

2136 with open(filename, "w") as stream: 

2137 backend = BackendClass(stream, universe=self.dimensions) 

2138 try: 

2139 helper = RepoExportContext( 

2140 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer 

2141 ) 

2142 yield helper 

2143 except BaseException: 

2144 raise 

2145 else: 

2146 helper._finish() 

2147 

2148 def import_( 

2149 self, 

2150 *, 

2151 directory: ResourcePathExpression | None = None, 

2152 filename: ResourcePathExpression | TextIO | None = None, 

2153 format: str | None = None, 

2154 transfer: str | None = None, 

2155 skip_dimensions: set | None = None, 

2156 ) -> None: 

2157 """Import datasets into this repository that were exported from a 

2158 different butler repository via `~lsst.daf.butler.Butler.export`. 

2159 

2160 Parameters 

2161 ---------- 

2162 directory : `~lsst.resources.ResourcePathExpression`, optional 

2163 Directory containing dataset files to import from. If `None`, 

2164 ``filename`` and all dataset file paths specified therein must 

2165 be absolute. 

2166 filename : `~lsst.resources.ResourcePathExpression` or `TextIO` 

2167 A stream or name of file that contains database information 

2168 associated with the exported datasets, typically generated by 

2169 `~lsst.daf.butler.Butler.export`. If this a string (name) or 

2170 `~lsst.resources.ResourcePath` and is not an absolute path, 

2171 it will first be looked for relative to ``directory`` and if not 

2172 found there it will be looked for in the current working 

2173 directory. Defaults to "export.{format}". 

2174 format : `str`, optional 

2175 File format for ``filename``. If `None`, the extension of 

2176 ``filename`` will be used. 

2177 transfer : `str`, optional 

2178 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

2179 skip_dimensions : `set`, optional 

2180 Names of dimensions that should be skipped and not imported. 

2181 

2182 Raises 

2183 ------ 

2184 TypeError 

2185 Raised if the set of arguments passed is inconsistent, or if the 

2186 butler is read-only. 

2187 """ 

2188 if not self.isWriteable(): 

2189 raise TypeError("Butler is read-only.") 

2190 if format is None: 

2191 if filename is None: 

2192 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2193 else: 

2194 _, format = os.path.splitext(filename) # type: ignore 

2195 elif filename is None: 

2196 filename = ResourcePath(f"export.{format}", forceAbsolute=False) 

2197 if directory is not None: 

2198 directory = ResourcePath(directory, forceDirectory=True) 

2199 # mypy doesn't think this will work but it does in python >= 3.10. 

2200 if isinstance(filename, ResourcePathExpression): # type: ignore 

2201 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore 

2202 if not filename.isabs() and directory is not None: 

2203 potential = directory.join(filename) 

2204 exists_in_cwd = filename.exists() 

2205 exists_in_dir = potential.exists() 

2206 if exists_in_cwd and exists_in_dir: 

2207 log.warning( 

2208 "A relative path for filename was specified (%s) which exists relative to cwd. " 

2209 "Additionally, the file exists relative to the given search directory (%s). " 

2210 "Using the export file in the given directory.", 

2211 filename, 

2212 potential, 

2213 ) 

2214 # Given they specified an explicit directory and that 

2215 # directory has the export file in it, assume that that 

2216 # is what was meant despite the file in cwd. 

2217 filename = potential 

2218 elif exists_in_dir: 

2219 filename = potential 

2220 elif not exists_in_cwd and not exists_in_dir: 

2221 # Raise early. 

2222 raise FileNotFoundError( 

2223 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}." 

2224 ) 

2225 BackendClass: type[RepoImportBackend] = get_class_of( 

2226 self._config["repo_transfer_formats"][format]["import"] 

2227 ) 

2228 

2229 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None: 

2230 backend = BackendClass(importStream, self.registry) # type: ignore[call-arg] 

2231 backend.register() 

2232 with self.transaction(): 

2233 backend.load( 

2234 self.datastore, 

2235 directory=directory, 

2236 transfer=transfer, 

2237 skip_dimensions=skip_dimensions, 

2238 ) 

2239 

2240 if isinstance(filename, ResourcePath): 

2241 # We can not use open() here at the moment because of 

2242 # DM-38589 since yaml does stream.read(8192) in a loop. 

2243 stream = io.StringIO(filename.read().decode()) 

2244 doImport(stream) 

2245 else: 

2246 doImport(filename) # type: ignore 

2247 

2248 def transfer_from( 

2249 self, 

2250 source_butler: LimitedButler, 

2251 source_refs: Iterable[DatasetRef], 

2252 transfer: str = "auto", 

2253 skip_missing: bool = True, 

2254 register_dataset_types: bool = False, 

2255 transfer_dimensions: bool = False, 

2256 ) -> collections.abc.Collection[DatasetRef]: 

2257 """Transfer datasets to this Butler from a run in another Butler. 

2258 

2259 Parameters 

2260 ---------- 

2261 source_butler : `LimitedButler` 

2262 Butler from which the datasets are to be transferred. If data IDs 

2263 in ``source_refs`` are not expanded then this has to be a full 

2264 `Butler` whose registry will be used to expand data IDs. 

2265 source_refs : iterable of `DatasetRef` 

2266 Datasets defined in the source butler that should be transferred to 

2267 this butler. 

2268 transfer : `str`, optional 

2269 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

2270 skip_missing : `bool` 

2271 If `True`, datasets with no datastore artifact associated with 

2272 them are not transferred. If `False` a registry entry will be 

2273 created even if no datastore record is created (and so will 

2274 look equivalent to the dataset being unstored). 

2275 register_dataset_types : `bool` 

2276 If `True` any missing dataset types are registered. Otherwise 

2277 an exception is raised. 

2278 transfer_dimensions : `bool`, optional 

2279 If `True`, dimension record data associated with the new datasets 

2280 will be transferred. 

2281 

2282 Returns 

2283 ------- 

2284 refs : `list` of `DatasetRef` 

2285 The refs added to this Butler. 

2286 

2287 Notes 

2288 ----- 

2289 The datastore artifact has to exist for a transfer 

2290 to be made but non-existence is not an error. 

2291 

2292 Datasets that already exist in this run will be skipped. 

2293 

2294 The datasets are imported as part of a transaction, although 

2295 dataset types are registered before the transaction is started. 

2296 This means that it is possible for a dataset type to be registered 

2297 even though transfer has failed. 

2298 """ 

2299 if not self.isWriteable(): 

2300 raise TypeError("Butler is read-only.") 

2301 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

2302 

2303 # Will iterate through the refs multiple times so need to convert 

2304 # to a list if this isn't a collection. 

2305 if not isinstance(source_refs, collections.abc.Collection): 

2306 source_refs = list(source_refs) 

2307 

2308 original_count = len(source_refs) 

2309 log.info("Transferring %d datasets into %s", original_count, str(self)) 

2310 

2311 # In some situations the datastore artifact may be missing 

2312 # and we do not want that registry entry to be imported. 

2313 # Asking datastore is not sufficient, the records may have been 

2314 # purged, we have to ask for the (predicted) URI and check 

2315 # existence explicitly. Execution butler is set up exactly like 

2316 # this with no datastore records. 

2317 artifact_existence: dict[ResourcePath, bool] = {} 

2318 if skip_missing: 

2319 dataset_existence = source_butler.datastore.mexists( 

2320 source_refs, artifact_existence=artifact_existence 

2321 ) 

2322 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

2323 filtered_count = len(source_refs) 

2324 n_missing = original_count - filtered_count 

2325 log.verbose( 

2326 "%d dataset%s removed because the artifact does not exist. Now have %d.", 

2327 n_missing, 

2328 "" if n_missing == 1 else "s", 

2329 filtered_count, 

2330 ) 

2331 

2332 # Importing requires that we group the refs by dataset type and run 

2333 # before doing the import. 

2334 source_dataset_types = set() 

2335 grouped_refs = defaultdict(list) 

2336 for ref in source_refs: 

2337 grouped_refs[ref.datasetType, ref.run].append(ref) 

2338 source_dataset_types.add(ref.datasetType) 

2339 

2340 # Check to see if the dataset type in the source butler has 

2341 # the same definition in the target butler and register missing 

2342 # ones if requested. Registration must happen outside a transaction. 

2343 newly_registered_dataset_types = set() 

2344 for datasetType in source_dataset_types: 

2345 if register_dataset_types: 

2346 # Let this raise immediately if inconsistent. Continuing 

2347 # on to find additional inconsistent dataset types 

2348 # might result in additional unwanted dataset types being 

2349 # registered. 

2350 if self.registry.registerDatasetType(datasetType): 

2351 newly_registered_dataset_types.add(datasetType) 

2352 else: 

2353 # If the dataset type is missing, let it fail immediately. 

2354 target_dataset_type = self.registry.getDatasetType(datasetType.name) 

2355 if target_dataset_type != datasetType: 

2356 raise ConflictingDefinitionError( 

2357 "Source butler dataset type differs from definition" 

2358 f" in target butler: {datasetType} !=" 

2359 f" {target_dataset_type}" 

2360 ) 

2361 if newly_registered_dataset_types: 

2362 # We may have registered some even if there were inconsistencies 

2363 # but should let people know (or else remove them again). 

2364 log.log( 

2365 VERBOSE, 

2366 "Registered the following dataset types in the target Butler: %s", 

2367 ", ".join(d.name for d in newly_registered_dataset_types), 

2368 ) 

2369 else: 

2370 log.log(VERBOSE, "All required dataset types are known to the target Butler") 

2371 

2372 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

2373 if transfer_dimensions: 

2374 # Collect all the dimension records for these refs. 

2375 # All dimensions are to be copied but the list of valid dimensions 

2376 # come from this butler's universe. 

2377 elements = frozenset( 

2378 element 

2379 for element in self.dimensions.getStaticElements() 

2380 if element.hasTable() and element.viewOf is None 

2381 ) 

2382 dataIds = {ref.dataId for ref in source_refs} 

2383 # This logic comes from saveDataIds. 

2384 for dataId in dataIds: 

2385 # Need an expanded record, if not expanded that we need a full 

2386 # butler with registry (allow mocks with registry too). 

2387 if not dataId.hasRecords(): 

2388 if registry := getattr(source_butler, "registry", None): 

2389 dataId = registry.expandDataId(dataId) 

2390 else: 

2391 raise TypeError("Input butler needs to be a full butler to expand DataId.") 

2392 # If this butler doesn't know about a dimension in the source 

2393 # butler things will break later. 

2394 for record in dataId.records.values(): 

2395 if record is not None and record.definition in elements: 

2396 dimension_records[record.definition].setdefault(record.dataId, record) 

2397 

2398 handled_collections: set[str] = set() 

2399 

2400 # Do all the importing in a single transaction. 

2401 with self.transaction(): 

2402 if dimension_records: 

2403 log.verbose("Ensuring that dimension records exist for transferred datasets.") 

2404 for element, r in dimension_records.items(): 

2405 records = [r[dataId] for dataId in r] 

2406 # Assume that if the record is already present that we can 

2407 # use it without having to check that the record metadata 

2408 # is consistent. 

2409 self.registry.insertDimensionData(element, *records, skip_existing=True) 

2410 

2411 n_imported = 0 

2412 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

2413 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

2414 ): 

2415 if run not in handled_collections: 

2416 # May need to create output collection. If source butler 

2417 # has a registry, ask for documentation string. 

2418 run_doc = None 

2419 if registry := getattr(source_butler, "registry", None): 

2420 run_doc = registry.getCollectionDocumentation(run) 

2421 registered = self.registry.registerRun(run, doc=run_doc) 

2422 handled_collections.add(run) 

2423 if registered: 

2424 log.log(VERBOSE, "Creating output run %s", run) 

2425 

2426 n_refs = len(refs_to_import) 

2427 log.verbose( 

2428 "Importing %d ref%s of dataset type %s into run %s", 

2429 n_refs, 

2430 "" if n_refs == 1 else "s", 

2431 datasetType.name, 

2432 run, 

2433 ) 

2434 

2435 # Assume we are using UUIDs and the source refs will match 

2436 # those imported. 

2437 imported_refs = self.registry._importDatasets(refs_to_import, expand=False) 

2438 assert set(imported_refs) == set(refs_to_import) 

2439 n_imported += len(imported_refs) 

2440 

2441 assert len(source_refs) == n_imported 

2442 log.verbose("Imported %d datasets into destination butler", n_imported) 

2443 

2444 # Ask the datastore to transfer. The datastore has to check that 

2445 # the source datastore is compatible with the target datastore. 

2446 accepted, rejected = self.datastore.transfer_from( 

2447 source_butler.datastore, 

2448 source_refs, 

2449 transfer=transfer, 

2450 artifact_existence=artifact_existence, 

2451 ) 

2452 if rejected: 

2453 # For now, accept the registry entries but not the files. 

2454 log.warning( 

2455 "%d datasets were rejected and %d accepted for dataset type %s in run %r.", 

2456 len(rejected), 

2457 len(accepted), 

2458 datasetType, 

2459 run, 

2460 ) 

2461 

2462 return source_refs 

2463 

2464 def validateConfiguration( 

2465 self, 

2466 logFailures: bool = False, 

2467 datasetTypeNames: Iterable[str] | None = None, 

2468 ignore: Iterable[str] | None = None, 

2469 ) -> None: 

2470 """Validate butler configuration. 

2471 

2472 Checks that each `DatasetType` can be stored in the `Datastore`. 

2473 

2474 Parameters 

2475 ---------- 

2476 logFailures : `bool`, optional 

2477 If `True`, output a log message for every validation error 

2478 detected. 

2479 datasetTypeNames : iterable of `str`, optional 

2480 The `DatasetType` names that should be checked. This allows 

2481 only a subset to be selected. 

2482 ignore : iterable of `str`, optional 

2483 Names of DatasetTypes to skip over. This can be used to skip 

2484 known problems. If a named `DatasetType` corresponds to a 

2485 composite, all components of that `DatasetType` will also be 

2486 ignored. 

2487 

2488 Raises 

2489 ------ 

2490 ButlerValidationError 

2491 Raised if there is some inconsistency with how this Butler 

2492 is configured. 

2493 """ 

2494 if datasetTypeNames: 

2495 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

2496 else: 

2497 datasetTypes = list(self.registry.queryDatasetTypes()) 

2498 

2499 # filter out anything from the ignore list 

2500 if ignore: 

2501 ignore = set(ignore) 

2502 datasetTypes = [ 

2503 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

2504 ] 

2505 else: 

2506 ignore = set() 

2507 

2508 # Find all the registered instruments 

2509 instruments = {record.name for record in self.registry.queryDimensionRecords("instrument")} 

2510 

2511 # For each datasetType that has an instrument dimension, create 

2512 # a DatasetRef for each defined instrument 

2513 datasetRefs = [] 

2514 

2515 for datasetType in datasetTypes: 

2516 if "instrument" in datasetType.dimensions: 

2517 for instrument in instruments: 

2518 datasetRef = DatasetRef( 

2519 datasetType, 

2520 {"instrument": instrument}, # type: ignore 

2521 conform=False, 

2522 run="validate", 

2523 ) 

2524 datasetRefs.append(datasetRef) 

2525 

2526 entities: list[DatasetType | DatasetRef] = [] 

2527 entities.extend(datasetTypes) 

2528 entities.extend(datasetRefs) 

2529 

2530 datastoreErrorStr = None 

2531 try: 

2532 self.datastore.validateConfiguration(entities, logFailures=logFailures) 

2533 except ValidationError as e: 

2534 datastoreErrorStr = str(e) 

2535 

2536 # Also check that the LookupKeys used by the datastores match 

2537 # registry and storage class definitions 

2538 keys = self.datastore.getLookupKeys() 

2539 

2540 failedNames = set() 

2541 failedDataId = set() 

2542 for key in keys: 

2543 if key.name is not None: 

2544 if key.name in ignore: 

2545 continue 

2546 

2547 # skip if specific datasetType names were requested and this 

2548 # name does not match 

2549 if datasetTypeNames and key.name not in datasetTypeNames: 

2550 continue 

2551 

2552 # See if it is a StorageClass or a DatasetType 

2553 if key.name in self.storageClasses: 

2554 pass 

2555 else: 

2556 try: 

2557 self.registry.getDatasetType(key.name) 

2558 except KeyError: 

2559 if logFailures: 

2560 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

2561 failedNames.add(key) 

2562 else: 

2563 # Dimensions are checked for consistency when the Butler 

2564 # is created and rendezvoused with a universe. 

2565 pass 

2566 

2567 # Check that the instrument is a valid instrument 

2568 # Currently only support instrument so check for that 

2569 if key.dataId: 

2570 dataIdKeys = set(key.dataId) 

2571 if {"instrument"} != dataIdKeys: 

2572 if logFailures: 

2573 log.critical("Key '%s' has unsupported DataId override", key) 

2574 failedDataId.add(key) 

2575 elif key.dataId["instrument"] not in instruments: 

2576 if logFailures: 

2577 log.critical("Key '%s' has unknown instrument", key) 

2578 failedDataId.add(key) 

2579 

2580 messages = [] 

2581 

2582 if datastoreErrorStr: 

2583 messages.append(datastoreErrorStr) 

2584 

2585 for failed, msg in ( 

2586 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2587 (failedDataId, "Keys with bad DataId entries: "), 

2588 ): 

2589 if failed: 

2590 msg += ", ".join(str(k) for k in failed) 

2591 messages.append(msg) 

2592 

2593 if messages: 

2594 raise ValidationError(";\n".join(messages)) 

2595 

2596 @property 

2597 def collections(self) -> Sequence[str]: 

2598 """The collections to search by default, in order 

2599 (`~collections.abc.Sequence` [ `str` ]). 

2600 

2601 This is an alias for ``self.registry.defaults.collections``. It cannot 

2602 be set directly in isolation, but all defaults may be changed together 

2603 by assigning a new `RegistryDefaults` instance to 

2604 ``self.registry.defaults``. 

2605 """ 

2606 return self.registry.defaults.collections 

2607 

2608 @property 

2609 def run(self) -> str | None: 

2610 """Name of the run this butler writes outputs to by default (`str` or 

2611 `None`). 

2612 

2613 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2614 directly in isolation, but all defaults may be changed together by 

2615 assigning a new `RegistryDefaults` instance to 

2616 ``self.registry.defaults``. 

2617 """ 

2618 return self.registry.defaults.run 

2619 

2620 @property 

2621 def dimensions(self) -> DimensionUniverse: 

2622 # Docstring inherited. 

2623 return self.registry.dimensions 

2624 

2625 registry: Registry 

2626 """The object that manages dataset metadata and relationships (`Registry`). 

2627 

2628 Most operations that don't involve reading or writing butler datasets are 

2629 accessible only via `Registry` methods. 

2630 """ 

2631 

2632 datastore: Datastore 

2633 """The object that manages actual dataset storage (`Datastore`). 

2634 

2635 Direct user access to the datastore should rarely be necessary; the primary 

2636 exception is the case where a `Datastore` implementation provides extra 

2637 functionality beyond what the base class defines. 

2638 """ 

2639 

2640 storageClasses: StorageClassFactory 

2641 """An object that maps known storage class names to objects that fully 

2642 describe them (`StorageClassFactory`). 

2643 """