Coverage for python/lsst/daf/butler/_butler.py: 8%

718 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-12 10:56 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Butler top level classes. 

23""" 

24from __future__ import annotations 

25 

26__all__ = ( 

27 "Butler", 

28 "ButlerValidationError", 

29) 

30 

31import collections.abc 

32import contextlib 

33import io 

34import logging 

35import numbers 

36import os 

37import warnings 

38from collections import Counter, defaultdict 

39from collections.abc import Iterable, Iterator, MutableMapping, Sequence 

40from typing import TYPE_CHECKING, Any, ClassVar, TextIO 

41 

42from deprecated.sphinx import deprecated 

43from lsst.resources import ResourcePath, ResourcePathExpression 

44from lsst.utils import doImportType 

45from lsst.utils.introspection import get_class_of 

46from lsst.utils.logging import VERBOSE, getLogger 

47from sqlalchemy.exc import IntegrityError 

48 

49from ._butlerConfig import ButlerConfig 

50from ._butlerRepoIndex import ButlerRepoIndex 

51from ._dataset_existence import DatasetExistence 

52from ._deferredDatasetHandle import DeferredDatasetHandle 

53from ._limited_butler import LimitedButler 

54from .core import ( 

55 Config, 

56 ConfigSubset, 

57 DataCoordinate, 

58 DataId, 

59 DataIdValue, 

60 DatasetIdGenEnum, 

61 DatasetRef, 

62 DatasetRefURIs, 

63 DatasetType, 

64 Datastore, 

65 Dimension, 

66 DimensionConfig, 

67 DimensionElement, 

68 DimensionRecord, 

69 DimensionUniverse, 

70 FileDataset, 

71 Progress, 

72 StorageClass, 

73 StorageClassFactory, 

74 Timespan, 

75 ValidationError, 

76) 

77from .core.repoRelocation import BUTLER_ROOT_TAG 

78from .core.utils import transactional 

79from .registry import ( 

80 CollectionType, 

81 ConflictingDefinitionError, 

82 DataIdError, 

83 MissingDatasetTypeError, 

84 NoDefaultCollectionError, 

85 Registry, 

86 RegistryConfig, 

87 RegistryDefaults, 

88) 

89from .transfers import RepoExportContext 

90 

91if TYPE_CHECKING: 

92 from lsst.resources import ResourceHandleProtocol 

93 

94 from .transfers import RepoImportBackend 

95 

96log = getLogger(__name__) 

97 

98 

99class ButlerValidationError(ValidationError): 

100 """There is a problem with the Butler configuration.""" 

101 

102 pass 

103 

104 

105class Butler(LimitedButler): 

106 """Main entry point for the data access system. 

107 

108 Parameters 

109 ---------- 

110 config : `ButlerConfig`, `Config` or `str`, optional. 

111 Configuration. Anything acceptable to the 

112 `ButlerConfig` constructor. If a directory path 

113 is given the configuration will be read from a ``butler.yaml`` file in 

114 that location. If `None` is given default values will be used. 

115 butler : `Butler`, optional. 

116 If provided, construct a new Butler that uses the same registry and 

117 datastore as the given one, but with the given collection and run. 

118 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

119 arguments. 

120 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional 

121 An expression specifying the collections to be searched (in order) when 

122 reading datasets. 

123 This may be a `str` collection name or an iterable thereof. 

124 See :ref:`daf_butler_collection_expressions` for more information. 

125 These collections are not registered automatically and must be 

126 manually registered before they are used by any method, but they may be 

127 manually registered after the `Butler` is initialized. 

128 run : `str`, optional 

129 Name of the `~CollectionType.RUN` collection new datasets should be 

130 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

131 ``collections`` will be set to ``[run]``. If not `None`, this 

132 collection will automatically be registered. If this is not set (and 

133 ``writeable`` is not set either), a read-only butler will be created. 

134 searchPaths : `list` of `str`, optional 

135 Directory paths to search when calculating the full Butler 

136 configuration. Not used if the supplied config is already a 

137 `ButlerConfig`. 

138 writeable : `bool`, optional 

139 Explicitly sets whether the butler supports write operations. If not 

140 provided, a read-write butler is created if any of ``run``, ``tags``, 

141 or ``chains`` is non-empty. 

142 inferDefaults : `bool`, optional 

143 If `True` (default) infer default data ID values from the values 

144 present in the datasets in ``collections``: if all collections have the 

145 same value (or no value) for a governor dimension, that value will be 

146 the default for that dimension. Nonexistent collections are ignored. 

147 If a default value is provided explicitly for a governor dimension via 

148 ``**kwargs``, no default will be inferred for that dimension. 

149 **kwargs : `str` 

150 Default data ID key-value pairs. These may only identify "governor" 

151 dimensions like ``instrument`` and ``skymap``. 

152 

153 Examples 

154 -------- 

155 While there are many ways to control exactly how a `Butler` interacts with 

156 the collections in its `Registry`, the most common cases are still simple. 

157 

158 For a read-only `Butler` that searches one collection, do:: 

159 

160 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

161 

162 For a read-write `Butler` that writes to and reads from a 

163 `~CollectionType.RUN` collection:: 

164 

165 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

166 

167 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

168 because we want to write to one `~CollectionType.RUN` collection but read 

169 from several others (as well):: 

170 

171 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

172 collections=["u/alice/DM-50000/a", 

173 "u/bob/DM-49998", 

174 "HSC/defaults"]) 

175 

176 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

177 Datasets will be read first from that run (since it appears first in the 

178 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

179 

180 Finally, one can always create a `Butler` with no collections:: 

181 

182 butler = Butler("/path/to/repo", writeable=True) 

183 

184 This can be extremely useful when you just want to use ``butler.registry``, 

185 e.g. for inserting dimension data or managing collections, or when the 

186 collections you want to use with the butler are not consistent. 

187 Passing ``writeable`` explicitly here is only necessary if you want to be 

188 able to make changes to the repo - usually the value for ``writeable`` can 

189 be guessed from the collection arguments provided, but it defaults to 

190 `False` when there are not collection arguments. 

191 """ 

192 

193 def __init__( 

194 self, 

195 config: Config | ResourcePathExpression | None = None, 

196 *, 

197 butler: Butler | None = None, 

198 collections: Any = None, 

199 run: str | None = None, 

200 searchPaths: Sequence[ResourcePathExpression] | None = None, 

201 writeable: bool | None = None, 

202 inferDefaults: bool = True, 

203 **kwargs: str, 

204 ): 

205 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

206 # Load registry, datastore, etc. from config or existing butler. 

207 if butler is not None: 

208 if config is not None or searchPaths is not None or writeable is not None: 

209 raise TypeError( 

210 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument." 

211 ) 

212 self.registry = butler.registry.copy(defaults) 

213 self._datastore = butler._datastore 

214 self.storageClasses = butler.storageClasses 

215 self._config: ButlerConfig = butler._config 

216 else: 

217 self._config = ButlerConfig(config, searchPaths=searchPaths) 

218 try: 

219 if "root" in self._config: 

220 butlerRoot = self._config["root"] 

221 else: 

222 butlerRoot = self._config.configDir 

223 if writeable is None: 

224 writeable = run is not None 

225 self.registry = Registry.fromConfig( 

226 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

227 ) 

228 self._datastore = Datastore.fromConfig( 

229 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

230 ) 

231 self.storageClasses = StorageClassFactory() 

232 self.storageClasses.addFromConfig(self._config) 

233 except Exception: 

234 # Failures here usually mean that configuration is incomplete, 

235 # just issue an error message which includes config file URI. 

236 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

237 raise 

238 

239 # For execution butler the datastore needs a special 

240 # dependency-inversion trick. This is not used by regular butler, 

241 # but we do not have a way to distinguish regular butler from execution 

242 # butler. 

243 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

244 

245 if "run" in self._config or "collection" in self._config: 

246 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

247 

248 GENERATION: ClassVar[int] = 3 

249 """This is a Generation 3 Butler. 

250 

251 This attribute may be removed in the future, once the Generation 2 Butler 

252 interface has been fully retired; it should only be used in transitional 

253 code. 

254 """ 

255 

256 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

257 """Return DatasetType defined in registry given dataset type name.""" 

258 try: 

259 return self.registry.getDatasetType(name) 

260 except MissingDatasetTypeError: 

261 return None 

262 

263 @classmethod 

264 def get_repo_uri(cls, label: str, return_label: bool = False) -> ResourcePath: 

265 """Look up the label in a butler repository index. 

266 

267 Parameters 

268 ---------- 

269 label : `str` 

270 Label of the Butler repository to look up. 

271 return_label : `bool`, optional 

272 If ``label`` cannot be found in the repository index (either 

273 because index is not defined or ``label`` is not in the index) and 

274 ``return_label`` is `True` then return ``ResourcePath(label)``. 

275 If ``return_label`` is `False` (default) then an exception will be 

276 raised instead. 

277 

278 Returns 

279 ------- 

280 uri : `lsst.resources.ResourcePath` 

281 URI to the Butler repository associated with the given label or 

282 default value if it is provided. 

283 

284 Raises 

285 ------ 

286 KeyError 

287 Raised if the label is not found in the index, or if an index 

288 is not defined, and ``return_label`` is `False`. 

289 

290 Notes 

291 ----- 

292 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

293 information is discovered. 

294 """ 

295 return ButlerRepoIndex.get_repo_uri(label, return_label) 

296 

297 @classmethod 

298 def get_known_repos(cls) -> set[str]: 

299 """Retrieve the list of known repository labels. 

300 

301 Returns 

302 ------- 

303 repos : `set` of `str` 

304 All the known labels. Can be empty if no index can be found. 

305 

306 Notes 

307 ----- 

308 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

309 information is discovered. 

310 """ 

311 return ButlerRepoIndex.get_known_repos() 

312 

313 @staticmethod 

314 def makeRepo( 

315 root: ResourcePathExpression, 

316 config: Config | str | None = None, 

317 dimensionConfig: Config | str | None = None, 

318 standalone: bool = False, 

319 searchPaths: list[str] | None = None, 

320 forceConfigRoot: bool = True, 

321 outfile: ResourcePathExpression | None = None, 

322 overwrite: bool = False, 

323 ) -> Config: 

324 """Create an empty data repository by adding a butler.yaml config 

325 to a repository root directory. 

326 

327 Parameters 

328 ---------- 

329 root : `lsst.resources.ResourcePathExpression` 

330 Path or URI to the root location of the new repository. Will be 

331 created if it does not exist. 

332 config : `Config` or `str`, optional 

333 Configuration to write to the repository, after setting any 

334 root-dependent Registry or Datastore config options. Can not 

335 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

336 configuration will be used. Root-dependent config options 

337 specified in this config are overwritten if ``forceConfigRoot`` 

338 is `True`. 

339 dimensionConfig : `Config` or `str`, optional 

340 Configuration for dimensions, will be used to initialize registry 

341 database. 

342 standalone : `bool` 

343 If True, write all expanded defaults, not just customized or 

344 repository-specific settings. 

345 This (mostly) decouples the repository from the default 

346 configuration, insulating it from changes to the defaults (which 

347 may be good or bad, depending on the nature of the changes). 

348 Future *additions* to the defaults will still be picked up when 

349 initializing `Butlers` to repos created with ``standalone=True``. 

350 searchPaths : `list` of `str`, optional 

351 Directory paths to search when calculating the full butler 

352 configuration. 

353 forceConfigRoot : `bool`, optional 

354 If `False`, any values present in the supplied ``config`` that 

355 would normally be reset are not overridden and will appear 

356 directly in the output config. This allows non-standard overrides 

357 of the root directory for a datastore or registry to be given. 

358 If this parameter is `True` the values for ``root`` will be 

359 forced into the resulting config if appropriate. 

360 outfile : `lss.resources.ResourcePathExpression`, optional 

361 If not-`None`, the output configuration will be written to this 

362 location rather than into the repository itself. Can be a URI 

363 string. Can refer to a directory that will be used to write 

364 ``butler.yaml``. 

365 overwrite : `bool`, optional 

366 Create a new configuration file even if one already exists 

367 in the specified output location. Default is to raise 

368 an exception. 

369 

370 Returns 

371 ------- 

372 config : `Config` 

373 The updated `Config` instance written to the repo. 

374 

375 Raises 

376 ------ 

377 ValueError 

378 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

379 regular Config (as these subclasses would make it impossible to 

380 support ``standalone=False``). 

381 FileExistsError 

382 Raised if the output config file already exists. 

383 os.error 

384 Raised if the directory does not exist, exists but is not a 

385 directory, or cannot be created. 

386 

387 Notes 

388 ----- 

389 Note that when ``standalone=False`` (the default), the configuration 

390 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

391 construct the repository should also be used to construct any Butlers 

392 to avoid configuration inconsistencies. 

393 """ 

394 if isinstance(config, (ButlerConfig, ConfigSubset)): 

395 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

396 

397 # Ensure that the root of the repository exists or can be made 

398 root_uri = ResourcePath(root, forceDirectory=True) 

399 root_uri.mkdir() 

400 

401 config = Config(config) 

402 

403 # If we are creating a new repo from scratch with relative roots, 

404 # do not propagate an explicit root from the config file 

405 if "root" in config: 

406 del config["root"] 

407 

408 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

409 imported_class = doImportType(full["datastore", "cls"]) 

410 if not issubclass(imported_class, Datastore): 

411 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore") 

412 datastoreClass: type[Datastore] = imported_class 

413 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

414 

415 # if key exists in given config, parse it, otherwise parse the defaults 

416 # in the expanded config 

417 if config.get(("registry", "db")): 

418 registryConfig = RegistryConfig(config) 

419 else: 

420 registryConfig = RegistryConfig(full) 

421 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

422 if defaultDatabaseUri is not None: 

423 Config.updateParameters( 

424 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot 

425 ) 

426 else: 

427 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot) 

428 

429 if standalone: 

430 config.merge(full) 

431 else: 

432 # Always expand the registry.managers section into the per-repo 

433 # config, because after the database schema is created, it's not 

434 # allowed to change anymore. Note that in the standalone=True 

435 # branch, _everything_ in the config is expanded, so there's no 

436 # need to special case this. 

437 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

438 configURI: ResourcePathExpression 

439 if outfile is not None: 

440 # When writing to a separate location we must include 

441 # the root of the butler repo in the config else it won't know 

442 # where to look. 

443 config["root"] = root_uri.geturl() 

444 configURI = outfile 

445 else: 

446 configURI = root_uri 

447 # Strip obscore configuration, if it is present, before writing config 

448 # to a file, obscore config will be stored in registry. 

449 if (obscore_config_key := ("registry", "managers", "obscore", "config")) in config: 

450 config_to_write = config.copy() 

451 del config_to_write[obscore_config_key] 

452 config_to_write.dumpToUri(configURI, overwrite=overwrite) 

453 # configFile attribute is updated, need to copy it to original. 

454 config.configFile = config_to_write.configFile 

455 else: 

456 config.dumpToUri(configURI, overwrite=overwrite) 

457 

458 # Create Registry and populate tables 

459 registryConfig = RegistryConfig(config.get("registry")) 

460 dimensionConfig = DimensionConfig(dimensionConfig) 

461 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri) 

462 

463 log.verbose("Wrote new Butler configuration file to %s", configURI) 

464 

465 return config 

466 

467 @classmethod 

468 def _unpickle( 

469 cls, 

470 config: ButlerConfig, 

471 collections: tuple[str, ...] | None, 

472 run: str | None, 

473 defaultDataId: dict[str, str], 

474 writeable: bool, 

475 ) -> Butler: 

476 """Callable used to unpickle a Butler. 

477 

478 We prefer not to use ``Butler.__init__`` directly so we can force some 

479 of its many arguments to be keyword-only (note that ``__reduce__`` 

480 can only invoke callables with positional arguments). 

481 

482 Parameters 

483 ---------- 

484 config : `ButlerConfig` 

485 Butler configuration, already coerced into a true `ButlerConfig` 

486 instance (and hence after any search paths for overrides have been 

487 utilized). 

488 collections : `tuple` [ `str` ] 

489 Names of the default collections to read from. 

490 run : `str`, optional 

491 Name of the default `~CollectionType.RUN` collection to write to. 

492 defaultDataId : `dict` [ `str`, `str` ] 

493 Default data ID values. 

494 writeable : `bool` 

495 Whether the Butler should support write operations. 

496 

497 Returns 

498 ------- 

499 butler : `Butler` 

500 A new `Butler` instance. 

501 """ 

502 # MyPy doesn't recognize that the kwargs below are totally valid; it 

503 # seems to think '**defaultDataId* is a _positional_ argument! 

504 return cls( 

505 config=config, 

506 collections=collections, 

507 run=run, 

508 writeable=writeable, 

509 **defaultDataId, # type: ignore 

510 ) 

511 

512 def __reduce__(self) -> tuple: 

513 """Support pickling.""" 

514 return ( 

515 Butler._unpickle, 

516 ( 

517 self._config, 

518 self.collections, 

519 self.run, 

520 self.registry.defaults.dataId.byName(), 

521 self.registry.isWriteable(), 

522 ), 

523 ) 

524 

525 def __str__(self) -> str: 

526 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

527 self.collections, self.run, self._datastore, self.registry 

528 ) 

529 

530 def isWriteable(self) -> bool: 

531 """Return `True` if this `Butler` supports write operations.""" 

532 return self.registry.isWriteable() 

533 

534 @contextlib.contextmanager 

535 def transaction(self) -> Iterator[None]: 

536 """Context manager supporting `Butler` transactions. 

537 

538 Transactions can be nested. 

539 """ 

540 with self.registry.transaction(): 

541 with self._datastore.transaction(): 

542 yield 

543 

544 def _standardizeArgs( 

545 self, 

546 datasetRefOrType: DatasetRef | DatasetType | str, 

547 dataId: DataId | None = None, 

548 for_put: bool = True, 

549 **kwargs: Any, 

550 ) -> tuple[DatasetType, DataId | None]: 

551 """Standardize the arguments passed to several Butler APIs. 

552 

553 Parameters 

554 ---------- 

555 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

556 When `DatasetRef` the `dataId` should be `None`. 

557 Otherwise the `DatasetType` or name thereof. 

558 dataId : `dict` or `DataCoordinate` 

559 A `dict` of `Dimension` link name, value pairs that label the 

560 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

561 should be provided as the second argument. 

562 for_put : `bool`, optional 

563 If `True` this call is invoked as part of a `Butler.put()`. 

564 Otherwise it is assumed to be part of a `Butler.get()`. This 

565 parameter is only relevant if there is dataset type 

566 inconsistency. 

567 **kwargs 

568 Additional keyword arguments used to augment or construct a 

569 `DataCoordinate`. See `DataCoordinate.standardize` 

570 parameters. 

571 

572 Returns 

573 ------- 

574 datasetType : `DatasetType` 

575 A `DatasetType` instance extracted from ``datasetRefOrType``. 

576 dataId : `dict` or `DataId`, optional 

577 Argument that can be used (along with ``kwargs``) to construct a 

578 `DataId`. 

579 

580 Notes 

581 ----- 

582 Butler APIs that conceptually need a DatasetRef also allow passing a 

583 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

584 keyword arguments that can be used to construct one) separately. This 

585 method accepts those arguments and always returns a true `DatasetType` 

586 and a `DataId` or `dict`. 

587 

588 Standardization of `dict` vs `DataId` is best handled by passing the 

589 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

590 generally similarly flexible. 

591 """ 

592 externalDatasetType: DatasetType | None = None 

593 internalDatasetType: DatasetType | None = None 

594 if isinstance(datasetRefOrType, DatasetRef): 

595 if dataId is not None or kwargs: 

596 raise ValueError("DatasetRef given, cannot use dataId as well") 

597 externalDatasetType = datasetRefOrType.datasetType 

598 dataId = datasetRefOrType.dataId 

599 else: 

600 # Don't check whether DataId is provided, because Registry APIs 

601 # can usually construct a better error message when it wasn't. 

602 if isinstance(datasetRefOrType, DatasetType): 

603 externalDatasetType = datasetRefOrType 

604 else: 

605 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

606 

607 # Check that they are self-consistent 

608 if externalDatasetType is not None: 

609 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

610 if externalDatasetType != internalDatasetType: 

611 # We can allow differences if they are compatible, depending 

612 # on whether this is a get or a put. A get requires that 

613 # the python type associated with the datastore can be 

614 # converted to the user type. A put requires that the user 

615 # supplied python type can be converted to the internal 

616 # type expected by registry. 

617 relevantDatasetType = internalDatasetType 

618 if for_put: 

619 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

620 else: 

621 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

622 relevantDatasetType = externalDatasetType 

623 if not is_compatible: 

624 raise ValueError( 

625 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

626 f"registry definition ({internalDatasetType})" 

627 ) 

628 # Override the internal definition. 

629 internalDatasetType = relevantDatasetType 

630 

631 assert internalDatasetType is not None 

632 return internalDatasetType, dataId 

633 

634 def _rewrite_data_id( 

635 self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any 

636 ) -> tuple[DataId | None, dict[str, Any]]: 

637 """Rewrite a data ID taking into account dimension records. 

638 

639 Take a Data ID and keyword args and rewrite it if necessary to 

640 allow the user to specify dimension records rather than dimension 

641 primary values. 

642 

643 This allows a user to include a dataId dict with keys of 

644 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

645 the integer exposure ID. It also allows a string to be given 

646 for a dimension value rather than the integer ID if that is more 

647 convenient. For example, rather than having to specifying the 

648 detector with ``detector.full_name``, a string given for ``detector`` 

649 will be interpreted as the full name and converted to the integer 

650 value. 

651 

652 Keyword arguments can also use strings for dimensions like detector 

653 and exposure but python does not allow them to include ``.`` and 

654 so the ``exposure.day_obs`` syntax can not be used in a keyword 

655 argument. 

656 

657 Parameters 

658 ---------- 

659 dataId : `dict` or `DataCoordinate` 

660 A `dict` of `Dimension` link name, value pairs that will label the 

661 `DatasetRef` within a Collection. 

662 datasetType : `DatasetType` 

663 The dataset type associated with this dataId. Required to 

664 determine the relevant dimensions. 

665 **kwargs 

666 Additional keyword arguments used to augment or construct a 

667 `DataId`. See `DataId` parameters. 

668 

669 Returns 

670 ------- 

671 dataId : `dict` or `DataCoordinate` 

672 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

673 no keyword arguments, the original dataId will be returned 

674 unchanged. 

675 **kwargs : `dict` 

676 Any unused keyword arguments (would normally be empty dict). 

677 """ 

678 # Do nothing if we have a standalone DataCoordinate. 

679 if isinstance(dataId, DataCoordinate) and not kwargs: 

680 return dataId, kwargs 

681 

682 # Process dimension records that are using record information 

683 # rather than ids 

684 newDataId: dict[str, DataIdValue] = {} 

685 byRecord: dict[str, dict[str, Any]] = defaultdict(dict) 

686 

687 # if all the dataId comes from keyword parameters we do not need 

688 # to do anything here because they can't be of the form 

689 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

690 if dataId: 

691 for k, v in dataId.items(): 

692 # If we have a Dimension we do not need to do anything 

693 # because it cannot be a compound key. 

694 if isinstance(k, str) and "." in k: 

695 # Someone is using a more human-readable dataId 

696 dimensionName, record = k.split(".", 1) 

697 byRecord[dimensionName][record] = v 

698 elif isinstance(k, Dimension): 

699 newDataId[k.name] = v 

700 else: 

701 newDataId[k] = v 

702 

703 # Go through the updated dataId and check the type in case someone is 

704 # using an alternate key. We have already filtered out the compound 

705 # keys dimensions.record format. 

706 not_dimensions = {} 

707 

708 # Will need to look in the dataId and the keyword arguments 

709 # and will remove them if they need to be fixed or are unrecognized. 

710 for dataIdDict in (newDataId, kwargs): 

711 # Use a list so we can adjust the dict safely in the loop 

712 for dimensionName in list(dataIdDict): 

713 value = dataIdDict[dimensionName] 

714 try: 

715 dimension = self.dimensions.getStaticDimensions()[dimensionName] 

716 except KeyError: 

717 # This is not a real dimension 

718 not_dimensions[dimensionName] = value 

719 del dataIdDict[dimensionName] 

720 continue 

721 

722 # Convert an integral type to an explicit int to simplify 

723 # comparisons here 

724 if isinstance(value, numbers.Integral): 

725 value = int(value) 

726 

727 if not isinstance(value, dimension.primaryKey.getPythonType()): 

728 for alternate in dimension.alternateKeys: 

729 if isinstance(value, alternate.getPythonType()): 

730 byRecord[dimensionName][alternate.name] = value 

731 del dataIdDict[dimensionName] 

732 log.debug( 

733 "Converting dimension %s to %s.%s=%s", 

734 dimensionName, 

735 dimensionName, 

736 alternate.name, 

737 value, 

738 ) 

739 break 

740 else: 

741 log.warning( 

742 "Type mismatch found for value '%r' provided for dimension %s. " 

743 "Could not find matching alternative (primary key has type %s) " 

744 "so attempting to use as-is.", 

745 value, 

746 dimensionName, 

747 dimension.primaryKey.getPythonType(), 

748 ) 

749 

750 # By this point kwargs and newDataId should only include valid 

751 # dimensions. Merge kwargs in to the new dataId and log if there 

752 # are dimensions in both (rather than calling update). 

753 for k, v in kwargs.items(): 

754 if k in newDataId and newDataId[k] != v: 

755 log.debug( 

756 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

757 ) 

758 newDataId[k] = v 

759 # No need to retain any values in kwargs now. 

760 kwargs = {} 

761 

762 # If we have some unrecognized dimensions we have to try to connect 

763 # them to records in other dimensions. This is made more complicated 

764 # by some dimensions having records with clashing names. A mitigation 

765 # is that we can tell by this point which dimensions are missing 

766 # for the DatasetType but this does not work for calibrations 

767 # where additional dimensions can be used to constrain the temporal 

768 # axis. 

769 if not_dimensions: 

770 # Search for all dimensions even if we have been given a value 

771 # explicitly. In some cases records are given as well as the 

772 # actually dimension and this should not be an error if they 

773 # match. 

774 mandatoryDimensions = datasetType.dimensions.names # - provided 

775 

776 candidateDimensions: set[str] = set() 

777 candidateDimensions.update(mandatoryDimensions) 

778 

779 # For calibrations we may well be needing temporal dimensions 

780 # so rather than always including all dimensions in the scan 

781 # restrict things a little. It is still possible for there 

782 # to be confusion over day_obs in visit vs exposure for example. 

783 # If we are not searching calibration collections things may 

784 # fail but they are going to fail anyway because of the 

785 # ambiguousness of the dataId... 

786 if datasetType.isCalibration(): 

787 for dim in self.dimensions.getStaticDimensions(): 

788 if dim.temporal: 

789 candidateDimensions.add(str(dim)) 

790 

791 # Look up table for the first association with a dimension 

792 guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict) 

793 

794 # Keep track of whether an item is associated with multiple 

795 # dimensions. 

796 counter: Counter[str] = Counter() 

797 assigned: dict[str, set[str]] = defaultdict(set) 

798 

799 # Go through the missing dimensions and associate the 

800 # given names with records within those dimensions 

801 matched_dims = set() 

802 for dimensionName in candidateDimensions: 

803 dimension = self.dimensions.getStaticDimensions()[dimensionName] 

804 fields = dimension.metadata.names | dimension.uniqueKeys.names 

805 for field in not_dimensions: 

806 if field in fields: 

807 guessedAssociation[dimensionName][field] = not_dimensions[field] 

808 counter[dimensionName] += 1 

809 assigned[field].add(dimensionName) 

810 matched_dims.add(field) 

811 

812 # Calculate the fields that matched nothing. 

813 never_found = set(not_dimensions) - matched_dims 

814 

815 if never_found: 

816 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

817 

818 # There is a chance we have allocated a single dataId item 

819 # to multiple dimensions. Need to decide which should be retained. 

820 # For now assume that the most popular alternative wins. 

821 # This means that day_obs with seq_num will result in 

822 # exposure.day_obs and not visit.day_obs 

823 # Also prefer an explicitly missing dimension over an inferred 

824 # temporal dimension. 

825 for fieldName, assignedDimensions in assigned.items(): 

826 if len(assignedDimensions) > 1: 

827 # Pick the most popular (preferring mandatory dimensions) 

828 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

829 if requiredButMissing: 

830 candidateDimensions = requiredButMissing 

831 else: 

832 candidateDimensions = assignedDimensions 

833 

834 # If this is a choice between visit and exposure and 

835 # neither was a required part of the dataset type, 

836 # (hence in this branch) always prefer exposure over 

837 # visit since exposures are always defined and visits 

838 # are defined from exposures. 

839 if candidateDimensions == {"exposure", "visit"}: 

840 candidateDimensions = {"exposure"} 

841 

842 # Select the relevant items and get a new restricted 

843 # counter. 

844 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

845 duplicatesCounter: Counter[str] = Counter() 

846 duplicatesCounter.update(theseCounts) 

847 

848 # Choose the most common. If they are equally common 

849 # we will pick the one that was found first. 

850 # Returns a list of tuples 

851 selected = duplicatesCounter.most_common(1)[0][0] 

852 

853 log.debug( 

854 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

855 " Removed ambiguity by choosing dimension %s.", 

856 fieldName, 

857 ", ".join(assignedDimensions), 

858 selected, 

859 ) 

860 

861 for candidateDimension in assignedDimensions: 

862 if candidateDimension != selected: 

863 del guessedAssociation[candidateDimension][fieldName] 

864 

865 # Update the record look up dict with the new associations 

866 for dimensionName, values in guessedAssociation.items(): 

867 if values: # A dict might now be empty 

868 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values) 

869 byRecord[dimensionName].update(values) 

870 

871 if byRecord: 

872 # Some record specifiers were found so we need to convert 

873 # them to the Id form 

874 for dimensionName, values in byRecord.items(): 

875 if dimensionName in newDataId: 

876 log.debug( 

877 "DataId specified explicit %s dimension value of %s in addition to" 

878 " general record specifiers for it of %s. Ignoring record information.", 

879 dimensionName, 

880 newDataId[dimensionName], 

881 str(values), 

882 ) 

883 # Get the actual record and compare with these values. 

884 try: 

885 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

886 except DataIdError: 

887 raise ValueError( 

888 f"Could not find dimension '{dimensionName}'" 

889 f" with dataId {newDataId} as part of comparing with" 

890 f" record values {byRecord[dimensionName]}" 

891 ) from None 

892 if len(recs) == 1: 

893 errmsg: list[str] = [] 

894 for k, v in values.items(): 

895 if (recval := getattr(recs[0], k)) != v: 

896 errmsg.append(f"{k}({recval} != {v})") 

897 if errmsg: 

898 raise ValueError( 

899 f"Dimension {dimensionName} in dataId has explicit value" 

900 " inconsistent with records: " + ", ".join(errmsg) 

901 ) 

902 else: 

903 # Multiple matches for an explicit dimension 

904 # should never happen but let downstream complain. 

905 pass 

906 continue 

907 

908 # Build up a WHERE expression 

909 bind = {k: v for k, v in values.items()} 

910 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

911 

912 # Hopefully we get a single record that matches 

913 records = set( 

914 self.registry.queryDimensionRecords( 

915 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

916 ) 

917 ) 

918 

919 if len(records) != 1: 

920 if len(records) > 1: 

921 # visit can have an ambiguous answer without involving 

922 # visit_system. The default visit_system is defined 

923 # by the instrument. 

924 if ( 

925 dimensionName == "visit" 

926 and "visit_system_membership" in self.dimensions 

927 and "visit_system" in self.dimensions["instrument"].metadata 

928 ): 

929 instrument_records = list( 

930 self.registry.queryDimensionRecords( 

931 "instrument", 

932 dataId=newDataId, 

933 **kwargs, 

934 ) 

935 ) 

936 if len(instrument_records) == 1: 

937 visit_system = instrument_records[0].visit_system 

938 if visit_system is None: 

939 # Set to a value that will never match. 

940 visit_system = -1 

941 

942 # Look up each visit in the 

943 # visit_system_membership records. 

944 for rec in records: 

945 membership = list( 

946 self.registry.queryDimensionRecords( 

947 # Use bind to allow zero results. 

948 # This is a fully-specified query. 

949 "visit_system_membership", 

950 where="instrument = inst AND visit_system = system AND visit = v", 

951 bind=dict( 

952 inst=instrument_records[0].name, system=visit_system, v=rec.id 

953 ), 

954 ) 

955 ) 

956 if membership: 

957 # This record is the right answer. 

958 records = {rec} 

959 break 

960 

961 # The ambiguity may have been resolved so check again. 

962 if len(records) > 1: 

963 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

964 for r in records: 

965 log.debug("- %s", str(r)) 

966 raise ValueError( 

967 f"DataId specification for dimension {dimensionName} is not" 

968 f" uniquely constrained to a single dataset by {values}." 

969 f" Got {len(records)} results." 

970 ) 

971 else: 

972 raise ValueError( 

973 f"DataId specification for dimension {dimensionName} matched no" 

974 f" records when constrained by {values}" 

975 ) 

976 

977 # Get the primary key from the real dimension object 

978 dimension = self.dimensions.getStaticDimensions()[dimensionName] 

979 if not isinstance(dimension, Dimension): 

980 raise RuntimeError( 

981 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

982 ) 

983 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

984 

985 return newDataId, kwargs 

986 

987 def _findDatasetRef( 

988 self, 

989 datasetRefOrType: DatasetRef | DatasetType | str, 

990 dataId: DataId | None = None, 

991 *, 

992 collections: Any = None, 

993 predict: bool = False, 

994 run: str | None = None, 

995 **kwargs: Any, 

996 ) -> DatasetRef: 

997 """Shared logic for methods that start with a search for a dataset in 

998 the registry. 

999 

1000 Parameters 

1001 ---------- 

1002 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1003 When `DatasetRef` the `dataId` should be `None`. 

1004 Otherwise the `DatasetType` or name thereof. 

1005 dataId : `dict` or `DataCoordinate`, optional 

1006 A `dict` of `Dimension` link name, value pairs that label the 

1007 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1008 should be provided as the first argument. 

1009 collections : Any, optional 

1010 Collections to be searched, overriding ``self.collections``. 

1011 Can be any of the types supported by the ``collections`` argument 

1012 to butler construction. 

1013 predict : `bool`, optional 

1014 If `True`, return a newly created `DatasetRef` with a unique 

1015 dataset ID if finding a reference in the `Registry` fails. 

1016 Defaults to `False`. 

1017 run : `str`, optional 

1018 Run collection name to use for creating `DatasetRef` for predicted 

1019 datasets. Only used if ``predict`` is `True`. 

1020 **kwargs 

1021 Additional keyword arguments used to augment or construct a 

1022 `DataId`. See `DataId` parameters. 

1023 

1024 Returns 

1025 ------- 

1026 ref : `DatasetRef` 

1027 A reference to the dataset identified by the given arguments. 

1028 This can be the same dataset reference as given if it was 

1029 resolved. 

1030 

1031 Raises 

1032 ------ 

1033 LookupError 

1034 Raised if no matching dataset exists in the `Registry` (and 

1035 ``predict`` is `False`). 

1036 ValueError 

1037 Raised if a resolved `DatasetRef` was passed as an input, but it 

1038 differs from the one found in the registry. 

1039 TypeError 

1040 Raised if no collections were provided. 

1041 """ 

1042 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

1043 if isinstance(datasetRefOrType, DatasetRef): 

1044 if collections is not None: 

1045 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3) 

1046 return datasetRefOrType 

1047 timespan: Timespan | None = None 

1048 

1049 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1050 

1051 if datasetType.isCalibration(): 

1052 # Because this is a calibration dataset, first try to make a 

1053 # standardize the data ID without restricting the dimensions to 

1054 # those of the dataset type requested, because there may be extra 

1055 # dimensions that provide temporal information for a validity-range 

1056 # lookup. 

1057 dataId = DataCoordinate.standardize( 

1058 dataId, universe=self.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1059 ) 

1060 if dataId.graph.temporal: 

1061 dataId = self.registry.expandDataId(dataId) 

1062 timespan = dataId.timespan 

1063 else: 

1064 # Standardize the data ID to just the dimensions of the dataset 

1065 # type instead of letting registry.findDataset do it, so we get the 

1066 # result even if no dataset is found. 

1067 dataId = DataCoordinate.standardize( 

1068 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1069 ) 

1070 # Always lookup the DatasetRef, even if one is given, to ensure it is 

1071 # present in the current collection. 

1072 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

1073 if ref is None: 

1074 if predict: 

1075 if run is None: 

1076 run = self.run 

1077 if run is None: 

1078 raise TypeError("Cannot predict dataset ID/location with run=None.") 

1079 return DatasetRef(datasetType, dataId, run=run) 

1080 else: 

1081 if collections is None: 

1082 collections = self.registry.defaults.collections 

1083 raise LookupError( 

1084 f"Dataset {datasetType.name} with data ID {dataId} " 

1085 f"could not be found in collections {collections}." 

1086 ) 

1087 if datasetType != ref.datasetType: 

1088 # If they differ it is because the user explicitly specified 

1089 # a compatible dataset type to this call rather than using the 

1090 # registry definition. The DatasetRef must therefore be recreated 

1091 # using the user definition such that the expected type is 

1092 # returned. 

1093 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id) 

1094 

1095 return ref 

1096 

1097 @transactional 

1098 @deprecated( 

1099 reason="Butler.put() now behaves like Butler.putDirect() when given a DatasetRef." 

1100 " Please use Butler.put(). Be aware that you may need to adjust your usage if you" 

1101 " were relying on the run parameter to determine the run." 

1102 " Will be removed after v27.0.", 

1103 version="v26.0", 

1104 category=FutureWarning, 

1105 ) 

1106 def putDirect(self, obj: Any, ref: DatasetRef, /) -> DatasetRef: 

1107 # Docstring inherited. 

1108 return self.put(obj, ref) 

1109 

1110 @transactional 

1111 def put( 

1112 self, 

1113 obj: Any, 

1114 datasetRefOrType: DatasetRef | DatasetType | str, 

1115 /, 

1116 dataId: DataId | None = None, 

1117 *, 

1118 run: str | None = None, 

1119 **kwargs: Any, 

1120 ) -> DatasetRef: 

1121 """Store and register a dataset. 

1122 

1123 Parameters 

1124 ---------- 

1125 obj : `object` 

1126 The dataset. 

1127 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1128 When `DatasetRef` is provided, ``dataId`` should be `None`. 

1129 Otherwise the `DatasetType` or name thereof. If a fully resolved 

1130 `DatasetRef` is given the run and ID are used directly. 

1131 dataId : `dict` or `DataCoordinate` 

1132 A `dict` of `Dimension` link name, value pairs that label the 

1133 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1134 should be provided as the second argument. 

1135 run : `str`, optional 

1136 The name of the run the dataset should be added to, overriding 

1137 ``self.run``. Not used if a resolved `DatasetRef` is provided. 

1138 **kwargs 

1139 Additional keyword arguments used to augment or construct a 

1140 `DataCoordinate`. See `DataCoordinate.standardize` 

1141 parameters. Not used if a resolve `DatasetRef` is provided. 

1142 

1143 Returns 

1144 ------- 

1145 ref : `DatasetRef` 

1146 A reference to the stored dataset, updated with the correct id if 

1147 given. 

1148 

1149 Raises 

1150 ------ 

1151 TypeError 

1152 Raised if the butler is read-only or if no run has been provided. 

1153 """ 

1154 if isinstance(datasetRefOrType, DatasetRef): 

1155 # This is a direct put of predefined DatasetRef. 

1156 log.debug("Butler put direct: %s", datasetRefOrType) 

1157 if run is not None: 

1158 warnings.warn("Run collection is not used for DatasetRef", stacklevel=3) 

1159 # If registry already has a dataset with the same dataset ID, 

1160 # dataset type and DataId, then _importDatasets will do nothing and 

1161 # just return an original ref. We have to raise in this case, there 

1162 # is a datastore check below for that. 

1163 self.registry._importDatasets([datasetRefOrType], expand=True) 

1164 # Before trying to write to the datastore check that it does not 

1165 # know this dataset. This is prone to races, of course. 

1166 if self._datastore.knows(datasetRefOrType): 

1167 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}") 

1168 # Try to write dataset to the datastore, if it fails due to a race 

1169 # with another write, the content of stored data may be 

1170 # unpredictable. 

1171 try: 

1172 self._datastore.put(obj, datasetRefOrType) 

1173 except IntegrityError as e: 

1174 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") 

1175 return datasetRefOrType 

1176 

1177 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

1178 if not self.isWriteable(): 

1179 raise TypeError("Butler is read-only.") 

1180 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

1181 

1182 # Handle dimension records in dataId 

1183 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1184 

1185 # Add Registry Dataset entry. 

1186 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

1187 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

1188 self._datastore.put(obj, ref) 

1189 

1190 return ref 

1191 

1192 @deprecated( 

1193 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef." 

1194 " Please use Butler.get(). Will be removed after v27.0.", 

1195 version="v26.0", 

1196 category=FutureWarning, 

1197 ) 

1198 def getDirect( 

1199 self, 

1200 ref: DatasetRef, 

1201 *, 

1202 parameters: dict[str, Any] | None = None, 

1203 storageClass: StorageClass | str | None = None, 

1204 ) -> Any: 

1205 """Retrieve a stored dataset. 

1206 

1207 Parameters 

1208 ---------- 

1209 ref : `DatasetRef` 

1210 Resolved reference to an already stored dataset. 

1211 parameters : `dict` 

1212 Additional StorageClass-defined options to control reading, 

1213 typically used to efficiently read only a subset of the dataset. 

1214 storageClass : `StorageClass` or `str`, optional 

1215 The storage class to be used to override the Python type 

1216 returned by this method. By default the returned type matches 

1217 the dataset type definition for this dataset. Specifying a 

1218 read `StorageClass` can force a different type to be returned. 

1219 This type must be compatible with the original type. 

1220 

1221 Returns 

1222 ------- 

1223 obj : `object` 

1224 The dataset. 

1225 """ 

1226 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1227 

1228 @deprecated( 

1229 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. " 

1230 "Please use Butler.getDeferred(). Will be removed after v27.0.", 

1231 version="v26.0", 

1232 category=FutureWarning, 

1233 ) 

1234 def getDirectDeferred( 

1235 self, 

1236 ref: DatasetRef, 

1237 *, 

1238 parameters: dict | None = None, 

1239 storageClass: str | StorageClass | None = None, 

1240 ) -> DeferredDatasetHandle: 

1241 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1242 from a resolved `DatasetRef`. 

1243 

1244 Parameters 

1245 ---------- 

1246 ref : `DatasetRef` 

1247 Resolved reference to an already stored dataset. 

1248 parameters : `dict` 

1249 Additional StorageClass-defined options to control reading, 

1250 typically used to efficiently read only a subset of the dataset. 

1251 storageClass : `StorageClass` or `str`, optional 

1252 The storage class to be used to override the Python type 

1253 returned by this method. By default the returned type matches 

1254 the dataset type definition for this dataset. Specifying a 

1255 read `StorageClass` can force a different type to be returned. 

1256 This type must be compatible with the original type. 

1257 

1258 Returns 

1259 ------- 

1260 obj : `DeferredDatasetHandle` 

1261 A handle which can be used to retrieve a dataset at a later time. 

1262 

1263 Raises 

1264 ------ 

1265 LookupError 

1266 Raised if no matching dataset exists in the `Registry`. 

1267 """ 

1268 # Check that dataset actually exists. 

1269 if not self._datastore.exists(ref): 

1270 raise LookupError(f"Dataset reference {ref} does not exist.") 

1271 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1272 

1273 def getDeferred( 

1274 self, 

1275 datasetRefOrType: DatasetRef | DatasetType | str, 

1276 /, 

1277 dataId: DataId | None = None, 

1278 *, 

1279 parameters: dict | None = None, 

1280 collections: Any = None, 

1281 storageClass: str | StorageClass | None = None, 

1282 **kwargs: Any, 

1283 ) -> DeferredDatasetHandle: 

1284 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1285 after an immediate registry lookup. 

1286 

1287 Parameters 

1288 ---------- 

1289 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1290 When `DatasetRef` the `dataId` should be `None`. 

1291 Otherwise the `DatasetType` or name thereof. 

1292 dataId : `dict` or `DataCoordinate`, optional 

1293 A `dict` of `Dimension` link name, value pairs that label the 

1294 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1295 should be provided as the first argument. 

1296 parameters : `dict` 

1297 Additional StorageClass-defined options to control reading, 

1298 typically used to efficiently read only a subset of the dataset. 

1299 collections : Any, optional 

1300 Collections to be searched, overriding ``self.collections``. 

1301 Can be any of the types supported by the ``collections`` argument 

1302 to butler construction. 

1303 storageClass : `StorageClass` or `str`, optional 

1304 The storage class to be used to override the Python type 

1305 returned by this method. By default the returned type matches 

1306 the dataset type definition for this dataset. Specifying a 

1307 read `StorageClass` can force a different type to be returned. 

1308 This type must be compatible with the original type. 

1309 **kwargs 

1310 Additional keyword arguments used to augment or construct a 

1311 `DataId`. See `DataId` parameters. 

1312 

1313 Returns 

1314 ------- 

1315 obj : `DeferredDatasetHandle` 

1316 A handle which can be used to retrieve a dataset at a later time. 

1317 

1318 Raises 

1319 ------ 

1320 LookupError 

1321 Raised if no matching dataset exists in the `Registry`. 

1322 ValueError 

1323 Raised if a resolved `DatasetRef` was passed as an input, but it 

1324 differs from the one found in the registry. 

1325 TypeError 

1326 Raised if no collections were provided. 

1327 """ 

1328 if isinstance(datasetRefOrType, DatasetRef) and not self._datastore.exists(datasetRefOrType): 

1329 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.") 

1330 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1331 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1332 

1333 def get( 

1334 self, 

1335 datasetRefOrType: DatasetRef | DatasetType | str, 

1336 /, 

1337 dataId: DataId | None = None, 

1338 *, 

1339 parameters: dict[str, Any] | None = None, 

1340 collections: Any = None, 

1341 storageClass: StorageClass | str | None = None, 

1342 **kwargs: Any, 

1343 ) -> Any: 

1344 """Retrieve a stored dataset. 

1345 

1346 Parameters 

1347 ---------- 

1348 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1349 When `DatasetRef` the `dataId` should be `None`. 

1350 Otherwise the `DatasetType` or name thereof. 

1351 If a resolved `DatasetRef`, the associated dataset 

1352 is returned directly without additional querying. 

1353 dataId : `dict` or `DataCoordinate` 

1354 A `dict` of `Dimension` link name, value pairs that label the 

1355 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1356 should be provided as the first argument. 

1357 parameters : `dict` 

1358 Additional StorageClass-defined options to control reading, 

1359 typically used to efficiently read only a subset of the dataset. 

1360 collections : Any, optional 

1361 Collections to be searched, overriding ``self.collections``. 

1362 Can be any of the types supported by the ``collections`` argument 

1363 to butler construction. 

1364 storageClass : `StorageClass` or `str`, optional 

1365 The storage class to be used to override the Python type 

1366 returned by this method. By default the returned type matches 

1367 the dataset type definition for this dataset. Specifying a 

1368 read `StorageClass` can force a different type to be returned. 

1369 This type must be compatible with the original type. 

1370 **kwargs 

1371 Additional keyword arguments used to augment or construct a 

1372 `DataCoordinate`. See `DataCoordinate.standardize` 

1373 parameters. 

1374 

1375 Returns 

1376 ------- 

1377 obj : `object` 

1378 The dataset. 

1379 

1380 Raises 

1381 ------ 

1382 LookupError 

1383 Raised if no matching dataset exists in the `Registry`. 

1384 TypeError 

1385 Raised if no collections were provided. 

1386 

1387 Notes 

1388 ----- 

1389 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1390 this method requires that the given data ID include temporal dimensions 

1391 beyond the dimensions of the dataset type itself, in order to find the 

1392 dataset with the appropriate validity range. For example, a "bias" 

1393 dataset with native dimensions ``{instrument, detector}`` could be 

1394 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1395 ``exposure`` is a temporal dimension. 

1396 """ 

1397 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1398 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1399 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1400 

1401 def getURIs( 

1402 self, 

1403 datasetRefOrType: DatasetRef | DatasetType | str, 

1404 /, 

1405 dataId: DataId | None = None, 

1406 *, 

1407 predict: bool = False, 

1408 collections: Any = None, 

1409 run: str | None = None, 

1410 **kwargs: Any, 

1411 ) -> DatasetRefURIs: 

1412 """Return the URIs associated with the dataset. 

1413 

1414 Parameters 

1415 ---------- 

1416 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1417 When `DatasetRef` the `dataId` should be `None`. 

1418 Otherwise the `DatasetType` or name thereof. 

1419 dataId : `dict` or `DataCoordinate` 

1420 A `dict` of `Dimension` link name, value pairs that label the 

1421 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1422 should be provided as the first argument. 

1423 predict : `bool` 

1424 If `True`, allow URIs to be returned of datasets that have not 

1425 been written. 

1426 collections : Any, optional 

1427 Collections to be searched, overriding ``self.collections``. 

1428 Can be any of the types supported by the ``collections`` argument 

1429 to butler construction. 

1430 run : `str`, optional 

1431 Run to use for predictions, overriding ``self.run``. 

1432 **kwargs 

1433 Additional keyword arguments used to augment or construct a 

1434 `DataCoordinate`. See `DataCoordinate.standardize` 

1435 parameters. 

1436 

1437 Returns 

1438 ------- 

1439 uris : `DatasetRefURIs` 

1440 The URI to the primary artifact associated with this dataset (if 

1441 the dataset was disassembled within the datastore this may be 

1442 `None`), and the URIs to any components associated with the dataset 

1443 artifact. (can be empty if there are no components). 

1444 """ 

1445 ref = self._findDatasetRef( 

1446 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs 

1447 ) 

1448 return self._datastore.getURIs(ref, predict) 

1449 

1450 def getURI( 

1451 self, 

1452 datasetRefOrType: DatasetRef | DatasetType | str, 

1453 /, 

1454 dataId: DataId | None = None, 

1455 *, 

1456 predict: bool = False, 

1457 collections: Any = None, 

1458 run: str | None = None, 

1459 **kwargs: Any, 

1460 ) -> ResourcePath: 

1461 """Return the URI to the Dataset. 

1462 

1463 Parameters 

1464 ---------- 

1465 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1466 When `DatasetRef` the `dataId` should be `None`. 

1467 Otherwise the `DatasetType` or name thereof. 

1468 dataId : `dict` or `DataCoordinate` 

1469 A `dict` of `Dimension` link name, value pairs that label the 

1470 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1471 should be provided as the first argument. 

1472 predict : `bool` 

1473 If `True`, allow URIs to be returned of datasets that have not 

1474 been written. 

1475 collections : Any, optional 

1476 Collections to be searched, overriding ``self.collections``. 

1477 Can be any of the types supported by the ``collections`` argument 

1478 to butler construction. 

1479 run : `str`, optional 

1480 Run to use for predictions, overriding ``self.run``. 

1481 **kwargs 

1482 Additional keyword arguments used to augment or construct a 

1483 `DataCoordinate`. See `DataCoordinate.standardize` 

1484 parameters. 

1485 

1486 Returns 

1487 ------- 

1488 uri : `lsst.resources.ResourcePath` 

1489 URI pointing to the Dataset within the datastore. If the 

1490 Dataset does not exist in the datastore, and if ``predict`` is 

1491 `True`, the URI will be a prediction and will include a URI 

1492 fragment "#predicted". 

1493 If the datastore does not have entities that relate well 

1494 to the concept of a URI the returned URI string will be 

1495 descriptive. The returned URI is not guaranteed to be obtainable. 

1496 

1497 Raises 

1498 ------ 

1499 LookupError 

1500 A URI has been requested for a dataset that does not exist and 

1501 guessing is not allowed. 

1502 ValueError 

1503 Raised if a resolved `DatasetRef` was passed as an input, but it 

1504 differs from the one found in the registry. 

1505 TypeError 

1506 Raised if no collections were provided. 

1507 RuntimeError 

1508 Raised if a URI is requested for a dataset that consists of 

1509 multiple artifacts. 

1510 """ 

1511 primary, components = self.getURIs( 

1512 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs 

1513 ) 

1514 

1515 if primary is None or components: 

1516 raise RuntimeError( 

1517 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1518 "Use Butler.getURIs() instead." 

1519 ) 

1520 return primary 

1521 

1522 def retrieveArtifacts( 

1523 self, 

1524 refs: Iterable[DatasetRef], 

1525 destination: ResourcePathExpression, 

1526 transfer: str = "auto", 

1527 preserve_path: bool = True, 

1528 overwrite: bool = False, 

1529 ) -> list[ResourcePath]: 

1530 """Retrieve the artifacts associated with the supplied refs. 

1531 

1532 Parameters 

1533 ---------- 

1534 refs : iterable of `DatasetRef` 

1535 The datasets for which artifacts are to be retrieved. 

1536 A single ref can result in multiple artifacts. The refs must 

1537 be resolved. 

1538 destination : `lsst.resources.ResourcePath` or `str` 

1539 Location to write the artifacts. 

1540 transfer : `str`, optional 

1541 Method to use to transfer the artifacts. Must be one of the options 

1542 supported by `~lsst.resources.ResourcePath.transfer_from()`. 

1543 "move" is not allowed. 

1544 preserve_path : `bool`, optional 

1545 If `True` the full path of the artifact within the datastore 

1546 is preserved. If `False` the final file component of the path 

1547 is used. 

1548 overwrite : `bool`, optional 

1549 If `True` allow transfers to overwrite existing files at the 

1550 destination. 

1551 

1552 Returns 

1553 ------- 

1554 targets : `list` of `lsst.resources.ResourcePath` 

1555 URIs of file artifacts in destination location. Order is not 

1556 preserved. 

1557 

1558 Notes 

1559 ----- 

1560 For non-file datastores the artifacts written to the destination 

1561 may not match the representation inside the datastore. For example 

1562 a hierarchical data structure in a NoSQL database may well be stored 

1563 as a JSON file. 

1564 """ 

1565 return self._datastore.retrieveArtifacts( 

1566 refs, 

1567 ResourcePath(destination), 

1568 transfer=transfer, 

1569 preserve_path=preserve_path, 

1570 overwrite=overwrite, 

1571 ) 

1572 

1573 def exists( 

1574 self, 

1575 dataset_ref_or_type: DatasetRef | DatasetType | str, 

1576 /, 

1577 data_id: DataId | None = None, 

1578 *, 

1579 full_check: bool = True, 

1580 collections: Any = None, 

1581 **kwargs: Any, 

1582 ) -> DatasetExistence: 

1583 """Indicate whether a dataset is known to Butler registry and 

1584 datastore. 

1585 

1586 Parameters 

1587 ---------- 

1588 dataset_ref_or_type : `DatasetRef`, `DatasetType`, or `str` 

1589 When `DatasetRef` the `dataId` should be `None`. 

1590 Otherwise the `DatasetType` or name thereof. 

1591 data_id : `dict` or `DataCoordinate` 

1592 A `dict` of `Dimension` link name, value pairs that label the 

1593 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1594 should be provided as the first argument. 

1595 full_check : `bool`, optional 

1596 If `True`, an additional check will be made for dataset artifact 

1597 existence. This will involve additional overhead due to the need 

1598 to query an external system. If `False` registry and datastore 

1599 will solely be asked if they know about the dataset but no 

1600 check for the artifact will be performed. 

1601 collections : Any, optional 

1602 Collections to be searched, overriding ``self.collections``. 

1603 Can be any of the types supported by the ``collections`` argument 

1604 to butler construction. 

1605 **kwargs 

1606 Additional keyword arguments used to augment or construct a 

1607 `DataCoordinate`. See `DataCoordinate.standardize` 

1608 parameters. 

1609 

1610 Returns 

1611 ------- 

1612 existence : `DatasetExistence` 

1613 Object indicating whether the dataset is known to registry and 

1614 datastore. Evaluates to `True` if the dataset is present and known 

1615 to both. 

1616 """ 

1617 existence = DatasetExistence.UNRECOGNIZED 

1618 

1619 if isinstance(dataset_ref_or_type, DatasetRef): 

1620 if collections is not None: 

1621 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2) 

1622 if data_id is not None: 

1623 warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2) 

1624 ref = dataset_ref_or_type 

1625 registry_ref = self.registry.getDataset(dataset_ref_or_type.id) 

1626 if registry_ref is not None: 

1627 existence |= DatasetExistence.RECORDED 

1628 

1629 if dataset_ref_or_type != registry_ref: 

1630 # This could mean that storage classes differ, so we should 

1631 # check for that but use the registry ref for the rest of 

1632 # the method. 

1633 if registry_ref.is_compatible_with(dataset_ref_or_type): 

1634 # Use the registry version from now on. 

1635 ref = registry_ref 

1636 else: 

1637 raise ValueError( 

1638 f"The ref given to exists() ({ref}) has the same dataset ID as one " 

1639 f"in registry but has different incompatible values ({registry_ref})." 

1640 ) 

1641 else: 

1642 try: 

1643 ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs) 

1644 except (LookupError, TypeError, NoDefaultCollectionError): 

1645 return existence 

1646 existence |= DatasetExistence.RECORDED 

1647 

1648 if self._datastore.knows(ref): 

1649 existence |= DatasetExistence.DATASTORE 

1650 

1651 if full_check: 

1652 if self._datastore.exists(ref): 

1653 existence |= DatasetExistence._ARTIFACT 

1654 elif existence.value != DatasetExistence.UNRECOGNIZED.value: 

1655 # Do not add this flag if we have no other idea about a dataset. 

1656 existence |= DatasetExistence(DatasetExistence._ASSUMED) 

1657 

1658 return existence 

1659 

1660 def _exists_many( 

1661 self, 

1662 refs: Iterable[DatasetRef], 

1663 /, 

1664 *, 

1665 full_check: bool = True, 

1666 ) -> dict[DatasetRef, DatasetExistence]: 

1667 """Indicate whether multiple datasets are known to Butler registry and 

1668 datastore. 

1669 

1670 This is an experimental API that may change at any moment. 

1671 

1672 Parameters 

1673 ---------- 

1674 refs : iterable of `DatasetRef` 

1675 The datasets to be checked. 

1676 full_check : `bool`, optional 

1677 If `True`, an additional check will be made for dataset artifact 

1678 existence. This will involve additional overhead due to the need 

1679 to query an external system. If `False` registry and datastore 

1680 will solely be asked if they know about the dataset but no 

1681 check for the artifact will be performed. 

1682 

1683 Returns 

1684 ------- 

1685 existence : dict of [`DatasetRef`, `DatasetExistence`] 

1686 Mapping from the given dataset refs to an enum indicating the 

1687 status of the dataset in registry and datastore. 

1688 Each value evaluates to `True` if the dataset is present and known 

1689 to both. 

1690 """ 

1691 existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs} 

1692 

1693 # Registry does not have a bulk API to check for a ref. 

1694 for ref in refs: 

1695 registry_ref = self.registry.getDataset(ref.id) 

1696 if registry_ref is not None: 

1697 # It is possible, albeit unlikely, that the given ref does 

1698 # not match the one in registry even though the UUID matches. 

1699 # When checking a single ref we raise, but it's impolite to 

1700 # do that when potentially hundreds of refs are being checked. 

1701 # We could change the API to only accept UUIDs and that would 

1702 # remove the ability to even check and remove the worry 

1703 # about differing storage classes. Given the ongoing discussion 

1704 # on refs vs UUIDs and whether to raise or have a new 

1705 # private flag, treat this as a private API for now. 

1706 existence[ref] |= DatasetExistence.RECORDED 

1707 

1708 # Ask datastore if it knows about these refs. 

1709 knows = self._datastore.knows_these(refs) 

1710 for ref, known in knows.items(): 

1711 if known: 

1712 existence[ref] |= DatasetExistence.DATASTORE 

1713 

1714 if full_check: 

1715 mexists = self._datastore.mexists(refs) 

1716 for ref, exists in mexists.items(): 

1717 if exists: 

1718 existence[ref] |= DatasetExistence._ARTIFACT 

1719 else: 

1720 # Do not set this flag if nothing is known about the dataset. 

1721 for ref in existence.keys(): 

1722 if existence[ref] != DatasetExistence.UNRECOGNIZED: 

1723 existence[ref] |= DatasetExistence._ASSUMED 

1724 

1725 return existence 

1726 

1727 @deprecated( 

1728 reason="Butler.datasetExists() has been replaced by Butler.exists(). Will be removed after v27.0.", 

1729 version="v26.0", 

1730 category=FutureWarning, 

1731 ) 

1732 def datasetExists( 

1733 self, 

1734 datasetRefOrType: DatasetRef | DatasetType | str, 

1735 dataId: DataId | None = None, 

1736 *, 

1737 collections: Any = None, 

1738 **kwargs: Any, 

1739 ) -> bool: 

1740 """Return True if the Dataset is actually present in the Datastore. 

1741 

1742 Parameters 

1743 ---------- 

1744 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1745 When `DatasetRef` the `dataId` should be `None`. 

1746 Otherwise the `DatasetType` or name thereof. 

1747 dataId : `dict` or `DataCoordinate` 

1748 A `dict` of `Dimension` link name, value pairs that label the 

1749 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1750 should be provided as the first argument. 

1751 collections : Any, optional 

1752 Collections to be searched, overriding ``self.collections``. 

1753 Can be any of the types supported by the ``collections`` argument 

1754 to butler construction. 

1755 **kwargs 

1756 Additional keyword arguments used to augment or construct a 

1757 `DataCoordinate`. See `DataCoordinate.standardize` 

1758 parameters. 

1759 

1760 Raises 

1761 ------ 

1762 LookupError 

1763 Raised if the dataset is not even present in the Registry. 

1764 ValueError 

1765 Raised if a resolved `DatasetRef` was passed as an input, but it 

1766 differs from the one found in the registry. 

1767 NoDefaultCollectionError 

1768 Raised if no collections were provided. 

1769 """ 

1770 # A resolved ref may be given that is not known to this butler. 

1771 if isinstance(datasetRefOrType, DatasetRef): 

1772 ref = self.registry.getDataset(datasetRefOrType.id) 

1773 if ref is None: 

1774 raise LookupError( 

1775 f"Resolved DatasetRef with id {datasetRefOrType.id} is not known to registry." 

1776 ) 

1777 else: 

1778 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1779 return self._datastore.exists(ref) 

1780 

1781 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1782 """Remove one or more `~CollectionType.RUN` collections and the 

1783 datasets within them. 

1784 

1785 Parameters 

1786 ---------- 

1787 names : `~collections.abc.Iterable` [ `str` ] 

1788 The names of the collections to remove. 

1789 unstore : `bool`, optional 

1790 If `True` (default), delete datasets from all datastores in which 

1791 they are present, and attempt to rollback the registry deletions if 

1792 datastore deletions fail (which may not always be possible). If 

1793 `False`, datastore records for these datasets are still removed, 

1794 but any artifacts (e.g. files) will not be. 

1795 

1796 Raises 

1797 ------ 

1798 TypeError 

1799 Raised if one or more collections are not of type 

1800 `~CollectionType.RUN`. 

1801 """ 

1802 if not self.isWriteable(): 

1803 raise TypeError("Butler is read-only.") 

1804 names = list(names) 

1805 refs: list[DatasetRef] = [] 

1806 for name in names: 

1807 collectionType = self.registry.getCollectionType(name) 

1808 if collectionType is not CollectionType.RUN: 

1809 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1810 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True)) 

1811 with self._datastore.transaction(): 

1812 with self.registry.transaction(): 

1813 if unstore: 

1814 self._datastore.trash(refs) 

1815 else: 

1816 self._datastore.forget(refs) 

1817 for name in names: 

1818 self.registry.removeCollection(name) 

1819 if unstore: 

1820 # Point of no return for removing artifacts 

1821 self._datastore.emptyTrash() 

1822 

1823 def pruneDatasets( 

1824 self, 

1825 refs: Iterable[DatasetRef], 

1826 *, 

1827 disassociate: bool = True, 

1828 unstore: bool = False, 

1829 tags: Iterable[str] = (), 

1830 purge: bool = False, 

1831 ) -> None: 

1832 # docstring inherited from LimitedButler 

1833 

1834 if not self.isWriteable(): 

1835 raise TypeError("Butler is read-only.") 

1836 if purge: 

1837 if not disassociate: 

1838 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1839 if not unstore: 

1840 raise TypeError("Cannot pass purge=True without unstore=True.") 

1841 elif disassociate: 

1842 tags = tuple(tags) 

1843 if not tags: 

1844 raise TypeError("No tags provided but disassociate=True.") 

1845 for tag in tags: 

1846 collectionType = self.registry.getCollectionType(tag) 

1847 if collectionType is not CollectionType.TAGGED: 

1848 raise TypeError( 

1849 f"Cannot disassociate from collection '{tag}' " 

1850 f"of non-TAGGED type {collectionType.name}." 

1851 ) 

1852 # Transform possibly-single-pass iterable into something we can iterate 

1853 # over multiple times. 

1854 refs = list(refs) 

1855 # Pruning a component of a DatasetRef makes no sense since registry 

1856 # doesn't know about components and datastore might not store 

1857 # components in a separate file 

1858 for ref in refs: 

1859 if ref.datasetType.component(): 

1860 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1861 # We don't need an unreliable Datastore transaction for this, because 

1862 # we've been extra careful to ensure that Datastore.trash only involves 

1863 # mutating the Registry (it can _look_ at Datastore-specific things, 

1864 # but shouldn't change them), and hence all operations here are 

1865 # Registry operations. 

1866 with self._datastore.transaction(): 

1867 with self.registry.transaction(): 

1868 if unstore: 

1869 self._datastore.trash(refs) 

1870 if purge: 

1871 self.registry.removeDatasets(refs) 

1872 elif disassociate: 

1873 assert tags, "Guaranteed by earlier logic in this function." 

1874 for tag in tags: 

1875 self.registry.disassociate(tag, refs) 

1876 # We've exited the Registry transaction, and apparently committed. 

1877 # (if there was an exception, everything rolled back, and it's as if 

1878 # nothing happened - and we never get here). 

1879 # Datastore artifacts are not yet gone, but they're clearly marked 

1880 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1881 # problems we can try again later, and if manual administrative 

1882 # intervention is required, it's pretty clear what that should entail: 

1883 # deleting everything on disk and in private Datastore tables that is 

1884 # in the dataset_location_trash table. 

1885 if unstore: 

1886 # Point of no return for removing artifacts 

1887 self._datastore.emptyTrash() 

1888 

1889 @transactional 

1890 def ingest( 

1891 self, 

1892 *datasets: FileDataset, 

1893 transfer: str | None = "auto", 

1894 run: str | None = None, 

1895 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1896 record_validation_info: bool = True, 

1897 ) -> None: 

1898 """Store and register one or more datasets that already exist on disk. 

1899 

1900 Parameters 

1901 ---------- 

1902 datasets : `FileDataset` 

1903 Each positional argument is a struct containing information about 

1904 a file to be ingested, including its URI (either absolute or 

1905 relative to the datastore root, if applicable), a resolved 

1906 `DatasetRef`, and optionally a formatter class or its 

1907 fully-qualified string name. If a formatter is not provided, the 

1908 formatter that would be used for `put` is assumed. On successful 

1909 ingest all `FileDataset.formatter` attributes will be set to the 

1910 formatter class used. `FileDataset.path` attributes may be modified 

1911 to put paths in whatever the datastore considers a standardized 

1912 form. 

1913 transfer : `str`, optional 

1914 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1915 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to 

1916 transfer the file. 

1917 run : `str`, optional 

1918 The name of the run ingested datasets should be added to, 

1919 overriding ``self.run``. This parameter is now deprecated since 

1920 the run is encoded in the ``FileDataset``. 

1921 idGenerationMode : `DatasetIdGenEnum`, optional 

1922 Specifies option for generating dataset IDs. By default unique IDs 

1923 are generated for each inserted dataset. 

1924 record_validation_info : `bool`, optional 

1925 If `True`, the default, the datastore can record validation 

1926 information associated with the file. If `False` the datastore 

1927 will not attempt to track any information such as checksums 

1928 or file sizes. This can be useful if such information is tracked 

1929 in an external system or if the file is to be compressed in place. 

1930 It is up to the datastore whether this parameter is relevant. 

1931 

1932 Raises 

1933 ------ 

1934 TypeError 

1935 Raised if the butler is read-only or if no run was provided. 

1936 NotImplementedError 

1937 Raised if the `Datastore` does not support the given transfer mode. 

1938 DatasetTypeNotSupportedError 

1939 Raised if one or more files to be ingested have a dataset type that 

1940 is not supported by the `Datastore`.. 

1941 FileNotFoundError 

1942 Raised if one of the given files does not exist. 

1943 FileExistsError 

1944 Raised if transfer is not `None` but the (internal) location the 

1945 file would be moved to is already occupied. 

1946 

1947 Notes 

1948 ----- 

1949 This operation is not fully exception safe: if a database operation 

1950 fails, the given `FileDataset` instances may be only partially updated. 

1951 

1952 It is atomic in terms of database operations (they will either all 

1953 succeed or all fail) providing the database engine implements 

1954 transactions correctly. It will attempt to be atomic in terms of 

1955 filesystem operations as well, but this cannot be implemented 

1956 rigorously for most datastores. 

1957 """ 

1958 if not self.isWriteable(): 

1959 raise TypeError("Butler is read-only.") 

1960 

1961 log.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s") 

1962 if not datasets: 

1963 return 

1964 

1965 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1966 

1967 # We need to reorganize all the inputs so that they are grouped 

1968 # by dataset type and run. Multiple refs in a single FileDataset 

1969 # are required to share the run and dataset type. 

1970 GroupedData = MutableMapping[tuple[DatasetType, str], list[FileDataset]] 

1971 groupedData: GroupedData = defaultdict(list) 

1972 

1973 # Track DataIDs that are being ingested so we can spot issues early 

1974 # with duplication. Retain previous FileDataset so we can report it. 

1975 groupedDataIds: MutableMapping[ 

1976 tuple[DatasetType, str], dict[DataCoordinate, FileDataset] 

1977 ] = defaultdict(dict) 

1978 

1979 used_run = False 

1980 

1981 # And the nested loop that populates it: 

1982 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1983 # Somewhere to store pre-existing refs if we have an 

1984 # execution butler. 

1985 existingRefs: list[DatasetRef] = [] 

1986 

1987 for ref in dataset.refs: 

1988 assert ref.run is not None # For mypy 

1989 group_key = (ref.datasetType, ref.run) 

1990 

1991 if ref.dataId in groupedDataIds[group_key]: 

1992 raise ConflictingDefinitionError( 

1993 f"Ingest conflict. Dataset {dataset.path} has same" 

1994 " DataId as other ingest dataset" 

1995 f" {groupedDataIds[group_key][ref.dataId].path} " 

1996 f" ({ref.dataId})" 

1997 ) 

1998 

1999 groupedDataIds[group_key][ref.dataId] = dataset 

2000 

2001 if existingRefs: 

2002 if len(dataset.refs) != len(existingRefs): 

2003 # Keeping track of partially pre-existing datasets is hard 

2004 # and should generally never happen. For now don't allow 

2005 # it. 

2006 raise ConflictingDefinitionError( 

2007 f"For dataset {dataset.path} some dataIds already exist" 

2008 " in registry but others do not. This is not supported." 

2009 ) 

2010 

2011 # Store expanded form in the original FileDataset. 

2012 dataset.refs = existingRefs 

2013 else: 

2014 groupedData[group_key].append(dataset) 

2015 

2016 if not used_run and run is not None: 

2017 warnings.warn( 

2018 "All DatasetRefs to be ingested had resolved dataset IDs. The value given to the " 

2019 f"'run' parameter ({run!r}) was not used and the parameter will be removed in the future.", 

2020 category=FutureWarning, 

2021 stacklevel=3, # Take into account the @transactional decorator. 

2022 ) 

2023 

2024 # Now we can bulk-insert into Registry for each DatasetType. 

2025 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks( 

2026 groupedData.items(), desc="Bulk-inserting datasets by type" 

2027 ): 

2028 refs_to_import = [] 

2029 for dataset in grouped_datasets: 

2030 refs_to_import.extend(dataset.refs) 

2031 

2032 n_refs = len(refs_to_import) 

2033 log.verbose( 

2034 "Importing %d ref%s of dataset type %r into run %r", 

2035 n_refs, 

2036 "" if n_refs == 1 else "s", 

2037 datasetType.name, 

2038 this_run, 

2039 ) 

2040 

2041 # Import the refs and expand the DataCoordinates since we can't 

2042 # guarantee that they are expanded and Datastore will need 

2043 # the records. 

2044 imported_refs = self.registry._importDatasets(refs_to_import, expand=True) 

2045 assert set(imported_refs) == set(refs_to_import) 

2046 

2047 # Replace all the refs in the FileDataset with expanded versions. 

2048 # Pull them off in the order we put them on the list. 

2049 for dataset in grouped_datasets: 

2050 n_dataset_refs = len(dataset.refs) 

2051 dataset.refs = imported_refs[:n_dataset_refs] 

2052 del imported_refs[:n_dataset_refs] 

2053 

2054 # Bulk-insert everything into Datastore. 

2055 # We do not know if any of the registry entries already existed 

2056 # (_importDatasets only complains if they exist but differ) so 

2057 # we have to catch IntegrityError explicitly. 

2058 try: 

2059 self._datastore.ingest( 

2060 *datasets, transfer=transfer, record_validation_info=record_validation_info 

2061 ) 

2062 except IntegrityError as e: 

2063 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") 

2064 

2065 @contextlib.contextmanager 

2066 def export( 

2067 self, 

2068 *, 

2069 directory: str | None = None, 

2070 filename: str | None = None, 

2071 format: str | None = None, 

2072 transfer: str | None = None, 

2073 ) -> Iterator[RepoExportContext]: 

2074 """Export datasets from the repository represented by this `Butler`. 

2075 

2076 This method is a context manager that returns a helper object 

2077 (`RepoExportContext`) that is used to indicate what information from 

2078 the repository should be exported. 

2079 

2080 Parameters 

2081 ---------- 

2082 directory : `str`, optional 

2083 Directory dataset files should be written to if ``transfer`` is not 

2084 `None`. 

2085 filename : `str`, optional 

2086 Name for the file that will include database information associated 

2087 with the exported datasets. If this is not an absolute path and 

2088 ``directory`` is not `None`, it will be written to ``directory`` 

2089 instead of the current working directory. Defaults to 

2090 "export.{format}". 

2091 format : `str`, optional 

2092 File format for the database information file. If `None`, the 

2093 extension of ``filename`` will be used. 

2094 transfer : `str`, optional 

2095 Transfer mode passed to `Datastore.export`. 

2096 

2097 Raises 

2098 ------ 

2099 TypeError 

2100 Raised if the set of arguments passed is inconsistent. 

2101 

2102 Examples 

2103 -------- 

2104 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

2105 methods are used to provide the iterables over data IDs and/or datasets 

2106 to be exported:: 

2107 

2108 with butler.export("exports.yaml") as export: 

2109 # Export all flats, but none of the dimension element rows 

2110 # (i.e. data ID information) associated with them. 

2111 export.saveDatasets(butler.registry.queryDatasets("flat"), 

2112 elements=()) 

2113 # Export all datasets that start with "deepCoadd_" and all of 

2114 # their associated data ID information. 

2115 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

2116 """ 

2117 if directory is None and transfer is not None: 

2118 raise TypeError("Cannot transfer without providing a directory.") 

2119 if transfer == "move": 

2120 raise TypeError("Transfer may not be 'move': export is read-only") 

2121 if format is None: 

2122 if filename is None: 

2123 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2124 else: 

2125 _, format = os.path.splitext(filename) 

2126 if not format: 

2127 raise ValueError("Please specify a file extension to determine export format.") 

2128 format = format[1:] # Strip leading "."" 

2129 elif filename is None: 

2130 filename = f"export.{format}" 

2131 if directory is not None: 

2132 filename = os.path.join(directory, filename) 

2133 formats = self._config["repo_transfer_formats"] 

2134 if format not in formats: 

2135 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}") 

2136 BackendClass = get_class_of(formats[format, "export"]) 

2137 with open(filename, "w") as stream: 

2138 backend = BackendClass(stream, universe=self.dimensions) 

2139 try: 

2140 helper = RepoExportContext( 

2141 self.registry, self._datastore, backend=backend, directory=directory, transfer=transfer 

2142 ) 

2143 yield helper 

2144 except BaseException: 

2145 raise 

2146 else: 

2147 helper._finish() 

2148 

2149 def import_( 

2150 self, 

2151 *, 

2152 directory: ResourcePathExpression | None = None, 

2153 filename: ResourcePathExpression | TextIO | None = None, 

2154 format: str | None = None, 

2155 transfer: str | None = None, 

2156 skip_dimensions: set | None = None, 

2157 ) -> None: 

2158 """Import datasets into this repository that were exported from a 

2159 different butler repository via `~lsst.daf.butler.Butler.export`. 

2160 

2161 Parameters 

2162 ---------- 

2163 directory : `~lsst.resources.ResourcePathExpression`, optional 

2164 Directory containing dataset files to import from. If `None`, 

2165 ``filename`` and all dataset file paths specified therein must 

2166 be absolute. 

2167 filename : `~lsst.resources.ResourcePathExpression` or `TextIO` 

2168 A stream or name of file that contains database information 

2169 associated with the exported datasets, typically generated by 

2170 `~lsst.daf.butler.Butler.export`. If this a string (name) or 

2171 `~lsst.resources.ResourcePath` and is not an absolute path, 

2172 it will first be looked for relative to ``directory`` and if not 

2173 found there it will be looked for in the current working 

2174 directory. Defaults to "export.{format}". 

2175 format : `str`, optional 

2176 File format for ``filename``. If `None`, the extension of 

2177 ``filename`` will be used. 

2178 transfer : `str`, optional 

2179 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

2180 skip_dimensions : `set`, optional 

2181 Names of dimensions that should be skipped and not imported. 

2182 

2183 Raises 

2184 ------ 

2185 TypeError 

2186 Raised if the set of arguments passed is inconsistent, or if the 

2187 butler is read-only. 

2188 """ 

2189 if not self.isWriteable(): 

2190 raise TypeError("Butler is read-only.") 

2191 if format is None: 

2192 if filename is None: 

2193 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2194 else: 

2195 _, format = os.path.splitext(filename) # type: ignore 

2196 elif filename is None: 

2197 filename = ResourcePath(f"export.{format}", forceAbsolute=False) 

2198 if directory is not None: 

2199 directory = ResourcePath(directory, forceDirectory=True) 

2200 # mypy doesn't think this will work but it does in python >= 3.10. 

2201 if isinstance(filename, ResourcePathExpression): # type: ignore 

2202 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore 

2203 if not filename.isabs() and directory is not None: 

2204 potential = directory.join(filename) 

2205 exists_in_cwd = filename.exists() 

2206 exists_in_dir = potential.exists() 

2207 if exists_in_cwd and exists_in_dir: 

2208 log.warning( 

2209 "A relative path for filename was specified (%s) which exists relative to cwd. " 

2210 "Additionally, the file exists relative to the given search directory (%s). " 

2211 "Using the export file in the given directory.", 

2212 filename, 

2213 potential, 

2214 ) 

2215 # Given they specified an explicit directory and that 

2216 # directory has the export file in it, assume that that 

2217 # is what was meant despite the file in cwd. 

2218 filename = potential 

2219 elif exists_in_dir: 

2220 filename = potential 

2221 elif not exists_in_cwd and not exists_in_dir: 

2222 # Raise early. 

2223 raise FileNotFoundError( 

2224 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}." 

2225 ) 

2226 BackendClass: type[RepoImportBackend] = get_class_of( 

2227 self._config["repo_transfer_formats"][format]["import"] 

2228 ) 

2229 

2230 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None: 

2231 backend = BackendClass(importStream, self.registry) # type: ignore[call-arg] 

2232 backend.register() 

2233 with self.transaction(): 

2234 backend.load( 

2235 self._datastore, 

2236 directory=directory, 

2237 transfer=transfer, 

2238 skip_dimensions=skip_dimensions, 

2239 ) 

2240 

2241 if isinstance(filename, ResourcePath): 

2242 # We can not use open() here at the moment because of 

2243 # DM-38589 since yaml does stream.read(8192) in a loop. 

2244 stream = io.StringIO(filename.read().decode()) 

2245 doImport(stream) 

2246 else: 

2247 doImport(filename) # type: ignore 

2248 

2249 def transfer_from( 

2250 self, 

2251 source_butler: LimitedButler, 

2252 source_refs: Iterable[DatasetRef], 

2253 transfer: str = "auto", 

2254 skip_missing: bool = True, 

2255 register_dataset_types: bool = False, 

2256 transfer_dimensions: bool = False, 

2257 ) -> collections.abc.Collection[DatasetRef]: 

2258 """Transfer datasets to this Butler from a run in another Butler. 

2259 

2260 Parameters 

2261 ---------- 

2262 source_butler : `LimitedButler` 

2263 Butler from which the datasets are to be transferred. If data IDs 

2264 in ``source_refs`` are not expanded then this has to be a full 

2265 `Butler` whose registry will be used to expand data IDs. 

2266 source_refs : iterable of `DatasetRef` 

2267 Datasets defined in the source butler that should be transferred to 

2268 this butler. 

2269 transfer : `str`, optional 

2270 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

2271 skip_missing : `bool` 

2272 If `True`, datasets with no datastore artifact associated with 

2273 them are not transferred. If `False` a registry entry will be 

2274 created even if no datastore record is created (and so will 

2275 look equivalent to the dataset being unstored). 

2276 register_dataset_types : `bool` 

2277 If `True` any missing dataset types are registered. Otherwise 

2278 an exception is raised. 

2279 transfer_dimensions : `bool`, optional 

2280 If `True`, dimension record data associated with the new datasets 

2281 will be transferred. 

2282 

2283 Returns 

2284 ------- 

2285 refs : `list` of `DatasetRef` 

2286 The refs added to this Butler. 

2287 

2288 Notes 

2289 ----- 

2290 The datastore artifact has to exist for a transfer 

2291 to be made but non-existence is not an error. 

2292 

2293 Datasets that already exist in this run will be skipped. 

2294 

2295 The datasets are imported as part of a transaction, although 

2296 dataset types are registered before the transaction is started. 

2297 This means that it is possible for a dataset type to be registered 

2298 even though transfer has failed. 

2299 """ 

2300 if not self.isWriteable(): 

2301 raise TypeError("Butler is read-only.") 

2302 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

2303 

2304 # Will iterate through the refs multiple times so need to convert 

2305 # to a list if this isn't a collection. 

2306 if not isinstance(source_refs, collections.abc.Collection): 

2307 source_refs = list(source_refs) 

2308 

2309 original_count = len(source_refs) 

2310 log.info("Transferring %d datasets into %s", original_count, str(self)) 

2311 

2312 # In some situations the datastore artifact may be missing 

2313 # and we do not want that registry entry to be imported. 

2314 # Asking datastore is not sufficient, the records may have been 

2315 # purged, we have to ask for the (predicted) URI and check 

2316 # existence explicitly. Execution butler is set up exactly like 

2317 # this with no datastore records. 

2318 artifact_existence: dict[ResourcePath, bool] = {} 

2319 if skip_missing: 

2320 dataset_existence = source_butler._datastore.mexists( 

2321 source_refs, artifact_existence=artifact_existence 

2322 ) 

2323 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

2324 filtered_count = len(source_refs) 

2325 n_missing = original_count - filtered_count 

2326 log.verbose( 

2327 "%d dataset%s removed because the artifact does not exist. Now have %d.", 

2328 n_missing, 

2329 "" if n_missing == 1 else "s", 

2330 filtered_count, 

2331 ) 

2332 

2333 # Importing requires that we group the refs by dataset type and run 

2334 # before doing the import. 

2335 source_dataset_types = set() 

2336 grouped_refs = defaultdict(list) 

2337 for ref in source_refs: 

2338 grouped_refs[ref.datasetType, ref.run].append(ref) 

2339 source_dataset_types.add(ref.datasetType) 

2340 

2341 # Check to see if the dataset type in the source butler has 

2342 # the same definition in the target butler and register missing 

2343 # ones if requested. Registration must happen outside a transaction. 

2344 newly_registered_dataset_types = set() 

2345 for datasetType in source_dataset_types: 

2346 if register_dataset_types: 

2347 # Let this raise immediately if inconsistent. Continuing 

2348 # on to find additional inconsistent dataset types 

2349 # might result in additional unwanted dataset types being 

2350 # registered. 

2351 if self.registry.registerDatasetType(datasetType): 

2352 newly_registered_dataset_types.add(datasetType) 

2353 else: 

2354 # If the dataset type is missing, let it fail immediately. 

2355 target_dataset_type = self.registry.getDatasetType(datasetType.name) 

2356 if target_dataset_type != datasetType: 

2357 raise ConflictingDefinitionError( 

2358 "Source butler dataset type differs from definition" 

2359 f" in target butler: {datasetType} !=" 

2360 f" {target_dataset_type}" 

2361 ) 

2362 if newly_registered_dataset_types: 

2363 # We may have registered some even if there were inconsistencies 

2364 # but should let people know (or else remove them again). 

2365 log.log( 

2366 VERBOSE, 

2367 "Registered the following dataset types in the target Butler: %s", 

2368 ", ".join(d.name for d in newly_registered_dataset_types), 

2369 ) 

2370 else: 

2371 log.log(VERBOSE, "All required dataset types are known to the target Butler") 

2372 

2373 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

2374 if transfer_dimensions: 

2375 # Collect all the dimension records for these refs. 

2376 # All dimensions are to be copied but the list of valid dimensions 

2377 # come from this butler's universe. 

2378 elements = frozenset( 

2379 element 

2380 for element in self.dimensions.getStaticElements() 

2381 if element.hasTable() and element.viewOf is None 

2382 ) 

2383 dataIds = {ref.dataId for ref in source_refs} 

2384 # This logic comes from saveDataIds. 

2385 for dataId in dataIds: 

2386 # Need an expanded record, if not expanded that we need a full 

2387 # butler with registry (allow mocks with registry too). 

2388 if not dataId.hasRecords(): 

2389 if registry := getattr(source_butler, "registry", None): 

2390 dataId = registry.expandDataId(dataId) 

2391 else: 

2392 raise TypeError("Input butler needs to be a full butler to expand DataId.") 

2393 # If this butler doesn't know about a dimension in the source 

2394 # butler things will break later. 

2395 for record in dataId.records.values(): 

2396 if record is not None and record.definition in elements: 

2397 dimension_records[record.definition].setdefault(record.dataId, record) 

2398 

2399 handled_collections: set[str] = set() 

2400 

2401 # Do all the importing in a single transaction. 

2402 with self.transaction(): 

2403 if dimension_records: 

2404 log.verbose("Ensuring that dimension records exist for transferred datasets.") 

2405 for element, r in dimension_records.items(): 

2406 records = [r[dataId] for dataId in r] 

2407 # Assume that if the record is already present that we can 

2408 # use it without having to check that the record metadata 

2409 # is consistent. 

2410 self.registry.insertDimensionData(element, *records, skip_existing=True) 

2411 

2412 n_imported = 0 

2413 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

2414 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

2415 ): 

2416 if run not in handled_collections: 

2417 # May need to create output collection. If source butler 

2418 # has a registry, ask for documentation string. 

2419 run_doc = None 

2420 if registry := getattr(source_butler, "registry", None): 

2421 run_doc = registry.getCollectionDocumentation(run) 

2422 registered = self.registry.registerRun(run, doc=run_doc) 

2423 handled_collections.add(run) 

2424 if registered: 

2425 log.log(VERBOSE, "Creating output run %s", run) 

2426 

2427 n_refs = len(refs_to_import) 

2428 log.verbose( 

2429 "Importing %d ref%s of dataset type %s into run %s", 

2430 n_refs, 

2431 "" if n_refs == 1 else "s", 

2432 datasetType.name, 

2433 run, 

2434 ) 

2435 

2436 # Assume we are using UUIDs and the source refs will match 

2437 # those imported. 

2438 imported_refs = self.registry._importDatasets(refs_to_import, expand=False) 

2439 assert set(imported_refs) == set(refs_to_import) 

2440 n_imported += len(imported_refs) 

2441 

2442 assert len(source_refs) == n_imported 

2443 log.verbose("Imported %d datasets into destination butler", n_imported) 

2444 

2445 # Ask the datastore to transfer. The datastore has to check that 

2446 # the source datastore is compatible with the target datastore. 

2447 accepted, rejected = self._datastore.transfer_from( 

2448 source_butler._datastore, 

2449 source_refs, 

2450 transfer=transfer, 

2451 artifact_existence=artifact_existence, 

2452 ) 

2453 if rejected: 

2454 # For now, accept the registry entries but not the files. 

2455 log.warning( 

2456 "%d datasets were rejected and %d accepted for dataset type %s in run %r.", 

2457 len(rejected), 

2458 len(accepted), 

2459 datasetType, 

2460 run, 

2461 ) 

2462 

2463 return source_refs 

2464 

2465 def validateConfiguration( 

2466 self, 

2467 logFailures: bool = False, 

2468 datasetTypeNames: Iterable[str] | None = None, 

2469 ignore: Iterable[str] | None = None, 

2470 ) -> None: 

2471 """Validate butler configuration. 

2472 

2473 Checks that each `DatasetType` can be stored in the `Datastore`. 

2474 

2475 Parameters 

2476 ---------- 

2477 logFailures : `bool`, optional 

2478 If `True`, output a log message for every validation error 

2479 detected. 

2480 datasetTypeNames : iterable of `str`, optional 

2481 The `DatasetType` names that should be checked. This allows 

2482 only a subset to be selected. 

2483 ignore : iterable of `str`, optional 

2484 Names of DatasetTypes to skip over. This can be used to skip 

2485 known problems. If a named `DatasetType` corresponds to a 

2486 composite, all components of that `DatasetType` will also be 

2487 ignored. 

2488 

2489 Raises 

2490 ------ 

2491 ButlerValidationError 

2492 Raised if there is some inconsistency with how this Butler 

2493 is configured. 

2494 """ 

2495 if datasetTypeNames: 

2496 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

2497 else: 

2498 datasetTypes = list(self.registry.queryDatasetTypes()) 

2499 

2500 # filter out anything from the ignore list 

2501 if ignore: 

2502 ignore = set(ignore) 

2503 datasetTypes = [ 

2504 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

2505 ] 

2506 else: 

2507 ignore = set() 

2508 

2509 # For each datasetType that has an instrument dimension, create 

2510 # a DatasetRef for each defined instrument 

2511 datasetRefs = [] 

2512 

2513 # Find all the registered instruments (if "instrument" is in the 

2514 # universe). 

2515 if "instrument" in self.dimensions: 

2516 instruments = {record.name for record in self.registry.queryDimensionRecords("instrument")} 

2517 

2518 for datasetType in datasetTypes: 

2519 if "instrument" in datasetType.dimensions: 

2520 # In order to create a conforming dataset ref, create 

2521 # fake DataCoordinate values for the non-instrument 

2522 # dimensions. The type of the value does not matter here. 

2523 dataId = {dim.name: 1 for dim in datasetType.dimensions if dim.name != "instrument"} 

2524 

2525 for instrument in instruments: 

2526 datasetRef = DatasetRef( 

2527 datasetType, 

2528 DataCoordinate.standardize( 

2529 dataId, instrument=instrument, graph=datasetType.dimensions 

2530 ), 

2531 run="validate", 

2532 ) 

2533 datasetRefs.append(datasetRef) 

2534 

2535 entities: list[DatasetType | DatasetRef] = [] 

2536 entities.extend(datasetTypes) 

2537 entities.extend(datasetRefs) 

2538 

2539 datastoreErrorStr = None 

2540 try: 

2541 self._datastore.validateConfiguration(entities, logFailures=logFailures) 

2542 except ValidationError as e: 

2543 datastoreErrorStr = str(e) 

2544 

2545 # Also check that the LookupKeys used by the datastores match 

2546 # registry and storage class definitions 

2547 keys = self._datastore.getLookupKeys() 

2548 

2549 failedNames = set() 

2550 failedDataId = set() 

2551 for key in keys: 

2552 if key.name is not None: 

2553 if key.name in ignore: 

2554 continue 

2555 

2556 # skip if specific datasetType names were requested and this 

2557 # name does not match 

2558 if datasetTypeNames and key.name not in datasetTypeNames: 

2559 continue 

2560 

2561 # See if it is a StorageClass or a DatasetType 

2562 if key.name in self.storageClasses: 

2563 pass 

2564 else: 

2565 try: 

2566 self.registry.getDatasetType(key.name) 

2567 except KeyError: 

2568 if logFailures: 

2569 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

2570 failedNames.add(key) 

2571 else: 

2572 # Dimensions are checked for consistency when the Butler 

2573 # is created and rendezvoused with a universe. 

2574 pass 

2575 

2576 # Check that the instrument is a valid instrument 

2577 # Currently only support instrument so check for that 

2578 if key.dataId: 

2579 dataIdKeys = set(key.dataId) 

2580 if {"instrument"} != dataIdKeys: 

2581 if logFailures: 

2582 log.critical("Key '%s' has unsupported DataId override", key) 

2583 failedDataId.add(key) 

2584 elif key.dataId["instrument"] not in instruments: 

2585 if logFailures: 

2586 log.critical("Key '%s' has unknown instrument", key) 

2587 failedDataId.add(key) 

2588 

2589 messages = [] 

2590 

2591 if datastoreErrorStr: 

2592 messages.append(datastoreErrorStr) 

2593 

2594 for failed, msg in ( 

2595 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2596 (failedDataId, "Keys with bad DataId entries: "), 

2597 ): 

2598 if failed: 

2599 msg += ", ".join(str(k) for k in failed) 

2600 messages.append(msg) 

2601 

2602 if messages: 

2603 raise ValidationError(";\n".join(messages)) 

2604 

2605 @property 

2606 def collections(self) -> Sequence[str]: 

2607 """The collections to search by default, in order 

2608 (`~collections.abc.Sequence` [ `str` ]). 

2609 

2610 This is an alias for ``self.registry.defaults.collections``. It cannot 

2611 be set directly in isolation, but all defaults may be changed together 

2612 by assigning a new `RegistryDefaults` instance to 

2613 ``self.registry.defaults``. 

2614 """ 

2615 return self.registry.defaults.collections 

2616 

2617 @property 

2618 def run(self) -> str | None: 

2619 """Name of the run this butler writes outputs to by default (`str` or 

2620 `None`). 

2621 

2622 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2623 directly in isolation, but all defaults may be changed together by 

2624 assigning a new `RegistryDefaults` instance to 

2625 ``self.registry.defaults``. 

2626 """ 

2627 return self.registry.defaults.run 

2628 

2629 @property 

2630 def dimensions(self) -> DimensionUniverse: 

2631 # Docstring inherited. 

2632 return self.registry.dimensions 

2633 

2634 registry: Registry 

2635 """The object that manages dataset metadata and relationships (`Registry`). 

2636 

2637 Most operations that don't involve reading or writing butler datasets are 

2638 accessible only via `Registry` methods. 

2639 """ 

2640 

2641 datastore: Datastore 

2642 """The object that manages actual dataset storage (`Datastore`). 

2643 

2644 Direct user access to the datastore should rarely be necessary; the primary 

2645 exception is the case where a `Datastore` implementation provides extra 

2646 functionality beyond what the base class defines. 

2647 """ 

2648 

2649 storageClasses: StorageClassFactory 

2650 """An object that maps known storage class names to objects that fully 

2651 describe them (`StorageClassFactory`). 

2652 """