Coverage for python/lsst/daf/butler/_butler.py: 8%

685 statements  

« prev     ^ index     » next       coverage.py v7.2.3, created at 2023-04-19 03:42 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Butler top level classes. 

24""" 

25from __future__ import annotations 

26 

27__all__ = ( 

28 "Butler", 

29 "ButlerValidationError", 

30) 

31 

32import collections.abc 

33import contextlib 

34import io 

35import logging 

36import numbers 

37import os 

38import uuid 

39from collections import defaultdict 

40from typing import ( 

41 TYPE_CHECKING, 

42 Any, 

43 ClassVar, 

44 Counter, 

45 Dict, 

46 Iterable, 

47 Iterator, 

48 List, 

49 MutableMapping, 

50 Optional, 

51 Sequence, 

52 Set, 

53 TextIO, 

54 Tuple, 

55 Type, 

56 Union, 

57) 

58 

59from deprecated.sphinx import deprecated 

60from lsst.resources import ResourcePath, ResourcePathExpression 

61from lsst.utils import doImportType 

62from lsst.utils.introspection import get_class_of 

63from lsst.utils.logging import VERBOSE, getLogger 

64 

65from ._butlerConfig import ButlerConfig 

66from ._butlerRepoIndex import ButlerRepoIndex 

67from ._deferredDatasetHandle import DeferredDatasetHandle 

68from ._limited_butler import LimitedButler 

69from .core import ( 

70 AmbiguousDatasetError, 

71 Config, 

72 ConfigSubset, 

73 DataCoordinate, 

74 DataId, 

75 DataIdValue, 

76 DatasetRef, 

77 DatasetRefURIs, 

78 DatasetType, 

79 Datastore, 

80 Dimension, 

81 DimensionConfig, 

82 DimensionElement, 

83 DimensionRecord, 

84 DimensionUniverse, 

85 FileDataset, 

86 Progress, 

87 StorageClass, 

88 StorageClassFactory, 

89 Timespan, 

90 ValidationError, 

91) 

92from .core.repoRelocation import BUTLER_ROOT_TAG 

93from .core.utils import transactional 

94from .registry import ( 

95 CollectionType, 

96 ConflictingDefinitionError, 

97 DataIdError, 

98 DatasetIdGenEnum, 

99 MissingDatasetTypeError, 

100 Registry, 

101 RegistryConfig, 

102 RegistryDefaults, 

103) 

104from .transfers import RepoExportContext 

105 

106if TYPE_CHECKING: 

107 from lsst.resources import ResourceHandleProtocol 

108 

109log = getLogger(__name__) 

110 

111 

112class ButlerValidationError(ValidationError): 

113 """There is a problem with the Butler configuration.""" 

114 

115 pass 

116 

117 

118class Butler(LimitedButler): 

119 """Main entry point for the data access system. 

120 

121 Parameters 

122 ---------- 

123 config : `ButlerConfig`, `Config` or `str`, optional. 

124 Configuration. Anything acceptable to the 

125 `ButlerConfig` constructor. If a directory path 

126 is given the configuration will be read from a ``butler.yaml`` file in 

127 that location. If `None` is given default values will be used. 

128 butler : `Butler`, optional. 

129 If provided, construct a new Butler that uses the same registry and 

130 datastore as the given one, but with the given collection and run. 

131 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

132 arguments. 

133 collections : `str` or `Iterable` [ `str` ], optional 

134 An expression specifying the collections to be searched (in order) when 

135 reading datasets. 

136 This may be a `str` collection name or an iterable thereof. 

137 See :ref:`daf_butler_collection_expressions` for more information. 

138 These collections are not registered automatically and must be 

139 manually registered before they are used by any method, but they may be 

140 manually registered after the `Butler` is initialized. 

141 run : `str`, optional 

142 Name of the `~CollectionType.RUN` collection new datasets should be 

143 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

144 ``collections`` will be set to ``[run]``. If not `None`, this 

145 collection will automatically be registered. If this is not set (and 

146 ``writeable`` is not set either), a read-only butler will be created. 

147 searchPaths : `list` of `str`, optional 

148 Directory paths to search when calculating the full Butler 

149 configuration. Not used if the supplied config is already a 

150 `ButlerConfig`. 

151 writeable : `bool`, optional 

152 Explicitly sets whether the butler supports write operations. If not 

153 provided, a read-write butler is created if any of ``run``, ``tags``, 

154 or ``chains`` is non-empty. 

155 inferDefaults : `bool`, optional 

156 If `True` (default) infer default data ID values from the values 

157 present in the datasets in ``collections``: if all collections have the 

158 same value (or no value) for a governor dimension, that value will be 

159 the default for that dimension. Nonexistent collections are ignored. 

160 If a default value is provided explicitly for a governor dimension via 

161 ``**kwargs``, no default will be inferred for that dimension. 

162 **kwargs : `str` 

163 Default data ID key-value pairs. These may only identify "governor" 

164 dimensions like ``instrument`` and ``skymap``. 

165 

166 Examples 

167 -------- 

168 While there are many ways to control exactly how a `Butler` interacts with 

169 the collections in its `Registry`, the most common cases are still simple. 

170 

171 For a read-only `Butler` that searches one collection, do:: 

172 

173 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

174 

175 For a read-write `Butler` that writes to and reads from a 

176 `~CollectionType.RUN` collection:: 

177 

178 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

179 

180 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

181 because we want to write to one `~CollectionType.RUN` collection but read 

182 from several others (as well):: 

183 

184 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

185 collections=["u/alice/DM-50000/a", 

186 "u/bob/DM-49998", 

187 "HSC/defaults"]) 

188 

189 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

190 Datasets will be read first from that run (since it appears first in the 

191 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

192 

193 Finally, one can always create a `Butler` with no collections:: 

194 

195 butler = Butler("/path/to/repo", writeable=True) 

196 

197 This can be extremely useful when you just want to use ``butler.registry``, 

198 e.g. for inserting dimension data or managing collections, or when the 

199 collections you want to use with the butler are not consistent. 

200 Passing ``writeable`` explicitly here is only necessary if you want to be 

201 able to make changes to the repo - usually the value for ``writeable`` can 

202 be guessed from the collection arguments provided, but it defaults to 

203 `False` when there are not collection arguments. 

204 """ 

205 

206 def __init__( 

207 self, 

208 config: Union[Config, str, None] = None, 

209 *, 

210 butler: Optional[Butler] = None, 

211 collections: Any = None, 

212 run: Optional[str] = None, 

213 searchPaths: Optional[List[str]] = None, 

214 writeable: Optional[bool] = None, 

215 inferDefaults: bool = True, 

216 **kwargs: str, 

217 ): 

218 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

219 # Load registry, datastore, etc. from config or existing butler. 

220 if butler is not None: 

221 if config is not None or searchPaths is not None or writeable is not None: 

222 raise TypeError( 

223 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument." 

224 ) 

225 self.registry = butler.registry.copy(defaults) 

226 self.datastore = butler.datastore 

227 self.storageClasses = butler.storageClasses 

228 self._config: ButlerConfig = butler._config 

229 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset 

230 else: 

231 # Can only look for strings in the known repos list. 

232 if isinstance(config, str) and config in self.get_known_repos(): 

233 config = str(self.get_repo_uri(config)) 

234 try: 

235 self._config = ButlerConfig(config, searchPaths=searchPaths) 

236 except FileNotFoundError as e: 

237 if known := self.get_known_repos(): 

238 aliases = f"(known aliases: {', '.join(known)})" 

239 else: 

240 aliases = "(no known aliases)" 

241 raise FileNotFoundError(f"{e} {aliases}") from e 

242 self._config = ButlerConfig(config, searchPaths=searchPaths) 

243 try: 

244 if "root" in self._config: 

245 butlerRoot = self._config["root"] 

246 else: 

247 butlerRoot = self._config.configDir 

248 if writeable is None: 

249 writeable = run is not None 

250 self.registry = Registry.fromConfig( 

251 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

252 ) 

253 self.datastore = Datastore.fromConfig( 

254 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

255 ) 

256 self.storageClasses = StorageClassFactory() 

257 self.storageClasses.addFromConfig(self._config) 

258 self._allow_put_of_predefined_dataset = self._config.get( 

259 "allow_put_of_predefined_dataset", False 

260 ) 

261 except Exception: 

262 # Failures here usually mean that configuration is incomplete, 

263 # just issue an error message which includes config file URI. 

264 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

265 raise 

266 

267 # For execution butler the datastore needs a special 

268 # dependency-inversion trick. This is not used by regular butler, 

269 # but we do not have a way to distinguish regular butler from execution 

270 # butler. 

271 self.datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

272 

273 if "run" in self._config or "collection" in self._config: 

274 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

275 

276 GENERATION: ClassVar[int] = 3 

277 """This is a Generation 3 Butler. 

278 

279 This attribute may be removed in the future, once the Generation 2 Butler 

280 interface has been fully retired; it should only be used in transitional 

281 code. 

282 """ 

283 

284 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

285 """Return DatasetType defined in registry given dataset type name.""" 

286 try: 

287 return self.registry.getDatasetType(name) 

288 except MissingDatasetTypeError: 

289 return None 

290 

291 @classmethod 

292 def get_repo_uri(cls, label: str) -> ResourcePath: 

293 """Look up the label in a butler repository index. 

294 

295 Parameters 

296 ---------- 

297 label : `str` 

298 Label of the Butler repository to look up. 

299 

300 Returns 

301 ------- 

302 uri : `lsst.resources.ResourcePath` 

303 URI to the Butler repository associated with the given label. 

304 

305 Raises 

306 ------ 

307 KeyError 

308 Raised if the label is not found in the index, or if an index 

309 can not be found at all. 

310 

311 Notes 

312 ----- 

313 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

314 information is discovered. 

315 """ 

316 return ButlerRepoIndex.get_repo_uri(label) 

317 

318 @classmethod 

319 def get_known_repos(cls) -> Set[str]: 

320 """Retrieve the list of known repository labels. 

321 

322 Returns 

323 ------- 

324 repos : `set` of `str` 

325 All the known labels. Can be empty if no index can be found. 

326 

327 Notes 

328 ----- 

329 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

330 information is discovered. 

331 """ 

332 return ButlerRepoIndex.get_known_repos() 

333 

334 @staticmethod 

335 def makeRepo( 

336 root: ResourcePathExpression, 

337 config: Union[Config, str, None] = None, 

338 dimensionConfig: Union[Config, str, None] = None, 

339 standalone: bool = False, 

340 searchPaths: Optional[List[str]] = None, 

341 forceConfigRoot: bool = True, 

342 outfile: Optional[ResourcePathExpression] = None, 

343 overwrite: bool = False, 

344 ) -> Config: 

345 """Create an empty data repository by adding a butler.yaml config 

346 to a repository root directory. 

347 

348 Parameters 

349 ---------- 

350 root : `lsst.resources.ResourcePathExpression` 

351 Path or URI to the root location of the new repository. Will be 

352 created if it does not exist. 

353 config : `Config` or `str`, optional 

354 Configuration to write to the repository, after setting any 

355 root-dependent Registry or Datastore config options. Can not 

356 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

357 configuration will be used. Root-dependent config options 

358 specified in this config are overwritten if ``forceConfigRoot`` 

359 is `True`. 

360 dimensionConfig : `Config` or `str`, optional 

361 Configuration for dimensions, will be used to initialize registry 

362 database. 

363 standalone : `bool` 

364 If True, write all expanded defaults, not just customized or 

365 repository-specific settings. 

366 This (mostly) decouples the repository from the default 

367 configuration, insulating it from changes to the defaults (which 

368 may be good or bad, depending on the nature of the changes). 

369 Future *additions* to the defaults will still be picked up when 

370 initializing `Butlers` to repos created with ``standalone=True``. 

371 searchPaths : `list` of `str`, optional 

372 Directory paths to search when calculating the full butler 

373 configuration. 

374 forceConfigRoot : `bool`, optional 

375 If `False`, any values present in the supplied ``config`` that 

376 would normally be reset are not overridden and will appear 

377 directly in the output config. This allows non-standard overrides 

378 of the root directory for a datastore or registry to be given. 

379 If this parameter is `True` the values for ``root`` will be 

380 forced into the resulting config if appropriate. 

381 outfile : `lss.resources.ResourcePathExpression`, optional 

382 If not-`None`, the output configuration will be written to this 

383 location rather than into the repository itself. Can be a URI 

384 string. Can refer to a directory that will be used to write 

385 ``butler.yaml``. 

386 overwrite : `bool`, optional 

387 Create a new configuration file even if one already exists 

388 in the specified output location. Default is to raise 

389 an exception. 

390 

391 Returns 

392 ------- 

393 config : `Config` 

394 The updated `Config` instance written to the repo. 

395 

396 Raises 

397 ------ 

398 ValueError 

399 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

400 regular Config (as these subclasses would make it impossible to 

401 support ``standalone=False``). 

402 FileExistsError 

403 Raised if the output config file already exists. 

404 os.error 

405 Raised if the directory does not exist, exists but is not a 

406 directory, or cannot be created. 

407 

408 Notes 

409 ----- 

410 Note that when ``standalone=False`` (the default), the configuration 

411 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

412 construct the repository should also be used to construct any Butlers 

413 to avoid configuration inconsistencies. 

414 """ 

415 if isinstance(config, (ButlerConfig, ConfigSubset)): 

416 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

417 

418 # Ensure that the root of the repository exists or can be made 

419 root_uri = ResourcePath(root, forceDirectory=True) 

420 root_uri.mkdir() 

421 

422 config = Config(config) 

423 

424 # If we are creating a new repo from scratch with relative roots, 

425 # do not propagate an explicit root from the config file 

426 if "root" in config: 

427 del config["root"] 

428 

429 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

430 imported_class = doImportType(full["datastore", "cls"]) 

431 if not issubclass(imported_class, Datastore): 

432 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore") 

433 datastoreClass: Type[Datastore] = imported_class 

434 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

435 

436 # if key exists in given config, parse it, otherwise parse the defaults 

437 # in the expanded config 

438 if config.get(("registry", "db")): 

439 registryConfig = RegistryConfig(config) 

440 else: 

441 registryConfig = RegistryConfig(full) 

442 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

443 if defaultDatabaseUri is not None: 

444 Config.updateParameters( 

445 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot 

446 ) 

447 else: 

448 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot) 

449 

450 if standalone: 

451 config.merge(full) 

452 else: 

453 # Always expand the registry.managers section into the per-repo 

454 # config, because after the database schema is created, it's not 

455 # allowed to change anymore. Note that in the standalone=True 

456 # branch, _everything_ in the config is expanded, so there's no 

457 # need to special case this. 

458 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

459 configURI: ResourcePathExpression 

460 if outfile is not None: 

461 # When writing to a separate location we must include 

462 # the root of the butler repo in the config else it won't know 

463 # where to look. 

464 config["root"] = root_uri.geturl() 

465 configURI = outfile 

466 else: 

467 configURI = root_uri 

468 # Strip obscore configuration, if it is present, before writing config 

469 # to a file, obscore config will be stored in registry. 

470 config_to_write = config 

471 if (obscore_config_key := ("registry", "managers", "obscore", "config")) in config: 

472 config_to_write = config.copy() 

473 del config_to_write[obscore_config_key] 

474 config_to_write.dumpToUri(configURI, overwrite=overwrite) 

475 

476 # Create Registry and populate tables 

477 registryConfig = RegistryConfig(config.get("registry")) 

478 dimensionConfig = DimensionConfig(dimensionConfig) 

479 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri) 

480 

481 log.verbose("Wrote new Butler configuration file to %s", configURI) 

482 

483 return config 

484 

485 @classmethod 

486 def _unpickle( 

487 cls, 

488 config: ButlerConfig, 

489 collections: Optional[tuple[str, ...]], 

490 run: Optional[str], 

491 defaultDataId: Dict[str, str], 

492 writeable: bool, 

493 ) -> Butler: 

494 """Callable used to unpickle a Butler. 

495 

496 We prefer not to use ``Butler.__init__`` directly so we can force some 

497 of its many arguments to be keyword-only (note that ``__reduce__`` 

498 can only invoke callables with positional arguments). 

499 

500 Parameters 

501 ---------- 

502 config : `ButlerConfig` 

503 Butler configuration, already coerced into a true `ButlerConfig` 

504 instance (and hence after any search paths for overrides have been 

505 utilized). 

506 collections : `tuple` [ `str` ] 

507 Names of the default collections to read from. 

508 run : `str`, optional 

509 Name of the default `~CollectionType.RUN` collection to write to. 

510 defaultDataId : `dict` [ `str`, `str` ] 

511 Default data ID values. 

512 writeable : `bool` 

513 Whether the Butler should support write operations. 

514 

515 Returns 

516 ------- 

517 butler : `Butler` 

518 A new `Butler` instance. 

519 """ 

520 # MyPy doesn't recognize that the kwargs below are totally valid; it 

521 # seems to think '**defaultDataId* is a _positional_ argument! 

522 return cls( 

523 config=config, 

524 collections=collections, 

525 run=run, 

526 writeable=writeable, 

527 **defaultDataId, # type: ignore 

528 ) 

529 

530 def __reduce__(self) -> tuple: 

531 """Support pickling.""" 

532 return ( 

533 Butler._unpickle, 

534 ( 

535 self._config, 

536 self.collections, 

537 self.run, 

538 self.registry.defaults.dataId.byName(), 

539 self.registry.isWriteable(), 

540 ), 

541 ) 

542 

543 def __str__(self) -> str: 

544 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

545 self.collections, self.run, self.datastore, self.registry 

546 ) 

547 

548 def isWriteable(self) -> bool: 

549 """Return `True` if this `Butler` supports write operations.""" 

550 return self.registry.isWriteable() 

551 

552 @contextlib.contextmanager 

553 def transaction(self) -> Iterator[None]: 

554 """Context manager supporting `Butler` transactions. 

555 

556 Transactions can be nested. 

557 """ 

558 with self.registry.transaction(): 

559 with self.datastore.transaction(): 

560 yield 

561 

562 def _standardizeArgs( 

563 self, 

564 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

565 dataId: Optional[DataId] = None, 

566 for_put: bool = True, 

567 **kwargs: Any, 

568 ) -> Tuple[DatasetType, Optional[DataId]]: 

569 """Standardize the arguments passed to several Butler APIs. 

570 

571 Parameters 

572 ---------- 

573 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

574 When `DatasetRef` the `dataId` should be `None`. 

575 Otherwise the `DatasetType` or name thereof. 

576 dataId : `dict` or `DataCoordinate` 

577 A `dict` of `Dimension` link name, value pairs that label the 

578 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

579 should be provided as the second argument. 

580 for_put : `bool`, optional 

581 If `True` this call is invoked as part of a `Butler.put()`. 

582 Otherwise it is assumed to be part of a `Butler.get()`. This 

583 parameter is only relevant if there is dataset type 

584 inconsistency. 

585 **kwargs 

586 Additional keyword arguments used to augment or construct a 

587 `DataCoordinate`. See `DataCoordinate.standardize` 

588 parameters. 

589 

590 Returns 

591 ------- 

592 datasetType : `DatasetType` 

593 A `DatasetType` instance extracted from ``datasetRefOrType``. 

594 dataId : `dict` or `DataId`, optional 

595 Argument that can be used (along with ``kwargs``) to construct a 

596 `DataId`. 

597 

598 Notes 

599 ----- 

600 Butler APIs that conceptually need a DatasetRef also allow passing a 

601 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

602 keyword arguments that can be used to construct one) separately. This 

603 method accepts those arguments and always returns a true `DatasetType` 

604 and a `DataId` or `dict`. 

605 

606 Standardization of `dict` vs `DataId` is best handled by passing the 

607 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

608 generally similarly flexible. 

609 """ 

610 externalDatasetType: Optional[DatasetType] = None 

611 internalDatasetType: Optional[DatasetType] = None 

612 if isinstance(datasetRefOrType, DatasetRef): 

613 if dataId is not None or kwargs: 

614 raise ValueError("DatasetRef given, cannot use dataId as well") 

615 externalDatasetType = datasetRefOrType.datasetType 

616 dataId = datasetRefOrType.dataId 

617 else: 

618 # Don't check whether DataId is provided, because Registry APIs 

619 # can usually construct a better error message when it wasn't. 

620 if isinstance(datasetRefOrType, DatasetType): 

621 externalDatasetType = datasetRefOrType 

622 else: 

623 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

624 

625 # Check that they are self-consistent 

626 if externalDatasetType is not None: 

627 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

628 if externalDatasetType != internalDatasetType: 

629 # We can allow differences if they are compatible, depending 

630 # on whether this is a get or a put. A get requires that 

631 # the python type associated with the datastore can be 

632 # converted to the user type. A put requires that the user 

633 # supplied python type can be converted to the internal 

634 # type expected by registry. 

635 relevantDatasetType = internalDatasetType 

636 if for_put: 

637 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

638 else: 

639 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

640 relevantDatasetType = externalDatasetType 

641 if not is_compatible: 

642 raise ValueError( 

643 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

644 f"registry definition ({internalDatasetType})" 

645 ) 

646 # Override the internal definition. 

647 internalDatasetType = relevantDatasetType 

648 

649 assert internalDatasetType is not None 

650 return internalDatasetType, dataId 

651 

652 def _rewrite_data_id( 

653 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any 

654 ) -> Tuple[Optional[DataId], Dict[str, Any]]: 

655 """Rewrite a data ID taking into account dimension records. 

656 

657 Take a Data ID and keyword args and rewrite it if necessary to 

658 allow the user to specify dimension records rather than dimension 

659 primary values. 

660 

661 This allows a user to include a dataId dict with keys of 

662 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

663 the integer exposure ID. It also allows a string to be given 

664 for a dimension value rather than the integer ID if that is more 

665 convenient. For example, rather than having to specifyin the 

666 detector with ``detector.full_name``, a string given for ``detector`` 

667 will be interpreted as the full name and converted to the integer 

668 value. 

669 

670 Keyword arguments can also use strings for dimensions like detector 

671 and exposure but python does not allow them to include ``.`` and 

672 so the ``exposure.day_obs`` syntax can not be used in a keyword 

673 argument. 

674 

675 Parameters 

676 ---------- 

677 dataId : `dict` or `DataCoordinate` 

678 A `dict` of `Dimension` link name, value pairs that will label the 

679 `DatasetRef` within a Collection. 

680 datasetType : `DatasetType` 

681 The dataset type associated with this dataId. Required to 

682 determine the relevant dimensions. 

683 **kwargs 

684 Additional keyword arguments used to augment or construct a 

685 `DataId`. See `DataId` parameters. 

686 

687 Returns 

688 ------- 

689 dataId : `dict` or `DataCoordinate` 

690 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

691 no keyword arguments, the original dataId will be returned 

692 unchanged. 

693 **kwargs : `dict` 

694 Any unused keyword arguments (would normally be empty dict). 

695 """ 

696 # Do nothing if we have a standalone DataCoordinate. 

697 if isinstance(dataId, DataCoordinate) and not kwargs: 

698 return dataId, kwargs 

699 

700 # Process dimension records that are using record information 

701 # rather than ids 

702 newDataId: Dict[str, DataIdValue] = {} 

703 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict) 

704 

705 # if all the dataId comes from keyword parameters we do not need 

706 # to do anything here because they can't be of the form 

707 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

708 if dataId: 

709 for k, v in dataId.items(): 

710 # If we have a Dimension we do not need to do anything 

711 # because it cannot be a compound key. 

712 if isinstance(k, str) and "." in k: 

713 # Someone is using a more human-readable dataId 

714 dimensionName, record = k.split(".", 1) 

715 byRecord[dimensionName][record] = v 

716 elif isinstance(k, Dimension): 

717 newDataId[k.name] = v 

718 else: 

719 newDataId[k] = v 

720 

721 # Go through the updated dataId and check the type in case someone is 

722 # using an alternate key. We have already filtered out the compound 

723 # keys dimensions.record format. 

724 not_dimensions = {} 

725 

726 # Will need to look in the dataId and the keyword arguments 

727 # and will remove them if they need to be fixed or are unrecognized. 

728 for dataIdDict in (newDataId, kwargs): 

729 # Use a list so we can adjust the dict safely in the loop 

730 for dimensionName in list(dataIdDict): 

731 value = dataIdDict[dimensionName] 

732 try: 

733 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

734 except KeyError: 

735 # This is not a real dimension 

736 not_dimensions[dimensionName] = value 

737 del dataIdDict[dimensionName] 

738 continue 

739 

740 # Convert an integral type to an explicit int to simplify 

741 # comparisons here 

742 if isinstance(value, numbers.Integral): 

743 value = int(value) 

744 

745 if not isinstance(value, dimension.primaryKey.getPythonType()): 

746 for alternate in dimension.alternateKeys: 

747 if isinstance(value, alternate.getPythonType()): 

748 byRecord[dimensionName][alternate.name] = value 

749 del dataIdDict[dimensionName] 

750 log.debug( 

751 "Converting dimension %s to %s.%s=%s", 

752 dimensionName, 

753 dimensionName, 

754 alternate.name, 

755 value, 

756 ) 

757 break 

758 else: 

759 log.warning( 

760 "Type mismatch found for value '%r' provided for dimension %s. " 

761 "Could not find matching alternative (primary key has type %s) " 

762 "so attempting to use as-is.", 

763 value, 

764 dimensionName, 

765 dimension.primaryKey.getPythonType(), 

766 ) 

767 

768 # By this point kwargs and newDataId should only include valid 

769 # dimensions. Merge kwargs in to the new dataId and log if there 

770 # are dimensions in both (rather than calling update). 

771 for k, v in kwargs.items(): 

772 if k in newDataId and newDataId[k] != v: 

773 log.debug( 

774 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

775 ) 

776 newDataId[k] = v 

777 # No need to retain any values in kwargs now. 

778 kwargs = {} 

779 

780 # If we have some unrecognized dimensions we have to try to connect 

781 # them to records in other dimensions. This is made more complicated 

782 # by some dimensions having records with clashing names. A mitigation 

783 # is that we can tell by this point which dimensions are missing 

784 # for the DatasetType but this does not work for calibrations 

785 # where additional dimensions can be used to constrain the temporal 

786 # axis. 

787 if not_dimensions: 

788 # Search for all dimensions even if we have been given a value 

789 # explicitly. In some cases records are given as well as the 

790 # actually dimension and this should not be an error if they 

791 # match. 

792 mandatoryDimensions = datasetType.dimensions.names # - provided 

793 

794 candidateDimensions: Set[str] = set() 

795 candidateDimensions.update(mandatoryDimensions) 

796 

797 # For calibrations we may well be needing temporal dimensions 

798 # so rather than always including all dimensions in the scan 

799 # restrict things a little. It is still possible for there 

800 # to be confusion over day_obs in visit vs exposure for example. 

801 # If we are not searching calibration collections things may 

802 # fail but they are going to fail anyway because of the 

803 # ambiguousness of the dataId... 

804 if datasetType.isCalibration(): 

805 for dim in self.registry.dimensions.getStaticDimensions(): 

806 if dim.temporal: 

807 candidateDimensions.add(str(dim)) 

808 

809 # Look up table for the first association with a dimension 

810 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict) 

811 

812 # Keep track of whether an item is associated with multiple 

813 # dimensions. 

814 counter: Counter[str] = Counter() 

815 assigned: Dict[str, Set[str]] = defaultdict(set) 

816 

817 # Go through the missing dimensions and associate the 

818 # given names with records within those dimensions 

819 matched_dims = set() 

820 for dimensionName in candidateDimensions: 

821 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

822 fields = dimension.metadata.names | dimension.uniqueKeys.names 

823 for field in not_dimensions: 

824 if field in fields: 

825 guessedAssociation[dimensionName][field] = not_dimensions[field] 

826 counter[dimensionName] += 1 

827 assigned[field].add(dimensionName) 

828 matched_dims.add(field) 

829 

830 # Calculate the fields that matched nothing. 

831 never_found = set(not_dimensions) - matched_dims 

832 

833 if never_found: 

834 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

835 

836 # There is a chance we have allocated a single dataId item 

837 # to multiple dimensions. Need to decide which should be retained. 

838 # For now assume that the most popular alternative wins. 

839 # This means that day_obs with seq_num will result in 

840 # exposure.day_obs and not visit.day_obs 

841 # Also prefer an explicitly missing dimension over an inferred 

842 # temporal dimension. 

843 for fieldName, assignedDimensions in assigned.items(): 

844 if len(assignedDimensions) > 1: 

845 # Pick the most popular (preferring mandatory dimensions) 

846 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

847 if requiredButMissing: 

848 candidateDimensions = requiredButMissing 

849 else: 

850 candidateDimensions = assignedDimensions 

851 

852 # If this is a choice between visit and exposure and 

853 # neither was a required part of the dataset type, 

854 # (hence in this branch) always prefer exposure over 

855 # visit since exposures are always defined and visits 

856 # are defined from exposures. 

857 if candidateDimensions == {"exposure", "visit"}: 

858 candidateDimensions = {"exposure"} 

859 

860 # Select the relevant items and get a new restricted 

861 # counter. 

862 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

863 duplicatesCounter: Counter[str] = Counter() 

864 duplicatesCounter.update(theseCounts) 

865 

866 # Choose the most common. If they are equally common 

867 # we will pick the one that was found first. 

868 # Returns a list of tuples 

869 selected = duplicatesCounter.most_common(1)[0][0] 

870 

871 log.debug( 

872 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

873 " Removed ambiguity by choosing dimension %s.", 

874 fieldName, 

875 ", ".join(assignedDimensions), 

876 selected, 

877 ) 

878 

879 for candidateDimension in assignedDimensions: 

880 if candidateDimension != selected: 

881 del guessedAssociation[candidateDimension][fieldName] 

882 

883 # Update the record look up dict with the new associations 

884 for dimensionName, values in guessedAssociation.items(): 

885 if values: # A dict might now be empty 

886 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values) 

887 byRecord[dimensionName].update(values) 

888 

889 if byRecord: 

890 # Some record specifiers were found so we need to convert 

891 # them to the Id form 

892 for dimensionName, values in byRecord.items(): 

893 if dimensionName in newDataId: 

894 log.debug( 

895 "DataId specified explicit %s dimension value of %s in addition to" 

896 " general record specifiers for it of %s. Ignoring record information.", 

897 dimensionName, 

898 newDataId[dimensionName], 

899 str(values), 

900 ) 

901 # Get the actual record and compare with these values. 

902 try: 

903 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

904 except DataIdError: 

905 raise ValueError( 

906 f"Could not find dimension '{dimensionName}'" 

907 f" with dataId {newDataId} as part of comparing with" 

908 f" record values {byRecord[dimensionName]}" 

909 ) from None 

910 if len(recs) == 1: 

911 errmsg: List[str] = [] 

912 for k, v in values.items(): 

913 if (recval := getattr(recs[0], k)) != v: 

914 errmsg.append(f"{k}({recval} != {v})") 

915 if errmsg: 

916 raise ValueError( 

917 f"Dimension {dimensionName} in dataId has explicit value" 

918 " inconsistent with records: " + ", ".join(errmsg) 

919 ) 

920 else: 

921 # Multiple matches for an explicit dimension 

922 # should never happen but let downstream complain. 

923 pass 

924 continue 

925 

926 # Build up a WHERE expression 

927 bind = {k: v for k, v in values.items()} 

928 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

929 

930 # Hopefully we get a single record that matches 

931 records = set( 

932 self.registry.queryDimensionRecords( 

933 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

934 ) 

935 ) 

936 

937 if len(records) != 1: 

938 if len(records) > 1: 

939 # visit can have an ambiguous answer without involving 

940 # visit_system. The default visit_system is defined 

941 # by the instrument. 

942 if ( 

943 dimensionName == "visit" 

944 and "visit_system_membership" in self.registry.dimensions 

945 and "visit_system" in self.registry.dimensions["instrument"].metadata 

946 ): 

947 instrument_records = list( 

948 self.registry.queryDimensionRecords( 

949 "instrument", 

950 dataId=newDataId, 

951 **kwargs, 

952 ) 

953 ) 

954 if len(instrument_records) == 1: 

955 visit_system = instrument_records[0].visit_system 

956 if visit_system is None: 

957 # Set to a value that will never match. 

958 visit_system = -1 

959 

960 # Look up each visit in the 

961 # visit_system_membership records. 

962 for rec in records: 

963 membership = list( 

964 self.registry.queryDimensionRecords( 

965 # Use bind to allow zero results. 

966 # This is a fully-specified query. 

967 "visit_system_membership", 

968 where="instrument = inst AND visit_system = system AND visit = v", 

969 bind=dict( 

970 inst=instrument_records[0].name, system=visit_system, v=rec.id 

971 ), 

972 ) 

973 ) 

974 if membership: 

975 # This record is the right answer. 

976 records = set([rec]) 

977 break 

978 

979 # The ambiguity may have been resolved so check again. 

980 if len(records) > 1: 

981 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

982 for r in records: 

983 log.debug("- %s", str(r)) 

984 raise ValueError( 

985 f"DataId specification for dimension {dimensionName} is not" 

986 f" uniquely constrained to a single dataset by {values}." 

987 f" Got {len(records)} results." 

988 ) 

989 else: 

990 raise ValueError( 

991 f"DataId specification for dimension {dimensionName} matched no" 

992 f" records when constrained by {values}" 

993 ) 

994 

995 # Get the primary key from the real dimension object 

996 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

997 if not isinstance(dimension, Dimension): 

998 raise RuntimeError( 

999 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

1000 ) 

1001 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

1002 

1003 return newDataId, kwargs 

1004 

1005 def _findDatasetRef( 

1006 self, 

1007 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1008 dataId: Optional[DataId] = None, 

1009 *, 

1010 collections: Any = None, 

1011 allowUnresolved: bool = False, 

1012 **kwargs: Any, 

1013 ) -> DatasetRef: 

1014 """Shared logic for methods that start with a search for a dataset in 

1015 the registry. 

1016 

1017 Parameters 

1018 ---------- 

1019 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1020 When `DatasetRef` the `dataId` should be `None`. 

1021 Otherwise the `DatasetType` or name thereof. 

1022 dataId : `dict` or `DataCoordinate`, optional 

1023 A `dict` of `Dimension` link name, value pairs that label the 

1024 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1025 should be provided as the first argument. 

1026 collections : Any, optional 

1027 Collections to be searched, overriding ``self.collections``. 

1028 Can be any of the types supported by the ``collections`` argument 

1029 to butler construction. 

1030 allowUnresolved : `bool`, optional 

1031 If `True`, return an unresolved `DatasetRef` if finding a resolved 

1032 one in the `Registry` fails. Defaults to `False`. 

1033 **kwargs 

1034 Additional keyword arguments used to augment or construct a 

1035 `DataId`. See `DataId` parameters. 

1036 

1037 Returns 

1038 ------- 

1039 ref : `DatasetRef` 

1040 A reference to the dataset identified by the given arguments. 

1041 This can be the same dataset reference as given if it was 

1042 resolved. 

1043 

1044 Raises 

1045 ------ 

1046 LookupError 

1047 Raised if no matching dataset exists in the `Registry` (and 

1048 ``allowUnresolved is False``). 

1049 ValueError 

1050 Raised if a resolved `DatasetRef` was passed as an input, but it 

1051 differs from the one found in the registry. 

1052 TypeError 

1053 Raised if no collections were provided. 

1054 """ 

1055 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

1056 if isinstance(datasetRefOrType, DatasetRef): 

1057 idNumber = datasetRefOrType.id 

1058 # This is a resolved ref, return it immediately. 

1059 if idNumber: 

1060 return datasetRefOrType 

1061 else: 

1062 idNumber = None 

1063 timespan: Optional[Timespan] = None 

1064 

1065 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1066 

1067 if datasetType.isCalibration(): 

1068 # Because this is a calibration dataset, first try to make a 

1069 # standardize the data ID without restricting the dimensions to 

1070 # those of the dataset type requested, because there may be extra 

1071 # dimensions that provide temporal information for a validity-range 

1072 # lookup. 

1073 dataId = DataCoordinate.standardize( 

1074 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1075 ) 

1076 if dataId.graph.temporal: 

1077 dataId = self.registry.expandDataId(dataId) 

1078 timespan = dataId.timespan 

1079 else: 

1080 # Standardize the data ID to just the dimensions of the dataset 

1081 # type instead of letting registry.findDataset do it, so we get the 

1082 # result even if no dataset is found. 

1083 dataId = DataCoordinate.standardize( 

1084 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1085 ) 

1086 # Always lookup the DatasetRef, even if one is given, to ensure it is 

1087 # present in the current collection. 

1088 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

1089 if ref is None: 

1090 if allowUnresolved: 

1091 return DatasetRef(datasetType, dataId) 

1092 else: 

1093 if collections is None: 

1094 collections = self.registry.defaults.collections 

1095 raise LookupError( 

1096 f"Dataset {datasetType.name} with data ID {dataId} " 

1097 f"could not be found in collections {collections}." 

1098 ) 

1099 if idNumber is not None and idNumber != ref.id: 

1100 if collections is None: 

1101 collections = self.registry.defaults.collections 

1102 raise ValueError( 

1103 f"DatasetRef.id provided ({idNumber}) does not match " 

1104 f"id ({ref.id}) in registry in collections {collections}." 

1105 ) 

1106 if datasetType != ref.datasetType: 

1107 # If they differ it is because the user explicitly specified 

1108 # a compatible dataset type to this call rather than using the 

1109 # registry definition. The DatasetRef must therefore be recreated 

1110 # using the user definition such that the expected type is 

1111 # returned. 

1112 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id) 

1113 

1114 return ref 

1115 

1116 @transactional 

1117 @deprecated( 

1118 reason="Butler.put() now behaves like Butler.putDirect() when given a DatasetRef." 

1119 " Please use Butler.put(). Be aware that you may need to adjust your usage if you" 

1120 " were relying on the run parameter to determine the run." 

1121 " Will be removed after v27.0.", 

1122 version="v26.0", 

1123 category=FutureWarning, 

1124 ) 

1125 def putDirect(self, obj: Any, ref: DatasetRef, /) -> DatasetRef: 

1126 # Docstring inherited. 

1127 return self.put(obj, ref) 

1128 

1129 @transactional 

1130 def put( 

1131 self, 

1132 obj: Any, 

1133 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1134 /, 

1135 dataId: Optional[DataId] = None, 

1136 *, 

1137 run: Optional[str] = None, 

1138 **kwargs: Any, 

1139 ) -> DatasetRef: 

1140 """Store and register a dataset. 

1141 

1142 Parameters 

1143 ---------- 

1144 obj : `object` 

1145 The dataset. 

1146 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1147 When `DatasetRef` is provided, ``dataId`` should be `None`. 

1148 Otherwise the `DatasetType` or name thereof. If a fully resolved 

1149 `DatasetRef` is given the run and ID are used directly. 

1150 dataId : `dict` or `DataCoordinate` 

1151 A `dict` of `Dimension` link name, value pairs that label the 

1152 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1153 should be provided as the second argument. 

1154 run : `str`, optional 

1155 The name of the run the dataset should be added to, overriding 

1156 ``self.run``. Not used if a resolved `DatasetRef` is provided. 

1157 **kwargs 

1158 Additional keyword arguments used to augment or construct a 

1159 `DataCoordinate`. See `DataCoordinate.standardize` 

1160 parameters. Not used if a resolve `DatasetRef` is provided. 

1161 

1162 Returns 

1163 ------- 

1164 ref : `DatasetRef` 

1165 A reference to the stored dataset, updated with the correct id if 

1166 given. 

1167 

1168 Raises 

1169 ------ 

1170 TypeError 

1171 Raised if the butler is read-only or if no run has been provided. 

1172 """ 

1173 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

1174 # This is a direct put of predefined DatasetRef. 

1175 log.debug("Butler put direct: %s", datasetRefOrType) 

1176 (imported_ref,) = self.registry._importDatasets( 

1177 [datasetRefOrType], 

1178 expand=True, 

1179 ) 

1180 if imported_ref.id != datasetRefOrType.getCheckedId(): 

1181 raise RuntimeError("This registry configuration does not support direct put of ref.") 

1182 self.datastore.put(obj, datasetRefOrType) 

1183 return datasetRefOrType 

1184 

1185 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

1186 if not self.isWriteable(): 

1187 raise TypeError("Butler is read-only.") 

1188 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

1189 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

1190 raise ValueError("DatasetRef must not be in registry, must have None id") 

1191 

1192 # Handle dimension records in dataId 

1193 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1194 

1195 # Add Registry Dataset entry. 

1196 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

1197 

1198 # For an execution butler the datasets will be pre-defined. 

1199 # If the butler is configured that way datasets should only be inserted 

1200 # if they do not already exist in registry. Trying and catching 

1201 # ConflictingDefinitionError will not work because the transaction 

1202 # will be corrupted. Instead, in this mode always check first. 

1203 ref = None 

1204 ref_is_predefined = False 

1205 if self._allow_put_of_predefined_dataset: 

1206 # Get the matching ref for this run. 

1207 ref = self.registry.findDataset(datasetType, collections=run, dataId=dataId) 

1208 

1209 if ref: 

1210 # Must be expanded form for datastore templating 

1211 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions) 

1212 ref = ref.expanded(dataId) 

1213 ref_is_predefined = True 

1214 

1215 if not ref: 

1216 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

1217 

1218 # If the ref is predefined it is possible that the datastore also 

1219 # has the record. Asking datastore to put it again will result in 

1220 # the artifact being recreated, overwriting previous, then will cause 

1221 # a failure in writing the record which will cause the artifact 

1222 # to be removed. Much safer to ask first before attempting to 

1223 # overwrite. Race conditions should not be an issue for the 

1224 # execution butler environment. 

1225 if ref_is_predefined: 

1226 if self.datastore.knows(ref): 

1227 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.") 

1228 

1229 self.datastore.put(obj, ref) 

1230 

1231 return ref 

1232 

1233 @deprecated( 

1234 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef." 

1235 " Please use Butler.get(). Will be removed after v27.0.", 

1236 version="v26.0", 

1237 category=FutureWarning, 

1238 ) 

1239 def getDirect( 

1240 self, 

1241 ref: DatasetRef, 

1242 *, 

1243 parameters: Optional[Dict[str, Any]] = None, 

1244 storageClass: Optional[Union[StorageClass, str]] = None, 

1245 ) -> Any: 

1246 """Retrieve a stored dataset. 

1247 

1248 Parameters 

1249 ---------- 

1250 ref : `DatasetRef` 

1251 Resolved reference to an already stored dataset. 

1252 parameters : `dict` 

1253 Additional StorageClass-defined options to control reading, 

1254 typically used to efficiently read only a subset of the dataset. 

1255 storageClass : `StorageClass` or `str`, optional 

1256 The storage class to be used to override the Python type 

1257 returned by this method. By default the returned type matches 

1258 the dataset type definition for this dataset. Specifying a 

1259 read `StorageClass` can force a different type to be returned. 

1260 This type must be compatible with the original type. 

1261 

1262 Returns 

1263 ------- 

1264 obj : `object` 

1265 The dataset. 

1266 """ 

1267 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1268 

1269 @deprecated( 

1270 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. " 

1271 "Please use Butler.getDeferred(). Will be removed after v27.0.", 

1272 version="v26.0", 

1273 category=FutureWarning, 

1274 ) 

1275 def getDirectDeferred( 

1276 self, 

1277 ref: DatasetRef, 

1278 *, 

1279 parameters: Union[dict, None] = None, 

1280 storageClass: str | StorageClass | None = None, 

1281 ) -> DeferredDatasetHandle: 

1282 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1283 from a resolved `DatasetRef`. 

1284 

1285 Parameters 

1286 ---------- 

1287 ref : `DatasetRef` 

1288 Resolved reference to an already stored dataset. 

1289 parameters : `dict` 

1290 Additional StorageClass-defined options to control reading, 

1291 typically used to efficiently read only a subset of the dataset. 

1292 storageClass : `StorageClass` or `str`, optional 

1293 The storage class to be used to override the Python type 

1294 returned by this method. By default the returned type matches 

1295 the dataset type definition for this dataset. Specifying a 

1296 read `StorageClass` can force a different type to be returned. 

1297 This type must be compatible with the original type. 

1298 

1299 Returns 

1300 ------- 

1301 obj : `DeferredDatasetHandle` 

1302 A handle which can be used to retrieve a dataset at a later time. 

1303 

1304 Raises 

1305 ------ 

1306 AmbiguousDatasetError 

1307 Raised if ``ref.id is None``, i.e. the reference is unresolved. 

1308 """ 

1309 if ref.id is None: 

1310 raise AmbiguousDatasetError( 

1311 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved." 

1312 ) 

1313 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1314 

1315 def getDeferred( 

1316 self, 

1317 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1318 /, 

1319 dataId: Optional[DataId] = None, 

1320 *, 

1321 parameters: Union[dict, None] = None, 

1322 collections: Any = None, 

1323 storageClass: str | StorageClass | None = None, 

1324 **kwargs: Any, 

1325 ) -> DeferredDatasetHandle: 

1326 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1327 after an immediate registry lookup. 

1328 

1329 Parameters 

1330 ---------- 

1331 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1332 When `DatasetRef` the `dataId` should be `None`. 

1333 Otherwise the `DatasetType` or name thereof. 

1334 dataId : `dict` or `DataCoordinate`, optional 

1335 A `dict` of `Dimension` link name, value pairs that label the 

1336 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1337 should be provided as the first argument. 

1338 parameters : `dict` 

1339 Additional StorageClass-defined options to control reading, 

1340 typically used to efficiently read only a subset of the dataset. 

1341 collections : Any, optional 

1342 Collections to be searched, overriding ``self.collections``. 

1343 Can be any of the types supported by the ``collections`` argument 

1344 to butler construction. 

1345 storageClass : `StorageClass` or `str`, optional 

1346 The storage class to be used to override the Python type 

1347 returned by this method. By default the returned type matches 

1348 the dataset type definition for this dataset. Specifying a 

1349 read `StorageClass` can force a different type to be returned. 

1350 This type must be compatible with the original type. 

1351 **kwargs 

1352 Additional keyword arguments used to augment or construct a 

1353 `DataId`. See `DataId` parameters. 

1354 

1355 Returns 

1356 ------- 

1357 obj : `DeferredDatasetHandle` 

1358 A handle which can be used to retrieve a dataset at a later time. 

1359 

1360 Raises 

1361 ------ 

1362 LookupError 

1363 Raised if no matching dataset exists in the `Registry` (and 

1364 ``allowUnresolved is False``). 

1365 ValueError 

1366 Raised if a resolved `DatasetRef` was passed as an input, but it 

1367 differs from the one found in the registry. 

1368 TypeError 

1369 Raised if no collections were provided. 

1370 """ 

1371 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1372 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1373 

1374 def get( 

1375 self, 

1376 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1377 /, 

1378 dataId: Optional[DataId] = None, 

1379 *, 

1380 parameters: Optional[Dict[str, Any]] = None, 

1381 collections: Any = None, 

1382 storageClass: Optional[Union[StorageClass, str]] = None, 

1383 **kwargs: Any, 

1384 ) -> Any: 

1385 """Retrieve a stored dataset. 

1386 

1387 Parameters 

1388 ---------- 

1389 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1390 When `DatasetRef` the `dataId` should be `None`. 

1391 Otherwise the `DatasetType` or name thereof. 

1392 If a resolved `DatasetRef`, the associated dataset 

1393 is returned directly without additional querying. 

1394 dataId : `dict` or `DataCoordinate` 

1395 A `dict` of `Dimension` link name, value pairs that label the 

1396 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1397 should be provided as the first argument. 

1398 parameters : `dict` 

1399 Additional StorageClass-defined options to control reading, 

1400 typically used to efficiently read only a subset of the dataset. 

1401 collections : Any, optional 

1402 Collections to be searched, overriding ``self.collections``. 

1403 Can be any of the types supported by the ``collections`` argument 

1404 to butler construction. 

1405 storageClass : `StorageClass` or `str`, optional 

1406 The storage class to be used to override the Python type 

1407 returned by this method. By default the returned type matches 

1408 the dataset type definition for this dataset. Specifying a 

1409 read `StorageClass` can force a different type to be returned. 

1410 This type must be compatible with the original type. 

1411 **kwargs 

1412 Additional keyword arguments used to augment or construct a 

1413 `DataCoordinate`. See `DataCoordinate.standardize` 

1414 parameters. 

1415 

1416 Returns 

1417 ------- 

1418 obj : `object` 

1419 The dataset. 

1420 

1421 Raises 

1422 ------ 

1423 LookupError 

1424 Raised if no matching dataset exists in the `Registry`. 

1425 TypeError 

1426 Raised if no collections were provided. 

1427 

1428 Notes 

1429 ----- 

1430 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1431 this method requires that the given data ID include temporal dimensions 

1432 beyond the dimensions of the dataset type itself, in order to find the 

1433 dataset with the appropriate validity range. For example, a "bias" 

1434 dataset with native dimensions ``{instrument, detector}`` could be 

1435 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1436 ``exposure`` is a temporal dimension. 

1437 """ 

1438 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1439 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1440 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1441 

1442 def getURIs( 

1443 self, 

1444 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1445 /, 

1446 dataId: Optional[DataId] = None, 

1447 *, 

1448 predict: bool = False, 

1449 collections: Any = None, 

1450 run: Optional[str] = None, 

1451 **kwargs: Any, 

1452 ) -> DatasetRefURIs: 

1453 """Returns the URIs associated with the dataset. 

1454 

1455 Parameters 

1456 ---------- 

1457 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1458 When `DatasetRef` the `dataId` should be `None`. 

1459 Otherwise the `DatasetType` or name thereof. 

1460 dataId : `dict` or `DataCoordinate` 

1461 A `dict` of `Dimension` link name, value pairs that label the 

1462 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1463 should be provided as the first argument. 

1464 predict : `bool` 

1465 If `True`, allow URIs to be returned of datasets that have not 

1466 been written. 

1467 collections : Any, optional 

1468 Collections to be searched, overriding ``self.collections``. 

1469 Can be any of the types supported by the ``collections`` argument 

1470 to butler construction. 

1471 run : `str`, optional 

1472 Run to use for predictions, overriding ``self.run``. 

1473 **kwargs 

1474 Additional keyword arguments used to augment or construct a 

1475 `DataCoordinate`. See `DataCoordinate.standardize` 

1476 parameters. 

1477 

1478 Returns 

1479 ------- 

1480 uris : `DatasetRefURIs` 

1481 The URI to the primary artifact associated with this dataset (if 

1482 the dataset was disassembled within the datastore this may be 

1483 `None`), and the URIs to any components associated with the dataset 

1484 artifact. (can be empty if there are no components). 

1485 """ 

1486 ref = self._findDatasetRef( 

1487 datasetRefOrType, dataId, allowUnresolved=predict, collections=collections, **kwargs 

1488 ) 

1489 if ref.id is None: # only possible if predict is True 

1490 if run is None: 

1491 run = self.run 

1492 if run is None: 

1493 raise TypeError("Cannot predict location with run=None.") 

1494 # Lie about ID, because we can't guess it, and only 

1495 # Datastore.getURIs() will ever see it (and it doesn't use it). 

1496 ref = ref.resolved(id=uuid.UUID("{00000000-0000-0000-0000-000000000000}"), run=run) 

1497 return self.datastore.getURIs(ref, predict) 

1498 

1499 def getURI( 

1500 self, 

1501 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1502 /, 

1503 dataId: Optional[DataId] = None, 

1504 *, 

1505 predict: bool = False, 

1506 collections: Any = None, 

1507 run: Optional[str] = None, 

1508 **kwargs: Any, 

1509 ) -> ResourcePath: 

1510 """Return the URI to the Dataset. 

1511 

1512 Parameters 

1513 ---------- 

1514 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1515 When `DatasetRef` the `dataId` should be `None`. 

1516 Otherwise the `DatasetType` or name thereof. 

1517 dataId : `dict` or `DataCoordinate` 

1518 A `dict` of `Dimension` link name, value pairs that label the 

1519 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1520 should be provided as the first argument. 

1521 predict : `bool` 

1522 If `True`, allow URIs to be returned of datasets that have not 

1523 been written. 

1524 collections : Any, optional 

1525 Collections to be searched, overriding ``self.collections``. 

1526 Can be any of the types supported by the ``collections`` argument 

1527 to butler construction. 

1528 run : `str`, optional 

1529 Run to use for predictions, overriding ``self.run``. 

1530 **kwargs 

1531 Additional keyword arguments used to augment or construct a 

1532 `DataCoordinate`. See `DataCoordinate.standardize` 

1533 parameters. 

1534 

1535 Returns 

1536 ------- 

1537 uri : `lsst.resources.ResourcePath` 

1538 URI pointing to the Dataset within the datastore. If the 

1539 Dataset does not exist in the datastore, and if ``predict`` is 

1540 `True`, the URI will be a prediction and will include a URI 

1541 fragment "#predicted". 

1542 If the datastore does not have entities that relate well 

1543 to the concept of a URI the returned URI string will be 

1544 descriptive. The returned URI is not guaranteed to be obtainable. 

1545 

1546 Raises 

1547 ------ 

1548 LookupError 

1549 A URI has been requested for a dataset that does not exist and 

1550 guessing is not allowed. 

1551 ValueError 

1552 Raised if a resolved `DatasetRef` was passed as an input, but it 

1553 differs from the one found in the registry. 

1554 TypeError 

1555 Raised if no collections were provided. 

1556 RuntimeError 

1557 Raised if a URI is requested for a dataset that consists of 

1558 multiple artifacts. 

1559 """ 

1560 primary, components = self.getURIs( 

1561 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs 

1562 ) 

1563 

1564 if primary is None or components: 

1565 raise RuntimeError( 

1566 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1567 "Use Butler.getURIs() instead." 

1568 ) 

1569 return primary 

1570 

1571 def retrieveArtifacts( 

1572 self, 

1573 refs: Iterable[DatasetRef], 

1574 destination: ResourcePathExpression, 

1575 transfer: str = "auto", 

1576 preserve_path: bool = True, 

1577 overwrite: bool = False, 

1578 ) -> List[ResourcePath]: 

1579 """Retrieve the artifacts associated with the supplied refs. 

1580 

1581 Parameters 

1582 ---------- 

1583 refs : iterable of `DatasetRef` 

1584 The datasets for which artifacts are to be retrieved. 

1585 A single ref can result in multiple artifacts. The refs must 

1586 be resolved. 

1587 destination : `lsst.resources.ResourcePath` or `str` 

1588 Location to write the artifacts. 

1589 transfer : `str`, optional 

1590 Method to use to transfer the artifacts. Must be one of the options 

1591 supported by `~lsst.resources.ResourcePath.transfer_from()`. 

1592 "move" is not allowed. 

1593 preserve_path : `bool`, optional 

1594 If `True` the full path of the artifact within the datastore 

1595 is preserved. If `False` the final file component of the path 

1596 is used. 

1597 overwrite : `bool`, optional 

1598 If `True` allow transfers to overwrite existing files at the 

1599 destination. 

1600 

1601 Returns 

1602 ------- 

1603 targets : `list` of `lsst.resources.ResourcePath` 

1604 URIs of file artifacts in destination location. Order is not 

1605 preserved. 

1606 

1607 Notes 

1608 ----- 

1609 For non-file datastores the artifacts written to the destination 

1610 may not match the representation inside the datastore. For example 

1611 a hierarchical data structure in a NoSQL database may well be stored 

1612 as a JSON file. 

1613 """ 

1614 return self.datastore.retrieveArtifacts( 

1615 refs, 

1616 ResourcePath(destination), 

1617 transfer=transfer, 

1618 preserve_path=preserve_path, 

1619 overwrite=overwrite, 

1620 ) 

1621 

1622 def datasetExists( 

1623 self, 

1624 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1625 dataId: Optional[DataId] = None, 

1626 *, 

1627 collections: Any = None, 

1628 **kwargs: Any, 

1629 ) -> bool: 

1630 """Return True if the Dataset is actually present in the Datastore. 

1631 

1632 Parameters 

1633 ---------- 

1634 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1635 When `DatasetRef` the `dataId` should be `None`. 

1636 Otherwise the `DatasetType` or name thereof. 

1637 dataId : `dict` or `DataCoordinate` 

1638 A `dict` of `Dimension` link name, value pairs that label the 

1639 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1640 should be provided as the first argument. 

1641 collections : Any, optional 

1642 Collections to be searched, overriding ``self.collections``. 

1643 Can be any of the types supported by the ``collections`` argument 

1644 to butler construction. 

1645 **kwargs 

1646 Additional keyword arguments used to augment or construct a 

1647 `DataCoordinate`. See `DataCoordinate.standardize` 

1648 parameters. 

1649 

1650 Raises 

1651 ------ 

1652 LookupError 

1653 Raised if the dataset is not even present in the Registry. 

1654 ValueError 

1655 Raised if a resolved `DatasetRef` was passed as an input, but it 

1656 differs from the one found in the registry. 

1657 TypeError 

1658 Raised if no collections were provided. 

1659 """ 

1660 # A resolved ref may be given that is not known to this butler. 

1661 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

1662 ref = self.registry.getDataset(datasetRefOrType.id) 

1663 if ref is None: 

1664 raise LookupError( 

1665 f"Resolved DatasetRef with id {datasetRefOrType.id} is not known to registry." 

1666 ) 

1667 else: 

1668 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1669 return self.datastore.exists(ref) 

1670 

1671 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1672 """Remove one or more `~CollectionType.RUN` collections and the 

1673 datasets within them. 

1674 

1675 Parameters 

1676 ---------- 

1677 names : `Iterable` [ `str` ] 

1678 The names of the collections to remove. 

1679 unstore : `bool`, optional 

1680 If `True` (default), delete datasets from all datastores in which 

1681 they are present, and attempt to rollback the registry deletions if 

1682 datastore deletions fail (which may not always be possible). If 

1683 `False`, datastore records for these datasets are still removed, 

1684 but any artifacts (e.g. files) will not be. 

1685 

1686 Raises 

1687 ------ 

1688 TypeError 

1689 Raised if one or more collections are not of type 

1690 `~CollectionType.RUN`. 

1691 """ 

1692 if not self.isWriteable(): 

1693 raise TypeError("Butler is read-only.") 

1694 names = list(names) 

1695 refs: List[DatasetRef] = [] 

1696 for name in names: 

1697 collectionType = self.registry.getCollectionType(name) 

1698 if collectionType is not CollectionType.RUN: 

1699 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1700 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True)) 

1701 with self.datastore.transaction(): 

1702 with self.registry.transaction(): 

1703 if unstore: 

1704 self.datastore.trash(refs) 

1705 else: 

1706 self.datastore.forget(refs) 

1707 for name in names: 

1708 self.registry.removeCollection(name) 

1709 if unstore: 

1710 # Point of no return for removing artifacts 

1711 self.datastore.emptyTrash() 

1712 

1713 def pruneDatasets( 

1714 self, 

1715 refs: Iterable[DatasetRef], 

1716 *, 

1717 disassociate: bool = True, 

1718 unstore: bool = False, 

1719 tags: Iterable[str] = (), 

1720 purge: bool = False, 

1721 ) -> None: 

1722 # docstring inherited from LimitedButler 

1723 

1724 if not self.isWriteable(): 

1725 raise TypeError("Butler is read-only.") 

1726 if purge: 

1727 if not disassociate: 

1728 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1729 if not unstore: 

1730 raise TypeError("Cannot pass purge=True without unstore=True.") 

1731 elif disassociate: 

1732 tags = tuple(tags) 

1733 if not tags: 

1734 raise TypeError("No tags provided but disassociate=True.") 

1735 for tag in tags: 

1736 collectionType = self.registry.getCollectionType(tag) 

1737 if collectionType is not CollectionType.TAGGED: 

1738 raise TypeError( 

1739 f"Cannot disassociate from collection '{tag}' " 

1740 f"of non-TAGGED type {collectionType.name}." 

1741 ) 

1742 # For an execution butler we want to keep existing UUIDs for the 

1743 # datasets, for that we need to keep them in the collections but 

1744 # remove from datastore. 

1745 if self._allow_put_of_predefined_dataset and purge: 

1746 purge = False 

1747 disassociate = False 

1748 # Transform possibly-single-pass iterable into something we can iterate 

1749 # over multiple times. 

1750 refs = list(refs) 

1751 # Pruning a component of a DatasetRef makes no sense since registry 

1752 # doesn't know about components and datastore might not store 

1753 # components in a separate file 

1754 for ref in refs: 

1755 if ref.datasetType.component(): 

1756 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1757 # We don't need an unreliable Datastore transaction for this, because 

1758 # we've been extra careful to ensure that Datastore.trash only involves 

1759 # mutating the Registry (it can _look_ at Datastore-specific things, 

1760 # but shouldn't change them), and hence all operations here are 

1761 # Registry operations. 

1762 with self.datastore.transaction(): 

1763 with self.registry.transaction(): 

1764 if unstore: 

1765 self.datastore.trash(refs) 

1766 if purge: 

1767 self.registry.removeDatasets(refs) 

1768 elif disassociate: 

1769 assert tags, "Guaranteed by earlier logic in this function." 

1770 for tag in tags: 

1771 self.registry.disassociate(tag, refs) 

1772 # We've exited the Registry transaction, and apparently committed. 

1773 # (if there was an exception, everything rolled back, and it's as if 

1774 # nothing happened - and we never get here). 

1775 # Datastore artifacts are not yet gone, but they're clearly marked 

1776 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1777 # problems we can try again later, and if manual administrative 

1778 # intervention is required, it's pretty clear what that should entail: 

1779 # deleting everything on disk and in private Datastore tables that is 

1780 # in the dataset_location_trash table. 

1781 if unstore: 

1782 # Point of no return for removing artifacts 

1783 self.datastore.emptyTrash() 

1784 

1785 @transactional 

1786 def ingest( 

1787 self, 

1788 *datasets: FileDataset, 

1789 transfer: Optional[str] = "auto", 

1790 run: Optional[str] = None, 

1791 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1792 record_validation_info: bool = True, 

1793 ) -> None: 

1794 """Store and register one or more datasets that already exist on disk. 

1795 

1796 Parameters 

1797 ---------- 

1798 datasets : `FileDataset` 

1799 Each positional argument is a struct containing information about 

1800 a file to be ingested, including its URI (either absolute or 

1801 relative to the datastore root, if applicable), a `DatasetRef`, 

1802 and optionally a formatter class or its fully-qualified string 

1803 name. If a formatter is not provided, the formatter that would be 

1804 used for `put` is assumed. On successful return, all 

1805 `FileDataset.ref` attributes will have their `DatasetRef.id` 

1806 attribute populated and all `FileDataset.formatter` attributes will 

1807 be set to the formatter class used. `FileDataset.path` attributes 

1808 may be modified to put paths in whatever the datastore considers a 

1809 standardized form. 

1810 transfer : `str`, optional 

1811 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1812 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to 

1813 transfer the file. 

1814 run : `str`, optional 

1815 The name of the run ingested datasets should be added to, 

1816 overriding ``self.run``. 

1817 idGenerationMode : `DatasetIdGenEnum`, optional 

1818 Specifies option for generating dataset IDs. By default unique IDs 

1819 are generated for each inserted dataset. 

1820 record_validation_info : `bool`, optional 

1821 If `True`, the default, the datastore can record validation 

1822 information associated with the file. If `False` the datastore 

1823 will not attempt to track any information such as checksums 

1824 or file sizes. This can be useful if such information is tracked 

1825 in an external system or if the file is to be compressed in place. 

1826 It is up to the datastore whether this parameter is relevant. 

1827 

1828 Raises 

1829 ------ 

1830 TypeError 

1831 Raised if the butler is read-only or if no run was provided. 

1832 NotImplementedError 

1833 Raised if the `Datastore` does not support the given transfer mode. 

1834 DatasetTypeNotSupportedError 

1835 Raised if one or more files to be ingested have a dataset type that 

1836 is not supported by the `Datastore`.. 

1837 FileNotFoundError 

1838 Raised if one of the given files does not exist. 

1839 FileExistsError 

1840 Raised if transfer is not `None` but the (internal) location the 

1841 file would be moved to is already occupied. 

1842 

1843 Notes 

1844 ----- 

1845 This operation is not fully exception safe: if a database operation 

1846 fails, the given `FileDataset` instances may be only partially updated. 

1847 

1848 It is atomic in terms of database operations (they will either all 

1849 succeed or all fail) providing the database engine implements 

1850 transactions correctly. It will attempt to be atomic in terms of 

1851 filesystem operations as well, but this cannot be implemented 

1852 rigorously for most datastores. 

1853 """ 

1854 if not self.isWriteable(): 

1855 raise TypeError("Butler is read-only.") 

1856 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1857 # Reorganize the inputs so they're grouped by DatasetType and then 

1858 # data ID. We also include a list of DatasetRefs for each FileDataset 

1859 # to hold the resolved DatasetRefs returned by the Registry, before 

1860 # it's safe to swap them into FileDataset.refs. 

1861 # Some type annotation aliases to make that clearer: 

1862 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]] 

1863 GroupedData = MutableMapping[DatasetType, GroupForType] 

1864 # The actual data structure: 

1865 groupedData: GroupedData = defaultdict(dict) 

1866 # And the nested loop that populates it: 

1867 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1868 # This list intentionally shared across the inner loop, since it's 

1869 # associated with `dataset`. 

1870 resolvedRefs: List[DatasetRef] = [] 

1871 

1872 # Somewhere to store pre-existing refs if we have an 

1873 # execution butler. 

1874 existingRefs: List[DatasetRef] = [] 

1875 

1876 for ref in dataset.refs: 

1877 if ref.dataId in groupedData[ref.datasetType]: 

1878 raise ConflictingDefinitionError( 

1879 f"Ingest conflict. Dataset {dataset.path} has same" 

1880 " DataId as other ingest dataset" 

1881 f" {groupedData[ref.datasetType][ref.dataId][0].path} " 

1882 f" ({ref.dataId})" 

1883 ) 

1884 if self._allow_put_of_predefined_dataset: 

1885 existing_ref = self.registry.findDataset( 

1886 ref.datasetType, dataId=ref.dataId, collections=run 

1887 ) 

1888 if existing_ref: 

1889 if self.datastore.knows(existing_ref): 

1890 raise ConflictingDefinitionError( 

1891 f"Dataset associated with path {dataset.path}" 

1892 f" already exists as {existing_ref}." 

1893 ) 

1894 # Store this ref elsewhere since it already exists 

1895 # and we do not want to remake it but we do want 

1896 # to store it in the datastore. 

1897 existingRefs.append(existing_ref) 

1898 

1899 # Nothing else to do until we have finished 

1900 # iterating. 

1901 continue 

1902 

1903 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs) 

1904 

1905 if existingRefs: 

1906 if len(dataset.refs) != len(existingRefs): 

1907 # Keeping track of partially pre-existing datasets is hard 

1908 # and should generally never happen. For now don't allow 

1909 # it. 

1910 raise ConflictingDefinitionError( 

1911 f"For dataset {dataset.path} some dataIds already exist" 

1912 " in registry but others do not. This is not supported." 

1913 ) 

1914 

1915 # Attach the resolved refs if we found them. 

1916 dataset.refs = existingRefs 

1917 

1918 # Now we can bulk-insert into Registry for each DatasetType. 

1919 for datasetType, groupForType in progress.iter_item_chunks( 

1920 groupedData.items(), desc="Bulk-inserting datasets by type" 

1921 ): 

1922 refs = self.registry.insertDatasets( 

1923 datasetType, 

1924 dataIds=groupForType.keys(), 

1925 run=run, 

1926 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType), 

1927 idGenerationMode=idGenerationMode, 

1928 ) 

1929 # Append those resolved DatasetRefs to the new lists we set up for 

1930 # them. 

1931 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()): 

1932 resolvedRefs.append(ref) 

1933 

1934 # Go back to the original FileDatasets to replace their refs with the 

1935 # new resolved ones. 

1936 for groupForType in progress.iter_chunks( 

1937 groupedData.values(), desc="Reassociating resolved dataset refs with files" 

1938 ): 

1939 for dataset, resolvedRefs in groupForType.values(): 

1940 dataset.refs = resolvedRefs 

1941 

1942 # Bulk-insert everything into Datastore. 

1943 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info) 

1944 

1945 @contextlib.contextmanager 

1946 def export( 

1947 self, 

1948 *, 

1949 directory: Optional[str] = None, 

1950 filename: Optional[str] = None, 

1951 format: Optional[str] = None, 

1952 transfer: Optional[str] = None, 

1953 ) -> Iterator[RepoExportContext]: 

1954 """Export datasets from the repository represented by this `Butler`. 

1955 

1956 This method is a context manager that returns a helper object 

1957 (`RepoExportContext`) that is used to indicate what information from 

1958 the repository should be exported. 

1959 

1960 Parameters 

1961 ---------- 

1962 directory : `str`, optional 

1963 Directory dataset files should be written to if ``transfer`` is not 

1964 `None`. 

1965 filename : `str`, optional 

1966 Name for the file that will include database information associated 

1967 with the exported datasets. If this is not an absolute path and 

1968 ``directory`` is not `None`, it will be written to ``directory`` 

1969 instead of the current working directory. Defaults to 

1970 "export.{format}". 

1971 format : `str`, optional 

1972 File format for the database information file. If `None`, the 

1973 extension of ``filename`` will be used. 

1974 transfer : `str`, optional 

1975 Transfer mode passed to `Datastore.export`. 

1976 

1977 Raises 

1978 ------ 

1979 TypeError 

1980 Raised if the set of arguments passed is inconsistent. 

1981 

1982 Examples 

1983 -------- 

1984 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

1985 methods are used to provide the iterables over data IDs and/or datasets 

1986 to be exported:: 

1987 

1988 with butler.export("exports.yaml") as export: 

1989 # Export all flats, but none of the dimension element rows 

1990 # (i.e. data ID information) associated with them. 

1991 export.saveDatasets(butler.registry.queryDatasets("flat"), 

1992 elements=()) 

1993 # Export all datasets that start with "deepCoadd_" and all of 

1994 # their associated data ID information. 

1995 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

1996 """ 

1997 if directory is None and transfer is not None: 

1998 raise TypeError("Cannot transfer without providing a directory.") 

1999 if transfer == "move": 

2000 raise TypeError("Transfer may not be 'move': export is read-only") 

2001 if format is None: 

2002 if filename is None: 

2003 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2004 else: 

2005 _, format = os.path.splitext(filename) 

2006 if not format: 

2007 raise ValueError("Please specify a file extension to determine export format.") 

2008 format = format[1:] # Strip leading "."" 

2009 elif filename is None: 

2010 filename = f"export.{format}" 

2011 if directory is not None: 

2012 filename = os.path.join(directory, filename) 

2013 formats = self._config["repo_transfer_formats"] 

2014 if format not in formats: 

2015 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}") 

2016 BackendClass = get_class_of(formats[format, "export"]) 

2017 with open(filename, "w") as stream: 

2018 backend = BackendClass(stream, universe=self.registry.dimensions) 

2019 try: 

2020 helper = RepoExportContext( 

2021 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer 

2022 ) 

2023 yield helper 

2024 except BaseException: 

2025 raise 

2026 else: 

2027 helper._finish() 

2028 

2029 def import_( 

2030 self, 

2031 *, 

2032 directory: Optional[ResourcePathExpression] = None, 

2033 filename: Union[ResourcePathExpression, TextIO, None] = None, 

2034 format: Optional[str] = None, 

2035 transfer: Optional[str] = None, 

2036 skip_dimensions: Optional[Set] = None, 

2037 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

2038 reuseIds: bool = False, 

2039 ) -> None: 

2040 """Import datasets into this repository that were exported from a 

2041 different butler repository via `~lsst.daf.butler.Butler.export`. 

2042 

2043 Parameters 

2044 ---------- 

2045 directory : `~lsst.resources.ResourcePathExpression`, optional 

2046 Directory containing dataset files to import from. If `None`, 

2047 ``filename`` and all dataset file paths specified therein must 

2048 be absolute. 

2049 filename : `~lsst.resources.ResourcePathExpression` or `TextIO` 

2050 A stream or name of file that contains database information 

2051 associated with the exported datasets, typically generated by 

2052 `~lsst.daf.butler.Butler.export`. If this a string (name) or 

2053 `~lsst.resources.ResourcePath` and is not an absolute path, 

2054 it will first be looked for relative to ``directory`` and if not 

2055 found there it will be looked for in the current working 

2056 directory. Defaults to "export.{format}". 

2057 format : `str`, optional 

2058 File format for ``filename``. If `None`, the extension of 

2059 ``filename`` will be used. 

2060 transfer : `str`, optional 

2061 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

2062 skip_dimensions : `set`, optional 

2063 Names of dimensions that should be skipped and not imported. 

2064 idGenerationMode : `DatasetIdGenEnum`, optional 

2065 Specifies option for generating dataset IDs when IDs are not 

2066 provided or their type does not match backend type. By default 

2067 unique IDs are generated for each inserted dataset. 

2068 reuseIds : `bool`, optional 

2069 If `True` then forces re-use of imported dataset IDs for integer 

2070 IDs which are normally generated as auto-incremented; exception 

2071 will be raised if imported IDs clash with existing ones. This 

2072 option has no effect on the use of globally-unique IDs which are 

2073 always re-used (or generated if integer IDs are being imported). 

2074 

2075 Raises 

2076 ------ 

2077 TypeError 

2078 Raised if the set of arguments passed is inconsistent, or if the 

2079 butler is read-only. 

2080 """ 

2081 if not self.isWriteable(): 

2082 raise TypeError("Butler is read-only.") 

2083 if format is None: 

2084 if filename is None: 

2085 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2086 else: 

2087 _, format = os.path.splitext(filename) # type: ignore 

2088 elif filename is None: 

2089 filename = ResourcePath(f"export.{format}", forceAbsolute=False) 

2090 if directory is not None: 

2091 directory = ResourcePath(directory, forceDirectory=True) 

2092 # mypy doesn't think this will work but it does in python >= 3.10. 

2093 if isinstance(filename, ResourcePathExpression): # type: ignore 

2094 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore 

2095 if not filename.isabs() and directory is not None: 

2096 potential = directory.join(filename) 

2097 exists_in_cwd = filename.exists() 

2098 exists_in_dir = potential.exists() 

2099 if exists_in_cwd and exists_in_dir: 

2100 log.warning( 

2101 "A relative path for filename was specified (%s) which exists relative to cwd. " 

2102 "Additionally, the file exists relative to the given search directory (%s). " 

2103 "Using the export file in the given directory.", 

2104 filename, 

2105 potential, 

2106 ) 

2107 # Given they specified an explicit directory and that 

2108 # directory has the export file in it, assume that that 

2109 # is what was meant despite the file in cwd. 

2110 filename = potential 

2111 elif exists_in_dir: 

2112 filename = potential 

2113 elif not exists_in_cwd and not exists_in_dir: 

2114 # Raise early. 

2115 raise FileNotFoundError( 

2116 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}." 

2117 ) 

2118 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"]) 

2119 

2120 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None: 

2121 backend = BackendClass(importStream, self.registry) 

2122 backend.register() 

2123 with self.transaction(): 

2124 backend.load( 

2125 self.datastore, 

2126 directory=directory, 

2127 transfer=transfer, 

2128 skip_dimensions=skip_dimensions, 

2129 idGenerationMode=idGenerationMode, 

2130 reuseIds=reuseIds, 

2131 ) 

2132 

2133 if isinstance(filename, ResourcePath): 

2134 # We can not use open() here at the moment because of 

2135 # DM-38589 since yaml does stream.read(8192) in a loop. 

2136 stream = io.StringIO(filename.read().decode()) 

2137 doImport(stream) 

2138 else: 

2139 doImport(filename) # type: ignore 

2140 

2141 def transfer_from( 

2142 self, 

2143 source_butler: LimitedButler, 

2144 source_refs: Iterable[DatasetRef], 

2145 transfer: str = "auto", 

2146 skip_missing: bool = True, 

2147 register_dataset_types: bool = False, 

2148 transfer_dimensions: bool = False, 

2149 ) -> collections.abc.Collection[DatasetRef]: 

2150 """Transfer datasets to this Butler from a run in another Butler. 

2151 

2152 Parameters 

2153 ---------- 

2154 source_butler : `LimitedButler` 

2155 Butler from which the datasets are to be transferred. If data IDs 

2156 in ``source_refs`` are not expanded then this has to be a full 

2157 `Butler` whose registry will be used to expand data IDs. 

2158 source_refs : iterable of `DatasetRef` 

2159 Datasets defined in the source butler that should be transferred to 

2160 this butler. 

2161 transfer : `str`, optional 

2162 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

2163 skip_missing : `bool` 

2164 If `True`, datasets with no datastore artifact associated with 

2165 them are not transferred. If `False` a registry entry will be 

2166 created even if no datastore record is created (and so will 

2167 look equivalent to the dataset being unstored). 

2168 register_dataset_types : `bool` 

2169 If `True` any missing dataset types are registered. Otherwise 

2170 an exception is raised. 

2171 transfer_dimensions : `bool`, optional 

2172 If `True`, dimension record data associated with the new datasets 

2173 will be transferred. 

2174 

2175 Returns 

2176 ------- 

2177 refs : `list` of `DatasetRef` 

2178 The refs added to this Butler. 

2179 

2180 Notes 

2181 ----- 

2182 The datastore artifact has to exist for a transfer 

2183 to be made but non-existence is not an error. 

2184 

2185 Datasets that already exist in this run will be skipped. 

2186 

2187 The datasets are imported as part of a transaction, although 

2188 dataset types are registered before the transaction is started. 

2189 This means that it is possible for a dataset type to be registered 

2190 even though transfer has failed. 

2191 """ 

2192 if not self.isWriteable(): 

2193 raise TypeError("Butler is read-only.") 

2194 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

2195 

2196 # Will iterate through the refs multiple times so need to convert 

2197 # to a list if this isn't a collection. 

2198 if not isinstance(source_refs, collections.abc.Collection): 

2199 source_refs = list(source_refs) 

2200 

2201 original_count = len(source_refs) 

2202 log.info("Transferring %d datasets into %s", original_count, str(self)) 

2203 

2204 # In some situations the datastore artifact may be missing 

2205 # and we do not want that registry entry to be imported. 

2206 # Asking datastore is not sufficient, the records may have been 

2207 # purged, we have to ask for the (predicted) URI and check 

2208 # existence explicitly. Execution butler is set up exactly like 

2209 # this with no datastore records. 

2210 artifact_existence: Dict[ResourcePath, bool] = {} 

2211 if skip_missing: 

2212 dataset_existence = source_butler.datastore.mexists( 

2213 source_refs, artifact_existence=artifact_existence 

2214 ) 

2215 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

2216 filtered_count = len(source_refs) 

2217 n_missing = original_count - filtered_count 

2218 log.verbose( 

2219 "%d dataset%s removed because the artifact does not exist. Now have %d.", 

2220 n_missing, 

2221 "" if n_missing == 1 else "s", 

2222 filtered_count, 

2223 ) 

2224 

2225 # Importing requires that we group the refs by dataset type and run 

2226 # before doing the import. 

2227 source_dataset_types = set() 

2228 grouped_refs = defaultdict(list) 

2229 for ref in source_refs: 

2230 grouped_refs[ref.datasetType, ref.run].append(ref) 

2231 source_dataset_types.add(ref.datasetType) 

2232 

2233 # Check to see if the dataset type in the source butler has 

2234 # the same definition in the target butler and register missing 

2235 # ones if requested. Registration must happen outside a transaction. 

2236 newly_registered_dataset_types = set() 

2237 for datasetType in source_dataset_types: 

2238 if register_dataset_types: 

2239 # Let this raise immediately if inconsistent. Continuing 

2240 # on to find additional inconsistent dataset types 

2241 # might result in additional unwanted dataset types being 

2242 # registered. 

2243 if self.registry.registerDatasetType(datasetType): 

2244 newly_registered_dataset_types.add(datasetType) 

2245 else: 

2246 # If the dataset type is missing, let it fail immediately. 

2247 target_dataset_type = self.registry.getDatasetType(datasetType.name) 

2248 if target_dataset_type != datasetType: 

2249 raise ConflictingDefinitionError( 

2250 "Source butler dataset type differs from definition" 

2251 f" in target butler: {datasetType} !=" 

2252 f" {target_dataset_type}" 

2253 ) 

2254 if newly_registered_dataset_types: 

2255 # We may have registered some even if there were inconsistencies 

2256 # but should let people know (or else remove them again). 

2257 log.log( 

2258 VERBOSE, 

2259 "Registered the following dataset types in the target Butler: %s", 

2260 ", ".join(d.name for d in newly_registered_dataset_types), 

2261 ) 

2262 else: 

2263 log.log(VERBOSE, "All required dataset types are known to the target Butler") 

2264 

2265 dimension_records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

2266 if transfer_dimensions: 

2267 # Collect all the dimension records for these refs. 

2268 # All dimensions are to be copied but the list of valid dimensions 

2269 # come from this butler's universe. 

2270 elements = frozenset( 

2271 element 

2272 for element in self.registry.dimensions.getStaticElements() 

2273 if element.hasTable() and element.viewOf is None 

2274 ) 

2275 dataIds = set(ref.dataId for ref in source_refs) 

2276 # This logic comes from saveDataIds. 

2277 for dataId in dataIds: 

2278 # Need an expanded record, if not expanded that we need a full 

2279 # butler with registry (allow mocks with registry too). 

2280 if not dataId.hasRecords(): 

2281 if registry := getattr(source_butler, "registry", None): 

2282 dataId = registry.expandDataId(dataId) 

2283 else: 

2284 raise TypeError("Input butler needs to be a full butler to expand DataId.") 

2285 # If this butler doesn't know about a dimension in the source 

2286 # butler things will break later. 

2287 for record in dataId.records.values(): 

2288 if record is not None and record.definition in elements: 

2289 dimension_records[record.definition].setdefault(record.dataId, record) 

2290 

2291 handled_collections: Set[str] = set() 

2292 

2293 # Do all the importing in a single transaction. 

2294 with self.transaction(): 

2295 if dimension_records: 

2296 log.verbose("Ensuring that dimension records exist for transferred datasets.") 

2297 for element, r in dimension_records.items(): 

2298 records = [r[dataId] for dataId in r] 

2299 # Assume that if the record is already present that we can 

2300 # use it without having to check that the record metadata 

2301 # is consistent. 

2302 self.registry.insertDimensionData(element, *records, skip_existing=True) 

2303 

2304 n_imported = 0 

2305 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

2306 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

2307 ): 

2308 if run not in handled_collections: 

2309 # May need to create output collection. If source butler 

2310 # has a registry, ask for documentation string. 

2311 run_doc = None 

2312 if registry := getattr(source_butler, "registry", None): 

2313 run_doc = registry.getCollectionDocumentation(run) 

2314 registered = self.registry.registerRun(run, doc=run_doc) 

2315 handled_collections.add(run) 

2316 if registered: 

2317 log.log(VERBOSE, "Creating output run %s", run) 

2318 

2319 n_refs = len(refs_to_import) 

2320 log.verbose( 

2321 "Importing %d ref%s of dataset type %s into run %s", 

2322 n_refs, 

2323 "" if n_refs == 1 else "s", 

2324 datasetType.name, 

2325 run, 

2326 ) 

2327 

2328 # Assume we are using UUIDs and the source refs will match 

2329 # those imported. 

2330 imported_refs = self.registry._importDatasets(refs_to_import, expand=False) 

2331 assert set(imported_refs) == set(refs_to_import) 

2332 n_imported += len(imported_refs) 

2333 

2334 assert len(source_refs) == n_imported 

2335 log.verbose("Imported %d datasets into destination butler", n_imported) 

2336 

2337 # Ask the datastore to transfer. The datastore has to check that 

2338 # the source datastore is compatible with the target datastore. 

2339 accepted, rejected = self.datastore.transfer_from( 

2340 source_butler.datastore, 

2341 source_refs, 

2342 transfer=transfer, 

2343 artifact_existence=artifact_existence, 

2344 ) 

2345 if rejected: 

2346 # For now, accept the registry entries but not the files. 

2347 log.warning( 

2348 "%d datasets were rejected and %d accepted for dataset type %s in run %r.", 

2349 len(rejected), 

2350 len(accepted), 

2351 datasetType, 

2352 run, 

2353 ) 

2354 

2355 return source_refs 

2356 

2357 def validateConfiguration( 

2358 self, 

2359 logFailures: bool = False, 

2360 datasetTypeNames: Optional[Iterable[str]] = None, 

2361 ignore: Iterable[str] | None = None, 

2362 ) -> None: 

2363 """Validate butler configuration. 

2364 

2365 Checks that each `DatasetType` can be stored in the `Datastore`. 

2366 

2367 Parameters 

2368 ---------- 

2369 logFailures : `bool`, optional 

2370 If `True`, output a log message for every validation error 

2371 detected. 

2372 datasetTypeNames : iterable of `str`, optional 

2373 The `DatasetType` names that should be checked. This allows 

2374 only a subset to be selected. 

2375 ignore : iterable of `str`, optional 

2376 Names of DatasetTypes to skip over. This can be used to skip 

2377 known problems. If a named `DatasetType` corresponds to a 

2378 composite, all components of that `DatasetType` will also be 

2379 ignored. 

2380 

2381 Raises 

2382 ------ 

2383 ButlerValidationError 

2384 Raised if there is some inconsistency with how this Butler 

2385 is configured. 

2386 """ 

2387 if datasetTypeNames: 

2388 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

2389 else: 

2390 datasetTypes = list(self.registry.queryDatasetTypes()) 

2391 

2392 # filter out anything from the ignore list 

2393 if ignore: 

2394 ignore = set(ignore) 

2395 datasetTypes = [ 

2396 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

2397 ] 

2398 else: 

2399 ignore = set() 

2400 

2401 # Find all the registered instruments 

2402 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument")) 

2403 

2404 # For each datasetType that has an instrument dimension, create 

2405 # a DatasetRef for each defined instrument 

2406 datasetRefs = [] 

2407 

2408 for datasetType in datasetTypes: 

2409 if "instrument" in datasetType.dimensions: 

2410 for instrument in instruments: 

2411 datasetRef = DatasetRef( 

2412 datasetType, {"instrument": instrument}, conform=False # type: ignore 

2413 ) 

2414 datasetRefs.append(datasetRef) 

2415 

2416 entities: List[Union[DatasetType, DatasetRef]] = [] 

2417 entities.extend(datasetTypes) 

2418 entities.extend(datasetRefs) 

2419 

2420 datastoreErrorStr = None 

2421 try: 

2422 self.datastore.validateConfiguration(entities, logFailures=logFailures) 

2423 except ValidationError as e: 

2424 datastoreErrorStr = str(e) 

2425 

2426 # Also check that the LookupKeys used by the datastores match 

2427 # registry and storage class definitions 

2428 keys = self.datastore.getLookupKeys() 

2429 

2430 failedNames = set() 

2431 failedDataId = set() 

2432 for key in keys: 

2433 if key.name is not None: 

2434 if key.name in ignore: 

2435 continue 

2436 

2437 # skip if specific datasetType names were requested and this 

2438 # name does not match 

2439 if datasetTypeNames and key.name not in datasetTypeNames: 

2440 continue 

2441 

2442 # See if it is a StorageClass or a DatasetType 

2443 if key.name in self.storageClasses: 

2444 pass 

2445 else: 

2446 try: 

2447 self.registry.getDatasetType(key.name) 

2448 except KeyError: 

2449 if logFailures: 

2450 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

2451 failedNames.add(key) 

2452 else: 

2453 # Dimensions are checked for consistency when the Butler 

2454 # is created and rendezvoused with a universe. 

2455 pass 

2456 

2457 # Check that the instrument is a valid instrument 

2458 # Currently only support instrument so check for that 

2459 if key.dataId: 

2460 dataIdKeys = set(key.dataId) 

2461 if set(["instrument"]) != dataIdKeys: 

2462 if logFailures: 

2463 log.critical("Key '%s' has unsupported DataId override", key) 

2464 failedDataId.add(key) 

2465 elif key.dataId["instrument"] not in instruments: 

2466 if logFailures: 

2467 log.critical("Key '%s' has unknown instrument", key) 

2468 failedDataId.add(key) 

2469 

2470 messages = [] 

2471 

2472 if datastoreErrorStr: 

2473 messages.append(datastoreErrorStr) 

2474 

2475 for failed, msg in ( 

2476 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2477 (failedDataId, "Keys with bad DataId entries: "), 

2478 ): 

2479 if failed: 

2480 msg += ", ".join(str(k) for k in failed) 

2481 messages.append(msg) 

2482 

2483 if messages: 

2484 raise ValidationError(";\n".join(messages)) 

2485 

2486 @property 

2487 def collections(self) -> Sequence[str]: 

2488 """The collections to search by default, in order 

2489 (`Sequence` [ `str` ]). 

2490 

2491 This is an alias for ``self.registry.defaults.collections``. It cannot 

2492 be set directly in isolation, but all defaults may be changed together 

2493 by assigning a new `RegistryDefaults` instance to 

2494 ``self.registry.defaults``. 

2495 """ 

2496 return self.registry.defaults.collections 

2497 

2498 @property 

2499 def run(self) -> Optional[str]: 

2500 """Name of the run this butler writes outputs to by default (`str` or 

2501 `None`). 

2502 

2503 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2504 directly in isolation, but all defaults may be changed together by 

2505 assigning a new `RegistryDefaults` instance to 

2506 ``self.registry.defaults``. 

2507 """ 

2508 return self.registry.defaults.run 

2509 

2510 @property 

2511 def dimensions(self) -> DimensionUniverse: 

2512 # Docstring inherited. 

2513 return self.registry.dimensions 

2514 

2515 registry: Registry 

2516 """The object that manages dataset metadata and relationships (`Registry`). 

2517 

2518 Most operations that don't involve reading or writing butler datasets are 

2519 accessible only via `Registry` methods. 

2520 """ 

2521 

2522 datastore: Datastore 

2523 """The object that manages actual dataset storage (`Datastore`). 

2524 

2525 Direct user access to the datastore should rarely be necessary; the primary 

2526 exception is the case where a `Datastore` implementation provides extra 

2527 functionality beyond what the base class defines. 

2528 """ 

2529 

2530 storageClasses: StorageClassFactory 

2531 """An object that maps known storage class names to objects that fully 

2532 describe them (`StorageClassFactory`). 

2533 """ 

2534 

2535 _allow_put_of_predefined_dataset: bool 

2536 """Allow a put to succeed even if there is already a registry entry for it 

2537 but not a datastore record. (`bool`)."""