Coverage for python/lsst/daf/butler/_butler.py: 8%

699 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-03-23 02:06 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Butler top level classes. 

24""" 

25from __future__ import annotations 

26 

27__all__ = ( 

28 "Butler", 

29 "ButlerValidationError", 

30 "PruneCollectionsArgsError", 

31 "PurgeWithoutUnstorePruneCollectionsError", 

32 "RunWithoutPurgePruneCollectionsError", 

33 "PurgeUnsupportedPruneCollectionsError", 

34) 

35 

36import collections.abc 

37import contextlib 

38import logging 

39import numbers 

40import os 

41import uuid 

42from collections import defaultdict 

43from typing import ( 

44 Any, 

45 ClassVar, 

46 Counter, 

47 Dict, 

48 Iterable, 

49 Iterator, 

50 List, 

51 MutableMapping, 

52 Optional, 

53 Sequence, 

54 Set, 

55 TextIO, 

56 Tuple, 

57 Type, 

58 Union, 

59) 

60 

61from lsst.resources import ResourcePath, ResourcePathExpression 

62from lsst.utils import doImportType 

63from lsst.utils.introspection import get_class_of 

64from lsst.utils.logging import VERBOSE, getLogger 

65 

66from ._butlerConfig import ButlerConfig 

67from ._butlerRepoIndex import ButlerRepoIndex 

68from ._deferredDatasetHandle import DeferredDatasetHandle 

69from ._limited_butler import LimitedButler 

70from .core import ( 

71 AmbiguousDatasetError, 

72 Config, 

73 ConfigSubset, 

74 DataCoordinate, 

75 DataId, 

76 DataIdValue, 

77 DatasetRef, 

78 DatasetRefURIs, 

79 DatasetType, 

80 Datastore, 

81 Dimension, 

82 DimensionConfig, 

83 DimensionElement, 

84 DimensionRecord, 

85 DimensionUniverse, 

86 FileDataset, 

87 Progress, 

88 StorageClass, 

89 StorageClassFactory, 

90 Timespan, 

91 ValidationError, 

92) 

93from .core.repoRelocation import BUTLER_ROOT_TAG 

94from .core.utils import transactional 

95from .registry import ( 

96 CollectionType, 

97 ConflictingDefinitionError, 

98 DataIdError, 

99 DatasetIdGenEnum, 

100 MissingDatasetTypeError, 

101 Registry, 

102 RegistryConfig, 

103 RegistryDefaults, 

104) 

105from .transfers import RepoExportContext 

106 

107log = getLogger(__name__) 

108 

109 

110class ButlerValidationError(ValidationError): 

111 """There is a problem with the Butler configuration.""" 

112 

113 pass 

114 

115 

116class PruneCollectionsArgsError(TypeError): 

117 """Base class for errors relating to Butler.pruneCollections input 

118 arguments. 

119 """ 

120 

121 pass 

122 

123 

124class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError): 

125 """Raised when purge and unstore are both required to be True, and 

126 purge is True but unstore is False. 

127 """ 

128 

129 def __init__(self) -> None: 

130 super().__init__("Cannot pass purge=True without unstore=True.") 

131 

132 

133class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError): 

134 """Raised when pruning a RUN collection but purge is False.""" 

135 

136 def __init__(self, collectionType: CollectionType): 

137 self.collectionType = collectionType 

138 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.") 

139 

140 

141class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError): 

142 """Raised when purge is True but is not supported for the given 

143 collection.""" 

144 

145 def __init__(self, collectionType: CollectionType): 

146 self.collectionType = collectionType 

147 super().__init__( 

148 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True." 

149 ) 

150 

151 

152class Butler(LimitedButler): 

153 """Main entry point for the data access system. 

154 

155 Parameters 

156 ---------- 

157 config : `ButlerConfig`, `Config` or `str`, optional. 

158 Configuration. Anything acceptable to the 

159 `ButlerConfig` constructor. If a directory path 

160 is given the configuration will be read from a ``butler.yaml`` file in 

161 that location. If `None` is given default values will be used. 

162 butler : `Butler`, optional. 

163 If provided, construct a new Butler that uses the same registry and 

164 datastore as the given one, but with the given collection and run. 

165 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

166 arguments. 

167 collections : `str` or `Iterable` [ `str` ], optional 

168 An expression specifying the collections to be searched (in order) when 

169 reading datasets. 

170 This may be a `str` collection name or an iterable thereof. 

171 See :ref:`daf_butler_collection_expressions` for more information. 

172 These collections are not registered automatically and must be 

173 manually registered before they are used by any method, but they may be 

174 manually registered after the `Butler` is initialized. 

175 run : `str`, optional 

176 Name of the `~CollectionType.RUN` collection new datasets should be 

177 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

178 ``collections`` will be set to ``[run]``. If not `None`, this 

179 collection will automatically be registered. If this is not set (and 

180 ``writeable`` is not set either), a read-only butler will be created. 

181 searchPaths : `list` of `str`, optional 

182 Directory paths to search when calculating the full Butler 

183 configuration. Not used if the supplied config is already a 

184 `ButlerConfig`. 

185 writeable : `bool`, optional 

186 Explicitly sets whether the butler supports write operations. If not 

187 provided, a read-write butler is created if any of ``run``, ``tags``, 

188 or ``chains`` is non-empty. 

189 inferDefaults : `bool`, optional 

190 If `True` (default) infer default data ID values from the values 

191 present in the datasets in ``collections``: if all collections have the 

192 same value (or no value) for a governor dimension, that value will be 

193 the default for that dimension. Nonexistent collections are ignored. 

194 If a default value is provided explicitly for a governor dimension via 

195 ``**kwargs``, no default will be inferred for that dimension. 

196 **kwargs : `str` 

197 Default data ID key-value pairs. These may only identify "governor" 

198 dimensions like ``instrument`` and ``skymap``. 

199 

200 Examples 

201 -------- 

202 While there are many ways to control exactly how a `Butler` interacts with 

203 the collections in its `Registry`, the most common cases are still simple. 

204 

205 For a read-only `Butler` that searches one collection, do:: 

206 

207 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

208 

209 For a read-write `Butler` that writes to and reads from a 

210 `~CollectionType.RUN` collection:: 

211 

212 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

213 

214 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

215 because we want to write to one `~CollectionType.RUN` collection but read 

216 from several others (as well):: 

217 

218 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

219 collections=["u/alice/DM-50000/a", 

220 "u/bob/DM-49998", 

221 "HSC/defaults"]) 

222 

223 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

224 Datasets will be read first from that run (since it appears first in the 

225 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

226 

227 Finally, one can always create a `Butler` with no collections:: 

228 

229 butler = Butler("/path/to/repo", writeable=True) 

230 

231 This can be extremely useful when you just want to use ``butler.registry``, 

232 e.g. for inserting dimension data or managing collections, or when the 

233 collections you want to use with the butler are not consistent. 

234 Passing ``writeable`` explicitly here is only necessary if you want to be 

235 able to make changes to the repo - usually the value for ``writeable`` can 

236 be guessed from the collection arguments provided, but it defaults to 

237 `False` when there are not collection arguments. 

238 """ 

239 

240 def __init__( 

241 self, 

242 config: Union[Config, str, None] = None, 

243 *, 

244 butler: Optional[Butler] = None, 

245 collections: Any = None, 

246 run: Optional[str] = None, 

247 searchPaths: Optional[List[str]] = None, 

248 writeable: Optional[bool] = None, 

249 inferDefaults: bool = True, 

250 **kwargs: str, 

251 ): 

252 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

253 # Load registry, datastore, etc. from config or existing butler. 

254 if butler is not None: 

255 if config is not None or searchPaths is not None or writeable is not None: 

256 raise TypeError( 

257 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument." 

258 ) 

259 self.registry = butler.registry.copy(defaults) 

260 self.datastore = butler.datastore 

261 self.storageClasses = butler.storageClasses 

262 self._config: ButlerConfig = butler._config 

263 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset 

264 else: 

265 # Can only look for strings in the known repos list. 

266 if isinstance(config, str) and config in self.get_known_repos(): 

267 config = str(self.get_repo_uri(config)) 

268 try: 

269 self._config = ButlerConfig(config, searchPaths=searchPaths) 

270 except FileNotFoundError as e: 

271 if known := self.get_known_repos(): 

272 aliases = f"(known aliases: {', '.join(known)})" 

273 else: 

274 aliases = "(no known aliases)" 

275 raise FileNotFoundError(f"{e} {aliases}") from e 

276 self._config = ButlerConfig(config, searchPaths=searchPaths) 

277 try: 

278 if "root" in self._config: 

279 butlerRoot = self._config["root"] 

280 else: 

281 butlerRoot = self._config.configDir 

282 if writeable is None: 

283 writeable = run is not None 

284 self.registry = Registry.fromConfig( 

285 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

286 ) 

287 self.datastore = Datastore.fromConfig( 

288 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

289 ) 

290 self.storageClasses = StorageClassFactory() 

291 self.storageClasses.addFromConfig(self._config) 

292 self._allow_put_of_predefined_dataset = self._config.get( 

293 "allow_put_of_predefined_dataset", False 

294 ) 

295 except Exception: 

296 # Failures here usually mean that configuration is incomplete, 

297 # just issue an error message which includes config file URI. 

298 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

299 raise 

300 

301 # For execution butler the datastore needs a special 

302 # dependency-inversion trick. This is not used by regular butler, 

303 # but we do not have a way to distinguish regular butler from execution 

304 # butler. 

305 self.datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

306 

307 if "run" in self._config or "collection" in self._config: 

308 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

309 

310 GENERATION: ClassVar[int] = 3 

311 """This is a Generation 3 Butler. 

312 

313 This attribute may be removed in the future, once the Generation 2 Butler 

314 interface has been fully retired; it should only be used in transitional 

315 code. 

316 """ 

317 

318 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

319 """Return DatasetType defined in registry given dataset type name.""" 

320 try: 

321 return self.registry.getDatasetType(name) 

322 except MissingDatasetTypeError: 

323 return None 

324 

325 @classmethod 

326 def get_repo_uri(cls, label: str) -> ResourcePath: 

327 """Look up the label in a butler repository index. 

328 

329 Parameters 

330 ---------- 

331 label : `str` 

332 Label of the Butler repository to look up. 

333 

334 Returns 

335 ------- 

336 uri : `lsst.resources.ResourcePath` 

337 URI to the Butler repository associated with the given label. 

338 

339 Raises 

340 ------ 

341 KeyError 

342 Raised if the label is not found in the index, or if an index 

343 can not be found at all. 

344 

345 Notes 

346 ----- 

347 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

348 information is discovered. 

349 """ 

350 return ButlerRepoIndex.get_repo_uri(label) 

351 

352 @classmethod 

353 def get_known_repos(cls) -> Set[str]: 

354 """Retrieve the list of known repository labels. 

355 

356 Returns 

357 ------- 

358 repos : `set` of `str` 

359 All the known labels. Can be empty if no index can be found. 

360 

361 Notes 

362 ----- 

363 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

364 information is discovered. 

365 """ 

366 return ButlerRepoIndex.get_known_repos() 

367 

368 @staticmethod 

369 def makeRepo( 

370 root: ResourcePathExpression, 

371 config: Union[Config, str, None] = None, 

372 dimensionConfig: Union[Config, str, None] = None, 

373 standalone: bool = False, 

374 searchPaths: Optional[List[str]] = None, 

375 forceConfigRoot: bool = True, 

376 outfile: Optional[ResourcePathExpression] = None, 

377 overwrite: bool = False, 

378 ) -> Config: 

379 """Create an empty data repository by adding a butler.yaml config 

380 to a repository root directory. 

381 

382 Parameters 

383 ---------- 

384 root : `lsst.resources.ResourcePathExpression` 

385 Path or URI to the root location of the new repository. Will be 

386 created if it does not exist. 

387 config : `Config` or `str`, optional 

388 Configuration to write to the repository, after setting any 

389 root-dependent Registry or Datastore config options. Can not 

390 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

391 configuration will be used. Root-dependent config options 

392 specified in this config are overwritten if ``forceConfigRoot`` 

393 is `True`. 

394 dimensionConfig : `Config` or `str`, optional 

395 Configuration for dimensions, will be used to initialize registry 

396 database. 

397 standalone : `bool` 

398 If True, write all expanded defaults, not just customized or 

399 repository-specific settings. 

400 This (mostly) decouples the repository from the default 

401 configuration, insulating it from changes to the defaults (which 

402 may be good or bad, depending on the nature of the changes). 

403 Future *additions* to the defaults will still be picked up when 

404 initializing `Butlers` to repos created with ``standalone=True``. 

405 searchPaths : `list` of `str`, optional 

406 Directory paths to search when calculating the full butler 

407 configuration. 

408 forceConfigRoot : `bool`, optional 

409 If `False`, any values present in the supplied ``config`` that 

410 would normally be reset are not overridden and will appear 

411 directly in the output config. This allows non-standard overrides 

412 of the root directory for a datastore or registry to be given. 

413 If this parameter is `True` the values for ``root`` will be 

414 forced into the resulting config if appropriate. 

415 outfile : `lss.resources.ResourcePathExpression`, optional 

416 If not-`None`, the output configuration will be written to this 

417 location rather than into the repository itself. Can be a URI 

418 string. Can refer to a directory that will be used to write 

419 ``butler.yaml``. 

420 overwrite : `bool`, optional 

421 Create a new configuration file even if one already exists 

422 in the specified output location. Default is to raise 

423 an exception. 

424 

425 Returns 

426 ------- 

427 config : `Config` 

428 The updated `Config` instance written to the repo. 

429 

430 Raises 

431 ------ 

432 ValueError 

433 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

434 regular Config (as these subclasses would make it impossible to 

435 support ``standalone=False``). 

436 FileExistsError 

437 Raised if the output config file already exists. 

438 os.error 

439 Raised if the directory does not exist, exists but is not a 

440 directory, or cannot be created. 

441 

442 Notes 

443 ----- 

444 Note that when ``standalone=False`` (the default), the configuration 

445 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

446 construct the repository should also be used to construct any Butlers 

447 to avoid configuration inconsistencies. 

448 """ 

449 if isinstance(config, (ButlerConfig, ConfigSubset)): 

450 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

451 

452 # Ensure that the root of the repository exists or can be made 

453 root_uri = ResourcePath(root, forceDirectory=True) 

454 root_uri.mkdir() 

455 

456 config = Config(config) 

457 

458 # If we are creating a new repo from scratch with relative roots, 

459 # do not propagate an explicit root from the config file 

460 if "root" in config: 

461 del config["root"] 

462 

463 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

464 imported_class = doImportType(full["datastore", "cls"]) 

465 if not issubclass(imported_class, Datastore): 

466 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore") 

467 datastoreClass: Type[Datastore] = imported_class 

468 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

469 

470 # if key exists in given config, parse it, otherwise parse the defaults 

471 # in the expanded config 

472 if config.get(("registry", "db")): 

473 registryConfig = RegistryConfig(config) 

474 else: 

475 registryConfig = RegistryConfig(full) 

476 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

477 if defaultDatabaseUri is not None: 

478 Config.updateParameters( 

479 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot 

480 ) 

481 else: 

482 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot) 

483 

484 if standalone: 

485 config.merge(full) 

486 else: 

487 # Always expand the registry.managers section into the per-repo 

488 # config, because after the database schema is created, it's not 

489 # allowed to change anymore. Note that in the standalone=True 

490 # branch, _everything_ in the config is expanded, so there's no 

491 # need to special case this. 

492 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

493 configURI: ResourcePathExpression 

494 if outfile is not None: 

495 # When writing to a separate location we must include 

496 # the root of the butler repo in the config else it won't know 

497 # where to look. 

498 config["root"] = root_uri.geturl() 

499 configURI = outfile 

500 else: 

501 configURI = root_uri 

502 # Strip obscore configuration, if it is present, before writing config 

503 # to a file, obscore config will be stored in registry. 

504 config_to_write = config 

505 if (obscore_config_key := ("registry", "managers", "obscore", "config")) in config: 

506 config_to_write = config.copy() 

507 del config_to_write[obscore_config_key] 

508 config_to_write.dumpToUri(configURI, overwrite=overwrite) 

509 

510 # Create Registry and populate tables 

511 registryConfig = RegistryConfig(config.get("registry")) 

512 dimensionConfig = DimensionConfig(dimensionConfig) 

513 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri) 

514 

515 log.verbose("Wrote new Butler configuration file to %s", configURI) 

516 

517 return config 

518 

519 @classmethod 

520 def _unpickle( 

521 cls, 

522 config: ButlerConfig, 

523 collections: Optional[tuple[str, ...]], 

524 run: Optional[str], 

525 defaultDataId: Dict[str, str], 

526 writeable: bool, 

527 ) -> Butler: 

528 """Callable used to unpickle a Butler. 

529 

530 We prefer not to use ``Butler.__init__`` directly so we can force some 

531 of its many arguments to be keyword-only (note that ``__reduce__`` 

532 can only invoke callables with positional arguments). 

533 

534 Parameters 

535 ---------- 

536 config : `ButlerConfig` 

537 Butler configuration, already coerced into a true `ButlerConfig` 

538 instance (and hence after any search paths for overrides have been 

539 utilized). 

540 collections : `tuple` [ `str` ] 

541 Names of the default collections to read from. 

542 run : `str`, optional 

543 Name of the default `~CollectionType.RUN` collection to write to. 

544 defaultDataId : `dict` [ `str`, `str` ] 

545 Default data ID values. 

546 writeable : `bool` 

547 Whether the Butler should support write operations. 

548 

549 Returns 

550 ------- 

551 butler : `Butler` 

552 A new `Butler` instance. 

553 """ 

554 # MyPy doesn't recognize that the kwargs below are totally valid; it 

555 # seems to think '**defaultDataId* is a _positional_ argument! 

556 return cls( 

557 config=config, 

558 collections=collections, 

559 run=run, 

560 writeable=writeable, 

561 **defaultDataId, # type: ignore 

562 ) 

563 

564 def __reduce__(self) -> tuple: 

565 """Support pickling.""" 

566 return ( 

567 Butler._unpickle, 

568 ( 

569 self._config, 

570 self.collections, 

571 self.run, 

572 self.registry.defaults.dataId.byName(), 

573 self.registry.isWriteable(), 

574 ), 

575 ) 

576 

577 def __str__(self) -> str: 

578 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

579 self.collections, self.run, self.datastore, self.registry 

580 ) 

581 

582 def isWriteable(self) -> bool: 

583 """Return `True` if this `Butler` supports write operations.""" 

584 return self.registry.isWriteable() 

585 

586 @contextlib.contextmanager 

587 def transaction(self) -> Iterator[None]: 

588 """Context manager supporting `Butler` transactions. 

589 

590 Transactions can be nested. 

591 """ 

592 with self.registry.transaction(): 

593 with self.datastore.transaction(): 

594 yield 

595 

596 def _standardizeArgs( 

597 self, 

598 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

599 dataId: Optional[DataId] = None, 

600 for_put: bool = True, 

601 **kwargs: Any, 

602 ) -> Tuple[DatasetType, Optional[DataId]]: 

603 """Standardize the arguments passed to several Butler APIs. 

604 

605 Parameters 

606 ---------- 

607 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

608 When `DatasetRef` the `dataId` should be `None`. 

609 Otherwise the `DatasetType` or name thereof. 

610 dataId : `dict` or `DataCoordinate` 

611 A `dict` of `Dimension` link name, value pairs that label the 

612 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

613 should be provided as the second argument. 

614 for_put : `bool`, optional 

615 If `True` this call is invoked as part of a `Butler.put()`. 

616 Otherwise it is assumed to be part of a `Butler.get()`. This 

617 parameter is only relevant if there is dataset type 

618 inconsistency. 

619 **kwargs 

620 Additional keyword arguments used to augment or construct a 

621 `DataCoordinate`. See `DataCoordinate.standardize` 

622 parameters. 

623 

624 Returns 

625 ------- 

626 datasetType : `DatasetType` 

627 A `DatasetType` instance extracted from ``datasetRefOrType``. 

628 dataId : `dict` or `DataId`, optional 

629 Argument that can be used (along with ``kwargs``) to construct a 

630 `DataId`. 

631 

632 Notes 

633 ----- 

634 Butler APIs that conceptually need a DatasetRef also allow passing a 

635 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

636 keyword arguments that can be used to construct one) separately. This 

637 method accepts those arguments and always returns a true `DatasetType` 

638 and a `DataId` or `dict`. 

639 

640 Standardization of `dict` vs `DataId` is best handled by passing the 

641 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

642 generally similarly flexible. 

643 """ 

644 externalDatasetType: Optional[DatasetType] = None 

645 internalDatasetType: Optional[DatasetType] = None 

646 if isinstance(datasetRefOrType, DatasetRef): 

647 if dataId is not None or kwargs: 

648 raise ValueError("DatasetRef given, cannot use dataId as well") 

649 externalDatasetType = datasetRefOrType.datasetType 

650 dataId = datasetRefOrType.dataId 

651 else: 

652 # Don't check whether DataId is provided, because Registry APIs 

653 # can usually construct a better error message when it wasn't. 

654 if isinstance(datasetRefOrType, DatasetType): 

655 externalDatasetType = datasetRefOrType 

656 else: 

657 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

658 

659 # Check that they are self-consistent 

660 if externalDatasetType is not None: 

661 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

662 if externalDatasetType != internalDatasetType: 

663 # We can allow differences if they are compatible, depending 

664 # on whether this is a get or a put. A get requires that 

665 # the python type associated with the datastore can be 

666 # converted to the user type. A put requires that the user 

667 # supplied python type can be converted to the internal 

668 # type expected by registry. 

669 relevantDatasetType = internalDatasetType 

670 if for_put: 

671 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

672 else: 

673 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

674 relevantDatasetType = externalDatasetType 

675 if not is_compatible: 

676 raise ValueError( 

677 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

678 f"registry definition ({internalDatasetType})" 

679 ) 

680 # Override the internal definition. 

681 internalDatasetType = relevantDatasetType 

682 

683 assert internalDatasetType is not None 

684 return internalDatasetType, dataId 

685 

686 def _rewrite_data_id( 

687 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any 

688 ) -> Tuple[Optional[DataId], Dict[str, Any]]: 

689 """Rewrite a data ID taking into account dimension records. 

690 

691 Take a Data ID and keyword args and rewrite it if necessary to 

692 allow the user to specify dimension records rather than dimension 

693 primary values. 

694 

695 This allows a user to include a dataId dict with keys of 

696 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

697 the integer exposure ID. It also allows a string to be given 

698 for a dimension value rather than the integer ID if that is more 

699 convenient. For example, rather than having to specifyin the 

700 detector with ``detector.full_name``, a string given for ``detector`` 

701 will be interpreted as the full name and converted to the integer 

702 value. 

703 

704 Keyword arguments can also use strings for dimensions like detector 

705 and exposure but python does not allow them to include ``.`` and 

706 so the ``exposure.day_obs`` syntax can not be used in a keyword 

707 argument. 

708 

709 Parameters 

710 ---------- 

711 dataId : `dict` or `DataCoordinate` 

712 A `dict` of `Dimension` link name, value pairs that will label the 

713 `DatasetRef` within a Collection. 

714 datasetType : `DatasetType` 

715 The dataset type associated with this dataId. Required to 

716 determine the relevant dimensions. 

717 **kwargs 

718 Additional keyword arguments used to augment or construct a 

719 `DataId`. See `DataId` parameters. 

720 

721 Returns 

722 ------- 

723 dataId : `dict` or `DataCoordinate` 

724 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

725 no keyword arguments, the original dataId will be returned 

726 unchanged. 

727 **kwargs : `dict` 

728 Any unused keyword arguments (would normally be empty dict). 

729 """ 

730 # Do nothing if we have a standalone DataCoordinate. 

731 if isinstance(dataId, DataCoordinate) and not kwargs: 

732 return dataId, kwargs 

733 

734 # Process dimension records that are using record information 

735 # rather than ids 

736 newDataId: Dict[str, DataIdValue] = {} 

737 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict) 

738 

739 # if all the dataId comes from keyword parameters we do not need 

740 # to do anything here because they can't be of the form 

741 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

742 if dataId: 

743 for k, v in dataId.items(): 

744 # If we have a Dimension we do not need to do anything 

745 # because it cannot be a compound key. 

746 if isinstance(k, str) and "." in k: 

747 # Someone is using a more human-readable dataId 

748 dimensionName, record = k.split(".", 1) 

749 byRecord[dimensionName][record] = v 

750 elif isinstance(k, Dimension): 

751 newDataId[k.name] = v 

752 else: 

753 newDataId[k] = v 

754 

755 # Go through the updated dataId and check the type in case someone is 

756 # using an alternate key. We have already filtered out the compound 

757 # keys dimensions.record format. 

758 not_dimensions = {} 

759 

760 # Will need to look in the dataId and the keyword arguments 

761 # and will remove them if they need to be fixed or are unrecognized. 

762 for dataIdDict in (newDataId, kwargs): 

763 # Use a list so we can adjust the dict safely in the loop 

764 for dimensionName in list(dataIdDict): 

765 value = dataIdDict[dimensionName] 

766 try: 

767 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

768 except KeyError: 

769 # This is not a real dimension 

770 not_dimensions[dimensionName] = value 

771 del dataIdDict[dimensionName] 

772 continue 

773 

774 # Convert an integral type to an explicit int to simplify 

775 # comparisons here 

776 if isinstance(value, numbers.Integral): 

777 value = int(value) 

778 

779 if not isinstance(value, dimension.primaryKey.getPythonType()): 

780 for alternate in dimension.alternateKeys: 

781 if isinstance(value, alternate.getPythonType()): 

782 byRecord[dimensionName][alternate.name] = value 

783 del dataIdDict[dimensionName] 

784 log.debug( 

785 "Converting dimension %s to %s.%s=%s", 

786 dimensionName, 

787 dimensionName, 

788 alternate.name, 

789 value, 

790 ) 

791 break 

792 else: 

793 log.warning( 

794 "Type mismatch found for value '%r' provided for dimension %s. " 

795 "Could not find matching alternative (primary key has type %s) " 

796 "so attempting to use as-is.", 

797 value, 

798 dimensionName, 

799 dimension.primaryKey.getPythonType(), 

800 ) 

801 

802 # By this point kwargs and newDataId should only include valid 

803 # dimensions. Merge kwargs in to the new dataId and log if there 

804 # are dimensions in both (rather than calling update). 

805 for k, v in kwargs.items(): 

806 if k in newDataId and newDataId[k] != v: 

807 log.debug( 

808 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

809 ) 

810 newDataId[k] = v 

811 # No need to retain any values in kwargs now. 

812 kwargs = {} 

813 

814 # If we have some unrecognized dimensions we have to try to connect 

815 # them to records in other dimensions. This is made more complicated 

816 # by some dimensions having records with clashing names. A mitigation 

817 # is that we can tell by this point which dimensions are missing 

818 # for the DatasetType but this does not work for calibrations 

819 # where additional dimensions can be used to constrain the temporal 

820 # axis. 

821 if not_dimensions: 

822 # Search for all dimensions even if we have been given a value 

823 # explicitly. In some cases records are given as well as the 

824 # actually dimension and this should not be an error if they 

825 # match. 

826 mandatoryDimensions = datasetType.dimensions.names # - provided 

827 

828 candidateDimensions: Set[str] = set() 

829 candidateDimensions.update(mandatoryDimensions) 

830 

831 # For calibrations we may well be needing temporal dimensions 

832 # so rather than always including all dimensions in the scan 

833 # restrict things a little. It is still possible for there 

834 # to be confusion over day_obs in visit vs exposure for example. 

835 # If we are not searching calibration collections things may 

836 # fail but they are going to fail anyway because of the 

837 # ambiguousness of the dataId... 

838 if datasetType.isCalibration(): 

839 for dim in self.registry.dimensions.getStaticDimensions(): 

840 if dim.temporal: 

841 candidateDimensions.add(str(dim)) 

842 

843 # Look up table for the first association with a dimension 

844 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict) 

845 

846 # Keep track of whether an item is associated with multiple 

847 # dimensions. 

848 counter: Counter[str] = Counter() 

849 assigned: Dict[str, Set[str]] = defaultdict(set) 

850 

851 # Go through the missing dimensions and associate the 

852 # given names with records within those dimensions 

853 matched_dims = set() 

854 for dimensionName in candidateDimensions: 

855 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

856 fields = dimension.metadata.names | dimension.uniqueKeys.names 

857 for field in not_dimensions: 

858 if field in fields: 

859 guessedAssociation[dimensionName][field] = not_dimensions[field] 

860 counter[dimensionName] += 1 

861 assigned[field].add(dimensionName) 

862 matched_dims.add(field) 

863 

864 # Calculate the fields that matched nothing. 

865 never_found = set(not_dimensions) - matched_dims 

866 

867 if never_found: 

868 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

869 

870 # There is a chance we have allocated a single dataId item 

871 # to multiple dimensions. Need to decide which should be retained. 

872 # For now assume that the most popular alternative wins. 

873 # This means that day_obs with seq_num will result in 

874 # exposure.day_obs and not visit.day_obs 

875 # Also prefer an explicitly missing dimension over an inferred 

876 # temporal dimension. 

877 for fieldName, assignedDimensions in assigned.items(): 

878 if len(assignedDimensions) > 1: 

879 # Pick the most popular (preferring mandatory dimensions) 

880 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

881 if requiredButMissing: 

882 candidateDimensions = requiredButMissing 

883 else: 

884 candidateDimensions = assignedDimensions 

885 

886 # If this is a choice between visit and exposure and 

887 # neither was a required part of the dataset type, 

888 # (hence in this branch) always prefer exposure over 

889 # visit since exposures are always defined and visits 

890 # are defined from exposures. 

891 if candidateDimensions == {"exposure", "visit"}: 

892 candidateDimensions = {"exposure"} 

893 

894 # Select the relevant items and get a new restricted 

895 # counter. 

896 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

897 duplicatesCounter: Counter[str] = Counter() 

898 duplicatesCounter.update(theseCounts) 

899 

900 # Choose the most common. If they are equally common 

901 # we will pick the one that was found first. 

902 # Returns a list of tuples 

903 selected = duplicatesCounter.most_common(1)[0][0] 

904 

905 log.debug( 

906 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

907 " Removed ambiguity by choosing dimension %s.", 

908 fieldName, 

909 ", ".join(assignedDimensions), 

910 selected, 

911 ) 

912 

913 for candidateDimension in assignedDimensions: 

914 if candidateDimension != selected: 

915 del guessedAssociation[candidateDimension][fieldName] 

916 

917 # Update the record look up dict with the new associations 

918 for dimensionName, values in guessedAssociation.items(): 

919 if values: # A dict might now be empty 

920 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values) 

921 byRecord[dimensionName].update(values) 

922 

923 if byRecord: 

924 # Some record specifiers were found so we need to convert 

925 # them to the Id form 

926 for dimensionName, values in byRecord.items(): 

927 if dimensionName in newDataId: 

928 log.debug( 

929 "DataId specified explicit %s dimension value of %s in addition to" 

930 " general record specifiers for it of %s. Ignoring record information.", 

931 dimensionName, 

932 newDataId[dimensionName], 

933 str(values), 

934 ) 

935 # Get the actual record and compare with these values. 

936 try: 

937 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

938 except DataIdError: 

939 raise ValueError( 

940 f"Could not find dimension '{dimensionName}'" 

941 f" with dataId {newDataId} as part of comparing with" 

942 f" record values {byRecord[dimensionName]}" 

943 ) from None 

944 if len(recs) == 1: 

945 errmsg: List[str] = [] 

946 for k, v in values.items(): 

947 if (recval := getattr(recs[0], k)) != v: 

948 errmsg.append(f"{k}({recval} != {v})") 

949 if errmsg: 

950 raise ValueError( 

951 f"Dimension {dimensionName} in dataId has explicit value" 

952 " inconsistent with records: " + ", ".join(errmsg) 

953 ) 

954 else: 

955 # Multiple matches for an explicit dimension 

956 # should never happen but let downstream complain. 

957 pass 

958 continue 

959 

960 # Build up a WHERE expression 

961 bind = {k: v for k, v in values.items()} 

962 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

963 

964 # Hopefully we get a single record that matches 

965 records = set( 

966 self.registry.queryDimensionRecords( 

967 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

968 ) 

969 ) 

970 

971 if len(records) != 1: 

972 if len(records) > 1: 

973 # visit can have an ambiguous answer without involving 

974 # visit_system. The default visit_system is defined 

975 # by the instrument. 

976 if ( 

977 dimensionName == "visit" 

978 and "visit_system_membership" in self.registry.dimensions 

979 and "visit_system" in self.registry.dimensions["instrument"].metadata 

980 ): 

981 instrument_records = list( 

982 self.registry.queryDimensionRecords( 

983 "instrument", 

984 dataId=newDataId, 

985 **kwargs, 

986 ) 

987 ) 

988 if len(instrument_records) == 1: 

989 visit_system = instrument_records[0].visit_system 

990 if visit_system is None: 

991 # Set to a value that will never match. 

992 visit_system = -1 

993 

994 # Look up each visit in the 

995 # visit_system_membership records. 

996 for rec in records: 

997 membership = list( 

998 self.registry.queryDimensionRecords( 

999 # Use bind to allow zero results. 

1000 # This is a fully-specified query. 

1001 "visit_system_membership", 

1002 where="instrument = inst AND visit_system = system AND visit = v", 

1003 bind=dict( 

1004 inst=instrument_records[0].name, system=visit_system, v=rec.id 

1005 ), 

1006 ) 

1007 ) 

1008 if membership: 

1009 # This record is the right answer. 

1010 records = set([rec]) 

1011 break 

1012 

1013 # The ambiguity may have been resolved so check again. 

1014 if len(records) > 1: 

1015 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

1016 for r in records: 

1017 log.debug("- %s", str(r)) 

1018 raise ValueError( 

1019 f"DataId specification for dimension {dimensionName} is not" 

1020 f" uniquely constrained to a single dataset by {values}." 

1021 f" Got {len(records)} results." 

1022 ) 

1023 else: 

1024 raise ValueError( 

1025 f"DataId specification for dimension {dimensionName} matched no" 

1026 f" records when constrained by {values}" 

1027 ) 

1028 

1029 # Get the primary key from the real dimension object 

1030 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

1031 if not isinstance(dimension, Dimension): 

1032 raise RuntimeError( 

1033 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

1034 ) 

1035 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

1036 

1037 return newDataId, kwargs 

1038 

1039 def _findDatasetRef( 

1040 self, 

1041 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1042 dataId: Optional[DataId] = None, 

1043 *, 

1044 collections: Any = None, 

1045 allowUnresolved: bool = False, 

1046 **kwargs: Any, 

1047 ) -> DatasetRef: 

1048 """Shared logic for methods that start with a search for a dataset in 

1049 the registry. 

1050 

1051 Parameters 

1052 ---------- 

1053 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1054 When `DatasetRef` the `dataId` should be `None`. 

1055 Otherwise the `DatasetType` or name thereof. 

1056 dataId : `dict` or `DataCoordinate`, optional 

1057 A `dict` of `Dimension` link name, value pairs that label the 

1058 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1059 should be provided as the first argument. 

1060 collections : Any, optional 

1061 Collections to be searched, overriding ``self.collections``. 

1062 Can be any of the types supported by the ``collections`` argument 

1063 to butler construction. 

1064 allowUnresolved : `bool`, optional 

1065 If `True`, return an unresolved `DatasetRef` if finding a resolved 

1066 one in the `Registry` fails. Defaults to `False`. 

1067 **kwargs 

1068 Additional keyword arguments used to augment or construct a 

1069 `DataId`. See `DataId` parameters. 

1070 

1071 Returns 

1072 ------- 

1073 ref : `DatasetRef` 

1074 A reference to the dataset identified by the given arguments. 

1075 

1076 Raises 

1077 ------ 

1078 LookupError 

1079 Raised if no matching dataset exists in the `Registry` (and 

1080 ``allowUnresolved is False``). 

1081 ValueError 

1082 Raised if a resolved `DatasetRef` was passed as an input, but it 

1083 differs from the one found in the registry. 

1084 TypeError 

1085 Raised if no collections were provided. 

1086 """ 

1087 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

1088 if isinstance(datasetRefOrType, DatasetRef): 

1089 idNumber = datasetRefOrType.id 

1090 else: 

1091 idNumber = None 

1092 timespan: Optional[Timespan] = None 

1093 

1094 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1095 

1096 if datasetType.isCalibration(): 

1097 # Because this is a calibration dataset, first try to make a 

1098 # standardize the data ID without restricting the dimensions to 

1099 # those of the dataset type requested, because there may be extra 

1100 # dimensions that provide temporal information for a validity-range 

1101 # lookup. 

1102 dataId = DataCoordinate.standardize( 

1103 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1104 ) 

1105 if dataId.graph.temporal: 

1106 dataId = self.registry.expandDataId(dataId) 

1107 timespan = dataId.timespan 

1108 else: 

1109 # Standardize the data ID to just the dimensions of the dataset 

1110 # type instead of letting registry.findDataset do it, so we get the 

1111 # result even if no dataset is found. 

1112 dataId = DataCoordinate.standardize( 

1113 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1114 ) 

1115 # Always lookup the DatasetRef, even if one is given, to ensure it is 

1116 # present in the current collection. 

1117 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

1118 if ref is None: 

1119 if allowUnresolved: 

1120 return DatasetRef(datasetType, dataId) 

1121 else: 

1122 if collections is None: 

1123 collections = self.registry.defaults.collections 

1124 raise LookupError( 

1125 f"Dataset {datasetType.name} with data ID {dataId} " 

1126 f"could not be found in collections {collections}." 

1127 ) 

1128 if idNumber is not None and idNumber != ref.id: 

1129 if collections is None: 

1130 collections = self.registry.defaults.collections 

1131 raise ValueError( 

1132 f"DatasetRef.id provided ({idNumber}) does not match " 

1133 f"id ({ref.id}) in registry in collections {collections}." 

1134 ) 

1135 if datasetType != ref.datasetType: 

1136 # If they differ it is because the user explicitly specified 

1137 # a compatible dataset type to this call rather than using the 

1138 # registry definition. The DatasetRef must therefore be recreated 

1139 # using the user definition such that the expected type is 

1140 # returned. 

1141 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id) 

1142 

1143 return ref 

1144 

1145 @transactional 

1146 def putDirect(self, obj: Any, ref: DatasetRef) -> DatasetRef: 

1147 # Docstring inherited. 

1148 (imported_ref,) = self.registry._importDatasets( 

1149 [ref], 

1150 expand=True, 

1151 ) 

1152 if imported_ref.id != ref.getCheckedId(): 

1153 raise RuntimeError("This registry configuration does not support putDirect.") 

1154 self.datastore.put(obj, ref) 

1155 return ref 

1156 

1157 @transactional 

1158 def put( 

1159 self, 

1160 obj: Any, 

1161 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1162 dataId: Optional[DataId] = None, 

1163 *, 

1164 run: Optional[str] = None, 

1165 **kwargs: Any, 

1166 ) -> DatasetRef: 

1167 """Store and register a dataset. 

1168 

1169 Parameters 

1170 ---------- 

1171 obj : `object` 

1172 The dataset. 

1173 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1174 When `DatasetRef` is provided, ``dataId`` should be `None`. 

1175 Otherwise the `DatasetType` or name thereof. 

1176 dataId : `dict` or `DataCoordinate` 

1177 A `dict` of `Dimension` link name, value pairs that label the 

1178 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1179 should be provided as the second argument. 

1180 run : `str`, optional 

1181 The name of the run the dataset should be added to, overriding 

1182 ``self.run``. 

1183 **kwargs 

1184 Additional keyword arguments used to augment or construct a 

1185 `DataCoordinate`. See `DataCoordinate.standardize` 

1186 parameters. 

1187 

1188 Returns 

1189 ------- 

1190 ref : `DatasetRef` 

1191 A reference to the stored dataset, updated with the correct id if 

1192 given. 

1193 

1194 Raises 

1195 ------ 

1196 TypeError 

1197 Raised if the butler is read-only or if no run has been provided. 

1198 """ 

1199 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

1200 if not self.isWriteable(): 

1201 raise TypeError("Butler is read-only.") 

1202 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

1203 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

1204 raise ValueError("DatasetRef must not be in registry, must have None id") 

1205 

1206 # Handle dimension records in dataId 

1207 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1208 

1209 # Add Registry Dataset entry. 

1210 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

1211 

1212 # For an execution butler the datasets will be pre-defined. 

1213 # If the butler is configured that way datasets should only be inserted 

1214 # if they do not already exist in registry. Trying and catching 

1215 # ConflictingDefinitionError will not work because the transaction 

1216 # will be corrupted. Instead, in this mode always check first. 

1217 ref = None 

1218 ref_is_predefined = False 

1219 if self._allow_put_of_predefined_dataset: 

1220 # Get the matching ref for this run. 

1221 ref = self.registry.findDataset(datasetType, collections=run, dataId=dataId) 

1222 

1223 if ref: 

1224 # Must be expanded form for datastore templating 

1225 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions) 

1226 ref = ref.expanded(dataId) 

1227 ref_is_predefined = True 

1228 

1229 if not ref: 

1230 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

1231 

1232 # If the ref is predefined it is possible that the datastore also 

1233 # has the record. Asking datastore to put it again will result in 

1234 # the artifact being recreated, overwriting previous, then will cause 

1235 # a failure in writing the record which will cause the artifact 

1236 # to be removed. Much safer to ask first before attempting to 

1237 # overwrite. Race conditions should not be an issue for the 

1238 # execution butler environment. 

1239 if ref_is_predefined: 

1240 if self.datastore.knows(ref): 

1241 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.") 

1242 

1243 self.datastore.put(obj, ref) 

1244 

1245 return ref 

1246 

1247 def getDirect( 

1248 self, 

1249 ref: DatasetRef, 

1250 *, 

1251 parameters: Optional[Dict[str, Any]] = None, 

1252 storageClass: Optional[Union[StorageClass, str]] = None, 

1253 ) -> Any: 

1254 """Retrieve a stored dataset. 

1255 

1256 Unlike `Butler.get`, this method allows datasets outside the Butler's 

1257 collection to be read as long as the `DatasetRef` that identifies them 

1258 can be obtained separately. 

1259 

1260 Parameters 

1261 ---------- 

1262 ref : `DatasetRef` 

1263 Resolved reference to an already stored dataset. 

1264 parameters : `dict` 

1265 Additional StorageClass-defined options to control reading, 

1266 typically used to efficiently read only a subset of the dataset. 

1267 storageClass : `StorageClass` or `str`, optional 

1268 The storage class to be used to override the Python type 

1269 returned by this method. By default the returned type matches 

1270 the dataset type definition for this dataset. Specifying a 

1271 read `StorageClass` can force a different type to be returned. 

1272 This type must be compatible with the original type. 

1273 

1274 Returns 

1275 ------- 

1276 obj : `object` 

1277 The dataset. 

1278 """ 

1279 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1280 

1281 def getDirectDeferred( 

1282 self, 

1283 ref: DatasetRef, 

1284 *, 

1285 parameters: Union[dict, None] = None, 

1286 storageClass: str | StorageClass | None = None, 

1287 ) -> DeferredDatasetHandle: 

1288 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1289 from a resolved `DatasetRef`. 

1290 

1291 Parameters 

1292 ---------- 

1293 ref : `DatasetRef` 

1294 Resolved reference to an already stored dataset. 

1295 parameters : `dict` 

1296 Additional StorageClass-defined options to control reading, 

1297 typically used to efficiently read only a subset of the dataset. 

1298 storageClass : `StorageClass` or `str`, optional 

1299 The storage class to be used to override the Python type 

1300 returned by this method. By default the returned type matches 

1301 the dataset type definition for this dataset. Specifying a 

1302 read `StorageClass` can force a different type to be returned. 

1303 This type must be compatible with the original type. 

1304 

1305 Returns 

1306 ------- 

1307 obj : `DeferredDatasetHandle` 

1308 A handle which can be used to retrieve a dataset at a later time. 

1309 

1310 Raises 

1311 ------ 

1312 AmbiguousDatasetError 

1313 Raised if ``ref.id is None``, i.e. the reference is unresolved. 

1314 """ 

1315 if ref.id is None: 

1316 raise AmbiguousDatasetError( 

1317 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved." 

1318 ) 

1319 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1320 

1321 def getDeferred( 

1322 self, 

1323 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1324 dataId: Optional[DataId] = None, 

1325 *, 

1326 parameters: Union[dict, None] = None, 

1327 collections: Any = None, 

1328 storageClass: str | StorageClass | None = None, 

1329 **kwargs: Any, 

1330 ) -> DeferredDatasetHandle: 

1331 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1332 after an immediate registry lookup. 

1333 

1334 Parameters 

1335 ---------- 

1336 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1337 When `DatasetRef` the `dataId` should be `None`. 

1338 Otherwise the `DatasetType` or name thereof. 

1339 dataId : `dict` or `DataCoordinate`, optional 

1340 A `dict` of `Dimension` link name, value pairs that label the 

1341 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1342 should be provided as the first argument. 

1343 parameters : `dict` 

1344 Additional StorageClass-defined options to control reading, 

1345 typically used to efficiently read only a subset of the dataset. 

1346 collections : Any, optional 

1347 Collections to be searched, overriding ``self.collections``. 

1348 Can be any of the types supported by the ``collections`` argument 

1349 to butler construction. 

1350 storageClass : `StorageClass` or `str`, optional 

1351 The storage class to be used to override the Python type 

1352 returned by this method. By default the returned type matches 

1353 the dataset type definition for this dataset. Specifying a 

1354 read `StorageClass` can force a different type to be returned. 

1355 This type must be compatible with the original type. 

1356 **kwargs 

1357 Additional keyword arguments used to augment or construct a 

1358 `DataId`. See `DataId` parameters. 

1359 

1360 Returns 

1361 ------- 

1362 obj : `DeferredDatasetHandle` 

1363 A handle which can be used to retrieve a dataset at a later time. 

1364 

1365 Raises 

1366 ------ 

1367 LookupError 

1368 Raised if no matching dataset exists in the `Registry` (and 

1369 ``allowUnresolved is False``). 

1370 ValueError 

1371 Raised if a resolved `DatasetRef` was passed as an input, but it 

1372 differs from the one found in the registry. 

1373 TypeError 

1374 Raised if no collections were provided. 

1375 """ 

1376 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1377 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1378 

1379 def get( 

1380 self, 

1381 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1382 dataId: Optional[DataId] = None, 

1383 *, 

1384 parameters: Optional[Dict[str, Any]] = None, 

1385 collections: Any = None, 

1386 storageClass: Optional[Union[StorageClass, str]] = None, 

1387 **kwargs: Any, 

1388 ) -> Any: 

1389 """Retrieve a stored dataset. 

1390 

1391 Parameters 

1392 ---------- 

1393 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1394 When `DatasetRef` the `dataId` should be `None`. 

1395 Otherwise the `DatasetType` or name thereof. 

1396 dataId : `dict` or `DataCoordinate` 

1397 A `dict` of `Dimension` link name, value pairs that label the 

1398 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1399 should be provided as the first argument. 

1400 parameters : `dict` 

1401 Additional StorageClass-defined options to control reading, 

1402 typically used to efficiently read only a subset of the dataset. 

1403 collections : Any, optional 

1404 Collections to be searched, overriding ``self.collections``. 

1405 Can be any of the types supported by the ``collections`` argument 

1406 to butler construction. 

1407 storageClass : `StorageClass` or `str`, optional 

1408 The storage class to be used to override the Python type 

1409 returned by this method. By default the returned type matches 

1410 the dataset type definition for this dataset. Specifying a 

1411 read `StorageClass` can force a different type to be returned. 

1412 This type must be compatible with the original type. 

1413 **kwargs 

1414 Additional keyword arguments used to augment or construct a 

1415 `DataCoordinate`. See `DataCoordinate.standardize` 

1416 parameters. 

1417 

1418 Returns 

1419 ------- 

1420 obj : `object` 

1421 The dataset. 

1422 

1423 Raises 

1424 ------ 

1425 ValueError 

1426 Raised if a resolved `DatasetRef` was passed as an input, but it 

1427 differs from the one found in the registry. 

1428 LookupError 

1429 Raised if no matching dataset exists in the `Registry`. 

1430 TypeError 

1431 Raised if no collections were provided. 

1432 

1433 Notes 

1434 ----- 

1435 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1436 this method requires that the given data ID include temporal dimensions 

1437 beyond the dimensions of the dataset type itself, in order to find the 

1438 dataset with the appropriate validity range. For example, a "bias" 

1439 dataset with native dimensions ``{instrument, detector}`` could be 

1440 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1441 ``exposure`` is a temporal dimension. 

1442 """ 

1443 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1444 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1445 return self.getDirect(ref, parameters=parameters, storageClass=storageClass) 

1446 

1447 def getURIs( 

1448 self, 

1449 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1450 dataId: Optional[DataId] = None, 

1451 *, 

1452 predict: bool = False, 

1453 collections: Any = None, 

1454 run: Optional[str] = None, 

1455 **kwargs: Any, 

1456 ) -> DatasetRefURIs: 

1457 """Returns the URIs associated with the dataset. 

1458 

1459 Parameters 

1460 ---------- 

1461 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1462 When `DatasetRef` the `dataId` should be `None`. 

1463 Otherwise the `DatasetType` or name thereof. 

1464 dataId : `dict` or `DataCoordinate` 

1465 A `dict` of `Dimension` link name, value pairs that label the 

1466 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1467 should be provided as the first argument. 

1468 predict : `bool` 

1469 If `True`, allow URIs to be returned of datasets that have not 

1470 been written. 

1471 collections : Any, optional 

1472 Collections to be searched, overriding ``self.collections``. 

1473 Can be any of the types supported by the ``collections`` argument 

1474 to butler construction. 

1475 run : `str`, optional 

1476 Run to use for predictions, overriding ``self.run``. 

1477 **kwargs 

1478 Additional keyword arguments used to augment or construct a 

1479 `DataCoordinate`. See `DataCoordinate.standardize` 

1480 parameters. 

1481 

1482 Returns 

1483 ------- 

1484 uris : `DatasetRefURIs` 

1485 The URI to the primary artifact associated with this dataset (if 

1486 the dataset was disassembled within the datastore this may be 

1487 `None`), and the URIs to any components associated with the dataset 

1488 artifact. (can be empty if there are no components). 

1489 """ 

1490 ref = self._findDatasetRef( 

1491 datasetRefOrType, dataId, allowUnresolved=predict, collections=collections, **kwargs 

1492 ) 

1493 if ref.id is None: # only possible if predict is True 

1494 if run is None: 

1495 run = self.run 

1496 if run is None: 

1497 raise TypeError("Cannot predict location with run=None.") 

1498 # Lie about ID, because we can't guess it, and only 

1499 # Datastore.getURIs() will ever see it (and it doesn't use it). 

1500 ref = ref.resolved(id=uuid.UUID("{00000000-0000-0000-0000-000000000000}"), run=run) 

1501 return self.datastore.getURIs(ref, predict) 

1502 

1503 def getURI( 

1504 self, 

1505 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1506 dataId: Optional[DataId] = None, 

1507 *, 

1508 predict: bool = False, 

1509 collections: Any = None, 

1510 run: Optional[str] = None, 

1511 **kwargs: Any, 

1512 ) -> ResourcePath: 

1513 """Return the URI to the Dataset. 

1514 

1515 Parameters 

1516 ---------- 

1517 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1518 When `DatasetRef` the `dataId` should be `None`. 

1519 Otherwise the `DatasetType` or name thereof. 

1520 dataId : `dict` or `DataCoordinate` 

1521 A `dict` of `Dimension` link name, value pairs that label the 

1522 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1523 should be provided as the first argument. 

1524 predict : `bool` 

1525 If `True`, allow URIs to be returned of datasets that have not 

1526 been written. 

1527 collections : Any, optional 

1528 Collections to be searched, overriding ``self.collections``. 

1529 Can be any of the types supported by the ``collections`` argument 

1530 to butler construction. 

1531 run : `str`, optional 

1532 Run to use for predictions, overriding ``self.run``. 

1533 **kwargs 

1534 Additional keyword arguments used to augment or construct a 

1535 `DataCoordinate`. See `DataCoordinate.standardize` 

1536 parameters. 

1537 

1538 Returns 

1539 ------- 

1540 uri : `lsst.resources.ResourcePath` 

1541 URI pointing to the Dataset within the datastore. If the 

1542 Dataset does not exist in the datastore, and if ``predict`` is 

1543 `True`, the URI will be a prediction and will include a URI 

1544 fragment "#predicted". 

1545 If the datastore does not have entities that relate well 

1546 to the concept of a URI the returned URI string will be 

1547 descriptive. The returned URI is not guaranteed to be obtainable. 

1548 

1549 Raises 

1550 ------ 

1551 LookupError 

1552 A URI has been requested for a dataset that does not exist and 

1553 guessing is not allowed. 

1554 ValueError 

1555 Raised if a resolved `DatasetRef` was passed as an input, but it 

1556 differs from the one found in the registry. 

1557 TypeError 

1558 Raised if no collections were provided. 

1559 RuntimeError 

1560 Raised if a URI is requested for a dataset that consists of 

1561 multiple artifacts. 

1562 """ 

1563 primary, components = self.getURIs( 

1564 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs 

1565 ) 

1566 

1567 if primary is None or components: 

1568 raise RuntimeError( 

1569 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1570 "Use Butler.getURIs() instead." 

1571 ) 

1572 return primary 

1573 

1574 def retrieveArtifacts( 

1575 self, 

1576 refs: Iterable[DatasetRef], 

1577 destination: ResourcePathExpression, 

1578 transfer: str = "auto", 

1579 preserve_path: bool = True, 

1580 overwrite: bool = False, 

1581 ) -> List[ResourcePath]: 

1582 """Retrieve the artifacts associated with the supplied refs. 

1583 

1584 Parameters 

1585 ---------- 

1586 refs : iterable of `DatasetRef` 

1587 The datasets for which artifacts are to be retrieved. 

1588 A single ref can result in multiple artifacts. The refs must 

1589 be resolved. 

1590 destination : `lsst.resources.ResourcePath` or `str` 

1591 Location to write the artifacts. 

1592 transfer : `str`, optional 

1593 Method to use to transfer the artifacts. Must be one of the options 

1594 supported by `~lsst.resources.ResourcePath.transfer_from()`. 

1595 "move" is not allowed. 

1596 preserve_path : `bool`, optional 

1597 If `True` the full path of the artifact within the datastore 

1598 is preserved. If `False` the final file component of the path 

1599 is used. 

1600 overwrite : `bool`, optional 

1601 If `True` allow transfers to overwrite existing files at the 

1602 destination. 

1603 

1604 Returns 

1605 ------- 

1606 targets : `list` of `lsst.resources.ResourcePath` 

1607 URIs of file artifacts in destination location. Order is not 

1608 preserved. 

1609 

1610 Notes 

1611 ----- 

1612 For non-file datastores the artifacts written to the destination 

1613 may not match the representation inside the datastore. For example 

1614 a hierarchical data structure in a NoSQL database may well be stored 

1615 as a JSON file. 

1616 """ 

1617 return self.datastore.retrieveArtifacts( 

1618 refs, 

1619 ResourcePath(destination), 

1620 transfer=transfer, 

1621 preserve_path=preserve_path, 

1622 overwrite=overwrite, 

1623 ) 

1624 

1625 def datasetExists( 

1626 self, 

1627 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1628 dataId: Optional[DataId] = None, 

1629 *, 

1630 collections: Any = None, 

1631 **kwargs: Any, 

1632 ) -> bool: 

1633 """Return True if the Dataset is actually present in the Datastore. 

1634 

1635 Parameters 

1636 ---------- 

1637 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1638 When `DatasetRef` the `dataId` should be `None`. 

1639 Otherwise the `DatasetType` or name thereof. 

1640 dataId : `dict` or `DataCoordinate` 

1641 A `dict` of `Dimension` link name, value pairs that label the 

1642 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1643 should be provided as the first argument. 

1644 collections : Any, optional 

1645 Collections to be searched, overriding ``self.collections``. 

1646 Can be any of the types supported by the ``collections`` argument 

1647 to butler construction. 

1648 **kwargs 

1649 Additional keyword arguments used to augment or construct a 

1650 `DataCoordinate`. See `DataCoordinate.standardize` 

1651 parameters. 

1652 

1653 Raises 

1654 ------ 

1655 LookupError 

1656 Raised if the dataset is not even present in the Registry. 

1657 ValueError 

1658 Raised if a resolved `DatasetRef` was passed as an input, but it 

1659 differs from the one found in the registry. 

1660 TypeError 

1661 Raised if no collections were provided. 

1662 """ 

1663 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1664 return self.datastore.exists(ref) 

1665 

1666 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1667 """Remove one or more `~CollectionType.RUN` collections and the 

1668 datasets within them. 

1669 

1670 Parameters 

1671 ---------- 

1672 names : `Iterable` [ `str` ] 

1673 The names of the collections to remove. 

1674 unstore : `bool`, optional 

1675 If `True` (default), delete datasets from all datastores in which 

1676 they are present, and attempt to rollback the registry deletions if 

1677 datastore deletions fail (which may not always be possible). If 

1678 `False`, datastore records for these datasets are still removed, 

1679 but any artifacts (e.g. files) will not be. 

1680 

1681 Raises 

1682 ------ 

1683 TypeError 

1684 Raised if one or more collections are not of type 

1685 `~CollectionType.RUN`. 

1686 """ 

1687 if not self.isWriteable(): 

1688 raise TypeError("Butler is read-only.") 

1689 names = list(names) 

1690 refs: List[DatasetRef] = [] 

1691 for name in names: 

1692 collectionType = self.registry.getCollectionType(name) 

1693 if collectionType is not CollectionType.RUN: 

1694 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1695 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True)) 

1696 with self.datastore.transaction(): 

1697 with self.registry.transaction(): 

1698 if unstore: 

1699 self.datastore.trash(refs) 

1700 else: 

1701 self.datastore.forget(refs) 

1702 for name in names: 

1703 self.registry.removeCollection(name) 

1704 if unstore: 

1705 # Point of no return for removing artifacts 

1706 self.datastore.emptyTrash() 

1707 

1708 def pruneCollection( 

1709 self, name: str, purge: bool = False, unstore: bool = False, unlink: Optional[List[str]] = None 

1710 ) -> None: 

1711 """Remove a collection and possibly prune datasets within it. 

1712 

1713 Parameters 

1714 ---------- 

1715 name : `str` 

1716 Name of the collection to remove. If this is a 

1717 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection, 

1718 datasets within the collection are not modified unless ``unstore`` 

1719 is `True`. If this is a `~CollectionType.RUN` collection, 

1720 ``purge`` and ``unstore`` must be `True`, and all datasets in it 

1721 are fully removed from the data repository. 

1722 purge : `bool`, optional 

1723 If `True`, permit `~CollectionType.RUN` collections to be removed, 

1724 fully removing datasets within them. Requires ``unstore=True`` as 

1725 well as an added precaution against accidental deletion. Must be 

1726 `False` (default) if the collection is not a ``RUN``. 

1727 unstore: `bool`, optional 

1728 If `True`, remove all datasets in the collection from all 

1729 datastores in which they appear. 

1730 unlink: `list` [`str`], optional 

1731 Before removing the given `collection` unlink it from from these 

1732 parent collections. 

1733 

1734 Raises 

1735 ------ 

1736 TypeError 

1737 Raised if the butler is read-only or arguments are mutually 

1738 inconsistent. 

1739 """ 

1740 # See pruneDatasets comments for more information about the logic here; 

1741 # the cases are almost the same, but here we can rely on Registry to 

1742 # take care everything but Datastore deletion when we remove the 

1743 # collection. 

1744 if not self.isWriteable(): 

1745 raise TypeError("Butler is read-only.") 

1746 collectionType = self.registry.getCollectionType(name) 

1747 if purge and not unstore: 

1748 raise PurgeWithoutUnstorePruneCollectionsError() 

1749 if collectionType is CollectionType.RUN and not purge: 

1750 raise RunWithoutPurgePruneCollectionsError(collectionType) 

1751 if collectionType is not CollectionType.RUN and purge: 

1752 raise PurgeUnsupportedPruneCollectionsError(collectionType) 

1753 

1754 def remove(child: str, parent: str) -> None: 

1755 """Remove a child collection from a parent collection.""" 

1756 # Remove child from parent. 

1757 chain = list(self.registry.getCollectionChain(parent)) 

1758 try: 

1759 chain.remove(name) 

1760 except ValueError as e: 

1761 raise RuntimeError(f"{name} is not a child of {parent}") from e 

1762 self.registry.setCollectionChain(parent, chain) 

1763 

1764 with self.datastore.transaction(): 

1765 with self.registry.transaction(): 

1766 if unlink: 

1767 for parent in unlink: 

1768 remove(name, parent) 

1769 if unstore: 

1770 refs = self.registry.queryDatasets(..., collections=name, findFirst=True) 

1771 self.datastore.trash(refs) 

1772 self.registry.removeCollection(name) 

1773 

1774 if unstore: 

1775 # Point of no return for removing artifacts 

1776 self.datastore.emptyTrash() 

1777 

1778 def pruneDatasets( 

1779 self, 

1780 refs: Iterable[DatasetRef], 

1781 *, 

1782 disassociate: bool = True, 

1783 unstore: bool = False, 

1784 tags: Iterable[str] = (), 

1785 purge: bool = False, 

1786 ) -> None: 

1787 # docstring inherited from LimitedButler 

1788 

1789 if not self.isWriteable(): 

1790 raise TypeError("Butler is read-only.") 

1791 if purge: 

1792 if not disassociate: 

1793 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1794 if not unstore: 

1795 raise TypeError("Cannot pass purge=True without unstore=True.") 

1796 elif disassociate: 

1797 tags = tuple(tags) 

1798 if not tags: 

1799 raise TypeError("No tags provided but disassociate=True.") 

1800 for tag in tags: 

1801 collectionType = self.registry.getCollectionType(tag) 

1802 if collectionType is not CollectionType.TAGGED: 

1803 raise TypeError( 

1804 f"Cannot disassociate from collection '{tag}' " 

1805 f"of non-TAGGED type {collectionType.name}." 

1806 ) 

1807 # For an execution butler we want to keep existing UUIDs for the 

1808 # datasets, for that we need to keep them in the collections but 

1809 # remove from datastore. 

1810 if self._allow_put_of_predefined_dataset and purge: 

1811 purge = False 

1812 disassociate = False 

1813 # Transform possibly-single-pass iterable into something we can iterate 

1814 # over multiple times. 

1815 refs = list(refs) 

1816 # Pruning a component of a DatasetRef makes no sense since registry 

1817 # doesn't know about components and datastore might not store 

1818 # components in a separate file 

1819 for ref in refs: 

1820 if ref.datasetType.component(): 

1821 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1822 # We don't need an unreliable Datastore transaction for this, because 

1823 # we've been extra careful to ensure that Datastore.trash only involves 

1824 # mutating the Registry (it can _look_ at Datastore-specific things, 

1825 # but shouldn't change them), and hence all operations here are 

1826 # Registry operations. 

1827 with self.datastore.transaction(): 

1828 with self.registry.transaction(): 

1829 if unstore: 

1830 self.datastore.trash(refs) 

1831 if purge: 

1832 self.registry.removeDatasets(refs) 

1833 elif disassociate: 

1834 assert tags, "Guaranteed by earlier logic in this function." 

1835 for tag in tags: 

1836 self.registry.disassociate(tag, refs) 

1837 # We've exited the Registry transaction, and apparently committed. 

1838 # (if there was an exception, everything rolled back, and it's as if 

1839 # nothing happened - and we never get here). 

1840 # Datastore artifacts are not yet gone, but they're clearly marked 

1841 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1842 # problems we can try again later, and if manual administrative 

1843 # intervention is required, it's pretty clear what that should entail: 

1844 # deleting everything on disk and in private Datastore tables that is 

1845 # in the dataset_location_trash table. 

1846 if unstore: 

1847 # Point of no return for removing artifacts 

1848 self.datastore.emptyTrash() 

1849 

1850 @transactional 

1851 def ingest( 

1852 self, 

1853 *datasets: FileDataset, 

1854 transfer: Optional[str] = "auto", 

1855 run: Optional[str] = None, 

1856 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1857 record_validation_info: bool = True, 

1858 ) -> None: 

1859 """Store and register one or more datasets that already exist on disk. 

1860 

1861 Parameters 

1862 ---------- 

1863 datasets : `FileDataset` 

1864 Each positional argument is a struct containing information about 

1865 a file to be ingested, including its URI (either absolute or 

1866 relative to the datastore root, if applicable), a `DatasetRef`, 

1867 and optionally a formatter class or its fully-qualified string 

1868 name. If a formatter is not provided, the formatter that would be 

1869 used for `put` is assumed. On successful return, all 

1870 `FileDataset.ref` attributes will have their `DatasetRef.id` 

1871 attribute populated and all `FileDataset.formatter` attributes will 

1872 be set to the formatter class used. `FileDataset.path` attributes 

1873 may be modified to put paths in whatever the datastore considers a 

1874 standardized form. 

1875 transfer : `str`, optional 

1876 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1877 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to 

1878 transfer the file. 

1879 run : `str`, optional 

1880 The name of the run ingested datasets should be added to, 

1881 overriding ``self.run``. 

1882 idGenerationMode : `DatasetIdGenEnum`, optional 

1883 Specifies option for generating dataset IDs. By default unique IDs 

1884 are generated for each inserted dataset. 

1885 record_validation_info : `bool`, optional 

1886 If `True`, the default, the datastore can record validation 

1887 information associated with the file. If `False` the datastore 

1888 will not attempt to track any information such as checksums 

1889 or file sizes. This can be useful if such information is tracked 

1890 in an external system or if the file is to be compressed in place. 

1891 It is up to the datastore whether this parameter is relevant. 

1892 

1893 Raises 

1894 ------ 

1895 TypeError 

1896 Raised if the butler is read-only or if no run was provided. 

1897 NotImplementedError 

1898 Raised if the `Datastore` does not support the given transfer mode. 

1899 DatasetTypeNotSupportedError 

1900 Raised if one or more files to be ingested have a dataset type that 

1901 is not supported by the `Datastore`.. 

1902 FileNotFoundError 

1903 Raised if one of the given files does not exist. 

1904 FileExistsError 

1905 Raised if transfer is not `None` but the (internal) location the 

1906 file would be moved to is already occupied. 

1907 

1908 Notes 

1909 ----- 

1910 This operation is not fully exception safe: if a database operation 

1911 fails, the given `FileDataset` instances may be only partially updated. 

1912 

1913 It is atomic in terms of database operations (they will either all 

1914 succeed or all fail) providing the database engine implements 

1915 transactions correctly. It will attempt to be atomic in terms of 

1916 filesystem operations as well, but this cannot be implemented 

1917 rigorously for most datastores. 

1918 """ 

1919 if not self.isWriteable(): 

1920 raise TypeError("Butler is read-only.") 

1921 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1922 # Reorganize the inputs so they're grouped by DatasetType and then 

1923 # data ID. We also include a list of DatasetRefs for each FileDataset 

1924 # to hold the resolved DatasetRefs returned by the Registry, before 

1925 # it's safe to swap them into FileDataset.refs. 

1926 # Some type annotation aliases to make that clearer: 

1927 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]] 

1928 GroupedData = MutableMapping[DatasetType, GroupForType] 

1929 # The actual data structure: 

1930 groupedData: GroupedData = defaultdict(dict) 

1931 # And the nested loop that populates it: 

1932 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1933 # This list intentionally shared across the inner loop, since it's 

1934 # associated with `dataset`. 

1935 resolvedRefs: List[DatasetRef] = [] 

1936 

1937 # Somewhere to store pre-existing refs if we have an 

1938 # execution butler. 

1939 existingRefs: List[DatasetRef] = [] 

1940 

1941 for ref in dataset.refs: 

1942 if ref.dataId in groupedData[ref.datasetType]: 

1943 raise ConflictingDefinitionError( 

1944 f"Ingest conflict. Dataset {dataset.path} has same" 

1945 " DataId as other ingest dataset" 

1946 f" {groupedData[ref.datasetType][ref.dataId][0].path} " 

1947 f" ({ref.dataId})" 

1948 ) 

1949 if self._allow_put_of_predefined_dataset: 

1950 existing_ref = self.registry.findDataset( 

1951 ref.datasetType, dataId=ref.dataId, collections=run 

1952 ) 

1953 if existing_ref: 

1954 if self.datastore.knows(existing_ref): 

1955 raise ConflictingDefinitionError( 

1956 f"Dataset associated with path {dataset.path}" 

1957 f" already exists as {existing_ref}." 

1958 ) 

1959 # Store this ref elsewhere since it already exists 

1960 # and we do not want to remake it but we do want 

1961 # to store it in the datastore. 

1962 existingRefs.append(existing_ref) 

1963 

1964 # Nothing else to do until we have finished 

1965 # iterating. 

1966 continue 

1967 

1968 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs) 

1969 

1970 if existingRefs: 

1971 if len(dataset.refs) != len(existingRefs): 

1972 # Keeping track of partially pre-existing datasets is hard 

1973 # and should generally never happen. For now don't allow 

1974 # it. 

1975 raise ConflictingDefinitionError( 

1976 f"For dataset {dataset.path} some dataIds already exist" 

1977 " in registry but others do not. This is not supported." 

1978 ) 

1979 

1980 # Attach the resolved refs if we found them. 

1981 dataset.refs = existingRefs 

1982 

1983 # Now we can bulk-insert into Registry for each DatasetType. 

1984 for datasetType, groupForType in progress.iter_item_chunks( 

1985 groupedData.items(), desc="Bulk-inserting datasets by type" 

1986 ): 

1987 refs = self.registry.insertDatasets( 

1988 datasetType, 

1989 dataIds=groupForType.keys(), 

1990 run=run, 

1991 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType), 

1992 idGenerationMode=idGenerationMode, 

1993 ) 

1994 # Append those resolved DatasetRefs to the new lists we set up for 

1995 # them. 

1996 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()): 

1997 resolvedRefs.append(ref) 

1998 

1999 # Go back to the original FileDatasets to replace their refs with the 

2000 # new resolved ones. 

2001 for groupForType in progress.iter_chunks( 

2002 groupedData.values(), desc="Reassociating resolved dataset refs with files" 

2003 ): 

2004 for dataset, resolvedRefs in groupForType.values(): 

2005 dataset.refs = resolvedRefs 

2006 

2007 # Bulk-insert everything into Datastore. 

2008 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info) 

2009 

2010 @contextlib.contextmanager 

2011 def export( 

2012 self, 

2013 *, 

2014 directory: Optional[str] = None, 

2015 filename: Optional[str] = None, 

2016 format: Optional[str] = None, 

2017 transfer: Optional[str] = None, 

2018 ) -> Iterator[RepoExportContext]: 

2019 """Export datasets from the repository represented by this `Butler`. 

2020 

2021 This method is a context manager that returns a helper object 

2022 (`RepoExportContext`) that is used to indicate what information from 

2023 the repository should be exported. 

2024 

2025 Parameters 

2026 ---------- 

2027 directory : `str`, optional 

2028 Directory dataset files should be written to if ``transfer`` is not 

2029 `None`. 

2030 filename : `str`, optional 

2031 Name for the file that will include database information associated 

2032 with the exported datasets. If this is not an absolute path and 

2033 ``directory`` is not `None`, it will be written to ``directory`` 

2034 instead of the current working directory. Defaults to 

2035 "export.{format}". 

2036 format : `str`, optional 

2037 File format for the database information file. If `None`, the 

2038 extension of ``filename`` will be used. 

2039 transfer : `str`, optional 

2040 Transfer mode passed to `Datastore.export`. 

2041 

2042 Raises 

2043 ------ 

2044 TypeError 

2045 Raised if the set of arguments passed is inconsistent. 

2046 

2047 Examples 

2048 -------- 

2049 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

2050 methods are used to provide the iterables over data IDs and/or datasets 

2051 to be exported:: 

2052 

2053 with butler.export("exports.yaml") as export: 

2054 # Export all flats, but none of the dimension element rows 

2055 # (i.e. data ID information) associated with them. 

2056 export.saveDatasets(butler.registry.queryDatasets("flat"), 

2057 elements=()) 

2058 # Export all datasets that start with "deepCoadd_" and all of 

2059 # their associated data ID information. 

2060 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

2061 """ 

2062 if directory is None and transfer is not None: 

2063 raise TypeError("Cannot transfer without providing a directory.") 

2064 if transfer == "move": 

2065 raise TypeError("Transfer may not be 'move': export is read-only") 

2066 if format is None: 

2067 if filename is None: 

2068 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2069 else: 

2070 _, format = os.path.splitext(filename) 

2071 if not format: 

2072 raise ValueError("Please specify a file extension to determine export format.") 

2073 format = format[1:] # Strip leading "."" 

2074 elif filename is None: 

2075 filename = f"export.{format}" 

2076 if directory is not None: 

2077 filename = os.path.join(directory, filename) 

2078 formats = self._config["repo_transfer_formats"] 

2079 if format not in formats: 

2080 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}") 

2081 BackendClass = get_class_of(formats[format, "export"]) 

2082 with open(filename, "w") as stream: 

2083 backend = BackendClass(stream, universe=self.registry.dimensions) 

2084 try: 

2085 helper = RepoExportContext( 

2086 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer 

2087 ) 

2088 yield helper 

2089 except BaseException: 

2090 raise 

2091 else: 

2092 helper._finish() 

2093 

2094 def import_( 

2095 self, 

2096 *, 

2097 directory: Optional[str] = None, 

2098 filename: Union[str, TextIO, None] = None, 

2099 format: Optional[str] = None, 

2100 transfer: Optional[str] = None, 

2101 skip_dimensions: Optional[Set] = None, 

2102 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

2103 reuseIds: bool = False, 

2104 ) -> None: 

2105 """Import datasets into this repository that were exported from a 

2106 different butler repository via `~lsst.daf.butler.Butler.export`. 

2107 

2108 Parameters 

2109 ---------- 

2110 directory : `str`, optional 

2111 Directory containing dataset files to import from. If `None`, 

2112 ``filename`` and all dataset file paths specified therein must 

2113 be absolute. 

2114 filename : `str` or `TextIO`, optional 

2115 A stream or name of file that contains database information 

2116 associated with the exported datasets, typically generated by 

2117 `~lsst.daf.butler.Butler.export`. If this a string (name) and 

2118 is not an absolute path, does not exist in the current working 

2119 directory, and ``directory`` is not `None`, it is assumed to be in 

2120 ``directory``. Defaults to "export.{format}". 

2121 format : `str`, optional 

2122 File format for ``filename``. If `None`, the extension of 

2123 ``filename`` will be used. 

2124 transfer : `str`, optional 

2125 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

2126 skip_dimensions : `set`, optional 

2127 Names of dimensions that should be skipped and not imported. 

2128 idGenerationMode : `DatasetIdGenEnum`, optional 

2129 Specifies option for generating dataset IDs when IDs are not 

2130 provided or their type does not match backend type. By default 

2131 unique IDs are generated for each inserted dataset. 

2132 reuseIds : `bool`, optional 

2133 If `True` then forces re-use of imported dataset IDs for integer 

2134 IDs which are normally generated as auto-incremented; exception 

2135 will be raised if imported IDs clash with existing ones. This 

2136 option has no effect on the use of globally-unique IDs which are 

2137 always re-used (or generated if integer IDs are being imported). 

2138 

2139 Raises 

2140 ------ 

2141 TypeError 

2142 Raised if the set of arguments passed is inconsistent, or if the 

2143 butler is read-only. 

2144 """ 

2145 if not self.isWriteable(): 

2146 raise TypeError("Butler is read-only.") 

2147 if format is None: 

2148 if filename is None: 

2149 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2150 else: 

2151 _, format = os.path.splitext(filename) # type: ignore 

2152 elif filename is None: 

2153 filename = f"export.{format}" 

2154 if isinstance(filename, str) and directory is not None and not os.path.exists(filename): 

2155 filename = os.path.join(directory, filename) 

2156 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"]) 

2157 

2158 def doImport(importStream: TextIO) -> None: 

2159 backend = BackendClass(importStream, self.registry) 

2160 backend.register() 

2161 with self.transaction(): 

2162 backend.load( 

2163 self.datastore, 

2164 directory=directory, 

2165 transfer=transfer, 

2166 skip_dimensions=skip_dimensions, 

2167 idGenerationMode=idGenerationMode, 

2168 reuseIds=reuseIds, 

2169 ) 

2170 

2171 if isinstance(filename, str): 

2172 with open(filename, "r") as stream: 

2173 doImport(stream) 

2174 else: 

2175 doImport(filename) 

2176 

2177 def transfer_from( 

2178 self, 

2179 source_butler: LimitedButler, 

2180 source_refs: Iterable[DatasetRef], 

2181 transfer: str = "auto", 

2182 skip_missing: bool = True, 

2183 register_dataset_types: bool = False, 

2184 transfer_dimensions: bool = False, 

2185 ) -> collections.abc.Collection[DatasetRef]: 

2186 """Transfer datasets to this Butler from a run in another Butler. 

2187 

2188 Parameters 

2189 ---------- 

2190 source_butler : `LimitedButler` 

2191 Butler from which the datasets are to be transferred. If data IDs 

2192 in ``source_refs`` are not expanded then this has to be a full 

2193 `Butler` whose registry will be used to expand data IDs. 

2194 source_refs : iterable of `DatasetRef` 

2195 Datasets defined in the source butler that should be transferred to 

2196 this butler. 

2197 transfer : `str`, optional 

2198 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

2199 skip_missing : `bool` 

2200 If `True`, datasets with no datastore artifact associated with 

2201 them are not transferred. If `False` a registry entry will be 

2202 created even if no datastore record is created (and so will 

2203 look equivalent to the dataset being unstored). 

2204 register_dataset_types : `bool` 

2205 If `True` any missing dataset types are registered. Otherwise 

2206 an exception is raised. 

2207 transfer_dimensions : `bool`, optional 

2208 If `True`, dimension record data associated with the new datasets 

2209 will be transferred. 

2210 

2211 Returns 

2212 ------- 

2213 refs : `list` of `DatasetRef` 

2214 The refs added to this Butler. 

2215 

2216 Notes 

2217 ----- 

2218 The datastore artifact has to exist for a transfer 

2219 to be made but non-existence is not an error. 

2220 

2221 Datasets that already exist in this run will be skipped. 

2222 

2223 The datasets are imported as part of a transaction, although 

2224 dataset types are registered before the transaction is started. 

2225 This means that it is possible for a dataset type to be registered 

2226 even though transfer has failed. 

2227 """ 

2228 if not self.isWriteable(): 

2229 raise TypeError("Butler is read-only.") 

2230 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

2231 

2232 # Will iterate through the refs multiple times so need to convert 

2233 # to a list if this isn't a collection. 

2234 if not isinstance(source_refs, collections.abc.Collection): 

2235 source_refs = list(source_refs) 

2236 

2237 original_count = len(source_refs) 

2238 log.info("Transferring %d datasets into %s", original_count, str(self)) 

2239 

2240 # In some situations the datastore artifact may be missing 

2241 # and we do not want that registry entry to be imported. 

2242 # Asking datastore is not sufficient, the records may have been 

2243 # purged, we have to ask for the (predicted) URI and check 

2244 # existence explicitly. Execution butler is set up exactly like 

2245 # this with no datastore records. 

2246 artifact_existence: Dict[ResourcePath, bool] = {} 

2247 if skip_missing: 

2248 dataset_existence = source_butler.datastore.mexists( 

2249 source_refs, artifact_existence=artifact_existence 

2250 ) 

2251 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

2252 filtered_count = len(source_refs) 

2253 n_missing = original_count - filtered_count 

2254 log.verbose( 

2255 "%d dataset%s removed because the artifact does not exist. Now have %d.", 

2256 n_missing, 

2257 "" if n_missing == 1 else "s", 

2258 filtered_count, 

2259 ) 

2260 

2261 # Importing requires that we group the refs by dataset type and run 

2262 # before doing the import. 

2263 source_dataset_types = set() 

2264 grouped_refs = defaultdict(list) 

2265 for ref in source_refs: 

2266 grouped_refs[ref.datasetType, ref.run].append(ref) 

2267 source_dataset_types.add(ref.datasetType) 

2268 

2269 # Check to see if the dataset type in the source butler has 

2270 # the same definition in the target butler and register missing 

2271 # ones if requested. Registration must happen outside a transaction. 

2272 newly_registered_dataset_types = set() 

2273 for datasetType in source_dataset_types: 

2274 if register_dataset_types: 

2275 # Let this raise immediately if inconsistent. Continuing 

2276 # on to find additional inconsistent dataset types 

2277 # might result in additional unwanted dataset types being 

2278 # registered. 

2279 if self.registry.registerDatasetType(datasetType): 

2280 newly_registered_dataset_types.add(datasetType) 

2281 else: 

2282 # If the dataset type is missing, let it fail immediately. 

2283 target_dataset_type = self.registry.getDatasetType(datasetType.name) 

2284 if target_dataset_type != datasetType: 

2285 raise ConflictingDefinitionError( 

2286 "Source butler dataset type differs from definition" 

2287 f" in target butler: {datasetType} !=" 

2288 f" {target_dataset_type}" 

2289 ) 

2290 if newly_registered_dataset_types: 

2291 # We may have registered some even if there were inconsistencies 

2292 # but should let people know (or else remove them again). 

2293 log.log( 

2294 VERBOSE, 

2295 "Registered the following dataset types in the target Butler: %s", 

2296 ", ".join(d.name for d in newly_registered_dataset_types), 

2297 ) 

2298 else: 

2299 log.log(VERBOSE, "All required dataset types are known to the target Butler") 

2300 

2301 dimension_records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

2302 if transfer_dimensions: 

2303 # Collect all the dimension records for these refs. 

2304 # All dimensions are to be copied but the list of valid dimensions 

2305 # come from this butler's universe. 

2306 elements = frozenset( 

2307 element 

2308 for element in self.registry.dimensions.getStaticElements() 

2309 if element.hasTable() and element.viewOf is None 

2310 ) 

2311 dataIds = set(ref.dataId for ref in source_refs) 

2312 # This logic comes from saveDataIds. 

2313 for dataId in dataIds: 

2314 # Need an expanded record, if not expanded that we need a full 

2315 # butler with registry (allow mocks with registry too). 

2316 if not dataId.hasRecords(): 

2317 if registry := getattr(source_butler, "registry", None): 

2318 dataId = registry.expandDataId(dataId) 

2319 else: 

2320 raise TypeError("Input butler needs to be a full butler to expand DataId.") 

2321 # If this butler doesn't know about a dimension in the source 

2322 # butler things will break later. 

2323 for record in dataId.records.values(): 

2324 if record is not None and record.definition in elements: 

2325 dimension_records[record.definition].setdefault(record.dataId, record) 

2326 

2327 handled_collections: Set[str] = set() 

2328 

2329 # Do all the importing in a single transaction. 

2330 with self.transaction(): 

2331 if dimension_records: 

2332 log.verbose("Ensuring that dimension records exist for transferred datasets.") 

2333 for element, r in dimension_records.items(): 

2334 records = [r[dataId] for dataId in r] 

2335 # Assume that if the record is already present that we can 

2336 # use it without having to check that the record metadata 

2337 # is consistent. 

2338 self.registry.insertDimensionData(element, *records, skip_existing=True) 

2339 

2340 n_imported = 0 

2341 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

2342 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

2343 ): 

2344 if run not in handled_collections: 

2345 # May need to create output collection. If source butler 

2346 # has a registry, ask for documentation string. 

2347 run_doc = None 

2348 if registry := getattr(source_butler, "registry", None): 

2349 run_doc = registry.getCollectionDocumentation(run) 

2350 registered = self.registry.registerRun(run, doc=run_doc) 

2351 handled_collections.add(run) 

2352 if registered: 

2353 log.log(VERBOSE, "Creating output run %s", run) 

2354 

2355 n_refs = len(refs_to_import) 

2356 log.verbose( 

2357 "Importing %d ref%s of dataset type %s into run %s", 

2358 n_refs, 

2359 "" if n_refs == 1 else "s", 

2360 datasetType.name, 

2361 run, 

2362 ) 

2363 

2364 # Assume we are using UUIDs and the source refs will match 

2365 # those imported. 

2366 imported_refs = self.registry._importDatasets(refs_to_import, expand=False) 

2367 assert set(imported_refs) == set(refs_to_import) 

2368 n_imported += len(imported_refs) 

2369 

2370 assert len(source_refs) == n_imported 

2371 log.verbose("Imported %d datasets into destination butler", n_imported) 

2372 

2373 # Ask the datastore to transfer. The datastore has to check that 

2374 # the source datastore is compatible with the target datastore. 

2375 accepted, rejected = self.datastore.transfer_from( 

2376 source_butler.datastore, 

2377 source_refs, 

2378 transfer=transfer, 

2379 artifact_existence=artifact_existence, 

2380 ) 

2381 if rejected: 

2382 # For now, accept the registry entries but not the files. 

2383 log.warning( 

2384 "%d datasets were rejected and %d accepted for dataset type %s in run %r.", 

2385 len(rejected), 

2386 len(accepted), 

2387 datasetType, 

2388 run, 

2389 ) 

2390 

2391 return source_refs 

2392 

2393 def validateConfiguration( 

2394 self, 

2395 logFailures: bool = False, 

2396 datasetTypeNames: Optional[Iterable[str]] = None, 

2397 ignore: Iterable[str] | None = None, 

2398 ) -> None: 

2399 """Validate butler configuration. 

2400 

2401 Checks that each `DatasetType` can be stored in the `Datastore`. 

2402 

2403 Parameters 

2404 ---------- 

2405 logFailures : `bool`, optional 

2406 If `True`, output a log message for every validation error 

2407 detected. 

2408 datasetTypeNames : iterable of `str`, optional 

2409 The `DatasetType` names that should be checked. This allows 

2410 only a subset to be selected. 

2411 ignore : iterable of `str`, optional 

2412 Names of DatasetTypes to skip over. This can be used to skip 

2413 known problems. If a named `DatasetType` corresponds to a 

2414 composite, all components of that `DatasetType` will also be 

2415 ignored. 

2416 

2417 Raises 

2418 ------ 

2419 ButlerValidationError 

2420 Raised if there is some inconsistency with how this Butler 

2421 is configured. 

2422 """ 

2423 if datasetTypeNames: 

2424 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

2425 else: 

2426 datasetTypes = list(self.registry.queryDatasetTypes()) 

2427 

2428 # filter out anything from the ignore list 

2429 if ignore: 

2430 ignore = set(ignore) 

2431 datasetTypes = [ 

2432 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

2433 ] 

2434 else: 

2435 ignore = set() 

2436 

2437 # Find all the registered instruments 

2438 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument")) 

2439 

2440 # For each datasetType that has an instrument dimension, create 

2441 # a DatasetRef for each defined instrument 

2442 datasetRefs = [] 

2443 

2444 for datasetType in datasetTypes: 

2445 if "instrument" in datasetType.dimensions: 

2446 for instrument in instruments: 

2447 datasetRef = DatasetRef( 

2448 datasetType, {"instrument": instrument}, conform=False # type: ignore 

2449 ) 

2450 datasetRefs.append(datasetRef) 

2451 

2452 entities: List[Union[DatasetType, DatasetRef]] = [] 

2453 entities.extend(datasetTypes) 

2454 entities.extend(datasetRefs) 

2455 

2456 datastoreErrorStr = None 

2457 try: 

2458 self.datastore.validateConfiguration(entities, logFailures=logFailures) 

2459 except ValidationError as e: 

2460 datastoreErrorStr = str(e) 

2461 

2462 # Also check that the LookupKeys used by the datastores match 

2463 # registry and storage class definitions 

2464 keys = self.datastore.getLookupKeys() 

2465 

2466 failedNames = set() 

2467 failedDataId = set() 

2468 for key in keys: 

2469 if key.name is not None: 

2470 if key.name in ignore: 

2471 continue 

2472 

2473 # skip if specific datasetType names were requested and this 

2474 # name does not match 

2475 if datasetTypeNames and key.name not in datasetTypeNames: 

2476 continue 

2477 

2478 # See if it is a StorageClass or a DatasetType 

2479 if key.name in self.storageClasses: 

2480 pass 

2481 else: 

2482 try: 

2483 self.registry.getDatasetType(key.name) 

2484 except KeyError: 

2485 if logFailures: 

2486 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

2487 failedNames.add(key) 

2488 else: 

2489 # Dimensions are checked for consistency when the Butler 

2490 # is created and rendezvoused with a universe. 

2491 pass 

2492 

2493 # Check that the instrument is a valid instrument 

2494 # Currently only support instrument so check for that 

2495 if key.dataId: 

2496 dataIdKeys = set(key.dataId) 

2497 if set(["instrument"]) != dataIdKeys: 

2498 if logFailures: 

2499 log.critical("Key '%s' has unsupported DataId override", key) 

2500 failedDataId.add(key) 

2501 elif key.dataId["instrument"] not in instruments: 

2502 if logFailures: 

2503 log.critical("Key '%s' has unknown instrument", key) 

2504 failedDataId.add(key) 

2505 

2506 messages = [] 

2507 

2508 if datastoreErrorStr: 

2509 messages.append(datastoreErrorStr) 

2510 

2511 for failed, msg in ( 

2512 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2513 (failedDataId, "Keys with bad DataId entries: "), 

2514 ): 

2515 if failed: 

2516 msg += ", ".join(str(k) for k in failed) 

2517 messages.append(msg) 

2518 

2519 if messages: 

2520 raise ValidationError(";\n".join(messages)) 

2521 

2522 @property 

2523 def collections(self) -> Sequence[str]: 

2524 """The collections to search by default, in order 

2525 (`Sequence` [ `str` ]). 

2526 

2527 This is an alias for ``self.registry.defaults.collections``. It cannot 

2528 be set directly in isolation, but all defaults may be changed together 

2529 by assigning a new `RegistryDefaults` instance to 

2530 ``self.registry.defaults``. 

2531 """ 

2532 return self.registry.defaults.collections 

2533 

2534 @property 

2535 def run(self) -> Optional[str]: 

2536 """Name of the run this butler writes outputs to by default (`str` or 

2537 `None`). 

2538 

2539 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2540 directly in isolation, but all defaults may be changed together by 

2541 assigning a new `RegistryDefaults` instance to 

2542 ``self.registry.defaults``. 

2543 """ 

2544 return self.registry.defaults.run 

2545 

2546 @property 

2547 def dimensions(self) -> DimensionUniverse: 

2548 # Docstring inherited. 

2549 return self.registry.dimensions 

2550 

2551 registry: Registry 

2552 """The object that manages dataset metadata and relationships (`Registry`). 

2553 

2554 Most operations that don't involve reading or writing butler datasets are 

2555 accessible only via `Registry` methods. 

2556 """ 

2557 

2558 datastore: Datastore 

2559 """The object that manages actual dataset storage (`Datastore`). 

2560 

2561 Direct user access to the datastore should rarely be necessary; the primary 

2562 exception is the case where a `Datastore` implementation provides extra 

2563 functionality beyond what the base class defines. 

2564 """ 

2565 

2566 storageClasses: StorageClassFactory 

2567 """An object that maps known storage class names to objects that fully 

2568 describe them (`StorageClassFactory`). 

2569 """ 

2570 

2571 _allow_put_of_predefined_dataset: bool 

2572 """Allow a put to succeed even if there is already a registry entry for it 

2573 but not a datastore record. (`bool`)."""