Coverage for python/lsst/daf/butler/direct_butler/_direct_butler.py: 11%

763 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-05-08 02:51 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Butler top level classes. 

29""" 

30from __future__ import annotations 

31 

32__all__ = ( 

33 "DirectButler", 

34 "ButlerValidationError", 

35) 

36 

37import collections.abc 

38import contextlib 

39import io 

40import itertools 

41import logging 

42import numbers 

43import os 

44import warnings 

45from collections import Counter, defaultdict 

46from collections.abc import Iterable, Iterator, MutableMapping, Sequence 

47from typing import TYPE_CHECKING, Any, ClassVar, TextIO, cast 

48 

49from lsst.resources import ResourcePath, ResourcePathExpression 

50from lsst.utils.introspection import get_class_of 

51from lsst.utils.logging import VERBOSE, getLogger 

52from sqlalchemy.exc import IntegrityError 

53 

54from .._butler import Butler 

55from .._butler_config import ButlerConfig 

56from .._butler_instance_options import ButlerInstanceOptions 

57from .._dataset_existence import DatasetExistence 

58from .._dataset_ref import DatasetRef 

59from .._dataset_type import DatasetType 

60from .._deferredDatasetHandle import DeferredDatasetHandle 

61from .._exceptions import DatasetNotFoundError, DimensionValueError, ValidationError 

62from .._limited_butler import LimitedButler 

63from .._registry_shim import RegistryShim 

64from .._storage_class import StorageClass, StorageClassFactory 

65from .._timespan import Timespan 

66from ..datastore import Datastore, NullDatastore 

67from ..dimensions import DataCoordinate, Dimension 

68from ..direct_query_driver import DirectQueryDriver 

69from ..progress import Progress 

70from ..queries import Query 

71from ..registry import ( 

72 CollectionType, 

73 ConflictingDefinitionError, 

74 DataIdError, 

75 MissingDatasetTypeError, 

76 RegistryDefaults, 

77 _RegistryFactory, 

78) 

79from ..registry.sql_registry import SqlRegistry 

80from ..transfers import RepoExportContext 

81from ..utils import transactional 

82from ._direct_butler_collections import DirectButlerCollections 

83 

84if TYPE_CHECKING: 

85 from lsst.resources import ResourceHandleProtocol 

86 

87 from .._dataset_ref import DatasetId 

88 from .._file_dataset import FileDataset 

89 from ..datastore import DatasetRefURIs 

90 from ..dimensions import DataId, DataIdValue, DimensionElement, DimensionRecord, DimensionUniverse 

91 from ..registry import Registry 

92 from ..transfers import RepoImportBackend 

93 

94_LOG = getLogger(__name__) 

95 

96 

97class ButlerValidationError(ValidationError): 

98 """There is a problem with the Butler configuration.""" 

99 

100 pass 

101 

102 

103class DirectButler(Butler): # numpydoc ignore=PR02 

104 """Main entry point for the data access system. 

105 

106 Parameters 

107 ---------- 

108 config : `ButlerConfig` 

109 The configuration for this Butler instance. 

110 registry : `SqlRegistry` 

111 The object that manages dataset metadata and relationships. 

112 datastore : Datastore 

113 The object that manages actual dataset storage. 

114 storageClasses : StorageClassFactory 

115 An object that maps known storage class names to objects that fully 

116 describe them. 

117 

118 Notes 

119 ----- 

120 Most users should call the top-level `Butler`.``from_config`` instead of 

121 using this constructor directly. 

122 """ 

123 

124 # This is __new__ instead of __init__ because we have to support 

125 # instantiation via the legacy constructor Butler.__new__(), which 

126 # reads the configuration and selects which subclass to instantiate. The 

127 # interaction between __new__ and __init__ is kind of wacky in Python. If 

128 # we were using __init__ here, __init__ would be called twice (once when 

129 # the DirectButler instance is constructed inside Butler.from_config(), and 

130 # a second time with the original arguments to Butler() when the instance 

131 # is returned from Butler.__new__() 

132 def __new__( 

133 cls, 

134 *, 

135 config: ButlerConfig, 

136 registry: SqlRegistry, 

137 datastore: Datastore, 

138 storageClasses: StorageClassFactory, 

139 ) -> DirectButler: 

140 self = cast(DirectButler, super().__new__(cls)) 

141 self._config = config 

142 self._registry = registry 

143 self._datastore = datastore 

144 self.storageClasses = storageClasses 

145 

146 # For execution butler the datastore needs a special 

147 # dependency-inversion trick. This is not used by regular butler, 

148 # but we do not have a way to distinguish regular butler from execution 

149 # butler. 

150 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

151 

152 self._registry_shim = RegistryShim(self) 

153 

154 return self 

155 

156 @classmethod 

157 def create_from_config( 

158 cls, 

159 config: ButlerConfig, 

160 *, 

161 options: ButlerInstanceOptions, 

162 without_datastore: bool = False, 

163 ) -> DirectButler: 

164 """Construct a Butler instance from a configuration file. 

165 

166 Parameters 

167 ---------- 

168 config : `ButlerConfig` 

169 The configuration for this Butler instance. 

170 options : `ButlerInstanceOptions` 

171 Default values and other settings for the Butler instance. 

172 without_datastore : `bool`, optional 

173 If `True` do not attach a datastore to this butler. Any attempts 

174 to use a datastore will fail. 

175 

176 Notes 

177 ----- 

178 Most users should call the top-level `Butler`.``from_config`` 

179 instead of using this function directly. 

180 """ 

181 if "run" in config or "collection" in config: 

182 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

183 

184 defaults = RegistryDefaults( 

185 collections=options.collections, run=options.run, infer=options.inferDefaults, **options.kwargs 

186 ) 

187 try: 

188 butlerRoot = config.get("root", config.configDir) 

189 writeable = options.writeable 

190 if writeable is None: 

191 writeable = options.run is not None 

192 registry = _RegistryFactory(config).from_config( 

193 butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

194 ) 

195 if without_datastore: 

196 datastore: Datastore = NullDatastore(None, None) 

197 else: 

198 datastore = Datastore.fromConfig( 

199 config, registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

200 ) 

201 # TODO: Once datastore drops dependency on registry we can 

202 # construct datastore first and pass opaque tables to registry 

203 # constructor. 

204 registry.make_datastore_tables(datastore.get_opaque_table_definitions()) 

205 storageClasses = StorageClassFactory() 

206 storageClasses.addFromConfig(config) 

207 

208 return DirectButler( 

209 config=config, registry=registry, datastore=datastore, storageClasses=storageClasses 

210 ) 

211 except Exception: 

212 # Failures here usually mean that configuration is incomplete, 

213 # just issue an error message which includes config file URI. 

214 _LOG.error(f"Failed to instantiate Butler from config {config.configFile}.") 

215 raise 

216 

217 def _clone( 

218 self, 

219 *, 

220 collections: Any = None, 

221 run: str | None = None, 

222 inferDefaults: bool = True, 

223 **kwargs: Any, 

224 ) -> DirectButler: 

225 # Docstring inherited 

226 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

227 registry = self._registry.copy(defaults) 

228 

229 return DirectButler( 

230 registry=registry, 

231 config=self._config, 

232 datastore=self._datastore.clone(registry.getDatastoreBridgeManager()), 

233 storageClasses=self.storageClasses, 

234 ) 

235 

236 GENERATION: ClassVar[int] = 3 

237 """This is a Generation 3 Butler. 

238 

239 This attribute may be removed in the future, once the Generation 2 Butler 

240 interface has been fully retired; it should only be used in transitional 

241 code. 

242 """ 

243 

244 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

245 """Return DatasetType defined in registry given dataset type name.""" 

246 try: 

247 return self.get_dataset_type(name) 

248 except MissingDatasetTypeError: 

249 return None 

250 

251 @classmethod 

252 def _unpickle( 

253 cls, 

254 config: ButlerConfig, 

255 collections: tuple[str, ...] | None, 

256 run: str | None, 

257 defaultDataId: dict[str, str], 

258 writeable: bool, 

259 ) -> DirectButler: 

260 """Callable used to unpickle a Butler. 

261 

262 We prefer not to use ``Butler.__init__`` directly so we can force some 

263 of its many arguments to be keyword-only (note that ``__reduce__`` 

264 can only invoke callables with positional arguments). 

265 

266 Parameters 

267 ---------- 

268 config : `ButlerConfig` 

269 Butler configuration, already coerced into a true `ButlerConfig` 

270 instance (and hence after any search paths for overrides have been 

271 utilized). 

272 collections : `tuple` [ `str` ] 

273 Names of the default collections to read from. 

274 run : `str`, optional 

275 Name of the default `~CollectionType.RUN` collection to write to. 

276 defaultDataId : `dict` [ `str`, `str` ] 

277 Default data ID values. 

278 writeable : `bool` 

279 Whether the Butler should support write operations. 

280 

281 Returns 

282 ------- 

283 butler : `Butler` 

284 A new `Butler` instance. 

285 """ 

286 return cls.create_from_config( 

287 config=config, 

288 options=ButlerInstanceOptions( 

289 collections=collections, run=run, writeable=writeable, kwargs=defaultDataId 

290 ), 

291 ) 

292 

293 def __reduce__(self) -> tuple: 

294 """Support pickling.""" 

295 return ( 

296 DirectButler._unpickle, 

297 ( 

298 self._config, 

299 self.collections, 

300 self.run, 

301 dict(self._registry.defaults.dataId.required), 

302 self._registry.isWriteable(), 

303 ), 

304 ) 

305 

306 def __str__(self) -> str: 

307 return ( 

308 f"Butler(collections={self.collections}, run={self.run}, " 

309 f"datastore='{self._datastore}', registry='{self._registry}')" 

310 ) 

311 

312 def isWriteable(self) -> bool: 

313 # Docstring inherited. 

314 return self._registry.isWriteable() 

315 

316 def _caching_context(self) -> contextlib.AbstractContextManager[None]: 

317 """Context manager that enables caching.""" 

318 return self._registry.caching_context() 

319 

320 @contextlib.contextmanager 

321 def transaction(self) -> Iterator[None]: 

322 """Context manager supporting `Butler` transactions. 

323 

324 Transactions can be nested. 

325 """ 

326 with self._registry.transaction(), self._datastore.transaction(): 

327 yield 

328 

329 def _standardizeArgs( 

330 self, 

331 datasetRefOrType: DatasetRef | DatasetType | str, 

332 dataId: DataId | None = None, 

333 for_put: bool = True, 

334 **kwargs: Any, 

335 ) -> tuple[DatasetType, DataId | None]: 

336 """Standardize the arguments passed to several Butler APIs. 

337 

338 Parameters 

339 ---------- 

340 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

341 When `DatasetRef` the `dataId` should be `None`. 

342 Otherwise the `DatasetType` or name thereof. 

343 dataId : `dict` or `DataCoordinate` 

344 A `dict` of `Dimension` link name, value pairs that label the 

345 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

346 should be provided as the second argument. 

347 for_put : `bool`, optional 

348 If `True` this call is invoked as part of a `Butler.put()`. 

349 Otherwise it is assumed to be part of a `Butler.get()`. This 

350 parameter is only relevant if there is dataset type 

351 inconsistency. 

352 **kwargs 

353 Additional keyword arguments used to augment or construct a 

354 `DataCoordinate`. See `DataCoordinate.standardize` 

355 parameters. 

356 

357 Returns 

358 ------- 

359 datasetType : `DatasetType` 

360 A `DatasetType` instance extracted from ``datasetRefOrType``. 

361 dataId : `dict` or `DataId`, optional 

362 Argument that can be used (along with ``kwargs``) to construct a 

363 `DataId`. 

364 

365 Notes 

366 ----- 

367 Butler APIs that conceptually need a DatasetRef also allow passing a 

368 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

369 keyword arguments that can be used to construct one) separately. This 

370 method accepts those arguments and always returns a true `DatasetType` 

371 and a `DataId` or `dict`. 

372 

373 Standardization of `dict` vs `DataId` is best handled by passing the 

374 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

375 generally similarly flexible. 

376 """ 

377 externalDatasetType: DatasetType | None = None 

378 internalDatasetType: DatasetType | None = None 

379 if isinstance(datasetRefOrType, DatasetRef): 

380 if dataId is not None or kwargs: 

381 raise ValueError("DatasetRef given, cannot use dataId as well") 

382 externalDatasetType = datasetRefOrType.datasetType 

383 dataId = datasetRefOrType.dataId 

384 else: 

385 # Don't check whether DataId is provided, because Registry APIs 

386 # can usually construct a better error message when it wasn't. 

387 if isinstance(datasetRefOrType, DatasetType): 

388 externalDatasetType = datasetRefOrType 

389 else: 

390 internalDatasetType = self.get_dataset_type(datasetRefOrType) 

391 

392 # Check that they are self-consistent 

393 if externalDatasetType is not None: 

394 internalDatasetType = self.get_dataset_type(externalDatasetType.name) 

395 if externalDatasetType != internalDatasetType: 

396 # We can allow differences if they are compatible, depending 

397 # on whether this is a get or a put. A get requires that 

398 # the python type associated with the datastore can be 

399 # converted to the user type. A put requires that the user 

400 # supplied python type can be converted to the internal 

401 # type expected by registry. 

402 relevantDatasetType = internalDatasetType 

403 if for_put: 

404 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

405 else: 

406 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

407 relevantDatasetType = externalDatasetType 

408 if not is_compatible: 

409 raise ValueError( 

410 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

411 f"registry definition ({internalDatasetType})" 

412 ) 

413 # Override the internal definition. 

414 internalDatasetType = relevantDatasetType 

415 

416 assert internalDatasetType is not None 

417 return internalDatasetType, dataId 

418 

419 def _rewrite_data_id( 

420 self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any 

421 ) -> tuple[DataId | None, dict[str, Any]]: 

422 """Rewrite a data ID taking into account dimension records. 

423 

424 Take a Data ID and keyword args and rewrite it if necessary to 

425 allow the user to specify dimension records rather than dimension 

426 primary values. 

427 

428 This allows a user to include a dataId dict with keys of 

429 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

430 the integer exposure ID. It also allows a string to be given 

431 for a dimension value rather than the integer ID if that is more 

432 convenient. For example, rather than having to specifying the 

433 detector with ``detector.full_name``, a string given for ``detector`` 

434 will be interpreted as the full name and converted to the integer 

435 value. 

436 

437 Keyword arguments can also use strings for dimensions like detector 

438 and exposure but python does not allow them to include ``.`` and 

439 so the ``exposure.day_obs`` syntax can not be used in a keyword 

440 argument. 

441 

442 Parameters 

443 ---------- 

444 dataId : `dict` or `DataCoordinate` 

445 A `dict` of `Dimension` link name, value pairs that will label the 

446 `DatasetRef` within a Collection. 

447 datasetType : `DatasetType` 

448 The dataset type associated with this dataId. Required to 

449 determine the relevant dimensions. 

450 **kwargs 

451 Additional keyword arguments used to augment or construct a 

452 `DataId`. See `DataId` parameters. 

453 

454 Returns 

455 ------- 

456 dataId : `dict` or `DataCoordinate` 

457 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

458 no keyword arguments, the original dataId will be returned 

459 unchanged. 

460 **kwargs : `dict` 

461 Any unused keyword arguments (would normally be empty dict). 

462 """ 

463 # Do nothing if we have a standalone DataCoordinate. 

464 if isinstance(dataId, DataCoordinate) and not kwargs: 

465 return dataId, kwargs 

466 

467 # Process dimension records that are using record information 

468 # rather than ids 

469 newDataId: dict[str, DataIdValue] = {} 

470 byRecord: dict[str, dict[str, Any]] = defaultdict(dict) 

471 

472 # if all the dataId comes from keyword parameters we do not need 

473 # to do anything here because they can't be of the form 

474 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

475 if dataId: 

476 for k, v in dataId.items(): 

477 # If we have a Dimension we do not need to do anything 

478 # because it cannot be a compound key. 

479 if isinstance(k, str) and "." in k: 

480 # Someone is using a more human-readable dataId 

481 dimensionName, record = k.split(".", 1) 

482 byRecord[dimensionName][record] = v 

483 elif isinstance(k, Dimension): 

484 newDataId[k.name] = v 

485 else: 

486 newDataId[k] = v 

487 

488 # Go through the updated dataId and check the type in case someone is 

489 # using an alternate key. We have already filtered out the compound 

490 # keys dimensions.record format. 

491 not_dimensions = {} 

492 

493 # Will need to look in the dataId and the keyword arguments 

494 # and will remove them if they need to be fixed or are unrecognized. 

495 for dataIdDict in (newDataId, kwargs): 

496 # Use a list so we can adjust the dict safely in the loop 

497 for dimensionName in list(dataIdDict): 

498 value = dataIdDict[dimensionName] 

499 try: 

500 dimension = self.dimensions.dimensions[dimensionName] 

501 except KeyError: 

502 # This is not a real dimension 

503 not_dimensions[dimensionName] = value 

504 del dataIdDict[dimensionName] 

505 continue 

506 

507 # Convert an integral type to an explicit int to simplify 

508 # comparisons here 

509 if isinstance(value, numbers.Integral): 

510 value = int(value) 

511 

512 if not isinstance(value, dimension.primaryKey.getPythonType()): 

513 for alternate in dimension.alternateKeys: 

514 if isinstance(value, alternate.getPythonType()): 

515 byRecord[dimensionName][alternate.name] = value 

516 del dataIdDict[dimensionName] 

517 _LOG.debug( 

518 "Converting dimension %s to %s.%s=%s", 

519 dimensionName, 

520 dimensionName, 

521 alternate.name, 

522 value, 

523 ) 

524 break 

525 else: 

526 _LOG.warning( 

527 "Type mismatch found for value '%r' provided for dimension %s. " 

528 "Could not find matching alternative (primary key has type %s) " 

529 "so attempting to use as-is.", 

530 value, 

531 dimensionName, 

532 dimension.primaryKey.getPythonType(), 

533 ) 

534 

535 # By this point kwargs and newDataId should only include valid 

536 # dimensions. Merge kwargs in to the new dataId and log if there 

537 # are dimensions in both (rather than calling update). 

538 for k, v in kwargs.items(): 

539 if k in newDataId and newDataId[k] != v: 

540 _LOG.debug( 

541 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

542 ) 

543 newDataId[k] = v 

544 # No need to retain any values in kwargs now. 

545 kwargs = {} 

546 

547 # If we have some unrecognized dimensions we have to try to connect 

548 # them to records in other dimensions. This is made more complicated 

549 # by some dimensions having records with clashing names. A mitigation 

550 # is that we can tell by this point which dimensions are missing 

551 # for the DatasetType but this does not work for calibrations 

552 # where additional dimensions can be used to constrain the temporal 

553 # axis. 

554 if not_dimensions: 

555 # Search for all dimensions even if we have been given a value 

556 # explicitly. In some cases records are given as well as the 

557 # actually dimension and this should not be an error if they 

558 # match. 

559 mandatoryDimensions = datasetType.dimensions.names # - provided 

560 

561 candidateDimensions: set[str] = set() 

562 candidateDimensions.update(mandatoryDimensions) 

563 

564 # For calibrations we may well be needing temporal dimensions 

565 # so rather than always including all dimensions in the scan 

566 # restrict things a little. It is still possible for there 

567 # to be confusion over day_obs in visit vs exposure for example. 

568 # If we are not searching calibration collections things may 

569 # fail but they are going to fail anyway because of the 

570 # ambiguousness of the dataId... 

571 if datasetType.isCalibration(): 

572 for dim in self.dimensions.dimensions: 

573 if dim.temporal: 

574 candidateDimensions.add(str(dim)) 

575 

576 # Look up table for the first association with a dimension 

577 guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict) 

578 

579 # Keep track of whether an item is associated with multiple 

580 # dimensions. 

581 counter: Counter[str] = Counter() 

582 assigned: dict[str, set[str]] = defaultdict(set) 

583 

584 # Go through the missing dimensions and associate the 

585 # given names with records within those dimensions 

586 matched_dims = set() 

587 for dimensionName in candidateDimensions: 

588 dimension = self.dimensions.dimensions[dimensionName] 

589 fields = dimension.metadata.names | dimension.uniqueKeys.names 

590 for field in not_dimensions: 

591 if field in fields: 

592 guessedAssociation[dimensionName][field] = not_dimensions[field] 

593 counter[dimensionName] += 1 

594 assigned[field].add(dimensionName) 

595 matched_dims.add(field) 

596 

597 # Calculate the fields that matched nothing. 

598 never_found = set(not_dimensions) - matched_dims 

599 

600 if never_found: 

601 raise DimensionValueError(f"Unrecognized keyword args given: {never_found}") 

602 

603 # There is a chance we have allocated a single dataId item 

604 # to multiple dimensions. Need to decide which should be retained. 

605 # For now assume that the most popular alternative wins. 

606 # This means that day_obs with seq_num will result in 

607 # exposure.day_obs and not visit.day_obs 

608 # Also prefer an explicitly missing dimension over an inferred 

609 # temporal dimension. 

610 for fieldName, assignedDimensions in assigned.items(): 

611 if len(assignedDimensions) > 1: 

612 # Pick the most popular (preferring mandatory dimensions) 

613 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

614 if requiredButMissing: 

615 candidateDimensions = requiredButMissing 

616 else: 

617 candidateDimensions = assignedDimensions 

618 

619 # If this is a choice between visit and exposure and 

620 # neither was a required part of the dataset type, 

621 # (hence in this branch) always prefer exposure over 

622 # visit since exposures are always defined and visits 

623 # are defined from exposures. 

624 if candidateDimensions == {"exposure", "visit"}: 

625 candidateDimensions = {"exposure"} 

626 

627 # Select the relevant items and get a new restricted 

628 # counter. 

629 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

630 duplicatesCounter: Counter[str] = Counter() 

631 duplicatesCounter.update(theseCounts) 

632 

633 # Choose the most common. If they are equally common 

634 # we will pick the one that was found first. 

635 # Returns a list of tuples 

636 selected = duplicatesCounter.most_common(1)[0][0] 

637 

638 _LOG.debug( 

639 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

640 " Removed ambiguity by choosing dimension %s.", 

641 fieldName, 

642 ", ".join(assignedDimensions), 

643 selected, 

644 ) 

645 

646 for candidateDimension in assignedDimensions: 

647 if candidateDimension != selected: 

648 del guessedAssociation[candidateDimension][fieldName] 

649 

650 # Update the record look up dict with the new associations 

651 for dimensionName, values in guessedAssociation.items(): 

652 if values: # A dict might now be empty 

653 _LOG.debug( 

654 "Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values 

655 ) 

656 byRecord[dimensionName].update(values) 

657 

658 if byRecord: 

659 # Some record specifiers were found so we need to convert 

660 # them to the Id form 

661 for dimensionName, values in byRecord.items(): 

662 if dimensionName in newDataId: 

663 _LOG.debug( 

664 "DataId specified explicit %s dimension value of %s in addition to" 

665 " general record specifiers for it of %s. Ignoring record information.", 

666 dimensionName, 

667 newDataId[dimensionName], 

668 str(values), 

669 ) 

670 # Get the actual record and compare with these values. 

671 try: 

672 recs = list(self._registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

673 except DataIdError: 

674 raise DimensionValueError( 

675 f"Could not find dimension '{dimensionName}'" 

676 f" with dataId {newDataId} as part of comparing with" 

677 f" record values {byRecord[dimensionName]}" 

678 ) from None 

679 if len(recs) == 1: 

680 errmsg: list[str] = [] 

681 for k, v in values.items(): 

682 if (recval := getattr(recs[0], k)) != v: 

683 errmsg.append(f"{k}({recval} != {v})") 

684 if errmsg: 

685 raise DimensionValueError( 

686 f"Dimension {dimensionName} in dataId has explicit value" 

687 " inconsistent with records: " + ", ".join(errmsg) 

688 ) 

689 else: 

690 # Multiple matches for an explicit dimension 

691 # should never happen but let downstream complain. 

692 pass 

693 continue 

694 

695 # Build up a WHERE expression 

696 bind = dict(values.items()) 

697 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

698 

699 # Hopefully we get a single record that matches 

700 records = set( 

701 self._registry.queryDimensionRecords( 

702 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

703 ) 

704 ) 

705 

706 if len(records) != 1: 

707 if len(records) > 1: 

708 # visit can have an ambiguous answer without involving 

709 # visit_system. The default visit_system is defined 

710 # by the instrument. 

711 if ( 

712 dimensionName == "visit" 

713 and "visit_system_membership" in self.dimensions 

714 and "visit_system" in self.dimensions["instrument"].metadata 

715 ): 

716 instrument_records = list( 

717 self._registry.queryDimensionRecords( 

718 "instrument", 

719 dataId=newDataId, 

720 **kwargs, 

721 ) 

722 ) 

723 if len(instrument_records) == 1: 

724 visit_system = instrument_records[0].visit_system 

725 if visit_system is None: 

726 # Set to a value that will never match. 

727 visit_system = -1 

728 

729 # Look up each visit in the 

730 # visit_system_membership records. 

731 for rec in records: 

732 membership = list( 

733 self._registry.queryDimensionRecords( 

734 # Use bind to allow zero results. 

735 # This is a fully-specified query. 

736 "visit_system_membership", 

737 where="instrument = inst AND visit_system = system AND visit = v", 

738 bind=dict( 

739 inst=instrument_records[0].name, system=visit_system, v=rec.id 

740 ), 

741 ) 

742 ) 

743 if membership: 

744 # This record is the right answer. 

745 records = {rec} 

746 break 

747 

748 # The ambiguity may have been resolved so check again. 

749 if len(records) > 1: 

750 _LOG.debug( 

751 "Received %d records from constraints of %s", len(records), str(values) 

752 ) 

753 for r in records: 

754 _LOG.debug("- %s", str(r)) 

755 raise DimensionValueError( 

756 f"DataId specification for dimension {dimensionName} is not" 

757 f" uniquely constrained to a single dataset by {values}." 

758 f" Got {len(records)} results." 

759 ) 

760 else: 

761 raise DimensionValueError( 

762 f"DataId specification for dimension {dimensionName} matched no" 

763 f" records when constrained by {values}" 

764 ) 

765 

766 # Get the primary key from the real dimension object 

767 dimension = self.dimensions.dimensions[dimensionName] 

768 if not isinstance(dimension, Dimension): 

769 raise RuntimeError( 

770 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

771 ) 

772 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

773 

774 return newDataId, kwargs 

775 

776 def _findDatasetRef( 

777 self, 

778 datasetRefOrType: DatasetRef | DatasetType | str, 

779 dataId: DataId | None = None, 

780 *, 

781 collections: Any = None, 

782 predict: bool = False, 

783 run: str | None = None, 

784 datastore_records: bool = False, 

785 timespan: Timespan | None = None, 

786 **kwargs: Any, 

787 ) -> DatasetRef: 

788 """Shared logic for methods that start with a search for a dataset in 

789 the registry. 

790 

791 Parameters 

792 ---------- 

793 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

794 When `DatasetRef` the `dataId` should be `None`. 

795 Otherwise the `DatasetType` or name thereof. 

796 dataId : `dict` or `DataCoordinate`, optional 

797 A `dict` of `Dimension` link name, value pairs that label the 

798 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

799 should be provided as the first argument. 

800 collections : Any, optional 

801 Collections to be searched, overriding ``self.collections``. 

802 Can be any of the types supported by the ``collections`` argument 

803 to butler construction. 

804 predict : `bool`, optional 

805 If `True`, return a newly created `DatasetRef` with a unique 

806 dataset ID if finding a reference in the `Registry` fails. 

807 Defaults to `False`. 

808 run : `str`, optional 

809 Run collection name to use for creating `DatasetRef` for predicted 

810 datasets. Only used if ``predict`` is `True`. 

811 datastore_records : `bool`, optional 

812 If `True` add datastore records to returned `DatasetRef`. 

813 timespan : `Timespan` or `None`, optional 

814 A timespan that the validity range of the dataset must overlap. 

815 If not provided and this is a calibration dataset type, an attempt 

816 will be made to find the timespan from any temporal coordinate 

817 in the data ID. 

818 **kwargs 

819 Additional keyword arguments used to augment or construct a 

820 `DataId`. See `DataId` parameters. 

821 

822 Returns 

823 ------- 

824 ref : `DatasetRef` 

825 A reference to the dataset identified by the given arguments. 

826 This can be the same dataset reference as given if it was 

827 resolved. 

828 

829 Raises 

830 ------ 

831 LookupError 

832 Raised if no matching dataset exists in the `Registry` (and 

833 ``predict`` is `False`). 

834 ValueError 

835 Raised if a resolved `DatasetRef` was passed as an input, but it 

836 differs from the one found in the registry. 

837 TypeError 

838 Raised if no collections were provided. 

839 """ 

840 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

841 if isinstance(datasetRefOrType, DatasetRef): 

842 if collections is not None: 

843 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3) 

844 # May need to retrieve datastore records if requested. 

845 if datastore_records and datasetRefOrType._datastore_records is None: 

846 datasetRefOrType = self._registry.get_datastore_records(datasetRefOrType) 

847 return datasetRefOrType 

848 

849 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

850 

851 if datasetType.isCalibration(): 

852 # Because this is a calibration dataset, first try to make a 

853 # standardize the data ID without restricting the dimensions to 

854 # those of the dataset type requested, because there may be extra 

855 # dimensions that provide temporal information for a validity-range 

856 # lookup. 

857 dataId = DataCoordinate.standardize( 

858 dataId, universe=self.dimensions, defaults=self._registry.defaults.dataId, **kwargs 

859 ) 

860 if timespan is None: 

861 if dataId.dimensions.temporal: 

862 dataId = self._registry.expandDataId(dataId) 

863 # Use the timespan from the data ID to constrain the 

864 # calibration lookup, but only if the caller has not 

865 # specified an explicit timespan. 

866 timespan = dataId.timespan 

867 else: 

868 # Try an arbitrary timespan. Downstream will fail if this 

869 # results in more than one matching dataset. 

870 timespan = Timespan(None, None) 

871 else: 

872 # Standardize the data ID to just the dimensions of the dataset 

873 # type instead of letting registry.findDataset do it, so we get the 

874 # result even if no dataset is found. 

875 dataId = DataCoordinate.standardize( 

876 dataId, 

877 dimensions=datasetType.dimensions, 

878 defaults=self._registry.defaults.dataId, 

879 **kwargs, 

880 ) 

881 # Always lookup the DatasetRef, even if one is given, to ensure it is 

882 # present in the current collection. 

883 ref = self.find_dataset( 

884 datasetType, 

885 dataId, 

886 collections=collections, 

887 timespan=timespan, 

888 datastore_records=datastore_records, 

889 ) 

890 if ref is None: 

891 if predict: 

892 if run is None: 

893 run = self.run 

894 if run is None: 

895 raise TypeError("Cannot predict dataset ID/location with run=None.") 

896 return DatasetRef(datasetType, dataId, run=run) 

897 else: 

898 if collections is None: 

899 collections = self._registry.defaults.collections 

900 raise DatasetNotFoundError( 

901 f"Dataset {datasetType.name} with data ID {dataId} " 

902 f"could not be found in collections {collections}." 

903 ) 

904 if datasetType != ref.datasetType: 

905 # If they differ it is because the user explicitly specified 

906 # a compatible dataset type to this call rather than using the 

907 # registry definition. The DatasetRef must therefore be recreated 

908 # using the user definition such that the expected type is 

909 # returned. 

910 ref = DatasetRef( 

911 datasetType, ref.dataId, run=ref.run, id=ref.id, datastore_records=ref._datastore_records 

912 ) 

913 

914 return ref 

915 

916 @transactional 

917 def put( 

918 self, 

919 obj: Any, 

920 datasetRefOrType: DatasetRef | DatasetType | str, 

921 /, 

922 dataId: DataId | None = None, 

923 *, 

924 run: str | None = None, 

925 **kwargs: Any, 

926 ) -> DatasetRef: 

927 """Store and register a dataset. 

928 

929 Parameters 

930 ---------- 

931 obj : `object` 

932 The dataset. 

933 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

934 When `DatasetRef` is provided, ``dataId`` should be `None`. 

935 Otherwise the `DatasetType` or name thereof. If a fully resolved 

936 `DatasetRef` is given the run and ID are used directly. 

937 dataId : `dict` or `DataCoordinate` 

938 A `dict` of `Dimension` link name, value pairs that label the 

939 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

940 should be provided as the second argument. 

941 run : `str`, optional 

942 The name of the run the dataset should be added to, overriding 

943 ``self.run``. Not used if a resolved `DatasetRef` is provided. 

944 **kwargs 

945 Additional keyword arguments used to augment or construct a 

946 `DataCoordinate`. See `DataCoordinate.standardize` 

947 parameters. Not used if a resolve `DatasetRef` is provided. 

948 

949 Returns 

950 ------- 

951 ref : `DatasetRef` 

952 A reference to the stored dataset, updated with the correct id if 

953 given. 

954 

955 Raises 

956 ------ 

957 TypeError 

958 Raised if the butler is read-only or if no run has been provided. 

959 """ 

960 if isinstance(datasetRefOrType, DatasetRef): 

961 # This is a direct put of predefined DatasetRef. 

962 _LOG.debug("Butler put direct: %s", datasetRefOrType) 

963 if run is not None: 

964 warnings.warn("Run collection is not used for DatasetRef", stacklevel=3) 

965 # If registry already has a dataset with the same dataset ID, 

966 # dataset type and DataId, then _importDatasets will do nothing and 

967 # just return an original ref. We have to raise in this case, there 

968 # is a datastore check below for that. 

969 self._registry._importDatasets([datasetRefOrType], expand=True) 

970 # Before trying to write to the datastore check that it does not 

971 # know this dataset. This is prone to races, of course. 

972 if self._datastore.knows(datasetRefOrType): 

973 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}") 

974 # Try to write dataset to the datastore, if it fails due to a race 

975 # with another write, the content of stored data may be 

976 # unpredictable. 

977 try: 

978 self._datastore.put(obj, datasetRefOrType) 

979 except IntegrityError as e: 

980 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") from e 

981 return datasetRefOrType 

982 

983 _LOG.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

984 if not self.isWriteable(): 

985 raise TypeError("Butler is read-only.") 

986 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

987 

988 # Handle dimension records in dataId 

989 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

990 

991 # Add Registry Dataset entry. 

992 dataId = self._registry.expandDataId(dataId, dimensions=datasetType.dimensions, **kwargs) 

993 (ref,) = self._registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

994 self._datastore.put(obj, ref) 

995 

996 return ref 

997 

998 def getDeferred( 

999 self, 

1000 datasetRefOrType: DatasetRef | DatasetType | str, 

1001 /, 

1002 dataId: DataId | None = None, 

1003 *, 

1004 parameters: dict | None = None, 

1005 collections: Any = None, 

1006 storageClass: str | StorageClass | None = None, 

1007 timespan: Timespan | None = None, 

1008 **kwargs: Any, 

1009 ) -> DeferredDatasetHandle: 

1010 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1011 after an immediate registry lookup. 

1012 

1013 Parameters 

1014 ---------- 

1015 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1016 When `DatasetRef` the `dataId` should be `None`. 

1017 Otherwise the `DatasetType` or name thereof. 

1018 dataId : `dict` or `DataCoordinate`, optional 

1019 A `dict` of `Dimension` link name, value pairs that label the 

1020 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1021 should be provided as the first argument. 

1022 parameters : `dict` 

1023 Additional StorageClass-defined options to control reading, 

1024 typically used to efficiently read only a subset of the dataset. 

1025 collections : Any, optional 

1026 Collections to be searched, overriding ``self.collections``. 

1027 Can be any of the types supported by the ``collections`` argument 

1028 to butler construction. 

1029 storageClass : `StorageClass` or `str`, optional 

1030 The storage class to be used to override the Python type 

1031 returned by this method. By default the returned type matches 

1032 the dataset type definition for this dataset. Specifying a 

1033 read `StorageClass` can force a different type to be returned. 

1034 This type must be compatible with the original type. 

1035 timespan : `Timespan` or `None`, optional 

1036 A timespan that the validity range of the dataset must overlap. 

1037 If not provided and this is a calibration dataset type, an attempt 

1038 will be made to find the timespan from any temporal coordinate 

1039 in the data ID. 

1040 **kwargs 

1041 Additional keyword arguments used to augment or construct a 

1042 `DataId`. See `DataId` parameters. 

1043 

1044 Returns 

1045 ------- 

1046 obj : `DeferredDatasetHandle` 

1047 A handle which can be used to retrieve a dataset at a later time. 

1048 

1049 Raises 

1050 ------ 

1051 LookupError 

1052 Raised if no matching dataset exists in the `Registry` or 

1053 datastore. 

1054 ValueError 

1055 Raised if a resolved `DatasetRef` was passed as an input, but it 

1056 differs from the one found in the registry. 

1057 TypeError 

1058 Raised if no collections were provided. 

1059 """ 

1060 if isinstance(datasetRefOrType, DatasetRef): 

1061 # Do the quick check first and if that fails, check for artifact 

1062 # existence. This is necessary for datastores that are configured 

1063 # in trust mode where there won't be a record but there will be 

1064 # a file. 

1065 if self._datastore.knows(datasetRefOrType) or self._datastore.exists(datasetRefOrType): 

1066 ref = datasetRefOrType 

1067 else: 

1068 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.") 

1069 else: 

1070 ref = self._findDatasetRef( 

1071 datasetRefOrType, dataId, collections=collections, timespan=timespan, **kwargs 

1072 ) 

1073 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1074 

1075 def get( 

1076 self, 

1077 datasetRefOrType: DatasetRef | DatasetType | str, 

1078 /, 

1079 dataId: DataId | None = None, 

1080 *, 

1081 parameters: dict[str, Any] | None = None, 

1082 collections: Any = None, 

1083 storageClass: StorageClass | str | None = None, 

1084 timespan: Timespan | None = None, 

1085 **kwargs: Any, 

1086 ) -> Any: 

1087 """Retrieve a stored dataset. 

1088 

1089 Parameters 

1090 ---------- 

1091 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1092 When `DatasetRef` the `dataId` should be `None`. 

1093 Otherwise the `DatasetType` or name thereof. 

1094 If a resolved `DatasetRef`, the associated dataset 

1095 is returned directly without additional querying. 

1096 dataId : `dict` or `DataCoordinate` 

1097 A `dict` of `Dimension` link name, value pairs that label the 

1098 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1099 should be provided as the first argument. 

1100 parameters : `dict` 

1101 Additional StorageClass-defined options to control reading, 

1102 typically used to efficiently read only a subset of the dataset. 

1103 collections : Any, optional 

1104 Collections to be searched, overriding ``self.collections``. 

1105 Can be any of the types supported by the ``collections`` argument 

1106 to butler construction. 

1107 storageClass : `StorageClass` or `str`, optional 

1108 The storage class to be used to override the Python type 

1109 returned by this method. By default the returned type matches 

1110 the dataset type definition for this dataset. Specifying a 

1111 read `StorageClass` can force a different type to be returned. 

1112 This type must be compatible with the original type. 

1113 timespan : `Timespan` or `None`, optional 

1114 A timespan that the validity range of the dataset must overlap. 

1115 If not provided and this is a calibration dataset type, an attempt 

1116 will be made to find the timespan from any temporal coordinate 

1117 in the data ID. 

1118 **kwargs 

1119 Additional keyword arguments used to augment or construct a 

1120 `DataCoordinate`. See `DataCoordinate.standardize` 

1121 parameters. 

1122 

1123 Returns 

1124 ------- 

1125 obj : `object` 

1126 The dataset. 

1127 

1128 Raises 

1129 ------ 

1130 LookupError 

1131 Raised if no matching dataset exists in the `Registry`. 

1132 TypeError 

1133 Raised if no collections were provided. 

1134 

1135 Notes 

1136 ----- 

1137 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1138 this method requires that the given data ID include temporal dimensions 

1139 beyond the dimensions of the dataset type itself, in order to find the 

1140 dataset with the appropriate validity range. For example, a "bias" 

1141 dataset with native dimensions ``{instrument, detector}`` could be 

1142 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1143 ``exposure`` is a temporal dimension. 

1144 """ 

1145 _LOG.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1146 ref = self._findDatasetRef( 

1147 datasetRefOrType, 

1148 dataId, 

1149 collections=collections, 

1150 datastore_records=True, 

1151 timespan=timespan, 

1152 **kwargs, 

1153 ) 

1154 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1155 

1156 def getURIs( 

1157 self, 

1158 datasetRefOrType: DatasetRef | DatasetType | str, 

1159 /, 

1160 dataId: DataId | None = None, 

1161 *, 

1162 predict: bool = False, 

1163 collections: Any = None, 

1164 run: str | None = None, 

1165 **kwargs: Any, 

1166 ) -> DatasetRefURIs: 

1167 """Return the URIs associated with the dataset. 

1168 

1169 Parameters 

1170 ---------- 

1171 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1172 When `DatasetRef` the `dataId` should be `None`. 

1173 Otherwise the `DatasetType` or name thereof. 

1174 dataId : `dict` or `DataCoordinate` 

1175 A `dict` of `Dimension` link name, value pairs that label the 

1176 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1177 should be provided as the first argument. 

1178 predict : `bool` 

1179 If `True`, allow URIs to be returned of datasets that have not 

1180 been written. 

1181 collections : Any, optional 

1182 Collections to be searched, overriding ``self.collections``. 

1183 Can be any of the types supported by the ``collections`` argument 

1184 to butler construction. 

1185 run : `str`, optional 

1186 Run to use for predictions, overriding ``self.run``. 

1187 **kwargs 

1188 Additional keyword arguments used to augment or construct a 

1189 `DataCoordinate`. See `DataCoordinate.standardize` 

1190 parameters. 

1191 

1192 Returns 

1193 ------- 

1194 uris : `DatasetRefURIs` 

1195 The URI to the primary artifact associated with this dataset (if 

1196 the dataset was disassembled within the datastore this may be 

1197 `None`), and the URIs to any components associated with the dataset 

1198 artifact. (can be empty if there are no components). 

1199 """ 

1200 ref = self._findDatasetRef( 

1201 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs 

1202 ) 

1203 return self._datastore.getURIs(ref, predict) 

1204 

1205 def get_dataset_type(self, name: str) -> DatasetType: 

1206 return self._registry.getDatasetType(name) 

1207 

1208 def get_dataset( 

1209 self, 

1210 id: DatasetId, 

1211 *, 

1212 storage_class: str | StorageClass | None = None, 

1213 dimension_records: bool = False, 

1214 datastore_records: bool = False, 

1215 ) -> DatasetRef | None: 

1216 ref = self._registry.getDataset(id) 

1217 if ref is not None: 

1218 if dimension_records: 

1219 ref = ref.expanded( 

1220 self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions) 

1221 ) 

1222 if storage_class: 

1223 ref = ref.overrideStorageClass(storage_class) 

1224 if datastore_records: 

1225 ref = self._registry.get_datastore_records(ref) 

1226 return ref 

1227 

1228 def find_dataset( 

1229 self, 

1230 dataset_type: DatasetType | str, 

1231 data_id: DataId | None = None, 

1232 *, 

1233 collections: str | Sequence[str] | None = None, 

1234 timespan: Timespan | None = None, 

1235 storage_class: str | StorageClass | None = None, 

1236 dimension_records: bool = False, 

1237 datastore_records: bool = False, 

1238 **kwargs: Any, 

1239 ) -> DatasetRef | None: 

1240 # Handle any parts of the dataID that are not using primary dimension 

1241 # keys. 

1242 if isinstance(dataset_type, str): 

1243 actual_type = self.get_dataset_type(dataset_type) 

1244 else: 

1245 actual_type = dataset_type 

1246 

1247 # Store the component for later. 

1248 component_name = actual_type.component() 

1249 if actual_type.isComponent(): 

1250 parent_type = actual_type.makeCompositeDatasetType() 

1251 else: 

1252 parent_type = actual_type 

1253 

1254 data_id, kwargs = self._rewrite_data_id(data_id, parent_type, **kwargs) 

1255 

1256 ref = self._registry.findDataset( 

1257 parent_type, 

1258 data_id, 

1259 collections=collections, 

1260 timespan=timespan, 

1261 datastore_records=datastore_records, 

1262 **kwargs, 

1263 ) 

1264 if ref is not None and dimension_records: 

1265 ref = ref.expanded(self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions)) 

1266 if ref is not None and component_name: 

1267 ref = ref.makeComponentRef(component_name) 

1268 if ref is not None and storage_class is not None: 

1269 ref = ref.overrideStorageClass(storage_class) 

1270 

1271 return ref 

1272 

1273 def retrieveArtifacts( 

1274 self, 

1275 refs: Iterable[DatasetRef], 

1276 destination: ResourcePathExpression, 

1277 transfer: str = "auto", 

1278 preserve_path: bool = True, 

1279 overwrite: bool = False, 

1280 ) -> list[ResourcePath]: 

1281 # Docstring inherited. 

1282 return self._datastore.retrieveArtifacts( 

1283 refs, 

1284 ResourcePath(destination), 

1285 transfer=transfer, 

1286 preserve_path=preserve_path, 

1287 overwrite=overwrite, 

1288 ) 

1289 

1290 def exists( 

1291 self, 

1292 dataset_ref_or_type: DatasetRef | DatasetType | str, 

1293 /, 

1294 data_id: DataId | None = None, 

1295 *, 

1296 full_check: bool = True, 

1297 collections: Any = None, 

1298 **kwargs: Any, 

1299 ) -> DatasetExistence: 

1300 # Docstring inherited. 

1301 existence = DatasetExistence.UNRECOGNIZED 

1302 

1303 if isinstance(dataset_ref_or_type, DatasetRef): 

1304 if collections is not None: 

1305 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2) 

1306 if data_id is not None: 

1307 warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2) 

1308 ref = dataset_ref_or_type 

1309 registry_ref = self._registry.getDataset(dataset_ref_or_type.id) 

1310 if registry_ref is not None: 

1311 existence |= DatasetExistence.RECORDED 

1312 

1313 if dataset_ref_or_type != registry_ref: 

1314 # This could mean that storage classes differ, so we should 

1315 # check for that but use the registry ref for the rest of 

1316 # the method. 

1317 if registry_ref.is_compatible_with(dataset_ref_or_type): 

1318 # Use the registry version from now on. 

1319 ref = registry_ref 

1320 else: 

1321 raise ValueError( 

1322 f"The ref given to exists() ({ref}) has the same dataset ID as one " 

1323 f"in registry but has different incompatible values ({registry_ref})." 

1324 ) 

1325 else: 

1326 try: 

1327 ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs) 

1328 except (LookupError, TypeError): 

1329 return existence 

1330 existence |= DatasetExistence.RECORDED 

1331 

1332 if self._datastore.knows(ref): 

1333 existence |= DatasetExistence.DATASTORE 

1334 

1335 if full_check: 

1336 if self._datastore.exists(ref): 

1337 existence |= DatasetExistence._ARTIFACT 

1338 elif existence.value != DatasetExistence.UNRECOGNIZED.value: 

1339 # Do not add this flag if we have no other idea about a dataset. 

1340 existence |= DatasetExistence(DatasetExistence._ASSUMED) 

1341 

1342 return existence 

1343 

1344 def _exists_many( 

1345 self, 

1346 refs: Iterable[DatasetRef], 

1347 /, 

1348 *, 

1349 full_check: bool = True, 

1350 ) -> dict[DatasetRef, DatasetExistence]: 

1351 # Docstring inherited. 

1352 existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs} 

1353 

1354 # Registry does not have a bulk API to check for a ref. 

1355 for ref in refs: 

1356 registry_ref = self._registry.getDataset(ref.id) 

1357 if registry_ref is not None: 

1358 # It is possible, albeit unlikely, that the given ref does 

1359 # not match the one in registry even though the UUID matches. 

1360 # When checking a single ref we raise, but it's impolite to 

1361 # do that when potentially hundreds of refs are being checked. 

1362 # We could change the API to only accept UUIDs and that would 

1363 # remove the ability to even check and remove the worry 

1364 # about differing storage classes. Given the ongoing discussion 

1365 # on refs vs UUIDs and whether to raise or have a new 

1366 # private flag, treat this as a private API for now. 

1367 existence[ref] |= DatasetExistence.RECORDED 

1368 

1369 # Ask datastore if it knows about these refs. 

1370 knows = self._datastore.knows_these(refs) 

1371 for ref, known in knows.items(): 

1372 if known: 

1373 existence[ref] |= DatasetExistence.DATASTORE 

1374 

1375 if full_check: 

1376 mexists = self._datastore.mexists(refs) 

1377 for ref, exists in mexists.items(): 

1378 if exists: 

1379 existence[ref] |= DatasetExistence._ARTIFACT 

1380 else: 

1381 # Do not set this flag if nothing is known about the dataset. 

1382 for ref in existence: 

1383 if existence[ref] != DatasetExistence.UNRECOGNIZED: 

1384 existence[ref] |= DatasetExistence._ASSUMED 

1385 

1386 return existence 

1387 

1388 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1389 # Docstring inherited. 

1390 if not self.isWriteable(): 

1391 raise TypeError("Butler is read-only.") 

1392 names = list(names) 

1393 refs: list[DatasetRef] = [] 

1394 for name in names: 

1395 collectionType = self._registry.getCollectionType(name) 

1396 if collectionType is not CollectionType.RUN: 

1397 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1398 refs.extend(self._registry.queryDatasets(..., collections=name, findFirst=True)) 

1399 with self._datastore.transaction(), self._registry.transaction(): 

1400 if unstore: 

1401 self._datastore.trash(refs) 

1402 else: 

1403 self._datastore.forget(refs) 

1404 for name in names: 

1405 self._registry.removeCollection(name) 

1406 if unstore: 

1407 # Point of no return for removing artifacts 

1408 self._datastore.emptyTrash() 

1409 

1410 def pruneDatasets( 

1411 self, 

1412 refs: Iterable[DatasetRef], 

1413 *, 

1414 disassociate: bool = True, 

1415 unstore: bool = False, 

1416 tags: Iterable[str] = (), 

1417 purge: bool = False, 

1418 ) -> None: 

1419 # docstring inherited from LimitedButler 

1420 

1421 if not self.isWriteable(): 

1422 raise TypeError("Butler is read-only.") 

1423 if purge: 

1424 if not disassociate: 

1425 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1426 if not unstore: 

1427 raise TypeError("Cannot pass purge=True without unstore=True.") 

1428 elif disassociate: 

1429 tags = tuple(tags) 

1430 if not tags: 

1431 raise TypeError("No tags provided but disassociate=True.") 

1432 for tag in tags: 

1433 collectionType = self._registry.getCollectionType(tag) 

1434 if collectionType is not CollectionType.TAGGED: 

1435 raise TypeError( 

1436 f"Cannot disassociate from collection '{tag}' " 

1437 f"of non-TAGGED type {collectionType.name}." 

1438 ) 

1439 # Transform possibly-single-pass iterable into something we can iterate 

1440 # over multiple times. 

1441 refs = list(refs) 

1442 # Pruning a component of a DatasetRef makes no sense since registry 

1443 # doesn't know about components and datastore might not store 

1444 # components in a separate file 

1445 for ref in refs: 

1446 if ref.datasetType.component(): 

1447 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1448 # We don't need an unreliable Datastore transaction for this, because 

1449 # we've been extra careful to ensure that Datastore.trash only involves 

1450 # mutating the Registry (it can _look_ at Datastore-specific things, 

1451 # but shouldn't change them), and hence all operations here are 

1452 # Registry operations. 

1453 with self._datastore.transaction(), self._registry.transaction(): 

1454 if unstore: 

1455 self._datastore.trash(refs) 

1456 if purge: 

1457 self._registry.removeDatasets(refs) 

1458 elif disassociate: 

1459 assert tags, "Guaranteed by earlier logic in this function." 

1460 for tag in tags: 

1461 self._registry.disassociate(tag, refs) 

1462 # We've exited the Registry transaction, and apparently committed. 

1463 # (if there was an exception, everything rolled back, and it's as if 

1464 # nothing happened - and we never get here). 

1465 # Datastore artifacts are not yet gone, but they're clearly marked 

1466 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1467 # problems we can try again later, and if manual administrative 

1468 # intervention is required, it's pretty clear what that should entail: 

1469 # deleting everything on disk and in private Datastore tables that is 

1470 # in the dataset_location_trash table. 

1471 if unstore: 

1472 # Point of no return for removing artifacts 

1473 self._datastore.emptyTrash() 

1474 

1475 @transactional 

1476 def ingest( 

1477 self, 

1478 *datasets: FileDataset, 

1479 transfer: str | None = "auto", 

1480 record_validation_info: bool = True, 

1481 ) -> None: 

1482 # Docstring inherited. 

1483 if not self.isWriteable(): 

1484 raise TypeError("Butler is read-only.") 

1485 

1486 _LOG.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s") 

1487 if not datasets: 

1488 return 

1489 

1490 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1491 

1492 # We need to reorganize all the inputs so that they are grouped 

1493 # by dataset type and run. Multiple refs in a single FileDataset 

1494 # are required to share the run and dataset type. 

1495 groupedData: MutableMapping[tuple[DatasetType, str], list[FileDataset]] = defaultdict(list) 

1496 

1497 # Track DataIDs that are being ingested so we can spot issues early 

1498 # with duplication. Retain previous FileDataset so we can report it. 

1499 groupedDataIds: MutableMapping[tuple[DatasetType, str], dict[DataCoordinate, FileDataset]] = ( 

1500 defaultdict(dict) 

1501 ) 

1502 

1503 # And the nested loop that populates it: 

1504 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1505 # Somewhere to store pre-existing refs if we have an 

1506 # execution butler. 

1507 existingRefs: list[DatasetRef] = [] 

1508 

1509 for ref in dataset.refs: 

1510 group_key = (ref.datasetType, ref.run) 

1511 

1512 if ref.dataId in groupedDataIds[group_key]: 

1513 raise ConflictingDefinitionError( 

1514 f"Ingest conflict. Dataset {dataset.path} has same" 

1515 " DataId as other ingest dataset" 

1516 f" {groupedDataIds[group_key][ref.dataId].path} " 

1517 f" ({ref.dataId})" 

1518 ) 

1519 

1520 groupedDataIds[group_key][ref.dataId] = dataset 

1521 

1522 if existingRefs: 

1523 if len(dataset.refs) != len(existingRefs): 

1524 # Keeping track of partially pre-existing datasets is hard 

1525 # and should generally never happen. For now don't allow 

1526 # it. 

1527 raise ConflictingDefinitionError( 

1528 f"For dataset {dataset.path} some dataIds already exist" 

1529 " in registry but others do not. This is not supported." 

1530 ) 

1531 

1532 # Store expanded form in the original FileDataset. 

1533 dataset.refs = existingRefs 

1534 else: 

1535 groupedData[group_key].append(dataset) 

1536 

1537 # Now we can bulk-insert into Registry for each DatasetType. 

1538 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks( 

1539 groupedData.items(), desc="Bulk-inserting datasets by type" 

1540 ): 

1541 refs_to_import = [] 

1542 for dataset in grouped_datasets: 

1543 refs_to_import.extend(dataset.refs) 

1544 

1545 n_refs = len(refs_to_import) 

1546 _LOG.verbose( 

1547 "Importing %d ref%s of dataset type %r into run %r", 

1548 n_refs, 

1549 "" if n_refs == 1 else "s", 

1550 datasetType.name, 

1551 this_run, 

1552 ) 

1553 

1554 # Import the refs and expand the DataCoordinates since we can't 

1555 # guarantee that they are expanded and Datastore will need 

1556 # the records. 

1557 imported_refs = self._registry._importDatasets(refs_to_import, expand=True) 

1558 assert set(imported_refs) == set(refs_to_import) 

1559 

1560 # Replace all the refs in the FileDataset with expanded versions. 

1561 # Pull them off in the order we put them on the list. 

1562 for dataset in grouped_datasets: 

1563 n_dataset_refs = len(dataset.refs) 

1564 dataset.refs = imported_refs[:n_dataset_refs] 

1565 del imported_refs[:n_dataset_refs] 

1566 

1567 # Bulk-insert everything into Datastore. 

1568 # We do not know if any of the registry entries already existed 

1569 # (_importDatasets only complains if they exist but differ) so 

1570 # we have to catch IntegrityError explicitly. 

1571 try: 

1572 self._datastore.ingest( 

1573 *datasets, transfer=transfer, record_validation_info=record_validation_info 

1574 ) 

1575 except IntegrityError as e: 

1576 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") from e 

1577 

1578 @contextlib.contextmanager 

1579 def export( 

1580 self, 

1581 *, 

1582 directory: str | None = None, 

1583 filename: str | None = None, 

1584 format: str | None = None, 

1585 transfer: str | None = None, 

1586 ) -> Iterator[RepoExportContext]: 

1587 # Docstring inherited. 

1588 if directory is None and transfer is not None: 

1589 raise TypeError("Cannot transfer without providing a directory.") 

1590 if transfer == "move": 

1591 raise TypeError("Transfer may not be 'move': export is read-only") 

1592 if format is None: 

1593 if filename is None: 

1594 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1595 else: 

1596 _, format = os.path.splitext(filename) 

1597 if not format: 

1598 raise ValueError("Please specify a file extension to determine export format.") 

1599 format = format[1:] # Strip leading "."" 

1600 elif filename is None: 

1601 filename = f"export.{format}" 

1602 if directory is not None: 

1603 filename = os.path.join(directory, filename) 

1604 formats = self._config["repo_transfer_formats"] 

1605 if format not in formats: 

1606 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}") 

1607 BackendClass = get_class_of(formats[format, "export"]) 

1608 with open(filename, "w") as stream: 

1609 backend = BackendClass(stream, universe=self.dimensions) 

1610 try: 

1611 helper = RepoExportContext( 

1612 self._registry, self._datastore, backend=backend, directory=directory, transfer=transfer 

1613 ) 

1614 with self._caching_context(): 

1615 yield helper 

1616 except BaseException: 

1617 raise 

1618 else: 

1619 helper._finish() 

1620 

1621 def import_( 

1622 self, 

1623 *, 

1624 directory: ResourcePathExpression | None = None, 

1625 filename: ResourcePathExpression | TextIO | None = None, 

1626 format: str | None = None, 

1627 transfer: str | None = None, 

1628 skip_dimensions: set | None = None, 

1629 ) -> None: 

1630 # Docstring inherited. 

1631 if not self.isWriteable(): 

1632 raise TypeError("Butler is read-only.") 

1633 if format is None: 

1634 if filename is None: 

1635 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1636 else: 

1637 _, format = os.path.splitext(filename) # type: ignore 

1638 elif filename is None: 

1639 filename = ResourcePath(f"export.{format}", forceAbsolute=False) 

1640 if directory is not None: 

1641 directory = ResourcePath(directory, forceDirectory=True) 

1642 # mypy doesn't think this will work but it does in python >= 3.10. 

1643 if isinstance(filename, ResourcePathExpression): # type: ignore 

1644 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore 

1645 if not filename.isabs() and directory is not None: 

1646 potential = directory.join(filename) 

1647 exists_in_cwd = filename.exists() 

1648 exists_in_dir = potential.exists() 

1649 if exists_in_cwd and exists_in_dir: 

1650 _LOG.warning( 

1651 "A relative path for filename was specified (%s) which exists relative to cwd. " 

1652 "Additionally, the file exists relative to the given search directory (%s). " 

1653 "Using the export file in the given directory.", 

1654 filename, 

1655 potential, 

1656 ) 

1657 # Given they specified an explicit directory and that 

1658 # directory has the export file in it, assume that that 

1659 # is what was meant despite the file in cwd. 

1660 filename = potential 

1661 elif exists_in_dir: 

1662 filename = potential 

1663 elif not exists_in_cwd and not exists_in_dir: 

1664 # Raise early. 

1665 raise FileNotFoundError( 

1666 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}." 

1667 ) 

1668 BackendClass: type[RepoImportBackend] = get_class_of( 

1669 self._config["repo_transfer_formats"][format]["import"] 

1670 ) 

1671 

1672 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None: 

1673 with self._caching_context(): 

1674 backend = BackendClass(importStream, self._registry) # type: ignore[call-arg] 

1675 backend.register() 

1676 with self.transaction(): 

1677 backend.load( 

1678 self._datastore, 

1679 directory=directory, 

1680 transfer=transfer, 

1681 skip_dimensions=skip_dimensions, 

1682 ) 

1683 

1684 if isinstance(filename, ResourcePath): 

1685 # We can not use open() here at the moment because of 

1686 # DM-38589 since yaml does stream.read(8192) in a loop. 

1687 stream = io.StringIO(filename.read().decode()) 

1688 doImport(stream) 

1689 else: 

1690 doImport(filename) # type: ignore 

1691 

1692 def transfer_dimension_records_from( 

1693 self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef] 

1694 ) -> None: 

1695 # Allowed dimensions in the target butler. 

1696 elements = frozenset(element for element in self.dimensions.elements if element.has_own_table) 

1697 

1698 data_ids = {ref.dataId for ref in source_refs} 

1699 

1700 dimension_records = self._extract_all_dimension_records_from_data_ids( 

1701 source_butler, data_ids, elements 

1702 ) 

1703 

1704 # Insert order is important. 

1705 for element in self.dimensions.sorted(dimension_records.keys()): 

1706 records = [r for r in dimension_records[element].values()] 

1707 # Assume that if the record is already present that we can 

1708 # use it without having to check that the record metadata 

1709 # is consistent. 

1710 self._registry.insertDimensionData(element, *records, skip_existing=True) 

1711 _LOG.debug("Dimension '%s' -- number of records transferred: %d", element.name, len(records)) 

1712 

1713 def _extract_all_dimension_records_from_data_ids( 

1714 self, 

1715 source_butler: LimitedButler | Butler, 

1716 data_ids: set[DataCoordinate], 

1717 allowed_elements: frozenset[DimensionElement], 

1718 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]: 

1719 primary_records = self._extract_dimension_records_from_data_ids( 

1720 source_butler, data_ids, allowed_elements 

1721 ) 

1722 

1723 can_query = True if isinstance(source_butler, Butler) else False 

1724 

1725 additional_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

1726 for original_element, record_mapping in primary_records.items(): 

1727 # Get dimensions that depend on this dimension. 

1728 populated_by = self.dimensions.get_elements_populated_by( 

1729 self.dimensions[original_element.name] # type: ignore 

1730 ) 

1731 

1732 for data_id in record_mapping.keys(): 

1733 for element in populated_by: 

1734 if element not in allowed_elements: 

1735 continue 

1736 if element.name == original_element.name: 

1737 continue 

1738 

1739 if element.name in primary_records: 

1740 # If this element has already been stored avoid 

1741 # re-finding records since that may lead to additional 

1742 # spurious records. e.g. visit is populated_by 

1743 # visit_detector_region but querying 

1744 # visit_detector_region by visit will return all the 

1745 # detectors for this visit -- the visit dataId does not 

1746 # constrain this. 

1747 # To constrain the query the original dataIds would 

1748 # have to be scanned. 

1749 continue 

1750 

1751 if not can_query: 

1752 raise RuntimeError( 

1753 f"Transferring populated_by records like {element.name} requires a full Butler." 

1754 ) 

1755 

1756 records = source_butler.registry.queryDimensionRecords( # type: ignore 

1757 element.name, 

1758 **data_id.mapping, # type: ignore 

1759 ) 

1760 for record in records: 

1761 additional_records[record.definition].setdefault(record.dataId, record) 

1762 

1763 # The next step is to walk back through the additional records to 

1764 # pick up any missing content (such as visit_definition needing to 

1765 # know the exposure). Want to ensure we do not request records we 

1766 # already have. 

1767 missing_data_ids = set() 

1768 for name, record_mapping in additional_records.items(): 

1769 for data_id in record_mapping.keys(): 

1770 if data_id not in primary_records[name]: 

1771 missing_data_ids.add(data_id) 

1772 

1773 # Fill out the new records. Assume that these new records do not 

1774 # also need to carry over additional populated_by records. 

1775 secondary_records = self._extract_dimension_records_from_data_ids( 

1776 source_butler, missing_data_ids, allowed_elements 

1777 ) 

1778 

1779 # Merge the extra sets of records in with the original. 

1780 for name, record_mapping in itertools.chain(additional_records.items(), secondary_records.items()): 

1781 primary_records[name].update(record_mapping) 

1782 

1783 return primary_records 

1784 

1785 def _extract_dimension_records_from_data_ids( 

1786 self, 

1787 source_butler: LimitedButler | Butler, 

1788 data_ids: set[DataCoordinate], 

1789 allowed_elements: frozenset[DimensionElement], 

1790 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]: 

1791 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

1792 

1793 for data_id in data_ids: 

1794 # Need an expanded record, if not expanded that we need a full 

1795 # butler with registry (allow mocks with registry too). 

1796 if not data_id.hasRecords(): 

1797 if registry := getattr(source_butler, "registry", None): 

1798 data_id = registry.expandDataId(data_id) 

1799 else: 

1800 raise TypeError("Input butler needs to be a full butler to expand DataId.") 

1801 # If this butler doesn't know about a dimension in the source 

1802 # butler things will break later. 

1803 for element_name in data_id.dimensions.elements: 

1804 record = data_id.records[element_name] 

1805 if record is not None and record.definition in allowed_elements: 

1806 dimension_records[record.definition].setdefault(record.dataId, record) 

1807 

1808 return dimension_records 

1809 

1810 def transfer_from( 

1811 self, 

1812 source_butler: LimitedButler, 

1813 source_refs: Iterable[DatasetRef], 

1814 transfer: str = "auto", 

1815 skip_missing: bool = True, 

1816 register_dataset_types: bool = False, 

1817 transfer_dimensions: bool = False, 

1818 dry_run: bool = False, 

1819 ) -> collections.abc.Collection[DatasetRef]: 

1820 # Docstring inherited. 

1821 if not self.isWriteable(): 

1822 raise TypeError("Butler is read-only.") 

1823 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

1824 

1825 # Will iterate through the refs multiple times so need to convert 

1826 # to a list if this isn't a collection. 

1827 if not isinstance(source_refs, collections.abc.Collection): 

1828 source_refs = list(source_refs) 

1829 

1830 original_count = len(source_refs) 

1831 _LOG.info("Transferring %d datasets into %s", original_count, str(self)) 

1832 

1833 # In some situations the datastore artifact may be missing 

1834 # and we do not want that registry entry to be imported. 

1835 # Asking datastore is not sufficient, the records may have been 

1836 # purged, we have to ask for the (predicted) URI and check 

1837 # existence explicitly. Execution butler is set up exactly like 

1838 # this with no datastore records. 

1839 artifact_existence: dict[ResourcePath, bool] = {} 

1840 if skip_missing: 

1841 dataset_existence = source_butler._datastore.mexists( 

1842 source_refs, artifact_existence=artifact_existence 

1843 ) 

1844 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

1845 filtered_count = len(source_refs) 

1846 n_missing = original_count - filtered_count 

1847 _LOG.verbose( 

1848 "%d dataset%s removed because the artifact does not exist. Now have %d.", 

1849 n_missing, 

1850 "" if n_missing == 1 else "s", 

1851 filtered_count, 

1852 ) 

1853 

1854 # Importing requires that we group the refs by dataset type and run 

1855 # before doing the import. 

1856 source_dataset_types = set() 

1857 grouped_refs = defaultdict(list) 

1858 for ref in source_refs: 

1859 grouped_refs[ref.datasetType, ref.run].append(ref) 

1860 source_dataset_types.add(ref.datasetType) 

1861 

1862 # Check to see if the dataset type in the source butler has 

1863 # the same definition in the target butler and register missing 

1864 # ones if requested. Registration must happen outside a transaction. 

1865 newly_registered_dataset_types = set() 

1866 for datasetType in source_dataset_types: 

1867 if register_dataset_types: 

1868 # Let this raise immediately if inconsistent. Continuing 

1869 # on to find additional inconsistent dataset types 

1870 # might result in additional unwanted dataset types being 

1871 # registered. 

1872 if self._registry.registerDatasetType(datasetType): 

1873 newly_registered_dataset_types.add(datasetType) 

1874 else: 

1875 # If the dataset type is missing, let it fail immediately. 

1876 target_dataset_type = self.get_dataset_type(datasetType.name) 

1877 if target_dataset_type != datasetType: 

1878 raise ConflictingDefinitionError( 

1879 "Source butler dataset type differs from definition" 

1880 f" in target butler: {datasetType} !=" 

1881 f" {target_dataset_type}" 

1882 ) 

1883 if newly_registered_dataset_types: 

1884 # We may have registered some even if there were inconsistencies 

1885 # but should let people know (or else remove them again). 

1886 _LOG.verbose( 

1887 "Registered the following dataset types in the target Butler: %s", 

1888 ", ".join(d.name for d in newly_registered_dataset_types), 

1889 ) 

1890 else: 

1891 _LOG.verbose("All required dataset types are known to the target Butler") 

1892 

1893 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

1894 if transfer_dimensions: 

1895 # Collect all the dimension records for these refs. 

1896 # All dimensions are to be copied but the list of valid dimensions 

1897 # come from this butler's universe. 

1898 elements = frozenset(element for element in self.dimensions.elements if element.has_own_table) 

1899 dataIds = {ref.dataId for ref in source_refs} 

1900 dimension_records = self._extract_all_dimension_records_from_data_ids( 

1901 source_butler, dataIds, elements 

1902 ) 

1903 

1904 handled_collections: set[str] = set() 

1905 

1906 # Do all the importing in a single transaction. 

1907 with self.transaction(): 

1908 if dimension_records and not dry_run: 

1909 _LOG.verbose("Ensuring that dimension records exist for transferred datasets.") 

1910 # Order matters. 

1911 for element in self.dimensions.sorted(dimension_records.keys()): 

1912 records = [r for r in dimension_records[element].values()] 

1913 # Assume that if the record is already present that we can 

1914 # use it without having to check that the record metadata 

1915 # is consistent. 

1916 self._registry.insertDimensionData(element, *records, skip_existing=True) 

1917 

1918 n_imported = 0 

1919 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

1920 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

1921 ): 

1922 if run not in handled_collections: 

1923 # May need to create output collection. If source butler 

1924 # has a registry, ask for documentation string. 

1925 run_doc = None 

1926 if registry := getattr(source_butler, "registry", None): 

1927 run_doc = registry.getCollectionDocumentation(run) 

1928 if not dry_run: 

1929 registered = self._registry.registerRun(run, doc=run_doc) 

1930 else: 

1931 registered = True 

1932 handled_collections.add(run) 

1933 if registered: 

1934 _LOG.verbose("Creating output run %s", run) 

1935 

1936 n_refs = len(refs_to_import) 

1937 _LOG.verbose( 

1938 "Importing %d ref%s of dataset type %s into run %s", 

1939 n_refs, 

1940 "" if n_refs == 1 else "s", 

1941 datasetType.name, 

1942 run, 

1943 ) 

1944 

1945 # Assume we are using UUIDs and the source refs will match 

1946 # those imported. 

1947 if not dry_run: 

1948 imported_refs = self._registry._importDatasets(refs_to_import) 

1949 else: 

1950 imported_refs = refs_to_import 

1951 assert set(imported_refs) == set(refs_to_import) 

1952 n_imported += len(imported_refs) 

1953 

1954 assert len(source_refs) == n_imported 

1955 _LOG.verbose("Imported %d datasets into destination butler", n_imported) 

1956 

1957 # Ask the datastore to transfer. The datastore has to check that 

1958 # the source datastore is compatible with the target datastore. 

1959 accepted, rejected = self._datastore.transfer_from( 

1960 source_butler._datastore, 

1961 source_refs, 

1962 transfer=transfer, 

1963 artifact_existence=artifact_existence, 

1964 dry_run=dry_run, 

1965 ) 

1966 if rejected: 

1967 # For now, accept the registry entries but not the files. 

1968 _LOG.warning( 

1969 "%d datasets were rejected and %d accepted for dataset type %s in run %r.", 

1970 len(rejected), 

1971 len(accepted), 

1972 datasetType, 

1973 run, 

1974 ) 

1975 

1976 return source_refs 

1977 

1978 def validateConfiguration( 

1979 self, 

1980 logFailures: bool = False, 

1981 datasetTypeNames: Iterable[str] | None = None, 

1982 ignore: Iterable[str] | None = None, 

1983 ) -> None: 

1984 # Docstring inherited. 

1985 if datasetTypeNames: 

1986 datasetTypes = [self.get_dataset_type(name) for name in datasetTypeNames] 

1987 else: 

1988 datasetTypes = list(self._registry.queryDatasetTypes()) 

1989 

1990 # filter out anything from the ignore list 

1991 if ignore: 

1992 ignore = set(ignore) 

1993 datasetTypes = [ 

1994 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

1995 ] 

1996 else: 

1997 ignore = set() 

1998 

1999 # For each datasetType that has an instrument dimension, create 

2000 # a DatasetRef for each defined instrument 

2001 datasetRefs = [] 

2002 

2003 # Find all the registered instruments (if "instrument" is in the 

2004 # universe). 

2005 if "instrument" in self.dimensions: 

2006 instruments = {record.name for record in self._registry.queryDimensionRecords("instrument")} 

2007 

2008 for datasetType in datasetTypes: 

2009 if "instrument" in datasetType.dimensions: 

2010 # In order to create a conforming dataset ref, create 

2011 # fake DataCoordinate values for the non-instrument 

2012 # dimensions. The type of the value does not matter here. 

2013 dataId = {dim: 1 for dim in datasetType.dimensions.names if dim != "instrument"} 

2014 

2015 for instrument in instruments: 

2016 datasetRef = DatasetRef( 

2017 datasetType, 

2018 DataCoordinate.standardize( 

2019 dataId, instrument=instrument, dimensions=datasetType.dimensions 

2020 ), 

2021 run="validate", 

2022 ) 

2023 datasetRefs.append(datasetRef) 

2024 

2025 entities: list[DatasetType | DatasetRef] = [] 

2026 entities.extend(datasetTypes) 

2027 entities.extend(datasetRefs) 

2028 

2029 datastoreErrorStr = None 

2030 try: 

2031 self._datastore.validateConfiguration(entities, logFailures=logFailures) 

2032 except ValidationError as e: 

2033 datastoreErrorStr = str(e) 

2034 

2035 # Also check that the LookupKeys used by the datastores match 

2036 # registry and storage class definitions 

2037 keys = self._datastore.getLookupKeys() 

2038 

2039 failedNames = set() 

2040 failedDataId = set() 

2041 for key in keys: 

2042 if key.name is not None: 

2043 if key.name in ignore: 

2044 continue 

2045 

2046 # skip if specific datasetType names were requested and this 

2047 # name does not match 

2048 if datasetTypeNames and key.name not in datasetTypeNames: 

2049 continue 

2050 

2051 # See if it is a StorageClass or a DatasetType 

2052 if key.name in self.storageClasses: 

2053 pass 

2054 else: 

2055 try: 

2056 self.get_dataset_type(key.name) 

2057 except KeyError: 

2058 if logFailures: 

2059 _LOG.critical( 

2060 "Key '%s' does not correspond to a DatasetType or StorageClass", key 

2061 ) 

2062 failedNames.add(key) 

2063 else: 

2064 # Dimensions are checked for consistency when the Butler 

2065 # is created and rendezvoused with a universe. 

2066 pass 

2067 

2068 # Check that the instrument is a valid instrument 

2069 # Currently only support instrument so check for that 

2070 if key.dataId: 

2071 dataIdKeys = set(key.dataId) 

2072 if {"instrument"} != dataIdKeys: 

2073 if logFailures: 

2074 _LOG.critical("Key '%s' has unsupported DataId override", key) 

2075 failedDataId.add(key) 

2076 elif key.dataId["instrument"] not in instruments: 

2077 if logFailures: 

2078 _LOG.critical("Key '%s' has unknown instrument", key) 

2079 failedDataId.add(key) 

2080 

2081 messages = [] 

2082 

2083 if datastoreErrorStr: 

2084 messages.append(datastoreErrorStr) 

2085 

2086 for failed, msg in ( 

2087 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2088 (failedDataId, "Keys with bad DataId entries: "), 

2089 ): 

2090 if failed: 

2091 msg += ", ".join(str(k) for k in failed) 

2092 messages.append(msg) 

2093 

2094 if messages: 

2095 raise ValidationError(";\n".join(messages)) 

2096 

2097 @property 

2098 def collection_chains(self) -> DirectButlerCollections: 

2099 """Object with methods for modifying collection chains.""" 

2100 return DirectButlerCollections(self._registry) 

2101 

2102 @property 

2103 def collections(self) -> Sequence[str]: 

2104 """The collections to search by default, in order 

2105 (`~collections.abc.Sequence` [ `str` ]). 

2106 

2107 This is an alias for ``self.registry.defaults.collections``. It cannot 

2108 be set directly in isolation, but all defaults may be changed together 

2109 by assigning a new `RegistryDefaults` instance to 

2110 ``self.registry.defaults``. 

2111 """ 

2112 return self._registry.defaults.collections 

2113 

2114 @property 

2115 def run(self) -> str | None: 

2116 """Name of the run this butler writes outputs to by default (`str` or 

2117 `None`). 

2118 

2119 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2120 directly in isolation, but all defaults may be changed together by 

2121 assigning a new `RegistryDefaults` instance to 

2122 ``self.registry.defaults``. 

2123 """ 

2124 return self._registry.defaults.run 

2125 

2126 @property 

2127 def registry(self) -> Registry: 

2128 """The object that manages dataset metadata and relationships 

2129 (`Registry`). 

2130 

2131 Many operations that don't involve reading or writing butler datasets 

2132 are accessible only via `Registry` methods. Eventually these methods 

2133 will be replaced by equivalent `Butler` methods. 

2134 """ 

2135 return self._registry_shim 

2136 

2137 @property 

2138 def dimensions(self) -> DimensionUniverse: 

2139 # Docstring inherited. 

2140 return self._registry.dimensions 

2141 

2142 @contextlib.contextmanager 

2143 def _query(self) -> Iterator[Query]: 

2144 # Docstring inherited. 

2145 with self._query_driver(self._registry.defaults.collections, self.registry.defaults.dataId) as driver: 

2146 yield Query(driver) 

2147 

2148 @contextlib.contextmanager 

2149 def _query_driver( 

2150 self, 

2151 default_collections: Iterable[str], 

2152 default_data_id: DataCoordinate, 

2153 ) -> Iterator[DirectQueryDriver]: 

2154 """Set up a QueryDriver instance for use with this Butler. Although 

2155 this is marked as a private method, it is also used by Butler server. 

2156 """ 

2157 with self._caching_context(): 

2158 driver = DirectQueryDriver( 

2159 self._registry._db, 

2160 self.dimensions, 

2161 self._registry._managers, 

2162 default_collections=default_collections, 

2163 default_data_id=default_data_id, 

2164 ) 

2165 with driver: 

2166 yield driver 

2167 

2168 def _preload_cache(self) -> None: 

2169 """Immediately load caches that are used for common operations.""" 

2170 self._registry.preload_cache() 

2171 

2172 _config: ButlerConfig 

2173 """Configuration for this Butler instance.""" 

2174 

2175 _registry: SqlRegistry 

2176 """The object that manages dataset metadata and relationships 

2177 (`SqlRegistry`). 

2178 

2179 Most operations that don't involve reading or writing butler datasets are 

2180 accessible only via `SqlRegistry` methods. 

2181 """ 

2182 

2183 datastore: Datastore 

2184 """The object that manages actual dataset storage (`Datastore`). 

2185 

2186 Direct user access to the datastore should rarely be necessary; the primary 

2187 exception is the case where a `Datastore` implementation provides extra 

2188 functionality beyond what the base class defines. 

2189 """ 

2190 

2191 storageClasses: StorageClassFactory 

2192 """An object that maps known storage class names to objects that fully 

2193 describe them (`StorageClassFactory`). 

2194 """ 

2195 

2196 _registry_shim: RegistryShim 

2197 """Shim object to provide a legacy public interface for querying via the 

2198 the ``registry`` property. 

2199 """