Coverage for python/lsst/daf/butler/direct_butler.py: 10%

782 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-16 10:44 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Butler top level classes. 

29""" 

30from __future__ import annotations 

31 

32__all__ = ( 

33 "DirectButler", 

34 "ButlerValidationError", 

35) 

36 

37import collections.abc 

38import contextlib 

39import io 

40import itertools 

41import logging 

42import numbers 

43import os 

44import warnings 

45from collections import Counter, defaultdict 

46from collections.abc import Iterable, Iterator, Mapping, MutableMapping, Sequence 

47from typing import TYPE_CHECKING, Any, ClassVar, TextIO, cast 

48 

49from lsst.resources import ResourcePath, ResourcePathExpression 

50from lsst.utils.introspection import get_class_of 

51from lsst.utils.iteration import ensure_iterable 

52from lsst.utils.logging import VERBOSE, getLogger 

53from sqlalchemy.exc import IntegrityError 

54 

55from ._butler import Butler 

56from ._butler_config import ButlerConfig 

57from ._butler_instance_options import ButlerInstanceOptions 

58from ._dataset_existence import DatasetExistence 

59from ._dataset_ref import DatasetRef 

60from ._dataset_type import DatasetType 

61from ._deferredDatasetHandle import DeferredDatasetHandle 

62from ._exceptions import EmptyQueryResultError, ValidationError 

63from ._limited_butler import LimitedButler 

64from ._registry_shim import RegistryShim 

65from ._storage_class import StorageClass, StorageClassFactory 

66from ._timespan import Timespan 

67from .datastore import Datastore, NullDatastore 

68from .dimensions import DataCoordinate, Dimension 

69from .direct_query import DirectQuery 

70from .progress import Progress 

71from .registry import ( 

72 CollectionType, 

73 ConflictingDefinitionError, 

74 DataIdError, 

75 MissingDatasetTypeError, 

76 NoDefaultCollectionError, 

77 RegistryDefaults, 

78 _RegistryFactory, 

79) 

80from .registry.sql_registry import SqlRegistry 

81from .transfers import RepoExportContext 

82from .utils import transactional 

83 

84if TYPE_CHECKING: 

85 from lsst.resources import ResourceHandleProtocol 

86 

87 from ._dataset_ref import DatasetId 

88 from ._file_dataset import FileDataset 

89 from ._query import Query 

90 from .datastore import DatasetRefURIs 

91 from .dimensions import ( 

92 DataId, 

93 DataIdValue, 

94 DimensionElement, 

95 DimensionGroup, 

96 DimensionRecord, 

97 DimensionUniverse, 

98 ) 

99 from .registry import CollectionArgType, Registry 

100 from .transfers import RepoImportBackend 

101 

102_LOG = getLogger(__name__) 

103 

104 

105class ButlerValidationError(ValidationError): 

106 """There is a problem with the Butler configuration.""" 

107 

108 pass 

109 

110 

111class DirectButler(Butler): # numpydoc ignore=PR02 

112 """Main entry point for the data access system. 

113 

114 Parameters 

115 ---------- 

116 config : `ButlerConfig` 

117 The configuration for this Butler instance. 

118 registry : `SqlRegistry` 

119 The object that manages dataset metadata and relationships. 

120 datastore : Datastore 

121 The object that manages actual dataset storage. 

122 storageClasses : StorageClassFactory 

123 An object that maps known storage class names to objects that fully 

124 describe them. 

125 

126 Notes 

127 ----- 

128 Most users should call the top-level `Butler`.``from_config`` instead of 

129 using this constructor directly. 

130 """ 

131 

132 # This is __new__ instead of __init__ because we have to support 

133 # instantiation via the legacy constructor Butler.__new__(), which 

134 # reads the configuration and selects which subclass to instantiate. The 

135 # interaction between __new__ and __init__ is kind of wacky in Python. If 

136 # we were using __init__ here, __init__ would be called twice (once when 

137 # the DirectButler instance is constructed inside Butler.from_config(), and 

138 # a second time with the original arguments to Butler() when the instance 

139 # is returned from Butler.__new__() 

140 def __new__( 

141 cls, 

142 *, 

143 config: ButlerConfig, 

144 registry: SqlRegistry, 

145 datastore: Datastore, 

146 storageClasses: StorageClassFactory, 

147 ) -> DirectButler: 

148 self = cast(DirectButler, super().__new__(cls)) 

149 self._config = config 

150 self._registry = registry 

151 self._datastore = datastore 

152 self.storageClasses = storageClasses 

153 

154 # For execution butler the datastore needs a special 

155 # dependency-inversion trick. This is not used by regular butler, 

156 # but we do not have a way to distinguish regular butler from execution 

157 # butler. 

158 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

159 

160 self._registry_shim = RegistryShim(self) 

161 

162 return self 

163 

164 @classmethod 

165 def create_from_config( 

166 cls, 

167 config: ButlerConfig, 

168 *, 

169 options: ButlerInstanceOptions, 

170 without_datastore: bool = False, 

171 ) -> DirectButler: 

172 """Construct a Butler instance from a configuration file. 

173 

174 Parameters 

175 ---------- 

176 config : `ButlerConfig` 

177 The configuration for this Butler instance. 

178 options : `ButlerInstanceOptions` 

179 Default values and other settings for the Butler instance. 

180 without_datastore : `bool`, optional 

181 If `True` do not attach a datastore to this butler. Any attempts 

182 to use a datastore will fail. 

183 

184 Notes 

185 ----- 

186 Most users should call the top-level `Butler`.``from_config`` 

187 instead of using this function directly. 

188 """ 

189 if "run" in config or "collection" in config: 

190 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

191 

192 defaults = RegistryDefaults( 

193 collections=options.collections, run=options.run, infer=options.inferDefaults, **options.kwargs 

194 ) 

195 try: 

196 butlerRoot = config.get("root", config.configDir) 

197 writeable = options.writeable 

198 if writeable is None: 

199 writeable = options.run is not None 

200 registry = _RegistryFactory(config).from_config( 

201 butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

202 ) 

203 if without_datastore: 

204 datastore: Datastore = NullDatastore(None, None) 

205 else: 

206 datastore = Datastore.fromConfig( 

207 config, registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

208 ) 

209 # TODO: Once datastore drops dependency on registry we can 

210 # construct datastore first and pass opaque tables to registry 

211 # constructor. 

212 registry.make_datastore_tables(datastore.get_opaque_table_definitions()) 

213 storageClasses = StorageClassFactory() 

214 storageClasses.addFromConfig(config) 

215 

216 return DirectButler( 

217 config=config, registry=registry, datastore=datastore, storageClasses=storageClasses 

218 ) 

219 except Exception: 

220 # Failures here usually mean that configuration is incomplete, 

221 # just issue an error message which includes config file URI. 

222 _LOG.error(f"Failed to instantiate Butler from config {config.configFile}.") 

223 raise 

224 

225 def _clone( 

226 self, 

227 *, 

228 collections: Any = None, 

229 run: str | None = None, 

230 inferDefaults: bool = True, 

231 **kwargs: Any, 

232 ) -> DirectButler: 

233 # Docstring inherited 

234 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

235 

236 return DirectButler( 

237 registry=self._registry.copy(defaults), 

238 config=self._config, 

239 datastore=self._datastore, 

240 storageClasses=self.storageClasses, 

241 ) 

242 

243 GENERATION: ClassVar[int] = 3 

244 """This is a Generation 3 Butler. 

245 

246 This attribute may be removed in the future, once the Generation 2 Butler 

247 interface has been fully retired; it should only be used in transitional 

248 code. 

249 """ 

250 

251 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

252 """Return DatasetType defined in registry given dataset type name.""" 

253 try: 

254 return self.get_dataset_type(name) 

255 except MissingDatasetTypeError: 

256 return None 

257 

258 @classmethod 

259 def _unpickle( 

260 cls, 

261 config: ButlerConfig, 

262 collections: tuple[str, ...] | None, 

263 run: str | None, 

264 defaultDataId: dict[str, str], 

265 writeable: bool, 

266 ) -> DirectButler: 

267 """Callable used to unpickle a Butler. 

268 

269 We prefer not to use ``Butler.__init__`` directly so we can force some 

270 of its many arguments to be keyword-only (note that ``__reduce__`` 

271 can only invoke callables with positional arguments). 

272 

273 Parameters 

274 ---------- 

275 config : `ButlerConfig` 

276 Butler configuration, already coerced into a true `ButlerConfig` 

277 instance (and hence after any search paths for overrides have been 

278 utilized). 

279 collections : `tuple` [ `str` ] 

280 Names of the default collections to read from. 

281 run : `str`, optional 

282 Name of the default `~CollectionType.RUN` collection to write to. 

283 defaultDataId : `dict` [ `str`, `str` ] 

284 Default data ID values. 

285 writeable : `bool` 

286 Whether the Butler should support write operations. 

287 

288 Returns 

289 ------- 

290 butler : `Butler` 

291 A new `Butler` instance. 

292 """ 

293 return cls.create_from_config( 

294 config=config, 

295 options=ButlerInstanceOptions( 

296 collections=collections, run=run, writeable=writeable, kwargs=defaultDataId 

297 ), 

298 ) 

299 

300 def __reduce__(self) -> tuple: 

301 """Support pickling.""" 

302 return ( 

303 DirectButler._unpickle, 

304 ( 

305 self._config, 

306 self.collections, 

307 self.run, 

308 dict(self._registry.defaults.dataId.required), 

309 self._registry.isWriteable(), 

310 ), 

311 ) 

312 

313 def __str__(self) -> str: 

314 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

315 self.collections, self.run, self._datastore, self._registry 

316 ) 

317 

318 def isWriteable(self) -> bool: 

319 # Docstring inherited. 

320 return self._registry.isWriteable() 

321 

322 def _caching_context(self) -> contextlib.AbstractContextManager[None]: 

323 """Context manager that enables caching.""" 

324 return self._registry.caching_context() 

325 

326 @contextlib.contextmanager 

327 def transaction(self) -> Iterator[None]: 

328 """Context manager supporting `Butler` transactions. 

329 

330 Transactions can be nested. 

331 """ 

332 with self._registry.transaction(), self._datastore.transaction(): 

333 yield 

334 

335 def _standardizeArgs( 

336 self, 

337 datasetRefOrType: DatasetRef | DatasetType | str, 

338 dataId: DataId | None = None, 

339 for_put: bool = True, 

340 **kwargs: Any, 

341 ) -> tuple[DatasetType, DataId | None]: 

342 """Standardize the arguments passed to several Butler APIs. 

343 

344 Parameters 

345 ---------- 

346 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

347 When `DatasetRef` the `dataId` should be `None`. 

348 Otherwise the `DatasetType` or name thereof. 

349 dataId : `dict` or `DataCoordinate` 

350 A `dict` of `Dimension` link name, value pairs that label the 

351 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

352 should be provided as the second argument. 

353 for_put : `bool`, optional 

354 If `True` this call is invoked as part of a `Butler.put()`. 

355 Otherwise it is assumed to be part of a `Butler.get()`. This 

356 parameter is only relevant if there is dataset type 

357 inconsistency. 

358 **kwargs 

359 Additional keyword arguments used to augment or construct a 

360 `DataCoordinate`. See `DataCoordinate.standardize` 

361 parameters. 

362 

363 Returns 

364 ------- 

365 datasetType : `DatasetType` 

366 A `DatasetType` instance extracted from ``datasetRefOrType``. 

367 dataId : `dict` or `DataId`, optional 

368 Argument that can be used (along with ``kwargs``) to construct a 

369 `DataId`. 

370 

371 Notes 

372 ----- 

373 Butler APIs that conceptually need a DatasetRef also allow passing a 

374 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

375 keyword arguments that can be used to construct one) separately. This 

376 method accepts those arguments and always returns a true `DatasetType` 

377 and a `DataId` or `dict`. 

378 

379 Standardization of `dict` vs `DataId` is best handled by passing the 

380 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

381 generally similarly flexible. 

382 """ 

383 externalDatasetType: DatasetType | None = None 

384 internalDatasetType: DatasetType | None = None 

385 if isinstance(datasetRefOrType, DatasetRef): 

386 if dataId is not None or kwargs: 

387 raise ValueError("DatasetRef given, cannot use dataId as well") 

388 externalDatasetType = datasetRefOrType.datasetType 

389 dataId = datasetRefOrType.dataId 

390 else: 

391 # Don't check whether DataId is provided, because Registry APIs 

392 # can usually construct a better error message when it wasn't. 

393 if isinstance(datasetRefOrType, DatasetType): 

394 externalDatasetType = datasetRefOrType 

395 else: 

396 internalDatasetType = self.get_dataset_type(datasetRefOrType) 

397 

398 # Check that they are self-consistent 

399 if externalDatasetType is not None: 

400 internalDatasetType = self.get_dataset_type(externalDatasetType.name) 

401 if externalDatasetType != internalDatasetType: 

402 # We can allow differences if they are compatible, depending 

403 # on whether this is a get or a put. A get requires that 

404 # the python type associated with the datastore can be 

405 # converted to the user type. A put requires that the user 

406 # supplied python type can be converted to the internal 

407 # type expected by registry. 

408 relevantDatasetType = internalDatasetType 

409 if for_put: 

410 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

411 else: 

412 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

413 relevantDatasetType = externalDatasetType 

414 if not is_compatible: 

415 raise ValueError( 

416 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

417 f"registry definition ({internalDatasetType})" 

418 ) 

419 # Override the internal definition. 

420 internalDatasetType = relevantDatasetType 

421 

422 assert internalDatasetType is not None 

423 return internalDatasetType, dataId 

424 

425 def _rewrite_data_id( 

426 self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any 

427 ) -> tuple[DataId | None, dict[str, Any]]: 

428 """Rewrite a data ID taking into account dimension records. 

429 

430 Take a Data ID and keyword args and rewrite it if necessary to 

431 allow the user to specify dimension records rather than dimension 

432 primary values. 

433 

434 This allows a user to include a dataId dict with keys of 

435 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

436 the integer exposure ID. It also allows a string to be given 

437 for a dimension value rather than the integer ID if that is more 

438 convenient. For example, rather than having to specifying the 

439 detector with ``detector.full_name``, a string given for ``detector`` 

440 will be interpreted as the full name and converted to the integer 

441 value. 

442 

443 Keyword arguments can also use strings for dimensions like detector 

444 and exposure but python does not allow them to include ``.`` and 

445 so the ``exposure.day_obs`` syntax can not be used in a keyword 

446 argument. 

447 

448 Parameters 

449 ---------- 

450 dataId : `dict` or `DataCoordinate` 

451 A `dict` of `Dimension` link name, value pairs that will label the 

452 `DatasetRef` within a Collection. 

453 datasetType : `DatasetType` 

454 The dataset type associated with this dataId. Required to 

455 determine the relevant dimensions. 

456 **kwargs 

457 Additional keyword arguments used to augment or construct a 

458 `DataId`. See `DataId` parameters. 

459 

460 Returns 

461 ------- 

462 dataId : `dict` or `DataCoordinate` 

463 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

464 no keyword arguments, the original dataId will be returned 

465 unchanged. 

466 **kwargs : `dict` 

467 Any unused keyword arguments (would normally be empty dict). 

468 """ 

469 # Do nothing if we have a standalone DataCoordinate. 

470 if isinstance(dataId, DataCoordinate) and not kwargs: 

471 return dataId, kwargs 

472 

473 # Process dimension records that are using record information 

474 # rather than ids 

475 newDataId: dict[str, DataIdValue] = {} 

476 byRecord: dict[str, dict[str, Any]] = defaultdict(dict) 

477 

478 # if all the dataId comes from keyword parameters we do not need 

479 # to do anything here because they can't be of the form 

480 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

481 if dataId: 

482 for k, v in dataId.items(): 

483 # If we have a Dimension we do not need to do anything 

484 # because it cannot be a compound key. 

485 if isinstance(k, str) and "." in k: 

486 # Someone is using a more human-readable dataId 

487 dimensionName, record = k.split(".", 1) 

488 byRecord[dimensionName][record] = v 

489 elif isinstance(k, Dimension): 

490 newDataId[k.name] = v 

491 else: 

492 newDataId[k] = v 

493 

494 # Go through the updated dataId and check the type in case someone is 

495 # using an alternate key. We have already filtered out the compound 

496 # keys dimensions.record format. 

497 not_dimensions = {} 

498 

499 # Will need to look in the dataId and the keyword arguments 

500 # and will remove them if they need to be fixed or are unrecognized. 

501 for dataIdDict in (newDataId, kwargs): 

502 # Use a list so we can adjust the dict safely in the loop 

503 for dimensionName in list(dataIdDict): 

504 value = dataIdDict[dimensionName] 

505 try: 

506 dimension = self.dimensions.dimensions[dimensionName] 

507 except KeyError: 

508 # This is not a real dimension 

509 not_dimensions[dimensionName] = value 

510 del dataIdDict[dimensionName] 

511 continue 

512 

513 # Convert an integral type to an explicit int to simplify 

514 # comparisons here 

515 if isinstance(value, numbers.Integral): 

516 value = int(value) 

517 

518 if not isinstance(value, dimension.primaryKey.getPythonType()): 

519 for alternate in dimension.alternateKeys: 

520 if isinstance(value, alternate.getPythonType()): 

521 byRecord[dimensionName][alternate.name] = value 

522 del dataIdDict[dimensionName] 

523 _LOG.debug( 

524 "Converting dimension %s to %s.%s=%s", 

525 dimensionName, 

526 dimensionName, 

527 alternate.name, 

528 value, 

529 ) 

530 break 

531 else: 

532 _LOG.warning( 

533 "Type mismatch found for value '%r' provided for dimension %s. " 

534 "Could not find matching alternative (primary key has type %s) " 

535 "so attempting to use as-is.", 

536 value, 

537 dimensionName, 

538 dimension.primaryKey.getPythonType(), 

539 ) 

540 

541 # By this point kwargs and newDataId should only include valid 

542 # dimensions. Merge kwargs in to the new dataId and log if there 

543 # are dimensions in both (rather than calling update). 

544 for k, v in kwargs.items(): 

545 if k in newDataId and newDataId[k] != v: 

546 _LOG.debug( 

547 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

548 ) 

549 newDataId[k] = v 

550 # No need to retain any values in kwargs now. 

551 kwargs = {} 

552 

553 # If we have some unrecognized dimensions we have to try to connect 

554 # them to records in other dimensions. This is made more complicated 

555 # by some dimensions having records with clashing names. A mitigation 

556 # is that we can tell by this point which dimensions are missing 

557 # for the DatasetType but this does not work for calibrations 

558 # where additional dimensions can be used to constrain the temporal 

559 # axis. 

560 if not_dimensions: 

561 # Search for all dimensions even if we have been given a value 

562 # explicitly. In some cases records are given as well as the 

563 # actually dimension and this should not be an error if they 

564 # match. 

565 mandatoryDimensions = datasetType.dimensions.names # - provided 

566 

567 candidateDimensions: set[str] = set() 

568 candidateDimensions.update(mandatoryDimensions) 

569 

570 # For calibrations we may well be needing temporal dimensions 

571 # so rather than always including all dimensions in the scan 

572 # restrict things a little. It is still possible for there 

573 # to be confusion over day_obs in visit vs exposure for example. 

574 # If we are not searching calibration collections things may 

575 # fail but they are going to fail anyway because of the 

576 # ambiguousness of the dataId... 

577 if datasetType.isCalibration(): 

578 for dim in self.dimensions.dimensions: 

579 if dim.temporal: 

580 candidateDimensions.add(str(dim)) 

581 

582 # Look up table for the first association with a dimension 

583 guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict) 

584 

585 # Keep track of whether an item is associated with multiple 

586 # dimensions. 

587 counter: Counter[str] = Counter() 

588 assigned: dict[str, set[str]] = defaultdict(set) 

589 

590 # Go through the missing dimensions and associate the 

591 # given names with records within those dimensions 

592 matched_dims = set() 

593 for dimensionName in candidateDimensions: 

594 dimension = self.dimensions.dimensions[dimensionName] 

595 fields = dimension.metadata.names | dimension.uniqueKeys.names 

596 for field in not_dimensions: 

597 if field in fields: 

598 guessedAssociation[dimensionName][field] = not_dimensions[field] 

599 counter[dimensionName] += 1 

600 assigned[field].add(dimensionName) 

601 matched_dims.add(field) 

602 

603 # Calculate the fields that matched nothing. 

604 never_found = set(not_dimensions) - matched_dims 

605 

606 if never_found: 

607 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

608 

609 # There is a chance we have allocated a single dataId item 

610 # to multiple dimensions. Need to decide which should be retained. 

611 # For now assume that the most popular alternative wins. 

612 # This means that day_obs with seq_num will result in 

613 # exposure.day_obs and not visit.day_obs 

614 # Also prefer an explicitly missing dimension over an inferred 

615 # temporal dimension. 

616 for fieldName, assignedDimensions in assigned.items(): 

617 if len(assignedDimensions) > 1: 

618 # Pick the most popular (preferring mandatory dimensions) 

619 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

620 if requiredButMissing: 

621 candidateDimensions = requiredButMissing 

622 else: 

623 candidateDimensions = assignedDimensions 

624 

625 # If this is a choice between visit and exposure and 

626 # neither was a required part of the dataset type, 

627 # (hence in this branch) always prefer exposure over 

628 # visit since exposures are always defined and visits 

629 # are defined from exposures. 

630 if candidateDimensions == {"exposure", "visit"}: 

631 candidateDimensions = {"exposure"} 

632 

633 # Select the relevant items and get a new restricted 

634 # counter. 

635 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

636 duplicatesCounter: Counter[str] = Counter() 

637 duplicatesCounter.update(theseCounts) 

638 

639 # Choose the most common. If they are equally common 

640 # we will pick the one that was found first. 

641 # Returns a list of tuples 

642 selected = duplicatesCounter.most_common(1)[0][0] 

643 

644 _LOG.debug( 

645 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

646 " Removed ambiguity by choosing dimension %s.", 

647 fieldName, 

648 ", ".join(assignedDimensions), 

649 selected, 

650 ) 

651 

652 for candidateDimension in assignedDimensions: 

653 if candidateDimension != selected: 

654 del guessedAssociation[candidateDimension][fieldName] 

655 

656 # Update the record look up dict with the new associations 

657 for dimensionName, values in guessedAssociation.items(): 

658 if values: # A dict might now be empty 

659 _LOG.debug( 

660 "Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values 

661 ) 

662 byRecord[dimensionName].update(values) 

663 

664 if byRecord: 

665 # Some record specifiers were found so we need to convert 

666 # them to the Id form 

667 for dimensionName, values in byRecord.items(): 

668 if dimensionName in newDataId: 

669 _LOG.debug( 

670 "DataId specified explicit %s dimension value of %s in addition to" 

671 " general record specifiers for it of %s. Ignoring record information.", 

672 dimensionName, 

673 newDataId[dimensionName], 

674 str(values), 

675 ) 

676 # Get the actual record and compare with these values. 

677 try: 

678 recs = list(self._registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

679 except DataIdError: 

680 raise ValueError( 

681 f"Could not find dimension '{dimensionName}'" 

682 f" with dataId {newDataId} as part of comparing with" 

683 f" record values {byRecord[dimensionName]}" 

684 ) from None 

685 if len(recs) == 1: 

686 errmsg: list[str] = [] 

687 for k, v in values.items(): 

688 if (recval := getattr(recs[0], k)) != v: 

689 errmsg.append(f"{k}({recval} != {v})") 

690 if errmsg: 

691 raise ValueError( 

692 f"Dimension {dimensionName} in dataId has explicit value" 

693 " inconsistent with records: " + ", ".join(errmsg) 

694 ) 

695 else: 

696 # Multiple matches for an explicit dimension 

697 # should never happen but let downstream complain. 

698 pass 

699 continue 

700 

701 # Build up a WHERE expression 

702 bind = dict(values.items()) 

703 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

704 

705 # Hopefully we get a single record that matches 

706 records = set( 

707 self._registry.queryDimensionRecords( 

708 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

709 ) 

710 ) 

711 

712 if len(records) != 1: 

713 if len(records) > 1: 

714 # visit can have an ambiguous answer without involving 

715 # visit_system. The default visit_system is defined 

716 # by the instrument. 

717 if ( 

718 dimensionName == "visit" 

719 and "visit_system_membership" in self.dimensions 

720 and "visit_system" in self.dimensions["instrument"].metadata 

721 ): 

722 instrument_records = list( 

723 self._registry.queryDimensionRecords( 

724 "instrument", 

725 dataId=newDataId, 

726 **kwargs, 

727 ) 

728 ) 

729 if len(instrument_records) == 1: 

730 visit_system = instrument_records[0].visit_system 

731 if visit_system is None: 

732 # Set to a value that will never match. 

733 visit_system = -1 

734 

735 # Look up each visit in the 

736 # visit_system_membership records. 

737 for rec in records: 

738 membership = list( 

739 self._registry.queryDimensionRecords( 

740 # Use bind to allow zero results. 

741 # This is a fully-specified query. 

742 "visit_system_membership", 

743 where="instrument = inst AND visit_system = system AND visit = v", 

744 bind=dict( 

745 inst=instrument_records[0].name, system=visit_system, v=rec.id 

746 ), 

747 ) 

748 ) 

749 if membership: 

750 # This record is the right answer. 

751 records = {rec} 

752 break 

753 

754 # The ambiguity may have been resolved so check again. 

755 if len(records) > 1: 

756 _LOG.debug( 

757 "Received %d records from constraints of %s", len(records), str(values) 

758 ) 

759 for r in records: 

760 _LOG.debug("- %s", str(r)) 

761 raise ValueError( 

762 f"DataId specification for dimension {dimensionName} is not" 

763 f" uniquely constrained to a single dataset by {values}." 

764 f" Got {len(records)} results." 

765 ) 

766 else: 

767 raise ValueError( 

768 f"DataId specification for dimension {dimensionName} matched no" 

769 f" records when constrained by {values}" 

770 ) 

771 

772 # Get the primary key from the real dimension object 

773 dimension = self.dimensions.dimensions[dimensionName] 

774 if not isinstance(dimension, Dimension): 

775 raise RuntimeError( 

776 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

777 ) 

778 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

779 

780 return newDataId, kwargs 

781 

782 def _findDatasetRef( 

783 self, 

784 datasetRefOrType: DatasetRef | DatasetType | str, 

785 dataId: DataId | None = None, 

786 *, 

787 collections: Any = None, 

788 predict: bool = False, 

789 run: str | None = None, 

790 datastore_records: bool = False, 

791 **kwargs: Any, 

792 ) -> DatasetRef: 

793 """Shared logic for methods that start with a search for a dataset in 

794 the registry. 

795 

796 Parameters 

797 ---------- 

798 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

799 When `DatasetRef` the `dataId` should be `None`. 

800 Otherwise the `DatasetType` or name thereof. 

801 dataId : `dict` or `DataCoordinate`, optional 

802 A `dict` of `Dimension` link name, value pairs that label the 

803 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

804 should be provided as the first argument. 

805 collections : Any, optional 

806 Collections to be searched, overriding ``self.collections``. 

807 Can be any of the types supported by the ``collections`` argument 

808 to butler construction. 

809 predict : `bool`, optional 

810 If `True`, return a newly created `DatasetRef` with a unique 

811 dataset ID if finding a reference in the `Registry` fails. 

812 Defaults to `False`. 

813 run : `str`, optional 

814 Run collection name to use for creating `DatasetRef` for predicted 

815 datasets. Only used if ``predict`` is `True`. 

816 datastore_records : `bool`, optional 

817 If `True` add datastore records to returned `DatasetRef`. 

818 **kwargs 

819 Additional keyword arguments used to augment or construct a 

820 `DataId`. See `DataId` parameters. 

821 

822 Returns 

823 ------- 

824 ref : `DatasetRef` 

825 A reference to the dataset identified by the given arguments. 

826 This can be the same dataset reference as given if it was 

827 resolved. 

828 

829 Raises 

830 ------ 

831 LookupError 

832 Raised if no matching dataset exists in the `Registry` (and 

833 ``predict`` is `False`). 

834 ValueError 

835 Raised if a resolved `DatasetRef` was passed as an input, but it 

836 differs from the one found in the registry. 

837 TypeError 

838 Raised if no collections were provided. 

839 """ 

840 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

841 if isinstance(datasetRefOrType, DatasetRef): 

842 if collections is not None: 

843 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3) 

844 # May need to retrieve datastore records if requested. 

845 if datastore_records and datasetRefOrType._datastore_records is None: 

846 datasetRefOrType = self._registry.get_datastore_records(datasetRefOrType) 

847 return datasetRefOrType 

848 timespan: Timespan | None = None 

849 

850 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

851 

852 if datasetType.isCalibration(): 

853 # Because this is a calibration dataset, first try to make a 

854 # standardize the data ID without restricting the dimensions to 

855 # those of the dataset type requested, because there may be extra 

856 # dimensions that provide temporal information for a validity-range 

857 # lookup. 

858 dataId = DataCoordinate.standardize( 

859 dataId, universe=self.dimensions, defaults=self._registry.defaults.dataId, **kwargs 

860 ) 

861 if dataId.dimensions.temporal: 

862 dataId = self._registry.expandDataId(dataId) 

863 timespan = dataId.timespan 

864 else: 

865 # Standardize the data ID to just the dimensions of the dataset 

866 # type instead of letting registry.findDataset do it, so we get the 

867 # result even if no dataset is found. 

868 dataId = DataCoordinate.standardize( 

869 dataId, 

870 dimensions=datasetType.dimensions, 

871 defaults=self._registry.defaults.dataId, 

872 **kwargs, 

873 ) 

874 # Always lookup the DatasetRef, even if one is given, to ensure it is 

875 # present in the current collection. 

876 ref = self.find_dataset( 

877 datasetType, 

878 dataId, 

879 collections=collections, 

880 timespan=timespan, 

881 datastore_records=datastore_records, 

882 ) 

883 if ref is None: 

884 if predict: 

885 if run is None: 

886 run = self.run 

887 if run is None: 

888 raise TypeError("Cannot predict dataset ID/location with run=None.") 

889 return DatasetRef(datasetType, dataId, run=run) 

890 else: 

891 if collections is None: 

892 collections = self._registry.defaults.collections 

893 raise LookupError( 

894 f"Dataset {datasetType.name} with data ID {dataId} " 

895 f"could not be found in collections {collections}." 

896 ) 

897 if datasetType != ref.datasetType: 

898 # If they differ it is because the user explicitly specified 

899 # a compatible dataset type to this call rather than using the 

900 # registry definition. The DatasetRef must therefore be recreated 

901 # using the user definition such that the expected type is 

902 # returned. 

903 ref = DatasetRef( 

904 datasetType, ref.dataId, run=ref.run, id=ref.id, datastore_records=ref._datastore_records 

905 ) 

906 

907 return ref 

908 

909 @transactional 

910 def put( 

911 self, 

912 obj: Any, 

913 datasetRefOrType: DatasetRef | DatasetType | str, 

914 /, 

915 dataId: DataId | None = None, 

916 *, 

917 run: str | None = None, 

918 **kwargs: Any, 

919 ) -> DatasetRef: 

920 """Store and register a dataset. 

921 

922 Parameters 

923 ---------- 

924 obj : `object` 

925 The dataset. 

926 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

927 When `DatasetRef` is provided, ``dataId`` should be `None`. 

928 Otherwise the `DatasetType` or name thereof. If a fully resolved 

929 `DatasetRef` is given the run and ID are used directly. 

930 dataId : `dict` or `DataCoordinate` 

931 A `dict` of `Dimension` link name, value pairs that label the 

932 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

933 should be provided as the second argument. 

934 run : `str`, optional 

935 The name of the run the dataset should be added to, overriding 

936 ``self.run``. Not used if a resolved `DatasetRef` is provided. 

937 **kwargs 

938 Additional keyword arguments used to augment or construct a 

939 `DataCoordinate`. See `DataCoordinate.standardize` 

940 parameters. Not used if a resolve `DatasetRef` is provided. 

941 

942 Returns 

943 ------- 

944 ref : `DatasetRef` 

945 A reference to the stored dataset, updated with the correct id if 

946 given. 

947 

948 Raises 

949 ------ 

950 TypeError 

951 Raised if the butler is read-only or if no run has been provided. 

952 """ 

953 if isinstance(datasetRefOrType, DatasetRef): 

954 # This is a direct put of predefined DatasetRef. 

955 _LOG.debug("Butler put direct: %s", datasetRefOrType) 

956 if run is not None: 

957 warnings.warn("Run collection is not used for DatasetRef", stacklevel=3) 

958 # If registry already has a dataset with the same dataset ID, 

959 # dataset type and DataId, then _importDatasets will do nothing and 

960 # just return an original ref. We have to raise in this case, there 

961 # is a datastore check below for that. 

962 self._registry._importDatasets([datasetRefOrType], expand=True) 

963 # Before trying to write to the datastore check that it does not 

964 # know this dataset. This is prone to races, of course. 

965 if self._datastore.knows(datasetRefOrType): 

966 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}") 

967 # Try to write dataset to the datastore, if it fails due to a race 

968 # with another write, the content of stored data may be 

969 # unpredictable. 

970 try: 

971 self._datastore.put(obj, datasetRefOrType) 

972 except IntegrityError as e: 

973 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") from e 

974 return datasetRefOrType 

975 

976 _LOG.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

977 if not self.isWriteable(): 

978 raise TypeError("Butler is read-only.") 

979 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

980 

981 # Handle dimension records in dataId 

982 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

983 

984 # Add Registry Dataset entry. 

985 dataId = self._registry.expandDataId(dataId, dimensions=datasetType.dimensions, **kwargs) 

986 (ref,) = self._registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

987 self._datastore.put(obj, ref) 

988 

989 return ref 

990 

991 def getDeferred( 

992 self, 

993 datasetRefOrType: DatasetRef | DatasetType | str, 

994 /, 

995 dataId: DataId | None = None, 

996 *, 

997 parameters: dict | None = None, 

998 collections: Any = None, 

999 storageClass: str | StorageClass | None = None, 

1000 **kwargs: Any, 

1001 ) -> DeferredDatasetHandle: 

1002 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1003 after an immediate registry lookup. 

1004 

1005 Parameters 

1006 ---------- 

1007 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1008 When `DatasetRef` the `dataId` should be `None`. 

1009 Otherwise the `DatasetType` or name thereof. 

1010 dataId : `dict` or `DataCoordinate`, optional 

1011 A `dict` of `Dimension` link name, value pairs that label the 

1012 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1013 should be provided as the first argument. 

1014 parameters : `dict` 

1015 Additional StorageClass-defined options to control reading, 

1016 typically used to efficiently read only a subset of the dataset. 

1017 collections : Any, optional 

1018 Collections to be searched, overriding ``self.collections``. 

1019 Can be any of the types supported by the ``collections`` argument 

1020 to butler construction. 

1021 storageClass : `StorageClass` or `str`, optional 

1022 The storage class to be used to override the Python type 

1023 returned by this method. By default the returned type matches 

1024 the dataset type definition for this dataset. Specifying a 

1025 read `StorageClass` can force a different type to be returned. 

1026 This type must be compatible with the original type. 

1027 **kwargs 

1028 Additional keyword arguments used to augment or construct a 

1029 `DataId`. See `DataId` parameters. 

1030 

1031 Returns 

1032 ------- 

1033 obj : `DeferredDatasetHandle` 

1034 A handle which can be used to retrieve a dataset at a later time. 

1035 

1036 Raises 

1037 ------ 

1038 LookupError 

1039 Raised if no matching dataset exists in the `Registry` or 

1040 datastore. 

1041 ValueError 

1042 Raised if a resolved `DatasetRef` was passed as an input, but it 

1043 differs from the one found in the registry. 

1044 TypeError 

1045 Raised if no collections were provided. 

1046 """ 

1047 if isinstance(datasetRefOrType, DatasetRef): 

1048 # Do the quick check first and if that fails, check for artifact 

1049 # existence. This is necessary for datastores that are configured 

1050 # in trust mode where there won't be a record but there will be 

1051 # a file. 

1052 if self._datastore.knows(datasetRefOrType) or self._datastore.exists(datasetRefOrType): 

1053 ref = datasetRefOrType 

1054 else: 

1055 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.") 

1056 else: 

1057 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1058 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1059 

1060 def get( 

1061 self, 

1062 datasetRefOrType: DatasetRef | DatasetType | str, 

1063 /, 

1064 dataId: DataId | None = None, 

1065 *, 

1066 parameters: dict[str, Any] | None = None, 

1067 collections: Any = None, 

1068 storageClass: StorageClass | str | None = None, 

1069 **kwargs: Any, 

1070 ) -> Any: 

1071 """Retrieve a stored dataset. 

1072 

1073 Parameters 

1074 ---------- 

1075 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1076 When `DatasetRef` the `dataId` should be `None`. 

1077 Otherwise the `DatasetType` or name thereof. 

1078 If a resolved `DatasetRef`, the associated dataset 

1079 is returned directly without additional querying. 

1080 dataId : `dict` or `DataCoordinate` 

1081 A `dict` of `Dimension` link name, value pairs that label the 

1082 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1083 should be provided as the first argument. 

1084 parameters : `dict` 

1085 Additional StorageClass-defined options to control reading, 

1086 typically used to efficiently read only a subset of the dataset. 

1087 collections : Any, optional 

1088 Collections to be searched, overriding ``self.collections``. 

1089 Can be any of the types supported by the ``collections`` argument 

1090 to butler construction. 

1091 storageClass : `StorageClass` or `str`, optional 

1092 The storage class to be used to override the Python type 

1093 returned by this method. By default the returned type matches 

1094 the dataset type definition for this dataset. Specifying a 

1095 read `StorageClass` can force a different type to be returned. 

1096 This type must be compatible with the original type. 

1097 **kwargs 

1098 Additional keyword arguments used to augment or construct a 

1099 `DataCoordinate`. See `DataCoordinate.standardize` 

1100 parameters. 

1101 

1102 Returns 

1103 ------- 

1104 obj : `object` 

1105 The dataset. 

1106 

1107 Raises 

1108 ------ 

1109 LookupError 

1110 Raised if no matching dataset exists in the `Registry`. 

1111 TypeError 

1112 Raised if no collections were provided. 

1113 

1114 Notes 

1115 ----- 

1116 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1117 this method requires that the given data ID include temporal dimensions 

1118 beyond the dimensions of the dataset type itself, in order to find the 

1119 dataset with the appropriate validity range. For example, a "bias" 

1120 dataset with native dimensions ``{instrument, detector}`` could be 

1121 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1122 ``exposure`` is a temporal dimension. 

1123 """ 

1124 _LOG.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1125 ref = self._findDatasetRef( 

1126 datasetRefOrType, dataId, collections=collections, datastore_records=True, **kwargs 

1127 ) 

1128 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1129 

1130 def getURIs( 

1131 self, 

1132 datasetRefOrType: DatasetRef | DatasetType | str, 

1133 /, 

1134 dataId: DataId | None = None, 

1135 *, 

1136 predict: bool = False, 

1137 collections: Any = None, 

1138 run: str | None = None, 

1139 **kwargs: Any, 

1140 ) -> DatasetRefURIs: 

1141 """Return the URIs associated with the dataset. 

1142 

1143 Parameters 

1144 ---------- 

1145 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1146 When `DatasetRef` the `dataId` should be `None`. 

1147 Otherwise the `DatasetType` or name thereof. 

1148 dataId : `dict` or `DataCoordinate` 

1149 A `dict` of `Dimension` link name, value pairs that label the 

1150 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1151 should be provided as the first argument. 

1152 predict : `bool` 

1153 If `True`, allow URIs to be returned of datasets that have not 

1154 been written. 

1155 collections : Any, optional 

1156 Collections to be searched, overriding ``self.collections``. 

1157 Can be any of the types supported by the ``collections`` argument 

1158 to butler construction. 

1159 run : `str`, optional 

1160 Run to use for predictions, overriding ``self.run``. 

1161 **kwargs 

1162 Additional keyword arguments used to augment or construct a 

1163 `DataCoordinate`. See `DataCoordinate.standardize` 

1164 parameters. 

1165 

1166 Returns 

1167 ------- 

1168 uris : `DatasetRefURIs` 

1169 The URI to the primary artifact associated with this dataset (if 

1170 the dataset was disassembled within the datastore this may be 

1171 `None`), and the URIs to any components associated with the dataset 

1172 artifact. (can be empty if there are no components). 

1173 """ 

1174 ref = self._findDatasetRef( 

1175 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs 

1176 ) 

1177 return self._datastore.getURIs(ref, predict) 

1178 

1179 def get_dataset_type(self, name: str) -> DatasetType: 

1180 return self._registry.getDatasetType(name) 

1181 

1182 def get_dataset( 

1183 self, 

1184 id: DatasetId, 

1185 *, 

1186 storage_class: str | StorageClass | None = None, 

1187 dimension_records: bool = False, 

1188 datastore_records: bool = False, 

1189 ) -> DatasetRef | None: 

1190 ref = self._registry.getDataset(id) 

1191 if ref is not None: 

1192 if dimension_records: 

1193 ref = ref.expanded( 

1194 self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions) 

1195 ) 

1196 if storage_class: 

1197 ref = ref.overrideStorageClass(storage_class) 

1198 if datastore_records: 

1199 ref = self._registry.get_datastore_records(ref) 

1200 return ref 

1201 

1202 def find_dataset( 

1203 self, 

1204 dataset_type: DatasetType | str, 

1205 data_id: DataId | None = None, 

1206 *, 

1207 collections: str | Sequence[str] | None = None, 

1208 timespan: Timespan | None = None, 

1209 storage_class: str | StorageClass | None = None, 

1210 dimension_records: bool = False, 

1211 datastore_records: bool = False, 

1212 **kwargs: Any, 

1213 ) -> DatasetRef | None: 

1214 # Handle any parts of the dataID that are not using primary dimension 

1215 # keys. 

1216 if isinstance(dataset_type, str): 

1217 actual_type = self.get_dataset_type(dataset_type) 

1218 else: 

1219 actual_type = dataset_type 

1220 

1221 # Store the component for later. 

1222 component_name = actual_type.component() 

1223 if actual_type.isComponent(): 

1224 parent_type = actual_type.makeCompositeDatasetType() 

1225 else: 

1226 parent_type = actual_type 

1227 

1228 data_id, kwargs = self._rewrite_data_id(data_id, parent_type, **kwargs) 

1229 

1230 ref = self._registry.findDataset( 

1231 parent_type, 

1232 data_id, 

1233 collections=collections, 

1234 timespan=timespan, 

1235 datastore_records=datastore_records, 

1236 **kwargs, 

1237 ) 

1238 if ref is not None and dimension_records: 

1239 ref = ref.expanded(self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions)) 

1240 if ref is not None and component_name: 

1241 ref = ref.makeComponentRef(component_name) 

1242 if ref is not None and storage_class is not None: 

1243 ref = ref.overrideStorageClass(storage_class) 

1244 

1245 return ref 

1246 

1247 def retrieveArtifacts( 

1248 self, 

1249 refs: Iterable[DatasetRef], 

1250 destination: ResourcePathExpression, 

1251 transfer: str = "auto", 

1252 preserve_path: bool = True, 

1253 overwrite: bool = False, 

1254 ) -> list[ResourcePath]: 

1255 # Docstring inherited. 

1256 return self._datastore.retrieveArtifacts( 

1257 refs, 

1258 ResourcePath(destination), 

1259 transfer=transfer, 

1260 preserve_path=preserve_path, 

1261 overwrite=overwrite, 

1262 ) 

1263 

1264 def exists( 

1265 self, 

1266 dataset_ref_or_type: DatasetRef | DatasetType | str, 

1267 /, 

1268 data_id: DataId | None = None, 

1269 *, 

1270 full_check: bool = True, 

1271 collections: Any = None, 

1272 **kwargs: Any, 

1273 ) -> DatasetExistence: 

1274 # Docstring inherited. 

1275 existence = DatasetExistence.UNRECOGNIZED 

1276 

1277 if isinstance(dataset_ref_or_type, DatasetRef): 

1278 if collections is not None: 

1279 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2) 

1280 if data_id is not None: 

1281 warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2) 

1282 ref = dataset_ref_or_type 

1283 registry_ref = self._registry.getDataset(dataset_ref_or_type.id) 

1284 if registry_ref is not None: 

1285 existence |= DatasetExistence.RECORDED 

1286 

1287 if dataset_ref_or_type != registry_ref: 

1288 # This could mean that storage classes differ, so we should 

1289 # check for that but use the registry ref for the rest of 

1290 # the method. 

1291 if registry_ref.is_compatible_with(dataset_ref_or_type): 

1292 # Use the registry version from now on. 

1293 ref = registry_ref 

1294 else: 

1295 raise ValueError( 

1296 f"The ref given to exists() ({ref}) has the same dataset ID as one " 

1297 f"in registry but has different incompatible values ({registry_ref})." 

1298 ) 

1299 else: 

1300 try: 

1301 ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs) 

1302 except (LookupError, TypeError, NoDefaultCollectionError): 

1303 return existence 

1304 existence |= DatasetExistence.RECORDED 

1305 

1306 if self._datastore.knows(ref): 

1307 existence |= DatasetExistence.DATASTORE 

1308 

1309 if full_check: 

1310 if self._datastore.exists(ref): 

1311 existence |= DatasetExistence._ARTIFACT 

1312 elif existence.value != DatasetExistence.UNRECOGNIZED.value: 

1313 # Do not add this flag if we have no other idea about a dataset. 

1314 existence |= DatasetExistence(DatasetExistence._ASSUMED) 

1315 

1316 return existence 

1317 

1318 def _exists_many( 

1319 self, 

1320 refs: Iterable[DatasetRef], 

1321 /, 

1322 *, 

1323 full_check: bool = True, 

1324 ) -> dict[DatasetRef, DatasetExistence]: 

1325 # Docstring inherited. 

1326 existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs} 

1327 

1328 # Registry does not have a bulk API to check for a ref. 

1329 for ref in refs: 

1330 registry_ref = self._registry.getDataset(ref.id) 

1331 if registry_ref is not None: 

1332 # It is possible, albeit unlikely, that the given ref does 

1333 # not match the one in registry even though the UUID matches. 

1334 # When checking a single ref we raise, but it's impolite to 

1335 # do that when potentially hundreds of refs are being checked. 

1336 # We could change the API to only accept UUIDs and that would 

1337 # remove the ability to even check and remove the worry 

1338 # about differing storage classes. Given the ongoing discussion 

1339 # on refs vs UUIDs and whether to raise or have a new 

1340 # private flag, treat this as a private API for now. 

1341 existence[ref] |= DatasetExistence.RECORDED 

1342 

1343 # Ask datastore if it knows about these refs. 

1344 knows = self._datastore.knows_these(refs) 

1345 for ref, known in knows.items(): 

1346 if known: 

1347 existence[ref] |= DatasetExistence.DATASTORE 

1348 

1349 if full_check: 

1350 mexists = self._datastore.mexists(refs) 

1351 for ref, exists in mexists.items(): 

1352 if exists: 

1353 existence[ref] |= DatasetExistence._ARTIFACT 

1354 else: 

1355 # Do not set this flag if nothing is known about the dataset. 

1356 for ref in existence: 

1357 if existence[ref] != DatasetExistence.UNRECOGNIZED: 

1358 existence[ref] |= DatasetExistence._ASSUMED 

1359 

1360 return existence 

1361 

1362 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1363 # Docstring inherited. 

1364 if not self.isWriteable(): 

1365 raise TypeError("Butler is read-only.") 

1366 names = list(names) 

1367 refs: list[DatasetRef] = [] 

1368 for name in names: 

1369 collectionType = self._registry.getCollectionType(name) 

1370 if collectionType is not CollectionType.RUN: 

1371 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1372 refs.extend(self._registry.queryDatasets(..., collections=name, findFirst=True)) 

1373 with self._datastore.transaction(), self._registry.transaction(): 

1374 if unstore: 

1375 self._datastore.trash(refs) 

1376 else: 

1377 self._datastore.forget(refs) 

1378 for name in names: 

1379 self._registry.removeCollection(name) 

1380 if unstore: 

1381 # Point of no return for removing artifacts 

1382 self._datastore.emptyTrash() 

1383 

1384 def pruneDatasets( 

1385 self, 

1386 refs: Iterable[DatasetRef], 

1387 *, 

1388 disassociate: bool = True, 

1389 unstore: bool = False, 

1390 tags: Iterable[str] = (), 

1391 purge: bool = False, 

1392 ) -> None: 

1393 # docstring inherited from LimitedButler 

1394 

1395 if not self.isWriteable(): 

1396 raise TypeError("Butler is read-only.") 

1397 if purge: 

1398 if not disassociate: 

1399 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1400 if not unstore: 

1401 raise TypeError("Cannot pass purge=True without unstore=True.") 

1402 elif disassociate: 

1403 tags = tuple(tags) 

1404 if not tags: 

1405 raise TypeError("No tags provided but disassociate=True.") 

1406 for tag in tags: 

1407 collectionType = self._registry.getCollectionType(tag) 

1408 if collectionType is not CollectionType.TAGGED: 

1409 raise TypeError( 

1410 f"Cannot disassociate from collection '{tag}' " 

1411 f"of non-TAGGED type {collectionType.name}." 

1412 ) 

1413 # Transform possibly-single-pass iterable into something we can iterate 

1414 # over multiple times. 

1415 refs = list(refs) 

1416 # Pruning a component of a DatasetRef makes no sense since registry 

1417 # doesn't know about components and datastore might not store 

1418 # components in a separate file 

1419 for ref in refs: 

1420 if ref.datasetType.component(): 

1421 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1422 # We don't need an unreliable Datastore transaction for this, because 

1423 # we've been extra careful to ensure that Datastore.trash only involves 

1424 # mutating the Registry (it can _look_ at Datastore-specific things, 

1425 # but shouldn't change them), and hence all operations here are 

1426 # Registry operations. 

1427 with self._datastore.transaction(), self._registry.transaction(): 

1428 if unstore: 

1429 self._datastore.trash(refs) 

1430 if purge: 

1431 self._registry.removeDatasets(refs) 

1432 elif disassociate: 

1433 assert tags, "Guaranteed by earlier logic in this function." 

1434 for tag in tags: 

1435 self._registry.disassociate(tag, refs) 

1436 # We've exited the Registry transaction, and apparently committed. 

1437 # (if there was an exception, everything rolled back, and it's as if 

1438 # nothing happened - and we never get here). 

1439 # Datastore artifacts are not yet gone, but they're clearly marked 

1440 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1441 # problems we can try again later, and if manual administrative 

1442 # intervention is required, it's pretty clear what that should entail: 

1443 # deleting everything on disk and in private Datastore tables that is 

1444 # in the dataset_location_trash table. 

1445 if unstore: 

1446 # Point of no return for removing artifacts 

1447 self._datastore.emptyTrash() 

1448 

1449 @transactional 

1450 def ingest( 

1451 self, 

1452 *datasets: FileDataset, 

1453 transfer: str | None = "auto", 

1454 record_validation_info: bool = True, 

1455 ) -> None: 

1456 # Docstring inherited. 

1457 if not self.isWriteable(): 

1458 raise TypeError("Butler is read-only.") 

1459 

1460 _LOG.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s") 

1461 if not datasets: 

1462 return 

1463 

1464 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1465 

1466 # We need to reorganize all the inputs so that they are grouped 

1467 # by dataset type and run. Multiple refs in a single FileDataset 

1468 # are required to share the run and dataset type. 

1469 groupedData: MutableMapping[tuple[DatasetType, str], list[FileDataset]] = defaultdict(list) 

1470 

1471 # Track DataIDs that are being ingested so we can spot issues early 

1472 # with duplication. Retain previous FileDataset so we can report it. 

1473 groupedDataIds: MutableMapping[ 

1474 tuple[DatasetType, str], dict[DataCoordinate, FileDataset] 

1475 ] = defaultdict(dict) 

1476 

1477 # And the nested loop that populates it: 

1478 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1479 # Somewhere to store pre-existing refs if we have an 

1480 # execution butler. 

1481 existingRefs: list[DatasetRef] = [] 

1482 

1483 for ref in dataset.refs: 

1484 group_key = (ref.datasetType, ref.run) 

1485 

1486 if ref.dataId in groupedDataIds[group_key]: 

1487 raise ConflictingDefinitionError( 

1488 f"Ingest conflict. Dataset {dataset.path} has same" 

1489 " DataId as other ingest dataset" 

1490 f" {groupedDataIds[group_key][ref.dataId].path} " 

1491 f" ({ref.dataId})" 

1492 ) 

1493 

1494 groupedDataIds[group_key][ref.dataId] = dataset 

1495 

1496 if existingRefs: 

1497 if len(dataset.refs) != len(existingRefs): 

1498 # Keeping track of partially pre-existing datasets is hard 

1499 # and should generally never happen. For now don't allow 

1500 # it. 

1501 raise ConflictingDefinitionError( 

1502 f"For dataset {dataset.path} some dataIds already exist" 

1503 " in registry but others do not. This is not supported." 

1504 ) 

1505 

1506 # Store expanded form in the original FileDataset. 

1507 dataset.refs = existingRefs 

1508 else: 

1509 groupedData[group_key].append(dataset) 

1510 

1511 # Now we can bulk-insert into Registry for each DatasetType. 

1512 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks( 

1513 groupedData.items(), desc="Bulk-inserting datasets by type" 

1514 ): 

1515 refs_to_import = [] 

1516 for dataset in grouped_datasets: 

1517 refs_to_import.extend(dataset.refs) 

1518 

1519 n_refs = len(refs_to_import) 

1520 _LOG.verbose( 

1521 "Importing %d ref%s of dataset type %r into run %r", 

1522 n_refs, 

1523 "" if n_refs == 1 else "s", 

1524 datasetType.name, 

1525 this_run, 

1526 ) 

1527 

1528 # Import the refs and expand the DataCoordinates since we can't 

1529 # guarantee that they are expanded and Datastore will need 

1530 # the records. 

1531 imported_refs = self._registry._importDatasets(refs_to_import, expand=True) 

1532 assert set(imported_refs) == set(refs_to_import) 

1533 

1534 # Replace all the refs in the FileDataset with expanded versions. 

1535 # Pull them off in the order we put them on the list. 

1536 for dataset in grouped_datasets: 

1537 n_dataset_refs = len(dataset.refs) 

1538 dataset.refs = imported_refs[:n_dataset_refs] 

1539 del imported_refs[:n_dataset_refs] 

1540 

1541 # Bulk-insert everything into Datastore. 

1542 # We do not know if any of the registry entries already existed 

1543 # (_importDatasets only complains if they exist but differ) so 

1544 # we have to catch IntegrityError explicitly. 

1545 try: 

1546 self._datastore.ingest( 

1547 *datasets, transfer=transfer, record_validation_info=record_validation_info 

1548 ) 

1549 except IntegrityError as e: 

1550 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") from e 

1551 

1552 @contextlib.contextmanager 

1553 def export( 

1554 self, 

1555 *, 

1556 directory: str | None = None, 

1557 filename: str | None = None, 

1558 format: str | None = None, 

1559 transfer: str | None = None, 

1560 ) -> Iterator[RepoExportContext]: 

1561 # Docstring inherited. 

1562 if directory is None and transfer is not None: 

1563 raise TypeError("Cannot transfer without providing a directory.") 

1564 if transfer == "move": 

1565 raise TypeError("Transfer may not be 'move': export is read-only") 

1566 if format is None: 

1567 if filename is None: 

1568 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1569 else: 

1570 _, format = os.path.splitext(filename) 

1571 if not format: 

1572 raise ValueError("Please specify a file extension to determine export format.") 

1573 format = format[1:] # Strip leading "."" 

1574 elif filename is None: 

1575 filename = f"export.{format}" 

1576 if directory is not None: 

1577 filename = os.path.join(directory, filename) 

1578 formats = self._config["repo_transfer_formats"] 

1579 if format not in formats: 

1580 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}") 

1581 BackendClass = get_class_of(formats[format, "export"]) 

1582 with open(filename, "w") as stream: 

1583 backend = BackendClass(stream, universe=self.dimensions) 

1584 try: 

1585 helper = RepoExportContext( 

1586 self._registry, self._datastore, backend=backend, directory=directory, transfer=transfer 

1587 ) 

1588 with self._caching_context(): 

1589 yield helper 

1590 except BaseException: 

1591 raise 

1592 else: 

1593 helper._finish() 

1594 

1595 def import_( 

1596 self, 

1597 *, 

1598 directory: ResourcePathExpression | None = None, 

1599 filename: ResourcePathExpression | TextIO | None = None, 

1600 format: str | None = None, 

1601 transfer: str | None = None, 

1602 skip_dimensions: set | None = None, 

1603 ) -> None: 

1604 # Docstring inherited. 

1605 if not self.isWriteable(): 

1606 raise TypeError("Butler is read-only.") 

1607 if format is None: 

1608 if filename is None: 

1609 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1610 else: 

1611 _, format = os.path.splitext(filename) # type: ignore 

1612 elif filename is None: 

1613 filename = ResourcePath(f"export.{format}", forceAbsolute=False) 

1614 if directory is not None: 

1615 directory = ResourcePath(directory, forceDirectory=True) 

1616 # mypy doesn't think this will work but it does in python >= 3.10. 

1617 if isinstance(filename, ResourcePathExpression): # type: ignore 

1618 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore 

1619 if not filename.isabs() and directory is not None: 

1620 potential = directory.join(filename) 

1621 exists_in_cwd = filename.exists() 

1622 exists_in_dir = potential.exists() 

1623 if exists_in_cwd and exists_in_dir: 

1624 _LOG.warning( 

1625 "A relative path for filename was specified (%s) which exists relative to cwd. " 

1626 "Additionally, the file exists relative to the given search directory (%s). " 

1627 "Using the export file in the given directory.", 

1628 filename, 

1629 potential, 

1630 ) 

1631 # Given they specified an explicit directory and that 

1632 # directory has the export file in it, assume that that 

1633 # is what was meant despite the file in cwd. 

1634 filename = potential 

1635 elif exists_in_dir: 

1636 filename = potential 

1637 elif not exists_in_cwd and not exists_in_dir: 

1638 # Raise early. 

1639 raise FileNotFoundError( 

1640 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}." 

1641 ) 

1642 BackendClass: type[RepoImportBackend] = get_class_of( 

1643 self._config["repo_transfer_formats"][format]["import"] 

1644 ) 

1645 

1646 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None: 

1647 with self._caching_context(): 

1648 backend = BackendClass(importStream, self._registry) # type: ignore[call-arg] 

1649 backend.register() 

1650 with self.transaction(): 

1651 backend.load( 

1652 self._datastore, 

1653 directory=directory, 

1654 transfer=transfer, 

1655 skip_dimensions=skip_dimensions, 

1656 ) 

1657 

1658 if isinstance(filename, ResourcePath): 

1659 # We can not use open() here at the moment because of 

1660 # DM-38589 since yaml does stream.read(8192) in a loop. 

1661 stream = io.StringIO(filename.read().decode()) 

1662 doImport(stream) 

1663 else: 

1664 doImport(filename) # type: ignore 

1665 

1666 def transfer_dimension_records_from( 

1667 self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef] 

1668 ) -> None: 

1669 # Allowed dimensions in the target butler. 

1670 elements = frozenset(element for element in self.dimensions.elements if element.has_own_table) 

1671 

1672 data_ids = {ref.dataId for ref in source_refs} 

1673 

1674 dimension_records = self._extract_all_dimension_records_from_data_ids( 

1675 source_butler, data_ids, elements 

1676 ) 

1677 

1678 # Insert order is important. 

1679 for element in self.dimensions.sorted(dimension_records.keys()): 

1680 records = [r for r in dimension_records[element].values()] 

1681 # Assume that if the record is already present that we can 

1682 # use it without having to check that the record metadata 

1683 # is consistent. 

1684 self._registry.insertDimensionData(element, *records, skip_existing=True) 

1685 _LOG.debug("Dimension '%s' -- number of records transferred: %d", element.name, len(records)) 

1686 

1687 def _extract_all_dimension_records_from_data_ids( 

1688 self, 

1689 source_butler: LimitedButler | Butler, 

1690 data_ids: set[DataCoordinate], 

1691 allowed_elements: frozenset[DimensionElement], 

1692 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]: 

1693 primary_records = self._extract_dimension_records_from_data_ids( 

1694 source_butler, data_ids, allowed_elements 

1695 ) 

1696 

1697 can_query = True if isinstance(source_butler, Butler) else False 

1698 

1699 additional_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

1700 for original_element, record_mapping in primary_records.items(): 

1701 # Get dimensions that depend on this dimension. 

1702 populated_by = self.dimensions.get_elements_populated_by( 

1703 self.dimensions[original_element.name] # type: ignore 

1704 ) 

1705 

1706 for data_id in record_mapping.keys(): 

1707 for element in populated_by: 

1708 if element not in allowed_elements: 

1709 continue 

1710 if element.name == original_element.name: 

1711 continue 

1712 

1713 if element.name in primary_records: 

1714 # If this element has already been stored avoid 

1715 # re-finding records since that may lead to additional 

1716 # spurious records. e.g. visit is populated_by 

1717 # visit_detector_region but querying 

1718 # visit_detector_region by visit will return all the 

1719 # detectors for this visit -- the visit dataId does not 

1720 # constrain this. 

1721 # To constrain the query the original dataIds would 

1722 # have to be scanned. 

1723 continue 

1724 

1725 if not can_query: 

1726 raise RuntimeError( 

1727 f"Transferring populated_by records like {element.name} requires a full Butler." 

1728 ) 

1729 

1730 records = source_butler.registry.queryDimensionRecords( # type: ignore 

1731 element.name, **data_id.mapping # type: ignore 

1732 ) 

1733 for record in records: 

1734 additional_records[record.definition].setdefault(record.dataId, record) 

1735 

1736 # The next step is to walk back through the additional records to 

1737 # pick up any missing content (such as visit_definition needing to 

1738 # know the exposure). Want to ensure we do not request records we 

1739 # already have. 

1740 missing_data_ids = set() 

1741 for name, record_mapping in additional_records.items(): 

1742 for data_id in record_mapping.keys(): 

1743 if data_id not in primary_records[name]: 

1744 missing_data_ids.add(data_id) 

1745 

1746 # Fill out the new records. Assume that these new records do not 

1747 # also need to carry over additional populated_by records. 

1748 secondary_records = self._extract_dimension_records_from_data_ids( 

1749 source_butler, missing_data_ids, allowed_elements 

1750 ) 

1751 

1752 # Merge the extra sets of records in with the original. 

1753 for name, record_mapping in itertools.chain(additional_records.items(), secondary_records.items()): 

1754 primary_records[name].update(record_mapping) 

1755 

1756 return primary_records 

1757 

1758 def _extract_dimension_records_from_data_ids( 

1759 self, 

1760 source_butler: LimitedButler | Butler, 

1761 data_ids: set[DataCoordinate], 

1762 allowed_elements: frozenset[DimensionElement], 

1763 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]: 

1764 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

1765 

1766 for data_id in data_ids: 

1767 # Need an expanded record, if not expanded that we need a full 

1768 # butler with registry (allow mocks with registry too). 

1769 if not data_id.hasRecords(): 

1770 if registry := getattr(source_butler, "registry", None): 

1771 data_id = registry.expandDataId(data_id) 

1772 else: 

1773 raise TypeError("Input butler needs to be a full butler to expand DataId.") 

1774 # If this butler doesn't know about a dimension in the source 

1775 # butler things will break later. 

1776 for element_name in data_id.dimensions.elements: 

1777 record = data_id.records[element_name] 

1778 if record is not None and record.definition in allowed_elements: 

1779 dimension_records[record.definition].setdefault(record.dataId, record) 

1780 

1781 return dimension_records 

1782 

1783 def transfer_from( 

1784 self, 

1785 source_butler: LimitedButler, 

1786 source_refs: Iterable[DatasetRef], 

1787 transfer: str = "auto", 

1788 skip_missing: bool = True, 

1789 register_dataset_types: bool = False, 

1790 transfer_dimensions: bool = False, 

1791 ) -> collections.abc.Collection[DatasetRef]: 

1792 # Docstring inherited. 

1793 if not self.isWriteable(): 

1794 raise TypeError("Butler is read-only.") 

1795 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

1796 

1797 # Will iterate through the refs multiple times so need to convert 

1798 # to a list if this isn't a collection. 

1799 if not isinstance(source_refs, collections.abc.Collection): 

1800 source_refs = list(source_refs) 

1801 

1802 original_count = len(source_refs) 

1803 _LOG.info("Transferring %d datasets into %s", original_count, str(self)) 

1804 

1805 # In some situations the datastore artifact may be missing 

1806 # and we do not want that registry entry to be imported. 

1807 # Asking datastore is not sufficient, the records may have been 

1808 # purged, we have to ask for the (predicted) URI and check 

1809 # existence explicitly. Execution butler is set up exactly like 

1810 # this with no datastore records. 

1811 artifact_existence: dict[ResourcePath, bool] = {} 

1812 if skip_missing: 

1813 dataset_existence = source_butler._datastore.mexists( 

1814 source_refs, artifact_existence=artifact_existence 

1815 ) 

1816 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

1817 filtered_count = len(source_refs) 

1818 n_missing = original_count - filtered_count 

1819 _LOG.verbose( 

1820 "%d dataset%s removed because the artifact does not exist. Now have %d.", 

1821 n_missing, 

1822 "" if n_missing == 1 else "s", 

1823 filtered_count, 

1824 ) 

1825 

1826 # Importing requires that we group the refs by dataset type and run 

1827 # before doing the import. 

1828 source_dataset_types = set() 

1829 grouped_refs = defaultdict(list) 

1830 for ref in source_refs: 

1831 grouped_refs[ref.datasetType, ref.run].append(ref) 

1832 source_dataset_types.add(ref.datasetType) 

1833 

1834 # Check to see if the dataset type in the source butler has 

1835 # the same definition in the target butler and register missing 

1836 # ones if requested. Registration must happen outside a transaction. 

1837 newly_registered_dataset_types = set() 

1838 for datasetType in source_dataset_types: 

1839 if register_dataset_types: 

1840 # Let this raise immediately if inconsistent. Continuing 

1841 # on to find additional inconsistent dataset types 

1842 # might result in additional unwanted dataset types being 

1843 # registered. 

1844 if self._registry.registerDatasetType(datasetType): 

1845 newly_registered_dataset_types.add(datasetType) 

1846 else: 

1847 # If the dataset type is missing, let it fail immediately. 

1848 target_dataset_type = self.get_dataset_type(datasetType.name) 

1849 if target_dataset_type != datasetType: 

1850 raise ConflictingDefinitionError( 

1851 "Source butler dataset type differs from definition" 

1852 f" in target butler: {datasetType} !=" 

1853 f" {target_dataset_type}" 

1854 ) 

1855 if newly_registered_dataset_types: 

1856 # We may have registered some even if there were inconsistencies 

1857 # but should let people know (or else remove them again). 

1858 _LOG.verbose( 

1859 "Registered the following dataset types in the target Butler: %s", 

1860 ", ".join(d.name for d in newly_registered_dataset_types), 

1861 ) 

1862 else: 

1863 _LOG.verbose("All required dataset types are known to the target Butler") 

1864 

1865 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

1866 if transfer_dimensions: 

1867 # Collect all the dimension records for these refs. 

1868 # All dimensions are to be copied but the list of valid dimensions 

1869 # come from this butler's universe. 

1870 elements = frozenset(element for element in self.dimensions.elements if element.has_own_table) 

1871 dataIds = {ref.dataId for ref in source_refs} 

1872 dimension_records = self._extract_all_dimension_records_from_data_ids( 

1873 source_butler, dataIds, elements 

1874 ) 

1875 

1876 handled_collections: set[str] = set() 

1877 

1878 # Do all the importing in a single transaction. 

1879 with self.transaction(): 

1880 if dimension_records: 

1881 _LOG.verbose("Ensuring that dimension records exist for transferred datasets.") 

1882 # Order matters. 

1883 for element in self.dimensions.sorted(dimension_records.keys()): 

1884 records = [r for r in dimension_records[element].values()] 

1885 # Assume that if the record is already present that we can 

1886 # use it without having to check that the record metadata 

1887 # is consistent. 

1888 self._registry.insertDimensionData(element, *records, skip_existing=True) 

1889 

1890 n_imported = 0 

1891 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

1892 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

1893 ): 

1894 if run not in handled_collections: 

1895 # May need to create output collection. If source butler 

1896 # has a registry, ask for documentation string. 

1897 run_doc = None 

1898 if registry := getattr(source_butler, "registry", None): 

1899 run_doc = registry.getCollectionDocumentation(run) 

1900 registered = self._registry.registerRun(run, doc=run_doc) 

1901 handled_collections.add(run) 

1902 if registered: 

1903 _LOG.verbose("Creating output run %s", run) 

1904 

1905 n_refs = len(refs_to_import) 

1906 _LOG.verbose( 

1907 "Importing %d ref%s of dataset type %s into run %s", 

1908 n_refs, 

1909 "" if n_refs == 1 else "s", 

1910 datasetType.name, 

1911 run, 

1912 ) 

1913 

1914 # Assume we are using UUIDs and the source refs will match 

1915 # those imported. 

1916 imported_refs = self._registry._importDatasets(refs_to_import) 

1917 assert set(imported_refs) == set(refs_to_import) 

1918 n_imported += len(imported_refs) 

1919 

1920 assert len(source_refs) == n_imported 

1921 _LOG.verbose("Imported %d datasets into destination butler", n_imported) 

1922 

1923 # Ask the datastore to transfer. The datastore has to check that 

1924 # the source datastore is compatible with the target datastore. 

1925 accepted, rejected = self._datastore.transfer_from( 

1926 source_butler._datastore, 

1927 source_refs, 

1928 transfer=transfer, 

1929 artifact_existence=artifact_existence, 

1930 ) 

1931 if rejected: 

1932 # For now, accept the registry entries but not the files. 

1933 _LOG.warning( 

1934 "%d datasets were rejected and %d accepted for dataset type %s in run %r.", 

1935 len(rejected), 

1936 len(accepted), 

1937 datasetType, 

1938 run, 

1939 ) 

1940 

1941 return source_refs 

1942 

1943 def validateConfiguration( 

1944 self, 

1945 logFailures: bool = False, 

1946 datasetTypeNames: Iterable[str] | None = None, 

1947 ignore: Iterable[str] | None = None, 

1948 ) -> None: 

1949 # Docstring inherited. 

1950 if datasetTypeNames: 

1951 datasetTypes = [self.get_dataset_type(name) for name in datasetTypeNames] 

1952 else: 

1953 datasetTypes = list(self._registry.queryDatasetTypes()) 

1954 

1955 # filter out anything from the ignore list 

1956 if ignore: 

1957 ignore = set(ignore) 

1958 datasetTypes = [ 

1959 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

1960 ] 

1961 else: 

1962 ignore = set() 

1963 

1964 # For each datasetType that has an instrument dimension, create 

1965 # a DatasetRef for each defined instrument 

1966 datasetRefs = [] 

1967 

1968 # Find all the registered instruments (if "instrument" is in the 

1969 # universe). 

1970 if "instrument" in self.dimensions: 

1971 instruments = {record.name for record in self._registry.queryDimensionRecords("instrument")} 

1972 

1973 for datasetType in datasetTypes: 

1974 if "instrument" in datasetType.dimensions: 

1975 # In order to create a conforming dataset ref, create 

1976 # fake DataCoordinate values for the non-instrument 

1977 # dimensions. The type of the value does not matter here. 

1978 dataId = {dim: 1 for dim in datasetType.dimensions.names if dim != "instrument"} 

1979 

1980 for instrument in instruments: 

1981 datasetRef = DatasetRef( 

1982 datasetType, 

1983 DataCoordinate.standardize( 

1984 dataId, instrument=instrument, dimensions=datasetType.dimensions 

1985 ), 

1986 run="validate", 

1987 ) 

1988 datasetRefs.append(datasetRef) 

1989 

1990 entities: list[DatasetType | DatasetRef] = [] 

1991 entities.extend(datasetTypes) 

1992 entities.extend(datasetRefs) 

1993 

1994 datastoreErrorStr = None 

1995 try: 

1996 self._datastore.validateConfiguration(entities, logFailures=logFailures) 

1997 except ValidationError as e: 

1998 datastoreErrorStr = str(e) 

1999 

2000 # Also check that the LookupKeys used by the datastores match 

2001 # registry and storage class definitions 

2002 keys = self._datastore.getLookupKeys() 

2003 

2004 failedNames = set() 

2005 failedDataId = set() 

2006 for key in keys: 

2007 if key.name is not None: 

2008 if key.name in ignore: 

2009 continue 

2010 

2011 # skip if specific datasetType names were requested and this 

2012 # name does not match 

2013 if datasetTypeNames and key.name not in datasetTypeNames: 

2014 continue 

2015 

2016 # See if it is a StorageClass or a DatasetType 

2017 if key.name in self.storageClasses: 

2018 pass 

2019 else: 

2020 try: 

2021 self.get_dataset_type(key.name) 

2022 except KeyError: 

2023 if logFailures: 

2024 _LOG.critical( 

2025 "Key '%s' does not correspond to a DatasetType or StorageClass", key 

2026 ) 

2027 failedNames.add(key) 

2028 else: 

2029 # Dimensions are checked for consistency when the Butler 

2030 # is created and rendezvoused with a universe. 

2031 pass 

2032 

2033 # Check that the instrument is a valid instrument 

2034 # Currently only support instrument so check for that 

2035 if key.dataId: 

2036 dataIdKeys = set(key.dataId) 

2037 if {"instrument"} != dataIdKeys: 

2038 if logFailures: 

2039 _LOG.critical("Key '%s' has unsupported DataId override", key) 

2040 failedDataId.add(key) 

2041 elif key.dataId["instrument"] not in instruments: 

2042 if logFailures: 

2043 _LOG.critical("Key '%s' has unknown instrument", key) 

2044 failedDataId.add(key) 

2045 

2046 messages = [] 

2047 

2048 if datastoreErrorStr: 

2049 messages.append(datastoreErrorStr) 

2050 

2051 for failed, msg in ( 

2052 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2053 (failedDataId, "Keys with bad DataId entries: "), 

2054 ): 

2055 if failed: 

2056 msg += ", ".join(str(k) for k in failed) 

2057 messages.append(msg) 

2058 

2059 if messages: 

2060 raise ValidationError(";\n".join(messages)) 

2061 

2062 @property 

2063 def collections(self) -> Sequence[str]: 

2064 """The collections to search by default, in order 

2065 (`~collections.abc.Sequence` [ `str` ]). 

2066 

2067 This is an alias for ``self.registry.defaults.collections``. It cannot 

2068 be set directly in isolation, but all defaults may be changed together 

2069 by assigning a new `RegistryDefaults` instance to 

2070 ``self.registry.defaults``. 

2071 """ 

2072 return self._registry.defaults.collections 

2073 

2074 @property 

2075 def run(self) -> str | None: 

2076 """Name of the run this butler writes outputs to by default (`str` or 

2077 `None`). 

2078 

2079 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2080 directly in isolation, but all defaults may be changed together by 

2081 assigning a new `RegistryDefaults` instance to 

2082 ``self.registry.defaults``. 

2083 """ 

2084 return self._registry.defaults.run 

2085 

2086 @property 

2087 def registry(self) -> Registry: 

2088 """The object that manages dataset metadata and relationships 

2089 (`Registry`). 

2090 

2091 Many operations that don't involve reading or writing butler datasets 

2092 are accessible only via `Registry` methods. Eventually these methods 

2093 will be replaced by equivalent `Butler` methods. 

2094 """ 

2095 return self._registry_shim 

2096 

2097 @property 

2098 def dimensions(self) -> DimensionUniverse: 

2099 # Docstring inherited. 

2100 return self._registry.dimensions 

2101 

2102 @contextlib.contextmanager 

2103 def _query(self) -> Iterator[Query]: 

2104 # Docstring inherited. 

2105 with self._caching_context(): 

2106 yield DirectQuery(self._registry) 

2107 

2108 def _query_data_ids( 

2109 self, 

2110 dimensions: DimensionGroup | Iterable[str] | str, 

2111 *, 

2112 data_id: DataId | None = None, 

2113 where: str = "", 

2114 bind: Mapping[str, Any] | None = None, 

2115 expanded: bool = False, 

2116 order_by: Iterable[str] | str | None = None, 

2117 limit: int | None = None, 

2118 offset: int | None = None, 

2119 explain: bool = True, 

2120 **kwargs: Any, 

2121 ) -> list[DataCoordinate]: 

2122 # Docstring inherited. 

2123 query = DirectQuery(self._registry) 

2124 result = query.data_ids(dimensions, data_id=data_id, where=where, bind=bind, **kwargs) 

2125 if expanded: 

2126 result = result.expanded() 

2127 if order_by: 

2128 result = result.order_by(*ensure_iterable(order_by)) 

2129 if limit is not None: 

2130 result = result.limit(limit, offset) 

2131 else: 

2132 if offset is not None: 

2133 raise TypeError("offset is specified without limit") 

2134 data_ids = list(result) 

2135 if explain and not data_ids: 

2136 raise EmptyQueryResultError(list(result.explain_no_results())) 

2137 return data_ids 

2138 

2139 def _query_datasets( 

2140 self, 

2141 dataset_type: Any, 

2142 collections: CollectionArgType | None = None, 

2143 *, 

2144 find_first: bool = True, 

2145 data_id: DataId | None = None, 

2146 where: str = "", 

2147 bind: Mapping[str, Any] | None = None, 

2148 expanded: bool = False, 

2149 explain: bool = True, 

2150 **kwargs: Any, 

2151 ) -> list[DatasetRef]: 

2152 # Docstring inherited. 

2153 query = DirectQuery(self._registry) 

2154 result = query.datasets( 

2155 dataset_type, 

2156 collections, 

2157 find_first=find_first, 

2158 data_id=data_id, 

2159 where=where, 

2160 bind=bind, 

2161 **kwargs, 

2162 ) 

2163 if expanded: 

2164 result = result.expanded() 

2165 refs = list(result) 

2166 if explain and not refs: 

2167 raise EmptyQueryResultError(list(result.explain_no_results())) 

2168 return refs 

2169 

2170 def _query_dimension_records( 

2171 self, 

2172 element: str, 

2173 *, 

2174 data_id: DataId | None = None, 

2175 where: str = "", 

2176 bind: Mapping[str, Any] | None = None, 

2177 order_by: Iterable[str] | str | None = None, 

2178 limit: int | None = None, 

2179 offset: int | None = None, 

2180 explain: bool = True, 

2181 **kwargs: Any, 

2182 ) -> list[DimensionRecord]: 

2183 # Docstring inherited. 

2184 query = DirectQuery(self._registry) 

2185 result = query.dimension_records(element, data_id=data_id, where=where, bind=bind, **kwargs) 

2186 if order_by: 

2187 result = result.order_by(*ensure_iterable(order_by)) 

2188 if limit is not None: 

2189 result = result.limit(limit, offset) 

2190 else: 

2191 if offset is not None: 

2192 raise TypeError("offset is specified without limit") 

2193 data_ids = list(result) 

2194 if explain and not data_ids: 

2195 raise EmptyQueryResultError(list(result.explain_no_results())) 

2196 return data_ids 

2197 

2198 _config: ButlerConfig 

2199 """Configuration for this Butler instance.""" 

2200 

2201 _registry: SqlRegistry 

2202 """The object that manages dataset metadata and relationships 

2203 (`SqlRegistry`). 

2204 

2205 Most operations that don't involve reading or writing butler datasets are 

2206 accessible only via `SqlRegistry` methods. 

2207 """ 

2208 

2209 datastore: Datastore 

2210 """The object that manages actual dataset storage (`Datastore`). 

2211 

2212 Direct user access to the datastore should rarely be necessary; the primary 

2213 exception is the case where a `Datastore` implementation provides extra 

2214 functionality beyond what the base class defines. 

2215 """ 

2216 

2217 storageClasses: StorageClassFactory 

2218 """An object that maps known storage class names to objects that fully 

2219 describe them (`StorageClassFactory`). 

2220 """ 

2221 

2222 _registry_shim: RegistryShim 

2223 """Shim object to provide a legacy public interface for querying via the 

2224 the ``registry`` property. 

2225 """