Coverage for python/lsst/daf/butler/direct_butler.py: 10%

750 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-03-30 02:51 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Butler top level classes. 

29""" 

30from __future__ import annotations 

31 

32__all__ = ( 

33 "DirectButler", 

34 "ButlerValidationError", 

35) 

36 

37import collections.abc 

38import contextlib 

39import io 

40import itertools 

41import logging 

42import numbers 

43import os 

44import warnings 

45from collections import Counter, defaultdict 

46from collections.abc import Iterable, Iterator, MutableMapping, Sequence 

47from typing import TYPE_CHECKING, Any, ClassVar, TextIO, cast 

48 

49from lsst.resources import ResourcePath, ResourcePathExpression 

50from lsst.utils.introspection import get_class_of 

51from lsst.utils.logging import VERBOSE, getLogger 

52from sqlalchemy.exc import IntegrityError 

53 

54from ._butler import Butler 

55from ._butler_config import ButlerConfig 

56from ._butler_instance_options import ButlerInstanceOptions 

57from ._dataset_existence import DatasetExistence 

58from ._dataset_ref import DatasetRef 

59from ._dataset_type import DatasetType 

60from ._deferredDatasetHandle import DeferredDatasetHandle 

61from ._exceptions import DatasetNotFoundError, DimensionValueError, ValidationError 

62from ._limited_butler import LimitedButler 

63from ._registry_shim import RegistryShim 

64from ._storage_class import StorageClass, StorageClassFactory 

65from ._timespan import Timespan 

66from .datastore import Datastore, NullDatastore 

67from .dimensions import DataCoordinate, Dimension 

68from .progress import Progress 

69from .queries import Query 

70from .registry import ( 

71 CollectionType, 

72 ConflictingDefinitionError, 

73 DataIdError, 

74 MissingDatasetTypeError, 

75 RegistryDefaults, 

76 _RegistryFactory, 

77) 

78from .registry.sql_registry import SqlRegistry 

79from .transfers import RepoExportContext 

80from .utils import transactional 

81 

82if TYPE_CHECKING: 

83 from lsst.resources import ResourceHandleProtocol 

84 

85 from ._dataset_ref import DatasetId 

86 from ._file_dataset import FileDataset 

87 from .datastore import DatasetRefURIs 

88 from .dimensions import DataId, DataIdValue, DimensionElement, DimensionRecord, DimensionUniverse 

89 from .registry import Registry 

90 from .transfers import RepoImportBackend 

91 

92_LOG = getLogger(__name__) 

93 

94 

95class ButlerValidationError(ValidationError): 

96 """There is a problem with the Butler configuration.""" 

97 

98 pass 

99 

100 

101class DirectButler(Butler): # numpydoc ignore=PR02 

102 """Main entry point for the data access system. 

103 

104 Parameters 

105 ---------- 

106 config : `ButlerConfig` 

107 The configuration for this Butler instance. 

108 registry : `SqlRegistry` 

109 The object that manages dataset metadata and relationships. 

110 datastore : Datastore 

111 The object that manages actual dataset storage. 

112 storageClasses : StorageClassFactory 

113 An object that maps known storage class names to objects that fully 

114 describe them. 

115 

116 Notes 

117 ----- 

118 Most users should call the top-level `Butler`.``from_config`` instead of 

119 using this constructor directly. 

120 """ 

121 

122 # This is __new__ instead of __init__ because we have to support 

123 # instantiation via the legacy constructor Butler.__new__(), which 

124 # reads the configuration and selects which subclass to instantiate. The 

125 # interaction between __new__ and __init__ is kind of wacky in Python. If 

126 # we were using __init__ here, __init__ would be called twice (once when 

127 # the DirectButler instance is constructed inside Butler.from_config(), and 

128 # a second time with the original arguments to Butler() when the instance 

129 # is returned from Butler.__new__() 

130 def __new__( 

131 cls, 

132 *, 

133 config: ButlerConfig, 

134 registry: SqlRegistry, 

135 datastore: Datastore, 

136 storageClasses: StorageClassFactory, 

137 ) -> DirectButler: 

138 self = cast(DirectButler, super().__new__(cls)) 

139 self._config = config 

140 self._registry = registry 

141 self._datastore = datastore 

142 self.storageClasses = storageClasses 

143 

144 # For execution butler the datastore needs a special 

145 # dependency-inversion trick. This is not used by regular butler, 

146 # but we do not have a way to distinguish regular butler from execution 

147 # butler. 

148 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

149 

150 self._registry_shim = RegistryShim(self) 

151 

152 return self 

153 

154 @classmethod 

155 def create_from_config( 

156 cls, 

157 config: ButlerConfig, 

158 *, 

159 options: ButlerInstanceOptions, 

160 without_datastore: bool = False, 

161 ) -> DirectButler: 

162 """Construct a Butler instance from a configuration file. 

163 

164 Parameters 

165 ---------- 

166 config : `ButlerConfig` 

167 The configuration for this Butler instance. 

168 options : `ButlerInstanceOptions` 

169 Default values and other settings for the Butler instance. 

170 without_datastore : `bool`, optional 

171 If `True` do not attach a datastore to this butler. Any attempts 

172 to use a datastore will fail. 

173 

174 Notes 

175 ----- 

176 Most users should call the top-level `Butler`.``from_config`` 

177 instead of using this function directly. 

178 """ 

179 if "run" in config or "collection" in config: 

180 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

181 

182 defaults = RegistryDefaults( 

183 collections=options.collections, run=options.run, infer=options.inferDefaults, **options.kwargs 

184 ) 

185 try: 

186 butlerRoot = config.get("root", config.configDir) 

187 writeable = options.writeable 

188 if writeable is None: 

189 writeable = options.run is not None 

190 registry = _RegistryFactory(config).from_config( 

191 butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

192 ) 

193 if without_datastore: 

194 datastore: Datastore = NullDatastore(None, None) 

195 else: 

196 datastore = Datastore.fromConfig( 

197 config, registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

198 ) 

199 # TODO: Once datastore drops dependency on registry we can 

200 # construct datastore first and pass opaque tables to registry 

201 # constructor. 

202 registry.make_datastore_tables(datastore.get_opaque_table_definitions()) 

203 storageClasses = StorageClassFactory() 

204 storageClasses.addFromConfig(config) 

205 

206 return DirectButler( 

207 config=config, registry=registry, datastore=datastore, storageClasses=storageClasses 

208 ) 

209 except Exception: 

210 # Failures here usually mean that configuration is incomplete, 

211 # just issue an error message which includes config file URI. 

212 _LOG.error(f"Failed to instantiate Butler from config {config.configFile}.") 

213 raise 

214 

215 def _clone( 

216 self, 

217 *, 

218 collections: Any = None, 

219 run: str | None = None, 

220 inferDefaults: bool = True, 

221 **kwargs: Any, 

222 ) -> DirectButler: 

223 # Docstring inherited 

224 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

225 registry = self._registry.copy(defaults) 

226 

227 return DirectButler( 

228 registry=registry, 

229 config=self._config, 

230 datastore=self._datastore.clone(registry.getDatastoreBridgeManager()), 

231 storageClasses=self.storageClasses, 

232 ) 

233 

234 GENERATION: ClassVar[int] = 3 

235 """This is a Generation 3 Butler. 

236 

237 This attribute may be removed in the future, once the Generation 2 Butler 

238 interface has been fully retired; it should only be used in transitional 

239 code. 

240 """ 

241 

242 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

243 """Return DatasetType defined in registry given dataset type name.""" 

244 try: 

245 return self.get_dataset_type(name) 

246 except MissingDatasetTypeError: 

247 return None 

248 

249 @classmethod 

250 def _unpickle( 

251 cls, 

252 config: ButlerConfig, 

253 collections: tuple[str, ...] | None, 

254 run: str | None, 

255 defaultDataId: dict[str, str], 

256 writeable: bool, 

257 ) -> DirectButler: 

258 """Callable used to unpickle a Butler. 

259 

260 We prefer not to use ``Butler.__init__`` directly so we can force some 

261 of its many arguments to be keyword-only (note that ``__reduce__`` 

262 can only invoke callables with positional arguments). 

263 

264 Parameters 

265 ---------- 

266 config : `ButlerConfig` 

267 Butler configuration, already coerced into a true `ButlerConfig` 

268 instance (and hence after any search paths for overrides have been 

269 utilized). 

270 collections : `tuple` [ `str` ] 

271 Names of the default collections to read from. 

272 run : `str`, optional 

273 Name of the default `~CollectionType.RUN` collection to write to. 

274 defaultDataId : `dict` [ `str`, `str` ] 

275 Default data ID values. 

276 writeable : `bool` 

277 Whether the Butler should support write operations. 

278 

279 Returns 

280 ------- 

281 butler : `Butler` 

282 A new `Butler` instance. 

283 """ 

284 return cls.create_from_config( 

285 config=config, 

286 options=ButlerInstanceOptions( 

287 collections=collections, run=run, writeable=writeable, kwargs=defaultDataId 

288 ), 

289 ) 

290 

291 def __reduce__(self) -> tuple: 

292 """Support pickling.""" 

293 return ( 

294 DirectButler._unpickle, 

295 ( 

296 self._config, 

297 self.collections, 

298 self.run, 

299 dict(self._registry.defaults.dataId.required), 

300 self._registry.isWriteable(), 

301 ), 

302 ) 

303 

304 def __str__(self) -> str: 

305 return ( 

306 f"Butler(collections={self.collections}, run={self.run}, " 

307 f"datastore='{self._datastore}', registry='{self._registry}')" 

308 ) 

309 

310 def isWriteable(self) -> bool: 

311 # Docstring inherited. 

312 return self._registry.isWriteable() 

313 

314 def _caching_context(self) -> contextlib.AbstractContextManager[None]: 

315 """Context manager that enables caching.""" 

316 return self._registry.caching_context() 

317 

318 @contextlib.contextmanager 

319 def transaction(self) -> Iterator[None]: 

320 """Context manager supporting `Butler` transactions. 

321 

322 Transactions can be nested. 

323 """ 

324 with self._registry.transaction(), self._datastore.transaction(): 

325 yield 

326 

327 def _standardizeArgs( 

328 self, 

329 datasetRefOrType: DatasetRef | DatasetType | str, 

330 dataId: DataId | None = None, 

331 for_put: bool = True, 

332 **kwargs: Any, 

333 ) -> tuple[DatasetType, DataId | None]: 

334 """Standardize the arguments passed to several Butler APIs. 

335 

336 Parameters 

337 ---------- 

338 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

339 When `DatasetRef` the `dataId` should be `None`. 

340 Otherwise the `DatasetType` or name thereof. 

341 dataId : `dict` or `DataCoordinate` 

342 A `dict` of `Dimension` link name, value pairs that label the 

343 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

344 should be provided as the second argument. 

345 for_put : `bool`, optional 

346 If `True` this call is invoked as part of a `Butler.put()`. 

347 Otherwise it is assumed to be part of a `Butler.get()`. This 

348 parameter is only relevant if there is dataset type 

349 inconsistency. 

350 **kwargs 

351 Additional keyword arguments used to augment or construct a 

352 `DataCoordinate`. See `DataCoordinate.standardize` 

353 parameters. 

354 

355 Returns 

356 ------- 

357 datasetType : `DatasetType` 

358 A `DatasetType` instance extracted from ``datasetRefOrType``. 

359 dataId : `dict` or `DataId`, optional 

360 Argument that can be used (along with ``kwargs``) to construct a 

361 `DataId`. 

362 

363 Notes 

364 ----- 

365 Butler APIs that conceptually need a DatasetRef also allow passing a 

366 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

367 keyword arguments that can be used to construct one) separately. This 

368 method accepts those arguments and always returns a true `DatasetType` 

369 and a `DataId` or `dict`. 

370 

371 Standardization of `dict` vs `DataId` is best handled by passing the 

372 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

373 generally similarly flexible. 

374 """ 

375 externalDatasetType: DatasetType | None = None 

376 internalDatasetType: DatasetType | None = None 

377 if isinstance(datasetRefOrType, DatasetRef): 

378 if dataId is not None or kwargs: 

379 raise ValueError("DatasetRef given, cannot use dataId as well") 

380 externalDatasetType = datasetRefOrType.datasetType 

381 dataId = datasetRefOrType.dataId 

382 else: 

383 # Don't check whether DataId is provided, because Registry APIs 

384 # can usually construct a better error message when it wasn't. 

385 if isinstance(datasetRefOrType, DatasetType): 

386 externalDatasetType = datasetRefOrType 

387 else: 

388 internalDatasetType = self.get_dataset_type(datasetRefOrType) 

389 

390 # Check that they are self-consistent 

391 if externalDatasetType is not None: 

392 internalDatasetType = self.get_dataset_type(externalDatasetType.name) 

393 if externalDatasetType != internalDatasetType: 

394 # We can allow differences if they are compatible, depending 

395 # on whether this is a get or a put. A get requires that 

396 # the python type associated with the datastore can be 

397 # converted to the user type. A put requires that the user 

398 # supplied python type can be converted to the internal 

399 # type expected by registry. 

400 relevantDatasetType = internalDatasetType 

401 if for_put: 

402 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

403 else: 

404 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

405 relevantDatasetType = externalDatasetType 

406 if not is_compatible: 

407 raise ValueError( 

408 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

409 f"registry definition ({internalDatasetType})" 

410 ) 

411 # Override the internal definition. 

412 internalDatasetType = relevantDatasetType 

413 

414 assert internalDatasetType is not None 

415 return internalDatasetType, dataId 

416 

417 def _rewrite_data_id( 

418 self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any 

419 ) -> tuple[DataId | None, dict[str, Any]]: 

420 """Rewrite a data ID taking into account dimension records. 

421 

422 Take a Data ID and keyword args and rewrite it if necessary to 

423 allow the user to specify dimension records rather than dimension 

424 primary values. 

425 

426 This allows a user to include a dataId dict with keys of 

427 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

428 the integer exposure ID. It also allows a string to be given 

429 for a dimension value rather than the integer ID if that is more 

430 convenient. For example, rather than having to specifying the 

431 detector with ``detector.full_name``, a string given for ``detector`` 

432 will be interpreted as the full name and converted to the integer 

433 value. 

434 

435 Keyword arguments can also use strings for dimensions like detector 

436 and exposure but python does not allow them to include ``.`` and 

437 so the ``exposure.day_obs`` syntax can not be used in a keyword 

438 argument. 

439 

440 Parameters 

441 ---------- 

442 dataId : `dict` or `DataCoordinate` 

443 A `dict` of `Dimension` link name, value pairs that will label the 

444 `DatasetRef` within a Collection. 

445 datasetType : `DatasetType` 

446 The dataset type associated with this dataId. Required to 

447 determine the relevant dimensions. 

448 **kwargs 

449 Additional keyword arguments used to augment or construct a 

450 `DataId`. See `DataId` parameters. 

451 

452 Returns 

453 ------- 

454 dataId : `dict` or `DataCoordinate` 

455 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

456 no keyword arguments, the original dataId will be returned 

457 unchanged. 

458 **kwargs : `dict` 

459 Any unused keyword arguments (would normally be empty dict). 

460 """ 

461 # Do nothing if we have a standalone DataCoordinate. 

462 if isinstance(dataId, DataCoordinate) and not kwargs: 

463 return dataId, kwargs 

464 

465 # Process dimension records that are using record information 

466 # rather than ids 

467 newDataId: dict[str, DataIdValue] = {} 

468 byRecord: dict[str, dict[str, Any]] = defaultdict(dict) 

469 

470 # if all the dataId comes from keyword parameters we do not need 

471 # to do anything here because they can't be of the form 

472 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

473 if dataId: 

474 for k, v in dataId.items(): 

475 # If we have a Dimension we do not need to do anything 

476 # because it cannot be a compound key. 

477 if isinstance(k, str) and "." in k: 

478 # Someone is using a more human-readable dataId 

479 dimensionName, record = k.split(".", 1) 

480 byRecord[dimensionName][record] = v 

481 elif isinstance(k, Dimension): 

482 newDataId[k.name] = v 

483 else: 

484 newDataId[k] = v 

485 

486 # Go through the updated dataId and check the type in case someone is 

487 # using an alternate key. We have already filtered out the compound 

488 # keys dimensions.record format. 

489 not_dimensions = {} 

490 

491 # Will need to look in the dataId and the keyword arguments 

492 # and will remove them if they need to be fixed or are unrecognized. 

493 for dataIdDict in (newDataId, kwargs): 

494 # Use a list so we can adjust the dict safely in the loop 

495 for dimensionName in list(dataIdDict): 

496 value = dataIdDict[dimensionName] 

497 try: 

498 dimension = self.dimensions.dimensions[dimensionName] 

499 except KeyError: 

500 # This is not a real dimension 

501 not_dimensions[dimensionName] = value 

502 del dataIdDict[dimensionName] 

503 continue 

504 

505 # Convert an integral type to an explicit int to simplify 

506 # comparisons here 

507 if isinstance(value, numbers.Integral): 

508 value = int(value) 

509 

510 if not isinstance(value, dimension.primaryKey.getPythonType()): 

511 for alternate in dimension.alternateKeys: 

512 if isinstance(value, alternate.getPythonType()): 

513 byRecord[dimensionName][alternate.name] = value 

514 del dataIdDict[dimensionName] 

515 _LOG.debug( 

516 "Converting dimension %s to %s.%s=%s", 

517 dimensionName, 

518 dimensionName, 

519 alternate.name, 

520 value, 

521 ) 

522 break 

523 else: 

524 _LOG.warning( 

525 "Type mismatch found for value '%r' provided for dimension %s. " 

526 "Could not find matching alternative (primary key has type %s) " 

527 "so attempting to use as-is.", 

528 value, 

529 dimensionName, 

530 dimension.primaryKey.getPythonType(), 

531 ) 

532 

533 # By this point kwargs and newDataId should only include valid 

534 # dimensions. Merge kwargs in to the new dataId and log if there 

535 # are dimensions in both (rather than calling update). 

536 for k, v in kwargs.items(): 

537 if k in newDataId and newDataId[k] != v: 

538 _LOG.debug( 

539 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

540 ) 

541 newDataId[k] = v 

542 # No need to retain any values in kwargs now. 

543 kwargs = {} 

544 

545 # If we have some unrecognized dimensions we have to try to connect 

546 # them to records in other dimensions. This is made more complicated 

547 # by some dimensions having records with clashing names. A mitigation 

548 # is that we can tell by this point which dimensions are missing 

549 # for the DatasetType but this does not work for calibrations 

550 # where additional dimensions can be used to constrain the temporal 

551 # axis. 

552 if not_dimensions: 

553 # Search for all dimensions even if we have been given a value 

554 # explicitly. In some cases records are given as well as the 

555 # actually dimension and this should not be an error if they 

556 # match. 

557 mandatoryDimensions = datasetType.dimensions.names # - provided 

558 

559 candidateDimensions: set[str] = set() 

560 candidateDimensions.update(mandatoryDimensions) 

561 

562 # For calibrations we may well be needing temporal dimensions 

563 # so rather than always including all dimensions in the scan 

564 # restrict things a little. It is still possible for there 

565 # to be confusion over day_obs in visit vs exposure for example. 

566 # If we are not searching calibration collections things may 

567 # fail but they are going to fail anyway because of the 

568 # ambiguousness of the dataId... 

569 if datasetType.isCalibration(): 

570 for dim in self.dimensions.dimensions: 

571 if dim.temporal: 

572 candidateDimensions.add(str(dim)) 

573 

574 # Look up table for the first association with a dimension 

575 guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict) 

576 

577 # Keep track of whether an item is associated with multiple 

578 # dimensions. 

579 counter: Counter[str] = Counter() 

580 assigned: dict[str, set[str]] = defaultdict(set) 

581 

582 # Go through the missing dimensions and associate the 

583 # given names with records within those dimensions 

584 matched_dims = set() 

585 for dimensionName in candidateDimensions: 

586 dimension = self.dimensions.dimensions[dimensionName] 

587 fields = dimension.metadata.names | dimension.uniqueKeys.names 

588 for field in not_dimensions: 

589 if field in fields: 

590 guessedAssociation[dimensionName][field] = not_dimensions[field] 

591 counter[dimensionName] += 1 

592 assigned[field].add(dimensionName) 

593 matched_dims.add(field) 

594 

595 # Calculate the fields that matched nothing. 

596 never_found = set(not_dimensions) - matched_dims 

597 

598 if never_found: 

599 raise DimensionValueError(f"Unrecognized keyword args given: {never_found}") 

600 

601 # There is a chance we have allocated a single dataId item 

602 # to multiple dimensions. Need to decide which should be retained. 

603 # For now assume that the most popular alternative wins. 

604 # This means that day_obs with seq_num will result in 

605 # exposure.day_obs and not visit.day_obs 

606 # Also prefer an explicitly missing dimension over an inferred 

607 # temporal dimension. 

608 for fieldName, assignedDimensions in assigned.items(): 

609 if len(assignedDimensions) > 1: 

610 # Pick the most popular (preferring mandatory dimensions) 

611 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

612 if requiredButMissing: 

613 candidateDimensions = requiredButMissing 

614 else: 

615 candidateDimensions = assignedDimensions 

616 

617 # If this is a choice between visit and exposure and 

618 # neither was a required part of the dataset type, 

619 # (hence in this branch) always prefer exposure over 

620 # visit since exposures are always defined and visits 

621 # are defined from exposures. 

622 if candidateDimensions == {"exposure", "visit"}: 

623 candidateDimensions = {"exposure"} 

624 

625 # Select the relevant items and get a new restricted 

626 # counter. 

627 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

628 duplicatesCounter: Counter[str] = Counter() 

629 duplicatesCounter.update(theseCounts) 

630 

631 # Choose the most common. If they are equally common 

632 # we will pick the one that was found first. 

633 # Returns a list of tuples 

634 selected = duplicatesCounter.most_common(1)[0][0] 

635 

636 _LOG.debug( 

637 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

638 " Removed ambiguity by choosing dimension %s.", 

639 fieldName, 

640 ", ".join(assignedDimensions), 

641 selected, 

642 ) 

643 

644 for candidateDimension in assignedDimensions: 

645 if candidateDimension != selected: 

646 del guessedAssociation[candidateDimension][fieldName] 

647 

648 # Update the record look up dict with the new associations 

649 for dimensionName, values in guessedAssociation.items(): 

650 if values: # A dict might now be empty 

651 _LOG.debug( 

652 "Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values 

653 ) 

654 byRecord[dimensionName].update(values) 

655 

656 if byRecord: 

657 # Some record specifiers were found so we need to convert 

658 # them to the Id form 

659 for dimensionName, values in byRecord.items(): 

660 if dimensionName in newDataId: 

661 _LOG.debug( 

662 "DataId specified explicit %s dimension value of %s in addition to" 

663 " general record specifiers for it of %s. Ignoring record information.", 

664 dimensionName, 

665 newDataId[dimensionName], 

666 str(values), 

667 ) 

668 # Get the actual record and compare with these values. 

669 try: 

670 recs = list(self._registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

671 except DataIdError: 

672 raise DimensionValueError( 

673 f"Could not find dimension '{dimensionName}'" 

674 f" with dataId {newDataId} as part of comparing with" 

675 f" record values {byRecord[dimensionName]}" 

676 ) from None 

677 if len(recs) == 1: 

678 errmsg: list[str] = [] 

679 for k, v in values.items(): 

680 if (recval := getattr(recs[0], k)) != v: 

681 errmsg.append(f"{k}({recval} != {v})") 

682 if errmsg: 

683 raise DimensionValueError( 

684 f"Dimension {dimensionName} in dataId has explicit value" 

685 " inconsistent with records: " + ", ".join(errmsg) 

686 ) 

687 else: 

688 # Multiple matches for an explicit dimension 

689 # should never happen but let downstream complain. 

690 pass 

691 continue 

692 

693 # Build up a WHERE expression 

694 bind = dict(values.items()) 

695 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

696 

697 # Hopefully we get a single record that matches 

698 records = set( 

699 self._registry.queryDimensionRecords( 

700 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

701 ) 

702 ) 

703 

704 if len(records) != 1: 

705 if len(records) > 1: 

706 # visit can have an ambiguous answer without involving 

707 # visit_system. The default visit_system is defined 

708 # by the instrument. 

709 if ( 

710 dimensionName == "visit" 

711 and "visit_system_membership" in self.dimensions 

712 and "visit_system" in self.dimensions["instrument"].metadata 

713 ): 

714 instrument_records = list( 

715 self._registry.queryDimensionRecords( 

716 "instrument", 

717 dataId=newDataId, 

718 **kwargs, 

719 ) 

720 ) 

721 if len(instrument_records) == 1: 

722 visit_system = instrument_records[0].visit_system 

723 if visit_system is None: 

724 # Set to a value that will never match. 

725 visit_system = -1 

726 

727 # Look up each visit in the 

728 # visit_system_membership records. 

729 for rec in records: 

730 membership = list( 

731 self._registry.queryDimensionRecords( 

732 # Use bind to allow zero results. 

733 # This is a fully-specified query. 

734 "visit_system_membership", 

735 where="instrument = inst AND visit_system = system AND visit = v", 

736 bind=dict( 

737 inst=instrument_records[0].name, system=visit_system, v=rec.id 

738 ), 

739 ) 

740 ) 

741 if membership: 

742 # This record is the right answer. 

743 records = {rec} 

744 break 

745 

746 # The ambiguity may have been resolved so check again. 

747 if len(records) > 1: 

748 _LOG.debug( 

749 "Received %d records from constraints of %s", len(records), str(values) 

750 ) 

751 for r in records: 

752 _LOG.debug("- %s", str(r)) 

753 raise DimensionValueError( 

754 f"DataId specification for dimension {dimensionName} is not" 

755 f" uniquely constrained to a single dataset by {values}." 

756 f" Got {len(records)} results." 

757 ) 

758 else: 

759 raise DimensionValueError( 

760 f"DataId specification for dimension {dimensionName} matched no" 

761 f" records when constrained by {values}" 

762 ) 

763 

764 # Get the primary key from the real dimension object 

765 dimension = self.dimensions.dimensions[dimensionName] 

766 if not isinstance(dimension, Dimension): 

767 raise RuntimeError( 

768 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

769 ) 

770 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

771 

772 return newDataId, kwargs 

773 

774 def _findDatasetRef( 

775 self, 

776 datasetRefOrType: DatasetRef | DatasetType | str, 

777 dataId: DataId | None = None, 

778 *, 

779 collections: Any = None, 

780 predict: bool = False, 

781 run: str | None = None, 

782 datastore_records: bool = False, 

783 timespan: Timespan | None = None, 

784 **kwargs: Any, 

785 ) -> DatasetRef: 

786 """Shared logic for methods that start with a search for a dataset in 

787 the registry. 

788 

789 Parameters 

790 ---------- 

791 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

792 When `DatasetRef` the `dataId` should be `None`. 

793 Otherwise the `DatasetType` or name thereof. 

794 dataId : `dict` or `DataCoordinate`, optional 

795 A `dict` of `Dimension` link name, value pairs that label the 

796 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

797 should be provided as the first argument. 

798 collections : Any, optional 

799 Collections to be searched, overriding ``self.collections``. 

800 Can be any of the types supported by the ``collections`` argument 

801 to butler construction. 

802 predict : `bool`, optional 

803 If `True`, return a newly created `DatasetRef` with a unique 

804 dataset ID if finding a reference in the `Registry` fails. 

805 Defaults to `False`. 

806 run : `str`, optional 

807 Run collection name to use for creating `DatasetRef` for predicted 

808 datasets. Only used if ``predict`` is `True`. 

809 datastore_records : `bool`, optional 

810 If `True` add datastore records to returned `DatasetRef`. 

811 timespan : `Timespan` or `None`, optional 

812 A timespan that the validity range of the dataset must overlap. 

813 If not provided and this is a calibration dataset type, an attempt 

814 will be made to find the timespan from any temporal coordinate 

815 in the data ID. 

816 **kwargs 

817 Additional keyword arguments used to augment or construct a 

818 `DataId`. See `DataId` parameters. 

819 

820 Returns 

821 ------- 

822 ref : `DatasetRef` 

823 A reference to the dataset identified by the given arguments. 

824 This can be the same dataset reference as given if it was 

825 resolved. 

826 

827 Raises 

828 ------ 

829 LookupError 

830 Raised if no matching dataset exists in the `Registry` (and 

831 ``predict`` is `False`). 

832 ValueError 

833 Raised if a resolved `DatasetRef` was passed as an input, but it 

834 differs from the one found in the registry. 

835 TypeError 

836 Raised if no collections were provided. 

837 """ 

838 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

839 if isinstance(datasetRefOrType, DatasetRef): 

840 if collections is not None: 

841 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3) 

842 # May need to retrieve datastore records if requested. 

843 if datastore_records and datasetRefOrType._datastore_records is None: 

844 datasetRefOrType = self._registry.get_datastore_records(datasetRefOrType) 

845 return datasetRefOrType 

846 

847 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

848 

849 if datasetType.isCalibration(): 

850 # Because this is a calibration dataset, first try to make a 

851 # standardize the data ID without restricting the dimensions to 

852 # those of the dataset type requested, because there may be extra 

853 # dimensions that provide temporal information for a validity-range 

854 # lookup. 

855 dataId = DataCoordinate.standardize( 

856 dataId, universe=self.dimensions, defaults=self._registry.defaults.dataId, **kwargs 

857 ) 

858 if timespan is None: 

859 if dataId.dimensions.temporal: 

860 dataId = self._registry.expandDataId(dataId) 

861 # Use the timespan from the data ID to constrain the 

862 # calibration lookup, but only if the caller has not 

863 # specified an explicit timespan. 

864 timespan = dataId.timespan 

865 else: 

866 # Try an arbitrary timespan. Downstream will fail if this 

867 # results in more than one matching dataset. 

868 timespan = Timespan(None, None) 

869 else: 

870 # Standardize the data ID to just the dimensions of the dataset 

871 # type instead of letting registry.findDataset do it, so we get the 

872 # result even if no dataset is found. 

873 dataId = DataCoordinate.standardize( 

874 dataId, 

875 dimensions=datasetType.dimensions, 

876 defaults=self._registry.defaults.dataId, 

877 **kwargs, 

878 ) 

879 # Always lookup the DatasetRef, even if one is given, to ensure it is 

880 # present in the current collection. 

881 ref = self.find_dataset( 

882 datasetType, 

883 dataId, 

884 collections=collections, 

885 timespan=timespan, 

886 datastore_records=datastore_records, 

887 ) 

888 if ref is None: 

889 if predict: 

890 if run is None: 

891 run = self.run 

892 if run is None: 

893 raise TypeError("Cannot predict dataset ID/location with run=None.") 

894 return DatasetRef(datasetType, dataId, run=run) 

895 else: 

896 if collections is None: 

897 collections = self._registry.defaults.collections 

898 raise DatasetNotFoundError( 

899 f"Dataset {datasetType.name} with data ID {dataId} " 

900 f"could not be found in collections {collections}." 

901 ) 

902 if datasetType != ref.datasetType: 

903 # If they differ it is because the user explicitly specified 

904 # a compatible dataset type to this call rather than using the 

905 # registry definition. The DatasetRef must therefore be recreated 

906 # using the user definition such that the expected type is 

907 # returned. 

908 ref = DatasetRef( 

909 datasetType, ref.dataId, run=ref.run, id=ref.id, datastore_records=ref._datastore_records 

910 ) 

911 

912 return ref 

913 

914 @transactional 

915 def put( 

916 self, 

917 obj: Any, 

918 datasetRefOrType: DatasetRef | DatasetType | str, 

919 /, 

920 dataId: DataId | None = None, 

921 *, 

922 run: str | None = None, 

923 **kwargs: Any, 

924 ) -> DatasetRef: 

925 """Store and register a dataset. 

926 

927 Parameters 

928 ---------- 

929 obj : `object` 

930 The dataset. 

931 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

932 When `DatasetRef` is provided, ``dataId`` should be `None`. 

933 Otherwise the `DatasetType` or name thereof. If a fully resolved 

934 `DatasetRef` is given the run and ID are used directly. 

935 dataId : `dict` or `DataCoordinate` 

936 A `dict` of `Dimension` link name, value pairs that label the 

937 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

938 should be provided as the second argument. 

939 run : `str`, optional 

940 The name of the run the dataset should be added to, overriding 

941 ``self.run``. Not used if a resolved `DatasetRef` is provided. 

942 **kwargs 

943 Additional keyword arguments used to augment or construct a 

944 `DataCoordinate`. See `DataCoordinate.standardize` 

945 parameters. Not used if a resolve `DatasetRef` is provided. 

946 

947 Returns 

948 ------- 

949 ref : `DatasetRef` 

950 A reference to the stored dataset, updated with the correct id if 

951 given. 

952 

953 Raises 

954 ------ 

955 TypeError 

956 Raised if the butler is read-only or if no run has been provided. 

957 """ 

958 if isinstance(datasetRefOrType, DatasetRef): 

959 # This is a direct put of predefined DatasetRef. 

960 _LOG.debug("Butler put direct: %s", datasetRefOrType) 

961 if run is not None: 

962 warnings.warn("Run collection is not used for DatasetRef", stacklevel=3) 

963 # If registry already has a dataset with the same dataset ID, 

964 # dataset type and DataId, then _importDatasets will do nothing and 

965 # just return an original ref. We have to raise in this case, there 

966 # is a datastore check below for that. 

967 self._registry._importDatasets([datasetRefOrType], expand=True) 

968 # Before trying to write to the datastore check that it does not 

969 # know this dataset. This is prone to races, of course. 

970 if self._datastore.knows(datasetRefOrType): 

971 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}") 

972 # Try to write dataset to the datastore, if it fails due to a race 

973 # with another write, the content of stored data may be 

974 # unpredictable. 

975 try: 

976 self._datastore.put(obj, datasetRefOrType) 

977 except IntegrityError as e: 

978 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") from e 

979 return datasetRefOrType 

980 

981 _LOG.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

982 if not self.isWriteable(): 

983 raise TypeError("Butler is read-only.") 

984 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

985 

986 # Handle dimension records in dataId 

987 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

988 

989 # Add Registry Dataset entry. 

990 dataId = self._registry.expandDataId(dataId, dimensions=datasetType.dimensions, **kwargs) 

991 (ref,) = self._registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

992 self._datastore.put(obj, ref) 

993 

994 return ref 

995 

996 def getDeferred( 

997 self, 

998 datasetRefOrType: DatasetRef | DatasetType | str, 

999 /, 

1000 dataId: DataId | None = None, 

1001 *, 

1002 parameters: dict | None = None, 

1003 collections: Any = None, 

1004 storageClass: str | StorageClass | None = None, 

1005 timespan: Timespan | None = None, 

1006 **kwargs: Any, 

1007 ) -> DeferredDatasetHandle: 

1008 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1009 after an immediate registry lookup. 

1010 

1011 Parameters 

1012 ---------- 

1013 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1014 When `DatasetRef` the `dataId` should be `None`. 

1015 Otherwise the `DatasetType` or name thereof. 

1016 dataId : `dict` or `DataCoordinate`, optional 

1017 A `dict` of `Dimension` link name, value pairs that label the 

1018 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1019 should be provided as the first argument. 

1020 parameters : `dict` 

1021 Additional StorageClass-defined options to control reading, 

1022 typically used to efficiently read only a subset of the dataset. 

1023 collections : Any, optional 

1024 Collections to be searched, overriding ``self.collections``. 

1025 Can be any of the types supported by the ``collections`` argument 

1026 to butler construction. 

1027 storageClass : `StorageClass` or `str`, optional 

1028 The storage class to be used to override the Python type 

1029 returned by this method. By default the returned type matches 

1030 the dataset type definition for this dataset. Specifying a 

1031 read `StorageClass` can force a different type to be returned. 

1032 This type must be compatible with the original type. 

1033 timespan : `Timespan` or `None`, optional 

1034 A timespan that the validity range of the dataset must overlap. 

1035 If not provided and this is a calibration dataset type, an attempt 

1036 will be made to find the timespan from any temporal coordinate 

1037 in the data ID. 

1038 **kwargs 

1039 Additional keyword arguments used to augment or construct a 

1040 `DataId`. See `DataId` parameters. 

1041 

1042 Returns 

1043 ------- 

1044 obj : `DeferredDatasetHandle` 

1045 A handle which can be used to retrieve a dataset at a later time. 

1046 

1047 Raises 

1048 ------ 

1049 LookupError 

1050 Raised if no matching dataset exists in the `Registry` or 

1051 datastore. 

1052 ValueError 

1053 Raised if a resolved `DatasetRef` was passed as an input, but it 

1054 differs from the one found in the registry. 

1055 TypeError 

1056 Raised if no collections were provided. 

1057 """ 

1058 if isinstance(datasetRefOrType, DatasetRef): 

1059 # Do the quick check first and if that fails, check for artifact 

1060 # existence. This is necessary for datastores that are configured 

1061 # in trust mode where there won't be a record but there will be 

1062 # a file. 

1063 if self._datastore.knows(datasetRefOrType) or self._datastore.exists(datasetRefOrType): 

1064 ref = datasetRefOrType 

1065 else: 

1066 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.") 

1067 else: 

1068 ref = self._findDatasetRef( 

1069 datasetRefOrType, dataId, collections=collections, timespan=timespan, **kwargs 

1070 ) 

1071 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1072 

1073 def get( 

1074 self, 

1075 datasetRefOrType: DatasetRef | DatasetType | str, 

1076 /, 

1077 dataId: DataId | None = None, 

1078 *, 

1079 parameters: dict[str, Any] | None = None, 

1080 collections: Any = None, 

1081 storageClass: StorageClass | str | None = None, 

1082 timespan: Timespan | None = None, 

1083 **kwargs: Any, 

1084 ) -> Any: 

1085 """Retrieve a stored dataset. 

1086 

1087 Parameters 

1088 ---------- 

1089 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1090 When `DatasetRef` the `dataId` should be `None`. 

1091 Otherwise the `DatasetType` or name thereof. 

1092 If a resolved `DatasetRef`, the associated dataset 

1093 is returned directly without additional querying. 

1094 dataId : `dict` or `DataCoordinate` 

1095 A `dict` of `Dimension` link name, value pairs that label the 

1096 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1097 should be provided as the first argument. 

1098 parameters : `dict` 

1099 Additional StorageClass-defined options to control reading, 

1100 typically used to efficiently read only a subset of the dataset. 

1101 collections : Any, optional 

1102 Collections to be searched, overriding ``self.collections``. 

1103 Can be any of the types supported by the ``collections`` argument 

1104 to butler construction. 

1105 storageClass : `StorageClass` or `str`, optional 

1106 The storage class to be used to override the Python type 

1107 returned by this method. By default the returned type matches 

1108 the dataset type definition for this dataset. Specifying a 

1109 read `StorageClass` can force a different type to be returned. 

1110 This type must be compatible with the original type. 

1111 timespan : `Timespan` or `None`, optional 

1112 A timespan that the validity range of the dataset must overlap. 

1113 If not provided and this is a calibration dataset type, an attempt 

1114 will be made to find the timespan from any temporal coordinate 

1115 in the data ID. 

1116 **kwargs 

1117 Additional keyword arguments used to augment or construct a 

1118 `DataCoordinate`. See `DataCoordinate.standardize` 

1119 parameters. 

1120 

1121 Returns 

1122 ------- 

1123 obj : `object` 

1124 The dataset. 

1125 

1126 Raises 

1127 ------ 

1128 LookupError 

1129 Raised if no matching dataset exists in the `Registry`. 

1130 TypeError 

1131 Raised if no collections were provided. 

1132 

1133 Notes 

1134 ----- 

1135 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1136 this method requires that the given data ID include temporal dimensions 

1137 beyond the dimensions of the dataset type itself, in order to find the 

1138 dataset with the appropriate validity range. For example, a "bias" 

1139 dataset with native dimensions ``{instrument, detector}`` could be 

1140 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1141 ``exposure`` is a temporal dimension. 

1142 """ 

1143 _LOG.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1144 ref = self._findDatasetRef( 

1145 datasetRefOrType, 

1146 dataId, 

1147 collections=collections, 

1148 datastore_records=True, 

1149 timespan=timespan, 

1150 **kwargs, 

1151 ) 

1152 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1153 

1154 def getURIs( 

1155 self, 

1156 datasetRefOrType: DatasetRef | DatasetType | str, 

1157 /, 

1158 dataId: DataId | None = None, 

1159 *, 

1160 predict: bool = False, 

1161 collections: Any = None, 

1162 run: str | None = None, 

1163 **kwargs: Any, 

1164 ) -> DatasetRefURIs: 

1165 """Return the URIs associated with the dataset. 

1166 

1167 Parameters 

1168 ---------- 

1169 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1170 When `DatasetRef` the `dataId` should be `None`. 

1171 Otherwise the `DatasetType` or name thereof. 

1172 dataId : `dict` or `DataCoordinate` 

1173 A `dict` of `Dimension` link name, value pairs that label the 

1174 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1175 should be provided as the first argument. 

1176 predict : `bool` 

1177 If `True`, allow URIs to be returned of datasets that have not 

1178 been written. 

1179 collections : Any, optional 

1180 Collections to be searched, overriding ``self.collections``. 

1181 Can be any of the types supported by the ``collections`` argument 

1182 to butler construction. 

1183 run : `str`, optional 

1184 Run to use for predictions, overriding ``self.run``. 

1185 **kwargs 

1186 Additional keyword arguments used to augment or construct a 

1187 `DataCoordinate`. See `DataCoordinate.standardize` 

1188 parameters. 

1189 

1190 Returns 

1191 ------- 

1192 uris : `DatasetRefURIs` 

1193 The URI to the primary artifact associated with this dataset (if 

1194 the dataset was disassembled within the datastore this may be 

1195 `None`), and the URIs to any components associated with the dataset 

1196 artifact. (can be empty if there are no components). 

1197 """ 

1198 ref = self._findDatasetRef( 

1199 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs 

1200 ) 

1201 return self._datastore.getURIs(ref, predict) 

1202 

1203 def get_dataset_type(self, name: str) -> DatasetType: 

1204 return self._registry.getDatasetType(name) 

1205 

1206 def get_dataset( 

1207 self, 

1208 id: DatasetId, 

1209 *, 

1210 storage_class: str | StorageClass | None = None, 

1211 dimension_records: bool = False, 

1212 datastore_records: bool = False, 

1213 ) -> DatasetRef | None: 

1214 ref = self._registry.getDataset(id) 

1215 if ref is not None: 

1216 if dimension_records: 

1217 ref = ref.expanded( 

1218 self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions) 

1219 ) 

1220 if storage_class: 

1221 ref = ref.overrideStorageClass(storage_class) 

1222 if datastore_records: 

1223 ref = self._registry.get_datastore_records(ref) 

1224 return ref 

1225 

1226 def find_dataset( 

1227 self, 

1228 dataset_type: DatasetType | str, 

1229 data_id: DataId | None = None, 

1230 *, 

1231 collections: str | Sequence[str] | None = None, 

1232 timespan: Timespan | None = None, 

1233 storage_class: str | StorageClass | None = None, 

1234 dimension_records: bool = False, 

1235 datastore_records: bool = False, 

1236 **kwargs: Any, 

1237 ) -> DatasetRef | None: 

1238 # Handle any parts of the dataID that are not using primary dimension 

1239 # keys. 

1240 if isinstance(dataset_type, str): 

1241 actual_type = self.get_dataset_type(dataset_type) 

1242 else: 

1243 actual_type = dataset_type 

1244 

1245 # Store the component for later. 

1246 component_name = actual_type.component() 

1247 if actual_type.isComponent(): 

1248 parent_type = actual_type.makeCompositeDatasetType() 

1249 else: 

1250 parent_type = actual_type 

1251 

1252 data_id, kwargs = self._rewrite_data_id(data_id, parent_type, **kwargs) 

1253 

1254 ref = self._registry.findDataset( 

1255 parent_type, 

1256 data_id, 

1257 collections=collections, 

1258 timespan=timespan, 

1259 datastore_records=datastore_records, 

1260 **kwargs, 

1261 ) 

1262 if ref is not None and dimension_records: 

1263 ref = ref.expanded(self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions)) 

1264 if ref is not None and component_name: 

1265 ref = ref.makeComponentRef(component_name) 

1266 if ref is not None and storage_class is not None: 

1267 ref = ref.overrideStorageClass(storage_class) 

1268 

1269 return ref 

1270 

1271 def retrieveArtifacts( 

1272 self, 

1273 refs: Iterable[DatasetRef], 

1274 destination: ResourcePathExpression, 

1275 transfer: str = "auto", 

1276 preserve_path: bool = True, 

1277 overwrite: bool = False, 

1278 ) -> list[ResourcePath]: 

1279 # Docstring inherited. 

1280 return self._datastore.retrieveArtifacts( 

1281 refs, 

1282 ResourcePath(destination), 

1283 transfer=transfer, 

1284 preserve_path=preserve_path, 

1285 overwrite=overwrite, 

1286 ) 

1287 

1288 def exists( 

1289 self, 

1290 dataset_ref_or_type: DatasetRef | DatasetType | str, 

1291 /, 

1292 data_id: DataId | None = None, 

1293 *, 

1294 full_check: bool = True, 

1295 collections: Any = None, 

1296 **kwargs: Any, 

1297 ) -> DatasetExistence: 

1298 # Docstring inherited. 

1299 existence = DatasetExistence.UNRECOGNIZED 

1300 

1301 if isinstance(dataset_ref_or_type, DatasetRef): 

1302 if collections is not None: 

1303 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2) 

1304 if data_id is not None: 

1305 warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2) 

1306 ref = dataset_ref_or_type 

1307 registry_ref = self._registry.getDataset(dataset_ref_or_type.id) 

1308 if registry_ref is not None: 

1309 existence |= DatasetExistence.RECORDED 

1310 

1311 if dataset_ref_or_type != registry_ref: 

1312 # This could mean that storage classes differ, so we should 

1313 # check for that but use the registry ref for the rest of 

1314 # the method. 

1315 if registry_ref.is_compatible_with(dataset_ref_or_type): 

1316 # Use the registry version from now on. 

1317 ref = registry_ref 

1318 else: 

1319 raise ValueError( 

1320 f"The ref given to exists() ({ref}) has the same dataset ID as one " 

1321 f"in registry but has different incompatible values ({registry_ref})." 

1322 ) 

1323 else: 

1324 try: 

1325 ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs) 

1326 except (LookupError, TypeError): 

1327 return existence 

1328 existence |= DatasetExistence.RECORDED 

1329 

1330 if self._datastore.knows(ref): 

1331 existence |= DatasetExistence.DATASTORE 

1332 

1333 if full_check: 

1334 if self._datastore.exists(ref): 

1335 existence |= DatasetExistence._ARTIFACT 

1336 elif existence.value != DatasetExistence.UNRECOGNIZED.value: 

1337 # Do not add this flag if we have no other idea about a dataset. 

1338 existence |= DatasetExistence(DatasetExistence._ASSUMED) 

1339 

1340 return existence 

1341 

1342 def _exists_many( 

1343 self, 

1344 refs: Iterable[DatasetRef], 

1345 /, 

1346 *, 

1347 full_check: bool = True, 

1348 ) -> dict[DatasetRef, DatasetExistence]: 

1349 # Docstring inherited. 

1350 existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs} 

1351 

1352 # Registry does not have a bulk API to check for a ref. 

1353 for ref in refs: 

1354 registry_ref = self._registry.getDataset(ref.id) 

1355 if registry_ref is not None: 

1356 # It is possible, albeit unlikely, that the given ref does 

1357 # not match the one in registry even though the UUID matches. 

1358 # When checking a single ref we raise, but it's impolite to 

1359 # do that when potentially hundreds of refs are being checked. 

1360 # We could change the API to only accept UUIDs and that would 

1361 # remove the ability to even check and remove the worry 

1362 # about differing storage classes. Given the ongoing discussion 

1363 # on refs vs UUIDs and whether to raise or have a new 

1364 # private flag, treat this as a private API for now. 

1365 existence[ref] |= DatasetExistence.RECORDED 

1366 

1367 # Ask datastore if it knows about these refs. 

1368 knows = self._datastore.knows_these(refs) 

1369 for ref, known in knows.items(): 

1370 if known: 

1371 existence[ref] |= DatasetExistence.DATASTORE 

1372 

1373 if full_check: 

1374 mexists = self._datastore.mexists(refs) 

1375 for ref, exists in mexists.items(): 

1376 if exists: 

1377 existence[ref] |= DatasetExistence._ARTIFACT 

1378 else: 

1379 # Do not set this flag if nothing is known about the dataset. 

1380 for ref in existence: 

1381 if existence[ref] != DatasetExistence.UNRECOGNIZED: 

1382 existence[ref] |= DatasetExistence._ASSUMED 

1383 

1384 return existence 

1385 

1386 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1387 # Docstring inherited. 

1388 if not self.isWriteable(): 

1389 raise TypeError("Butler is read-only.") 

1390 names = list(names) 

1391 refs: list[DatasetRef] = [] 

1392 for name in names: 

1393 collectionType = self._registry.getCollectionType(name) 

1394 if collectionType is not CollectionType.RUN: 

1395 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1396 refs.extend(self._registry.queryDatasets(..., collections=name, findFirst=True)) 

1397 with self._datastore.transaction(), self._registry.transaction(): 

1398 if unstore: 

1399 self._datastore.trash(refs) 

1400 else: 

1401 self._datastore.forget(refs) 

1402 for name in names: 

1403 self._registry.removeCollection(name) 

1404 if unstore: 

1405 # Point of no return for removing artifacts 

1406 self._datastore.emptyTrash() 

1407 

1408 def pruneDatasets( 

1409 self, 

1410 refs: Iterable[DatasetRef], 

1411 *, 

1412 disassociate: bool = True, 

1413 unstore: bool = False, 

1414 tags: Iterable[str] = (), 

1415 purge: bool = False, 

1416 ) -> None: 

1417 # docstring inherited from LimitedButler 

1418 

1419 if not self.isWriteable(): 

1420 raise TypeError("Butler is read-only.") 

1421 if purge: 

1422 if not disassociate: 

1423 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1424 if not unstore: 

1425 raise TypeError("Cannot pass purge=True without unstore=True.") 

1426 elif disassociate: 

1427 tags = tuple(tags) 

1428 if not tags: 

1429 raise TypeError("No tags provided but disassociate=True.") 

1430 for tag in tags: 

1431 collectionType = self._registry.getCollectionType(tag) 

1432 if collectionType is not CollectionType.TAGGED: 

1433 raise TypeError( 

1434 f"Cannot disassociate from collection '{tag}' " 

1435 f"of non-TAGGED type {collectionType.name}." 

1436 ) 

1437 # Transform possibly-single-pass iterable into something we can iterate 

1438 # over multiple times. 

1439 refs = list(refs) 

1440 # Pruning a component of a DatasetRef makes no sense since registry 

1441 # doesn't know about components and datastore might not store 

1442 # components in a separate file 

1443 for ref in refs: 

1444 if ref.datasetType.component(): 

1445 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1446 # We don't need an unreliable Datastore transaction for this, because 

1447 # we've been extra careful to ensure that Datastore.trash only involves 

1448 # mutating the Registry (it can _look_ at Datastore-specific things, 

1449 # but shouldn't change them), and hence all operations here are 

1450 # Registry operations. 

1451 with self._datastore.transaction(), self._registry.transaction(): 

1452 if unstore: 

1453 self._datastore.trash(refs) 

1454 if purge: 

1455 self._registry.removeDatasets(refs) 

1456 elif disassociate: 

1457 assert tags, "Guaranteed by earlier logic in this function." 

1458 for tag in tags: 

1459 self._registry.disassociate(tag, refs) 

1460 # We've exited the Registry transaction, and apparently committed. 

1461 # (if there was an exception, everything rolled back, and it's as if 

1462 # nothing happened - and we never get here). 

1463 # Datastore artifacts are not yet gone, but they're clearly marked 

1464 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1465 # problems we can try again later, and if manual administrative 

1466 # intervention is required, it's pretty clear what that should entail: 

1467 # deleting everything on disk and in private Datastore tables that is 

1468 # in the dataset_location_trash table. 

1469 if unstore: 

1470 # Point of no return for removing artifacts 

1471 self._datastore.emptyTrash() 

1472 

1473 @transactional 

1474 def ingest( 

1475 self, 

1476 *datasets: FileDataset, 

1477 transfer: str | None = "auto", 

1478 record_validation_info: bool = True, 

1479 ) -> None: 

1480 # Docstring inherited. 

1481 if not self.isWriteable(): 

1482 raise TypeError("Butler is read-only.") 

1483 

1484 _LOG.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s") 

1485 if not datasets: 

1486 return 

1487 

1488 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1489 

1490 # We need to reorganize all the inputs so that they are grouped 

1491 # by dataset type and run. Multiple refs in a single FileDataset 

1492 # are required to share the run and dataset type. 

1493 groupedData: MutableMapping[tuple[DatasetType, str], list[FileDataset]] = defaultdict(list) 

1494 

1495 # Track DataIDs that are being ingested so we can spot issues early 

1496 # with duplication. Retain previous FileDataset so we can report it. 

1497 groupedDataIds: MutableMapping[tuple[DatasetType, str], dict[DataCoordinate, FileDataset]] = ( 

1498 defaultdict(dict) 

1499 ) 

1500 

1501 # And the nested loop that populates it: 

1502 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1503 # Somewhere to store pre-existing refs if we have an 

1504 # execution butler. 

1505 existingRefs: list[DatasetRef] = [] 

1506 

1507 for ref in dataset.refs: 

1508 group_key = (ref.datasetType, ref.run) 

1509 

1510 if ref.dataId in groupedDataIds[group_key]: 

1511 raise ConflictingDefinitionError( 

1512 f"Ingest conflict. Dataset {dataset.path} has same" 

1513 " DataId as other ingest dataset" 

1514 f" {groupedDataIds[group_key][ref.dataId].path} " 

1515 f" ({ref.dataId})" 

1516 ) 

1517 

1518 groupedDataIds[group_key][ref.dataId] = dataset 

1519 

1520 if existingRefs: 

1521 if len(dataset.refs) != len(existingRefs): 

1522 # Keeping track of partially pre-existing datasets is hard 

1523 # and should generally never happen. For now don't allow 

1524 # it. 

1525 raise ConflictingDefinitionError( 

1526 f"For dataset {dataset.path} some dataIds already exist" 

1527 " in registry but others do not. This is not supported." 

1528 ) 

1529 

1530 # Store expanded form in the original FileDataset. 

1531 dataset.refs = existingRefs 

1532 else: 

1533 groupedData[group_key].append(dataset) 

1534 

1535 # Now we can bulk-insert into Registry for each DatasetType. 

1536 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks( 

1537 groupedData.items(), desc="Bulk-inserting datasets by type" 

1538 ): 

1539 refs_to_import = [] 

1540 for dataset in grouped_datasets: 

1541 refs_to_import.extend(dataset.refs) 

1542 

1543 n_refs = len(refs_to_import) 

1544 _LOG.verbose( 

1545 "Importing %d ref%s of dataset type %r into run %r", 

1546 n_refs, 

1547 "" if n_refs == 1 else "s", 

1548 datasetType.name, 

1549 this_run, 

1550 ) 

1551 

1552 # Import the refs and expand the DataCoordinates since we can't 

1553 # guarantee that they are expanded and Datastore will need 

1554 # the records. 

1555 imported_refs = self._registry._importDatasets(refs_to_import, expand=True) 

1556 assert set(imported_refs) == set(refs_to_import) 

1557 

1558 # Replace all the refs in the FileDataset with expanded versions. 

1559 # Pull them off in the order we put them on the list. 

1560 for dataset in grouped_datasets: 

1561 n_dataset_refs = len(dataset.refs) 

1562 dataset.refs = imported_refs[:n_dataset_refs] 

1563 del imported_refs[:n_dataset_refs] 

1564 

1565 # Bulk-insert everything into Datastore. 

1566 # We do not know if any of the registry entries already existed 

1567 # (_importDatasets only complains if they exist but differ) so 

1568 # we have to catch IntegrityError explicitly. 

1569 try: 

1570 self._datastore.ingest( 

1571 *datasets, transfer=transfer, record_validation_info=record_validation_info 

1572 ) 

1573 except IntegrityError as e: 

1574 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") from e 

1575 

1576 @contextlib.contextmanager 

1577 def export( 

1578 self, 

1579 *, 

1580 directory: str | None = None, 

1581 filename: str | None = None, 

1582 format: str | None = None, 

1583 transfer: str | None = None, 

1584 ) -> Iterator[RepoExportContext]: 

1585 # Docstring inherited. 

1586 if directory is None and transfer is not None: 

1587 raise TypeError("Cannot transfer without providing a directory.") 

1588 if transfer == "move": 

1589 raise TypeError("Transfer may not be 'move': export is read-only") 

1590 if format is None: 

1591 if filename is None: 

1592 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1593 else: 

1594 _, format = os.path.splitext(filename) 

1595 if not format: 

1596 raise ValueError("Please specify a file extension to determine export format.") 

1597 format = format[1:] # Strip leading "."" 

1598 elif filename is None: 

1599 filename = f"export.{format}" 

1600 if directory is not None: 

1601 filename = os.path.join(directory, filename) 

1602 formats = self._config["repo_transfer_formats"] 

1603 if format not in formats: 

1604 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}") 

1605 BackendClass = get_class_of(formats[format, "export"]) 

1606 with open(filename, "w") as stream: 

1607 backend = BackendClass(stream, universe=self.dimensions) 

1608 try: 

1609 helper = RepoExportContext( 

1610 self._registry, self._datastore, backend=backend, directory=directory, transfer=transfer 

1611 ) 

1612 with self._caching_context(): 

1613 yield helper 

1614 except BaseException: 

1615 raise 

1616 else: 

1617 helper._finish() 

1618 

1619 def import_( 

1620 self, 

1621 *, 

1622 directory: ResourcePathExpression | None = None, 

1623 filename: ResourcePathExpression | TextIO | None = None, 

1624 format: str | None = None, 

1625 transfer: str | None = None, 

1626 skip_dimensions: set | None = None, 

1627 ) -> None: 

1628 # Docstring inherited. 

1629 if not self.isWriteable(): 

1630 raise TypeError("Butler is read-only.") 

1631 if format is None: 

1632 if filename is None: 

1633 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1634 else: 

1635 _, format = os.path.splitext(filename) # type: ignore 

1636 elif filename is None: 

1637 filename = ResourcePath(f"export.{format}", forceAbsolute=False) 

1638 if directory is not None: 

1639 directory = ResourcePath(directory, forceDirectory=True) 

1640 # mypy doesn't think this will work but it does in python >= 3.10. 

1641 if isinstance(filename, ResourcePathExpression): # type: ignore 

1642 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore 

1643 if not filename.isabs() and directory is not None: 

1644 potential = directory.join(filename) 

1645 exists_in_cwd = filename.exists() 

1646 exists_in_dir = potential.exists() 

1647 if exists_in_cwd and exists_in_dir: 

1648 _LOG.warning( 

1649 "A relative path for filename was specified (%s) which exists relative to cwd. " 

1650 "Additionally, the file exists relative to the given search directory (%s). " 

1651 "Using the export file in the given directory.", 

1652 filename, 

1653 potential, 

1654 ) 

1655 # Given they specified an explicit directory and that 

1656 # directory has the export file in it, assume that that 

1657 # is what was meant despite the file in cwd. 

1658 filename = potential 

1659 elif exists_in_dir: 

1660 filename = potential 

1661 elif not exists_in_cwd and not exists_in_dir: 

1662 # Raise early. 

1663 raise FileNotFoundError( 

1664 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}." 

1665 ) 

1666 BackendClass: type[RepoImportBackend] = get_class_of( 

1667 self._config["repo_transfer_formats"][format]["import"] 

1668 ) 

1669 

1670 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None: 

1671 with self._caching_context(): 

1672 backend = BackendClass(importStream, self._registry) # type: ignore[call-arg] 

1673 backend.register() 

1674 with self.transaction(): 

1675 backend.load( 

1676 self._datastore, 

1677 directory=directory, 

1678 transfer=transfer, 

1679 skip_dimensions=skip_dimensions, 

1680 ) 

1681 

1682 if isinstance(filename, ResourcePath): 

1683 # We can not use open() here at the moment because of 

1684 # DM-38589 since yaml does stream.read(8192) in a loop. 

1685 stream = io.StringIO(filename.read().decode()) 

1686 doImport(stream) 

1687 else: 

1688 doImport(filename) # type: ignore 

1689 

1690 def transfer_dimension_records_from( 

1691 self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef] 

1692 ) -> None: 

1693 # Allowed dimensions in the target butler. 

1694 elements = frozenset(element for element in self.dimensions.elements if element.has_own_table) 

1695 

1696 data_ids = {ref.dataId for ref in source_refs} 

1697 

1698 dimension_records = self._extract_all_dimension_records_from_data_ids( 

1699 source_butler, data_ids, elements 

1700 ) 

1701 

1702 # Insert order is important. 

1703 for element in self.dimensions.sorted(dimension_records.keys()): 

1704 records = [r for r in dimension_records[element].values()] 

1705 # Assume that if the record is already present that we can 

1706 # use it without having to check that the record metadata 

1707 # is consistent. 

1708 self._registry.insertDimensionData(element, *records, skip_existing=True) 

1709 _LOG.debug("Dimension '%s' -- number of records transferred: %d", element.name, len(records)) 

1710 

1711 def _extract_all_dimension_records_from_data_ids( 

1712 self, 

1713 source_butler: LimitedButler | Butler, 

1714 data_ids: set[DataCoordinate], 

1715 allowed_elements: frozenset[DimensionElement], 

1716 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]: 

1717 primary_records = self._extract_dimension_records_from_data_ids( 

1718 source_butler, data_ids, allowed_elements 

1719 ) 

1720 

1721 can_query = True if isinstance(source_butler, Butler) else False 

1722 

1723 additional_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

1724 for original_element, record_mapping in primary_records.items(): 

1725 # Get dimensions that depend on this dimension. 

1726 populated_by = self.dimensions.get_elements_populated_by( 

1727 self.dimensions[original_element.name] # type: ignore 

1728 ) 

1729 

1730 for data_id in record_mapping.keys(): 

1731 for element in populated_by: 

1732 if element not in allowed_elements: 

1733 continue 

1734 if element.name == original_element.name: 

1735 continue 

1736 

1737 if element.name in primary_records: 

1738 # If this element has already been stored avoid 

1739 # re-finding records since that may lead to additional 

1740 # spurious records. e.g. visit is populated_by 

1741 # visit_detector_region but querying 

1742 # visit_detector_region by visit will return all the 

1743 # detectors for this visit -- the visit dataId does not 

1744 # constrain this. 

1745 # To constrain the query the original dataIds would 

1746 # have to be scanned. 

1747 continue 

1748 

1749 if not can_query: 

1750 raise RuntimeError( 

1751 f"Transferring populated_by records like {element.name} requires a full Butler." 

1752 ) 

1753 

1754 records = source_butler.registry.queryDimensionRecords( # type: ignore 

1755 element.name, 

1756 **data_id.mapping, # type: ignore 

1757 ) 

1758 for record in records: 

1759 additional_records[record.definition].setdefault(record.dataId, record) 

1760 

1761 # The next step is to walk back through the additional records to 

1762 # pick up any missing content (such as visit_definition needing to 

1763 # know the exposure). Want to ensure we do not request records we 

1764 # already have. 

1765 missing_data_ids = set() 

1766 for name, record_mapping in additional_records.items(): 

1767 for data_id in record_mapping.keys(): 

1768 if data_id not in primary_records[name]: 

1769 missing_data_ids.add(data_id) 

1770 

1771 # Fill out the new records. Assume that these new records do not 

1772 # also need to carry over additional populated_by records. 

1773 secondary_records = self._extract_dimension_records_from_data_ids( 

1774 source_butler, missing_data_ids, allowed_elements 

1775 ) 

1776 

1777 # Merge the extra sets of records in with the original. 

1778 for name, record_mapping in itertools.chain(additional_records.items(), secondary_records.items()): 

1779 primary_records[name].update(record_mapping) 

1780 

1781 return primary_records 

1782 

1783 def _extract_dimension_records_from_data_ids( 

1784 self, 

1785 source_butler: LimitedButler | Butler, 

1786 data_ids: set[DataCoordinate], 

1787 allowed_elements: frozenset[DimensionElement], 

1788 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]: 

1789 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

1790 

1791 for data_id in data_ids: 

1792 # Need an expanded record, if not expanded that we need a full 

1793 # butler with registry (allow mocks with registry too). 

1794 if not data_id.hasRecords(): 

1795 if registry := getattr(source_butler, "registry", None): 

1796 data_id = registry.expandDataId(data_id) 

1797 else: 

1798 raise TypeError("Input butler needs to be a full butler to expand DataId.") 

1799 # If this butler doesn't know about a dimension in the source 

1800 # butler things will break later. 

1801 for element_name in data_id.dimensions.elements: 

1802 record = data_id.records[element_name] 

1803 if record is not None and record.definition in allowed_elements: 

1804 dimension_records[record.definition].setdefault(record.dataId, record) 

1805 

1806 return dimension_records 

1807 

1808 def transfer_from( 

1809 self, 

1810 source_butler: LimitedButler, 

1811 source_refs: Iterable[DatasetRef], 

1812 transfer: str = "auto", 

1813 skip_missing: bool = True, 

1814 register_dataset_types: bool = False, 

1815 transfer_dimensions: bool = False, 

1816 dry_run: bool = False, 

1817 ) -> collections.abc.Collection[DatasetRef]: 

1818 # Docstring inherited. 

1819 if not self.isWriteable(): 

1820 raise TypeError("Butler is read-only.") 

1821 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

1822 

1823 # Will iterate through the refs multiple times so need to convert 

1824 # to a list if this isn't a collection. 

1825 if not isinstance(source_refs, collections.abc.Collection): 

1826 source_refs = list(source_refs) 

1827 

1828 original_count = len(source_refs) 

1829 _LOG.info("Transferring %d datasets into %s", original_count, str(self)) 

1830 

1831 # In some situations the datastore artifact may be missing 

1832 # and we do not want that registry entry to be imported. 

1833 # Asking datastore is not sufficient, the records may have been 

1834 # purged, we have to ask for the (predicted) URI and check 

1835 # existence explicitly. Execution butler is set up exactly like 

1836 # this with no datastore records. 

1837 artifact_existence: dict[ResourcePath, bool] = {} 

1838 if skip_missing: 

1839 dataset_existence = source_butler._datastore.mexists( 

1840 source_refs, artifact_existence=artifact_existence 

1841 ) 

1842 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

1843 filtered_count = len(source_refs) 

1844 n_missing = original_count - filtered_count 

1845 _LOG.verbose( 

1846 "%d dataset%s removed because the artifact does not exist. Now have %d.", 

1847 n_missing, 

1848 "" if n_missing == 1 else "s", 

1849 filtered_count, 

1850 ) 

1851 

1852 # Importing requires that we group the refs by dataset type and run 

1853 # before doing the import. 

1854 source_dataset_types = set() 

1855 grouped_refs = defaultdict(list) 

1856 for ref in source_refs: 

1857 grouped_refs[ref.datasetType, ref.run].append(ref) 

1858 source_dataset_types.add(ref.datasetType) 

1859 

1860 # Check to see if the dataset type in the source butler has 

1861 # the same definition in the target butler and register missing 

1862 # ones if requested. Registration must happen outside a transaction. 

1863 newly_registered_dataset_types = set() 

1864 for datasetType in source_dataset_types: 

1865 if register_dataset_types: 

1866 # Let this raise immediately if inconsistent. Continuing 

1867 # on to find additional inconsistent dataset types 

1868 # might result in additional unwanted dataset types being 

1869 # registered. 

1870 if self._registry.registerDatasetType(datasetType): 

1871 newly_registered_dataset_types.add(datasetType) 

1872 else: 

1873 # If the dataset type is missing, let it fail immediately. 

1874 target_dataset_type = self.get_dataset_type(datasetType.name) 

1875 if target_dataset_type != datasetType: 

1876 raise ConflictingDefinitionError( 

1877 "Source butler dataset type differs from definition" 

1878 f" in target butler: {datasetType} !=" 

1879 f" {target_dataset_type}" 

1880 ) 

1881 if newly_registered_dataset_types: 

1882 # We may have registered some even if there were inconsistencies 

1883 # but should let people know (or else remove them again). 

1884 _LOG.verbose( 

1885 "Registered the following dataset types in the target Butler: %s", 

1886 ", ".join(d.name for d in newly_registered_dataset_types), 

1887 ) 

1888 else: 

1889 _LOG.verbose("All required dataset types are known to the target Butler") 

1890 

1891 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

1892 if transfer_dimensions: 

1893 # Collect all the dimension records for these refs. 

1894 # All dimensions are to be copied but the list of valid dimensions 

1895 # come from this butler's universe. 

1896 elements = frozenset(element for element in self.dimensions.elements if element.has_own_table) 

1897 dataIds = {ref.dataId for ref in source_refs} 

1898 dimension_records = self._extract_all_dimension_records_from_data_ids( 

1899 source_butler, dataIds, elements 

1900 ) 

1901 

1902 handled_collections: set[str] = set() 

1903 

1904 # Do all the importing in a single transaction. 

1905 with self.transaction(): 

1906 if dimension_records and not dry_run: 

1907 _LOG.verbose("Ensuring that dimension records exist for transferred datasets.") 

1908 # Order matters. 

1909 for element in self.dimensions.sorted(dimension_records.keys()): 

1910 records = [r for r in dimension_records[element].values()] 

1911 # Assume that if the record is already present that we can 

1912 # use it without having to check that the record metadata 

1913 # is consistent. 

1914 self._registry.insertDimensionData(element, *records, skip_existing=True) 

1915 

1916 n_imported = 0 

1917 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

1918 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

1919 ): 

1920 if run not in handled_collections: 

1921 # May need to create output collection. If source butler 

1922 # has a registry, ask for documentation string. 

1923 run_doc = None 

1924 if registry := getattr(source_butler, "registry", None): 

1925 run_doc = registry.getCollectionDocumentation(run) 

1926 if not dry_run: 

1927 registered = self._registry.registerRun(run, doc=run_doc) 

1928 else: 

1929 registered = True 

1930 handled_collections.add(run) 

1931 if registered: 

1932 _LOG.verbose("Creating output run %s", run) 

1933 

1934 n_refs = len(refs_to_import) 

1935 _LOG.verbose( 

1936 "Importing %d ref%s of dataset type %s into run %s", 

1937 n_refs, 

1938 "" if n_refs == 1 else "s", 

1939 datasetType.name, 

1940 run, 

1941 ) 

1942 

1943 # Assume we are using UUIDs and the source refs will match 

1944 # those imported. 

1945 if not dry_run: 

1946 imported_refs = self._registry._importDatasets(refs_to_import) 

1947 else: 

1948 imported_refs = refs_to_import 

1949 assert set(imported_refs) == set(refs_to_import) 

1950 n_imported += len(imported_refs) 

1951 

1952 assert len(source_refs) == n_imported 

1953 _LOG.verbose("Imported %d datasets into destination butler", n_imported) 

1954 

1955 # Ask the datastore to transfer. The datastore has to check that 

1956 # the source datastore is compatible with the target datastore. 

1957 accepted, rejected = self._datastore.transfer_from( 

1958 source_butler._datastore, 

1959 source_refs, 

1960 transfer=transfer, 

1961 artifact_existence=artifact_existence, 

1962 dry_run=dry_run, 

1963 ) 

1964 if rejected: 

1965 # For now, accept the registry entries but not the files. 

1966 _LOG.warning( 

1967 "%d datasets were rejected and %d accepted for dataset type %s in run %r.", 

1968 len(rejected), 

1969 len(accepted), 

1970 datasetType, 

1971 run, 

1972 ) 

1973 

1974 return source_refs 

1975 

1976 def validateConfiguration( 

1977 self, 

1978 logFailures: bool = False, 

1979 datasetTypeNames: Iterable[str] | None = None, 

1980 ignore: Iterable[str] | None = None, 

1981 ) -> None: 

1982 # Docstring inherited. 

1983 if datasetTypeNames: 

1984 datasetTypes = [self.get_dataset_type(name) for name in datasetTypeNames] 

1985 else: 

1986 datasetTypes = list(self._registry.queryDatasetTypes()) 

1987 

1988 # filter out anything from the ignore list 

1989 if ignore: 

1990 ignore = set(ignore) 

1991 datasetTypes = [ 

1992 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

1993 ] 

1994 else: 

1995 ignore = set() 

1996 

1997 # For each datasetType that has an instrument dimension, create 

1998 # a DatasetRef for each defined instrument 

1999 datasetRefs = [] 

2000 

2001 # Find all the registered instruments (if "instrument" is in the 

2002 # universe). 

2003 if "instrument" in self.dimensions: 

2004 instruments = {record.name for record in self._registry.queryDimensionRecords("instrument")} 

2005 

2006 for datasetType in datasetTypes: 

2007 if "instrument" in datasetType.dimensions: 

2008 # In order to create a conforming dataset ref, create 

2009 # fake DataCoordinate values for the non-instrument 

2010 # dimensions. The type of the value does not matter here. 

2011 dataId = {dim: 1 for dim in datasetType.dimensions.names if dim != "instrument"} 

2012 

2013 for instrument in instruments: 

2014 datasetRef = DatasetRef( 

2015 datasetType, 

2016 DataCoordinate.standardize( 

2017 dataId, instrument=instrument, dimensions=datasetType.dimensions 

2018 ), 

2019 run="validate", 

2020 ) 

2021 datasetRefs.append(datasetRef) 

2022 

2023 entities: list[DatasetType | DatasetRef] = [] 

2024 entities.extend(datasetTypes) 

2025 entities.extend(datasetRefs) 

2026 

2027 datastoreErrorStr = None 

2028 try: 

2029 self._datastore.validateConfiguration(entities, logFailures=logFailures) 

2030 except ValidationError as e: 

2031 datastoreErrorStr = str(e) 

2032 

2033 # Also check that the LookupKeys used by the datastores match 

2034 # registry and storage class definitions 

2035 keys = self._datastore.getLookupKeys() 

2036 

2037 failedNames = set() 

2038 failedDataId = set() 

2039 for key in keys: 

2040 if key.name is not None: 

2041 if key.name in ignore: 

2042 continue 

2043 

2044 # skip if specific datasetType names were requested and this 

2045 # name does not match 

2046 if datasetTypeNames and key.name not in datasetTypeNames: 

2047 continue 

2048 

2049 # See if it is a StorageClass or a DatasetType 

2050 if key.name in self.storageClasses: 

2051 pass 

2052 else: 

2053 try: 

2054 self.get_dataset_type(key.name) 

2055 except KeyError: 

2056 if logFailures: 

2057 _LOG.critical( 

2058 "Key '%s' does not correspond to a DatasetType or StorageClass", key 

2059 ) 

2060 failedNames.add(key) 

2061 else: 

2062 # Dimensions are checked for consistency when the Butler 

2063 # is created and rendezvoused with a universe. 

2064 pass 

2065 

2066 # Check that the instrument is a valid instrument 

2067 # Currently only support instrument so check for that 

2068 if key.dataId: 

2069 dataIdKeys = set(key.dataId) 

2070 if {"instrument"} != dataIdKeys: 

2071 if logFailures: 

2072 _LOG.critical("Key '%s' has unsupported DataId override", key) 

2073 failedDataId.add(key) 

2074 elif key.dataId["instrument"] not in instruments: 

2075 if logFailures: 

2076 _LOG.critical("Key '%s' has unknown instrument", key) 

2077 failedDataId.add(key) 

2078 

2079 messages = [] 

2080 

2081 if datastoreErrorStr: 

2082 messages.append(datastoreErrorStr) 

2083 

2084 for failed, msg in ( 

2085 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2086 (failedDataId, "Keys with bad DataId entries: "), 

2087 ): 

2088 if failed: 

2089 msg += ", ".join(str(k) for k in failed) 

2090 messages.append(msg) 

2091 

2092 if messages: 

2093 raise ValidationError(";\n".join(messages)) 

2094 

2095 @property 

2096 def collections(self) -> Sequence[str]: 

2097 """The collections to search by default, in order 

2098 (`~collections.abc.Sequence` [ `str` ]). 

2099 

2100 This is an alias for ``self.registry.defaults.collections``. It cannot 

2101 be set directly in isolation, but all defaults may be changed together 

2102 by assigning a new `RegistryDefaults` instance to 

2103 ``self.registry.defaults``. 

2104 """ 

2105 return self._registry.defaults.collections 

2106 

2107 @property 

2108 def run(self) -> str | None: 

2109 """Name of the run this butler writes outputs to by default (`str` or 

2110 `None`). 

2111 

2112 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2113 directly in isolation, but all defaults may be changed together by 

2114 assigning a new `RegistryDefaults` instance to 

2115 ``self.registry.defaults``. 

2116 """ 

2117 return self._registry.defaults.run 

2118 

2119 @property 

2120 def registry(self) -> Registry: 

2121 """The object that manages dataset metadata and relationships 

2122 (`Registry`). 

2123 

2124 Many operations that don't involve reading or writing butler datasets 

2125 are accessible only via `Registry` methods. Eventually these methods 

2126 will be replaced by equivalent `Butler` methods. 

2127 """ 

2128 return self._registry_shim 

2129 

2130 @property 

2131 def dimensions(self) -> DimensionUniverse: 

2132 # Docstring inherited. 

2133 return self._registry.dimensions 

2134 

2135 @contextlib.contextmanager 

2136 def _query(self) -> Iterator[Query]: 

2137 # Docstring inherited. 

2138 raise NotImplementedError("TODO DM-41159") 

2139 

2140 def _preload_cache(self) -> None: 

2141 """Immediately load caches that are used for common operations.""" 

2142 self._registry.preload_cache() 

2143 

2144 _config: ButlerConfig 

2145 """Configuration for this Butler instance.""" 

2146 

2147 _registry: SqlRegistry 

2148 """The object that manages dataset metadata and relationships 

2149 (`SqlRegistry`). 

2150 

2151 Most operations that don't involve reading or writing butler datasets are 

2152 accessible only via `SqlRegistry` methods. 

2153 """ 

2154 

2155 datastore: Datastore 

2156 """The object that manages actual dataset storage (`Datastore`). 

2157 

2158 Direct user access to the datastore should rarely be necessary; the primary 

2159 exception is the case where a `Datastore` implementation provides extra 

2160 functionality beyond what the base class defines. 

2161 """ 

2162 

2163 storageClasses: StorageClassFactory 

2164 """An object that maps known storage class names to objects that fully 

2165 describe them (`StorageClassFactory`). 

2166 """ 

2167 

2168 _registry_shim: RegistryShim 

2169 """Shim object to provide a legacy public interface for querying via the 

2170 the ``registry`` property. 

2171 """