Coverage for python/lsst/daf/butler/direct_butler.py: 11%

795 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-08 10:56 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Butler top level classes. 

29""" 

30from __future__ import annotations 

31 

32__all__ = ( 

33 "DirectButler", 

34 "ButlerValidationError", 

35) 

36 

37import collections.abc 

38import contextlib 

39import io 

40import itertools 

41import logging 

42import numbers 

43import os 

44import warnings 

45from collections import Counter, defaultdict 

46from collections.abc import Iterable, Iterator, Mapping, MutableMapping, Sequence 

47from typing import TYPE_CHECKING, Any, ClassVar, TextIO 

48 

49from deprecated.sphinx import deprecated 

50from lsst.resources import ResourcePath, ResourcePathExpression 

51from lsst.utils.introspection import get_class_of 

52from lsst.utils.iteration import ensure_iterable 

53from lsst.utils.logging import VERBOSE, getLogger 

54from sqlalchemy.exc import IntegrityError 

55 

56from ._butler import Butler 

57from ._butler_config import ButlerConfig 

58from ._dataset_existence import DatasetExistence 

59from ._dataset_ref import DatasetRef 

60from ._dataset_type import DatasetType 

61from ._deferredDatasetHandle import DeferredDatasetHandle 

62from ._exceptions import EmptyQueryResultError, ValidationError 

63from ._limited_butler import LimitedButler 

64from ._registry_shim import RegistryShim 

65from ._storage_class import StorageClass, StorageClassFactory 

66from ._timespan import Timespan 

67from .datastore import Datastore, NullDatastore 

68from .dimensions import DataCoordinate, Dimension 

69from .direct_query import DirectQuery 

70from .progress import Progress 

71from .registry import ( 

72 CollectionType, 

73 ConflictingDefinitionError, 

74 DataIdError, 

75 MissingDatasetTypeError, 

76 NoDefaultCollectionError, 

77 RegistryDefaults, 

78 _RegistryFactory, 

79) 

80from .registry.sql_registry import SqlRegistry 

81from .transfers import RepoExportContext 

82from .utils import transactional 

83 

84if TYPE_CHECKING: 

85 from lsst.resources import ResourceHandleProtocol 

86 

87 from ._config import Config 

88 from ._dataset_ref import DatasetId, DatasetIdGenEnum 

89 from ._file_dataset import FileDataset 

90 from ._query import Query 

91 from .datastore import DatasetRefURIs 

92 from .dimensions import ( 

93 DataId, 

94 DataIdValue, 

95 DimensionElement, 

96 DimensionGroup, 

97 DimensionRecord, 

98 DimensionUniverse, 

99 ) 

100 from .registry import CollectionArgType, Registry 

101 from .transfers import RepoImportBackend 

102 

103_LOG = getLogger(__name__) 

104 

105 

106class ButlerValidationError(ValidationError): 

107 """There is a problem with the Butler configuration.""" 

108 

109 pass 

110 

111 

112class DirectButler(Butler): 

113 """Main entry point for the data access system. 

114 

115 Parameters 

116 ---------- 

117 config : `ButlerConfig`, `Config` or `str`, optional. 

118 Configuration. Anything acceptable to the 

119 `ButlerConfig` constructor. If a directory path 

120 is given the configuration will be read from a ``butler.yaml`` file in 

121 that location. If `None` is given default values will be used. 

122 butler : `DirectButler`, optional. 

123 If provided, construct a new Butler that uses the same registry and 

124 datastore as the given one, but with the given collection and run. 

125 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

126 arguments. 

127 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional 

128 An expression specifying the collections to be searched (in order) when 

129 reading datasets. 

130 This may be a `str` collection name or an iterable thereof. 

131 See :ref:`daf_butler_collection_expressions` for more information. 

132 These collections are not registered automatically and must be 

133 manually registered before they are used by any method, but they may be 

134 manually registered after the `Butler` is initialized. 

135 run : `str`, optional 

136 Name of the `~CollectionType.RUN` collection new datasets should be 

137 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

138 ``collections`` will be set to ``[run]``. If not `None`, this 

139 collection will automatically be registered. If this is not set (and 

140 ``writeable`` is not set either), a read-only butler will be created. 

141 searchPaths : `list` of `str`, optional 

142 Directory paths to search when calculating the full Butler 

143 configuration. Not used if the supplied config is already a 

144 `ButlerConfig`. 

145 writeable : `bool`, optional 

146 Explicitly sets whether the butler supports write operations. If not 

147 provided, a read-write butler is created if any of ``run``, ``tags``, 

148 or ``chains`` is non-empty. 

149 inferDefaults : `bool`, optional 

150 If `True` (default) infer default data ID values from the values 

151 present in the datasets in ``collections``: if all collections have the 

152 same value (or no value) for a governor dimension, that value will be 

153 the default for that dimension. Nonexistent collections are ignored. 

154 If a default value is provided explicitly for a governor dimension via 

155 ``**kwargs``, no default will be inferred for that dimension. 

156 without_datastore : `bool`, optional 

157 If `True` do not attach a datastore to this butler. Any attempts 

158 to use a datastore will fail. 

159 **kwargs : `str` 

160 Default data ID key-value pairs. These may only identify "governor" 

161 dimensions like ``instrument`` and ``skymap``. 

162 """ 

163 

164 def __init__( 

165 self, 

166 config: Config | ResourcePathExpression | None = None, 

167 *, 

168 butler: DirectButler | None = None, 

169 collections: Any = None, 

170 run: str | None = None, 

171 searchPaths: Sequence[ResourcePathExpression] | None = None, 

172 writeable: bool | None = None, 

173 inferDefaults: bool = True, 

174 without_datastore: bool = False, 

175 **kwargs: str, 

176 ): 

177 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

178 # Load registry, datastore, etc. from config or existing butler. 

179 if butler is not None: 

180 if config is not None or searchPaths is not None or writeable is not None: 

181 raise TypeError( 

182 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument." 

183 ) 

184 self._registry = butler._registry.copy(defaults) 

185 self._datastore = butler._datastore 

186 self.storageClasses = butler.storageClasses 

187 self._config: ButlerConfig = butler._config 

188 else: 

189 self._config = ButlerConfig(config, searchPaths=searchPaths, without_datastore=without_datastore) 

190 try: 

191 butlerRoot = self._config.get("root", self._config.configDir) 

192 if writeable is None: 

193 writeable = run is not None 

194 self._registry = _RegistryFactory(self._config).from_config( 

195 butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

196 ) 

197 if without_datastore: 

198 self._datastore = NullDatastore(None, None) 

199 else: 

200 self._datastore = Datastore.fromConfig( 

201 self._config, self._registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

202 ) 

203 # TODO: Once datastore drops dependency on registry we can 

204 # construct datastore first and pass opaque tables to registry 

205 # constructor. 

206 self._registry.make_datastore_tables(self._datastore.get_opaque_table_definitions()) 

207 self.storageClasses = StorageClassFactory() 

208 self.storageClasses.addFromConfig(self._config) 

209 except Exception: 

210 # Failures here usually mean that configuration is incomplete, 

211 # just issue an error message which includes config file URI. 

212 _LOG.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

213 raise 

214 

215 # For execution butler the datastore needs a special 

216 # dependency-inversion trick. This is not used by regular butler, 

217 # but we do not have a way to distinguish regular butler from execution 

218 # butler. 

219 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

220 

221 if "run" in self._config or "collection" in self._config: 

222 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

223 

224 self._registry_shim = RegistryShim(self) 

225 

226 GENERATION: ClassVar[int] = 3 

227 """This is a Generation 3 Butler. 

228 

229 This attribute may be removed in the future, once the Generation 2 Butler 

230 interface has been fully retired; it should only be used in transitional 

231 code. 

232 """ 

233 

234 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

235 """Return DatasetType defined in registry given dataset type name.""" 

236 try: 

237 return self.get_dataset_type(name) 

238 except MissingDatasetTypeError: 

239 return None 

240 

241 @classmethod 

242 def _unpickle( 

243 cls, 

244 config: ButlerConfig, 

245 collections: tuple[str, ...] | None, 

246 run: str | None, 

247 defaultDataId: dict[str, str], 

248 writeable: bool, 

249 ) -> DirectButler: 

250 """Callable used to unpickle a Butler. 

251 

252 We prefer not to use ``Butler.__init__`` directly so we can force some 

253 of its many arguments to be keyword-only (note that ``__reduce__`` 

254 can only invoke callables with positional arguments). 

255 

256 Parameters 

257 ---------- 

258 config : `ButlerConfig` 

259 Butler configuration, already coerced into a true `ButlerConfig` 

260 instance (and hence after any search paths for overrides have been 

261 utilized). 

262 collections : `tuple` [ `str` ] 

263 Names of the default collections to read from. 

264 run : `str`, optional 

265 Name of the default `~CollectionType.RUN` collection to write to. 

266 defaultDataId : `dict` [ `str`, `str` ] 

267 Default data ID values. 

268 writeable : `bool` 

269 Whether the Butler should support write operations. 

270 

271 Returns 

272 ------- 

273 butler : `Butler` 

274 A new `Butler` instance. 

275 """ 

276 # MyPy doesn't recognize that the kwargs below are totally valid; it 

277 # seems to think '**defaultDataId* is a _positional_ argument! 

278 return cls( 

279 config=config, 

280 collections=collections, 

281 run=run, 

282 writeable=writeable, 

283 **defaultDataId, # type: ignore 

284 ) 

285 

286 def __reduce__(self) -> tuple: 

287 """Support pickling.""" 

288 return ( 

289 DirectButler._unpickle, 

290 ( 

291 self._config, 

292 self.collections, 

293 self.run, 

294 dict(self._registry.defaults.dataId.required), 

295 self._registry.isWriteable(), 

296 ), 

297 ) 

298 

299 def __str__(self) -> str: 

300 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

301 self.collections, self.run, self._datastore, self._registry 

302 ) 

303 

304 def isWriteable(self) -> bool: 

305 # Docstring inherited. 

306 return self._registry.isWriteable() 

307 

308 def _caching_context(self) -> contextlib.AbstractContextManager[None]: 

309 """Context manager that enables caching.""" 

310 return self._registry.caching_context() 

311 

312 @contextlib.contextmanager 

313 def transaction(self) -> Iterator[None]: 

314 """Context manager supporting `Butler` transactions. 

315 

316 Transactions can be nested. 

317 """ 

318 with self._registry.transaction(), self._datastore.transaction(): 

319 yield 

320 

321 def _standardizeArgs( 

322 self, 

323 datasetRefOrType: DatasetRef | DatasetType | str, 

324 dataId: DataId | None = None, 

325 for_put: bool = True, 

326 **kwargs: Any, 

327 ) -> tuple[DatasetType, DataId | None]: 

328 """Standardize the arguments passed to several Butler APIs. 

329 

330 Parameters 

331 ---------- 

332 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

333 When `DatasetRef` the `dataId` should be `None`. 

334 Otherwise the `DatasetType` or name thereof. 

335 dataId : `dict` or `DataCoordinate` 

336 A `dict` of `Dimension` link name, value pairs that label the 

337 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

338 should be provided as the second argument. 

339 for_put : `bool`, optional 

340 If `True` this call is invoked as part of a `Butler.put()`. 

341 Otherwise it is assumed to be part of a `Butler.get()`. This 

342 parameter is only relevant if there is dataset type 

343 inconsistency. 

344 **kwargs 

345 Additional keyword arguments used to augment or construct a 

346 `DataCoordinate`. See `DataCoordinate.standardize` 

347 parameters. 

348 

349 Returns 

350 ------- 

351 datasetType : `DatasetType` 

352 A `DatasetType` instance extracted from ``datasetRefOrType``. 

353 dataId : `dict` or `DataId`, optional 

354 Argument that can be used (along with ``kwargs``) to construct a 

355 `DataId`. 

356 

357 Notes 

358 ----- 

359 Butler APIs that conceptually need a DatasetRef also allow passing a 

360 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

361 keyword arguments that can be used to construct one) separately. This 

362 method accepts those arguments and always returns a true `DatasetType` 

363 and a `DataId` or `dict`. 

364 

365 Standardization of `dict` vs `DataId` is best handled by passing the 

366 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

367 generally similarly flexible. 

368 """ 

369 externalDatasetType: DatasetType | None = None 

370 internalDatasetType: DatasetType | None = None 

371 if isinstance(datasetRefOrType, DatasetRef): 

372 if dataId is not None or kwargs: 

373 raise ValueError("DatasetRef given, cannot use dataId as well") 

374 externalDatasetType = datasetRefOrType.datasetType 

375 dataId = datasetRefOrType.dataId 

376 else: 

377 # Don't check whether DataId is provided, because Registry APIs 

378 # can usually construct a better error message when it wasn't. 

379 if isinstance(datasetRefOrType, DatasetType): 

380 externalDatasetType = datasetRefOrType 

381 else: 

382 internalDatasetType = self.get_dataset_type(datasetRefOrType) 

383 

384 # Check that they are self-consistent 

385 if externalDatasetType is not None: 

386 internalDatasetType = self.get_dataset_type(externalDatasetType.name) 

387 if externalDatasetType != internalDatasetType: 

388 # We can allow differences if they are compatible, depending 

389 # on whether this is a get or a put. A get requires that 

390 # the python type associated with the datastore can be 

391 # converted to the user type. A put requires that the user 

392 # supplied python type can be converted to the internal 

393 # type expected by registry. 

394 relevantDatasetType = internalDatasetType 

395 if for_put: 

396 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

397 else: 

398 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

399 relevantDatasetType = externalDatasetType 

400 if not is_compatible: 

401 raise ValueError( 

402 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

403 f"registry definition ({internalDatasetType})" 

404 ) 

405 # Override the internal definition. 

406 internalDatasetType = relevantDatasetType 

407 

408 assert internalDatasetType is not None 

409 return internalDatasetType, dataId 

410 

411 def _rewrite_data_id( 

412 self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any 

413 ) -> tuple[DataId | None, dict[str, Any]]: 

414 """Rewrite a data ID taking into account dimension records. 

415 

416 Take a Data ID and keyword args and rewrite it if necessary to 

417 allow the user to specify dimension records rather than dimension 

418 primary values. 

419 

420 This allows a user to include a dataId dict with keys of 

421 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

422 the integer exposure ID. It also allows a string to be given 

423 for a dimension value rather than the integer ID if that is more 

424 convenient. For example, rather than having to specifying the 

425 detector with ``detector.full_name``, a string given for ``detector`` 

426 will be interpreted as the full name and converted to the integer 

427 value. 

428 

429 Keyword arguments can also use strings for dimensions like detector 

430 and exposure but python does not allow them to include ``.`` and 

431 so the ``exposure.day_obs`` syntax can not be used in a keyword 

432 argument. 

433 

434 Parameters 

435 ---------- 

436 dataId : `dict` or `DataCoordinate` 

437 A `dict` of `Dimension` link name, value pairs that will label the 

438 `DatasetRef` within a Collection. 

439 datasetType : `DatasetType` 

440 The dataset type associated with this dataId. Required to 

441 determine the relevant dimensions. 

442 **kwargs 

443 Additional keyword arguments used to augment or construct a 

444 `DataId`. See `DataId` parameters. 

445 

446 Returns 

447 ------- 

448 dataId : `dict` or `DataCoordinate` 

449 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

450 no keyword arguments, the original dataId will be returned 

451 unchanged. 

452 **kwargs : `dict` 

453 Any unused keyword arguments (would normally be empty dict). 

454 """ 

455 # Do nothing if we have a standalone DataCoordinate. 

456 if isinstance(dataId, DataCoordinate) and not kwargs: 

457 return dataId, kwargs 

458 

459 # Process dimension records that are using record information 

460 # rather than ids 

461 newDataId: dict[str, DataIdValue] = {} 

462 byRecord: dict[str, dict[str, Any]] = defaultdict(dict) 

463 

464 # if all the dataId comes from keyword parameters we do not need 

465 # to do anything here because they can't be of the form 

466 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

467 if dataId: 

468 for k, v in dataId.items(): 

469 # If we have a Dimension we do not need to do anything 

470 # because it cannot be a compound key. 

471 if isinstance(k, str) and "." in k: 

472 # Someone is using a more human-readable dataId 

473 dimensionName, record = k.split(".", 1) 

474 byRecord[dimensionName][record] = v 

475 elif isinstance(k, Dimension): 

476 newDataId[k.name] = v 

477 else: 

478 newDataId[k] = v 

479 

480 # Go through the updated dataId and check the type in case someone is 

481 # using an alternate key. We have already filtered out the compound 

482 # keys dimensions.record format. 

483 not_dimensions = {} 

484 

485 # Will need to look in the dataId and the keyword arguments 

486 # and will remove them if they need to be fixed or are unrecognized. 

487 for dataIdDict in (newDataId, kwargs): 

488 # Use a list so we can adjust the dict safely in the loop 

489 for dimensionName in list(dataIdDict): 

490 value = dataIdDict[dimensionName] 

491 try: 

492 dimension = self.dimensions.dimensions[dimensionName] 

493 except KeyError: 

494 # This is not a real dimension 

495 not_dimensions[dimensionName] = value 

496 del dataIdDict[dimensionName] 

497 continue 

498 

499 # Convert an integral type to an explicit int to simplify 

500 # comparisons here 

501 if isinstance(value, numbers.Integral): 

502 value = int(value) 

503 

504 if not isinstance(value, dimension.primaryKey.getPythonType()): 

505 for alternate in dimension.alternateKeys: 

506 if isinstance(value, alternate.getPythonType()): 

507 byRecord[dimensionName][alternate.name] = value 

508 del dataIdDict[dimensionName] 

509 _LOG.debug( 

510 "Converting dimension %s to %s.%s=%s", 

511 dimensionName, 

512 dimensionName, 

513 alternate.name, 

514 value, 

515 ) 

516 break 

517 else: 

518 _LOG.warning( 

519 "Type mismatch found for value '%r' provided for dimension %s. " 

520 "Could not find matching alternative (primary key has type %s) " 

521 "so attempting to use as-is.", 

522 value, 

523 dimensionName, 

524 dimension.primaryKey.getPythonType(), 

525 ) 

526 

527 # By this point kwargs and newDataId should only include valid 

528 # dimensions. Merge kwargs in to the new dataId and log if there 

529 # are dimensions in both (rather than calling update). 

530 for k, v in kwargs.items(): 

531 if k in newDataId and newDataId[k] != v: 

532 _LOG.debug( 

533 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

534 ) 

535 newDataId[k] = v 

536 # No need to retain any values in kwargs now. 

537 kwargs = {} 

538 

539 # If we have some unrecognized dimensions we have to try to connect 

540 # them to records in other dimensions. This is made more complicated 

541 # by some dimensions having records with clashing names. A mitigation 

542 # is that we can tell by this point which dimensions are missing 

543 # for the DatasetType but this does not work for calibrations 

544 # where additional dimensions can be used to constrain the temporal 

545 # axis. 

546 if not_dimensions: 

547 # Search for all dimensions even if we have been given a value 

548 # explicitly. In some cases records are given as well as the 

549 # actually dimension and this should not be an error if they 

550 # match. 

551 mandatoryDimensions = datasetType.dimensions.names # - provided 

552 

553 candidateDimensions: set[str] = set() 

554 candidateDimensions.update(mandatoryDimensions) 

555 

556 # For calibrations we may well be needing temporal dimensions 

557 # so rather than always including all dimensions in the scan 

558 # restrict things a little. It is still possible for there 

559 # to be confusion over day_obs in visit vs exposure for example. 

560 # If we are not searching calibration collections things may 

561 # fail but they are going to fail anyway because of the 

562 # ambiguousness of the dataId... 

563 if datasetType.isCalibration(): 

564 for dim in self.dimensions.dimensions: 

565 if dim.temporal: 

566 candidateDimensions.add(str(dim)) 

567 

568 # Look up table for the first association with a dimension 

569 guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict) 

570 

571 # Keep track of whether an item is associated with multiple 

572 # dimensions. 

573 counter: Counter[str] = Counter() 

574 assigned: dict[str, set[str]] = defaultdict(set) 

575 

576 # Go through the missing dimensions and associate the 

577 # given names with records within those dimensions 

578 matched_dims = set() 

579 for dimensionName in candidateDimensions: 

580 dimension = self.dimensions.dimensions[dimensionName] 

581 fields = dimension.metadata.names | dimension.uniqueKeys.names 

582 for field in not_dimensions: 

583 if field in fields: 

584 guessedAssociation[dimensionName][field] = not_dimensions[field] 

585 counter[dimensionName] += 1 

586 assigned[field].add(dimensionName) 

587 matched_dims.add(field) 

588 

589 # Calculate the fields that matched nothing. 

590 never_found = set(not_dimensions) - matched_dims 

591 

592 if never_found: 

593 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

594 

595 # There is a chance we have allocated a single dataId item 

596 # to multiple dimensions. Need to decide which should be retained. 

597 # For now assume that the most popular alternative wins. 

598 # This means that day_obs with seq_num will result in 

599 # exposure.day_obs and not visit.day_obs 

600 # Also prefer an explicitly missing dimension over an inferred 

601 # temporal dimension. 

602 for fieldName, assignedDimensions in assigned.items(): 

603 if len(assignedDimensions) > 1: 

604 # Pick the most popular (preferring mandatory dimensions) 

605 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

606 if requiredButMissing: 

607 candidateDimensions = requiredButMissing 

608 else: 

609 candidateDimensions = assignedDimensions 

610 

611 # If this is a choice between visit and exposure and 

612 # neither was a required part of the dataset type, 

613 # (hence in this branch) always prefer exposure over 

614 # visit since exposures are always defined and visits 

615 # are defined from exposures. 

616 if candidateDimensions == {"exposure", "visit"}: 

617 candidateDimensions = {"exposure"} 

618 

619 # Select the relevant items and get a new restricted 

620 # counter. 

621 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

622 duplicatesCounter: Counter[str] = Counter() 

623 duplicatesCounter.update(theseCounts) 

624 

625 # Choose the most common. If they are equally common 

626 # we will pick the one that was found first. 

627 # Returns a list of tuples 

628 selected = duplicatesCounter.most_common(1)[0][0] 

629 

630 _LOG.debug( 

631 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

632 " Removed ambiguity by choosing dimension %s.", 

633 fieldName, 

634 ", ".join(assignedDimensions), 

635 selected, 

636 ) 

637 

638 for candidateDimension in assignedDimensions: 

639 if candidateDimension != selected: 

640 del guessedAssociation[candidateDimension][fieldName] 

641 

642 # Update the record look up dict with the new associations 

643 for dimensionName, values in guessedAssociation.items(): 

644 if values: # A dict might now be empty 

645 _LOG.debug( 

646 "Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values 

647 ) 

648 byRecord[dimensionName].update(values) 

649 

650 if byRecord: 

651 # Some record specifiers were found so we need to convert 

652 # them to the Id form 

653 for dimensionName, values in byRecord.items(): 

654 if dimensionName in newDataId: 

655 _LOG.debug( 

656 "DataId specified explicit %s dimension value of %s in addition to" 

657 " general record specifiers for it of %s. Ignoring record information.", 

658 dimensionName, 

659 newDataId[dimensionName], 

660 str(values), 

661 ) 

662 # Get the actual record and compare with these values. 

663 try: 

664 recs = list(self._registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

665 except DataIdError: 

666 raise ValueError( 

667 f"Could not find dimension '{dimensionName}'" 

668 f" with dataId {newDataId} as part of comparing with" 

669 f" record values {byRecord[dimensionName]}" 

670 ) from None 

671 if len(recs) == 1: 

672 errmsg: list[str] = [] 

673 for k, v in values.items(): 

674 if (recval := getattr(recs[0], k)) != v: 

675 errmsg.append(f"{k}({recval} != {v})") 

676 if errmsg: 

677 raise ValueError( 

678 f"Dimension {dimensionName} in dataId has explicit value" 

679 " inconsistent with records: " + ", ".join(errmsg) 

680 ) 

681 else: 

682 # Multiple matches for an explicit dimension 

683 # should never happen but let downstream complain. 

684 pass 

685 continue 

686 

687 # Build up a WHERE expression 

688 bind = dict(values.items()) 

689 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

690 

691 # Hopefully we get a single record that matches 

692 records = set( 

693 self._registry.queryDimensionRecords( 

694 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

695 ) 

696 ) 

697 

698 if len(records) != 1: 

699 if len(records) > 1: 

700 # visit can have an ambiguous answer without involving 

701 # visit_system. The default visit_system is defined 

702 # by the instrument. 

703 if ( 

704 dimensionName == "visit" 

705 and "visit_system_membership" in self.dimensions 

706 and "visit_system" in self.dimensions["instrument"].metadata 

707 ): 

708 instrument_records = list( 

709 self._registry.queryDimensionRecords( 

710 "instrument", 

711 dataId=newDataId, 

712 **kwargs, 

713 ) 

714 ) 

715 if len(instrument_records) == 1: 

716 visit_system = instrument_records[0].visit_system 

717 if visit_system is None: 

718 # Set to a value that will never match. 

719 visit_system = -1 

720 

721 # Look up each visit in the 

722 # visit_system_membership records. 

723 for rec in records: 

724 membership = list( 

725 self._registry.queryDimensionRecords( 

726 # Use bind to allow zero results. 

727 # This is a fully-specified query. 

728 "visit_system_membership", 

729 where="instrument = inst AND visit_system = system AND visit = v", 

730 bind=dict( 

731 inst=instrument_records[0].name, system=visit_system, v=rec.id 

732 ), 

733 ) 

734 ) 

735 if membership: 

736 # This record is the right answer. 

737 records = {rec} 

738 break 

739 

740 # The ambiguity may have been resolved so check again. 

741 if len(records) > 1: 

742 _LOG.debug( 

743 "Received %d records from constraints of %s", len(records), str(values) 

744 ) 

745 for r in records: 

746 _LOG.debug("- %s", str(r)) 

747 raise ValueError( 

748 f"DataId specification for dimension {dimensionName} is not" 

749 f" uniquely constrained to a single dataset by {values}." 

750 f" Got {len(records)} results." 

751 ) 

752 else: 

753 raise ValueError( 

754 f"DataId specification for dimension {dimensionName} matched no" 

755 f" records when constrained by {values}" 

756 ) 

757 

758 # Get the primary key from the real dimension object 

759 dimension = self.dimensions.dimensions[dimensionName] 

760 if not isinstance(dimension, Dimension): 

761 raise RuntimeError( 

762 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

763 ) 

764 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

765 

766 return newDataId, kwargs 

767 

768 def _findDatasetRef( 

769 self, 

770 datasetRefOrType: DatasetRef | DatasetType | str, 

771 dataId: DataId | None = None, 

772 *, 

773 collections: Any = None, 

774 predict: bool = False, 

775 run: str | None = None, 

776 datastore_records: bool = False, 

777 **kwargs: Any, 

778 ) -> DatasetRef: 

779 """Shared logic for methods that start with a search for a dataset in 

780 the registry. 

781 

782 Parameters 

783 ---------- 

784 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

785 When `DatasetRef` the `dataId` should be `None`. 

786 Otherwise the `DatasetType` or name thereof. 

787 dataId : `dict` or `DataCoordinate`, optional 

788 A `dict` of `Dimension` link name, value pairs that label the 

789 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

790 should be provided as the first argument. 

791 collections : Any, optional 

792 Collections to be searched, overriding ``self.collections``. 

793 Can be any of the types supported by the ``collections`` argument 

794 to butler construction. 

795 predict : `bool`, optional 

796 If `True`, return a newly created `DatasetRef` with a unique 

797 dataset ID if finding a reference in the `Registry` fails. 

798 Defaults to `False`. 

799 run : `str`, optional 

800 Run collection name to use for creating `DatasetRef` for predicted 

801 datasets. Only used if ``predict`` is `True`. 

802 datastore_records : `bool`, optional 

803 If `True` add datastore records to returned `DatasetRef`. 

804 **kwargs 

805 Additional keyword arguments used to augment or construct a 

806 `DataId`. See `DataId` parameters. 

807 

808 Returns 

809 ------- 

810 ref : `DatasetRef` 

811 A reference to the dataset identified by the given arguments. 

812 This can be the same dataset reference as given if it was 

813 resolved. 

814 

815 Raises 

816 ------ 

817 LookupError 

818 Raised if no matching dataset exists in the `Registry` (and 

819 ``predict`` is `False`). 

820 ValueError 

821 Raised if a resolved `DatasetRef` was passed as an input, but it 

822 differs from the one found in the registry. 

823 TypeError 

824 Raised if no collections were provided. 

825 """ 

826 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

827 if isinstance(datasetRefOrType, DatasetRef): 

828 if collections is not None: 

829 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3) 

830 # May need to retrieve datastore records if requested. 

831 if datastore_records and datasetRefOrType._datastore_records is None: 

832 datasetRefOrType = self._registry.get_datastore_records(datasetRefOrType) 

833 return datasetRefOrType 

834 timespan: Timespan | None = None 

835 

836 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

837 

838 if datasetType.isCalibration(): 

839 # Because this is a calibration dataset, first try to make a 

840 # standardize the data ID without restricting the dimensions to 

841 # those of the dataset type requested, because there may be extra 

842 # dimensions that provide temporal information for a validity-range 

843 # lookup. 

844 dataId = DataCoordinate.standardize( 

845 dataId, universe=self.dimensions, defaults=self._registry.defaults.dataId, **kwargs 

846 ) 

847 if dataId.dimensions.temporal: 

848 dataId = self._registry.expandDataId(dataId) 

849 timespan = dataId.timespan 

850 else: 

851 # Standardize the data ID to just the dimensions of the dataset 

852 # type instead of letting registry.findDataset do it, so we get the 

853 # result even if no dataset is found. 

854 dataId = DataCoordinate.standardize( 

855 dataId, 

856 dimensions=datasetType.dimensions, 

857 defaults=self._registry.defaults.dataId, 

858 **kwargs, 

859 ) 

860 # Always lookup the DatasetRef, even if one is given, to ensure it is 

861 # present in the current collection. 

862 ref = self.find_dataset( 

863 datasetType, 

864 dataId, 

865 collections=collections, 

866 timespan=timespan, 

867 datastore_records=datastore_records, 

868 ) 

869 if ref is None: 

870 if predict: 

871 if run is None: 

872 run = self.run 

873 if run is None: 

874 raise TypeError("Cannot predict dataset ID/location with run=None.") 

875 return DatasetRef(datasetType, dataId, run=run) 

876 else: 

877 if collections is None: 

878 collections = self._registry.defaults.collections 

879 raise LookupError( 

880 f"Dataset {datasetType.name} with data ID {dataId} " 

881 f"could not be found in collections {collections}." 

882 ) 

883 if datasetType != ref.datasetType: 

884 # If they differ it is because the user explicitly specified 

885 # a compatible dataset type to this call rather than using the 

886 # registry definition. The DatasetRef must therefore be recreated 

887 # using the user definition such that the expected type is 

888 # returned. 

889 ref = DatasetRef( 

890 datasetType, ref.dataId, run=ref.run, id=ref.id, datastore_records=ref._datastore_records 

891 ) 

892 

893 return ref 

894 

895 # TODO: remove on DM-40067. 

896 @transactional 

897 @deprecated( 

898 reason="Butler.put() now behaves like Butler.putDirect() when given a DatasetRef." 

899 " Please use Butler.put(). Be aware that you may need to adjust your usage if you" 

900 " were relying on the run parameter to determine the run." 

901 " Will be removed after v26.0.", 

902 version="v26.0", 

903 category=FutureWarning, 

904 ) 

905 def putDirect(self, obj: Any, ref: DatasetRef, /) -> DatasetRef: 

906 # Docstring inherited. 

907 return self.put(obj, ref) 

908 

909 @transactional 

910 def put( 

911 self, 

912 obj: Any, 

913 datasetRefOrType: DatasetRef | DatasetType | str, 

914 /, 

915 dataId: DataId | None = None, 

916 *, 

917 run: str | None = None, 

918 **kwargs: Any, 

919 ) -> DatasetRef: 

920 """Store and register a dataset. 

921 

922 Parameters 

923 ---------- 

924 obj : `object` 

925 The dataset. 

926 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

927 When `DatasetRef` is provided, ``dataId`` should be `None`. 

928 Otherwise the `DatasetType` or name thereof. If a fully resolved 

929 `DatasetRef` is given the run and ID are used directly. 

930 dataId : `dict` or `DataCoordinate` 

931 A `dict` of `Dimension` link name, value pairs that label the 

932 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

933 should be provided as the second argument. 

934 run : `str`, optional 

935 The name of the run the dataset should be added to, overriding 

936 ``self.run``. Not used if a resolved `DatasetRef` is provided. 

937 **kwargs 

938 Additional keyword arguments used to augment or construct a 

939 `DataCoordinate`. See `DataCoordinate.standardize` 

940 parameters. Not used if a resolve `DatasetRef` is provided. 

941 

942 Returns 

943 ------- 

944 ref : `DatasetRef` 

945 A reference to the stored dataset, updated with the correct id if 

946 given. 

947 

948 Raises 

949 ------ 

950 TypeError 

951 Raised if the butler is read-only or if no run has been provided. 

952 """ 

953 if isinstance(datasetRefOrType, DatasetRef): 

954 # This is a direct put of predefined DatasetRef. 

955 _LOG.debug("Butler put direct: %s", datasetRefOrType) 

956 if run is not None: 

957 warnings.warn("Run collection is not used for DatasetRef", stacklevel=3) 

958 # If registry already has a dataset with the same dataset ID, 

959 # dataset type and DataId, then _importDatasets will do nothing and 

960 # just return an original ref. We have to raise in this case, there 

961 # is a datastore check below for that. 

962 self._registry._importDatasets([datasetRefOrType], expand=True) 

963 # Before trying to write to the datastore check that it does not 

964 # know this dataset. This is prone to races, of course. 

965 if self._datastore.knows(datasetRefOrType): 

966 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}") 

967 # Try to write dataset to the datastore, if it fails due to a race 

968 # with another write, the content of stored data may be 

969 # unpredictable. 

970 try: 

971 self._datastore.put(obj, datasetRefOrType) 

972 except IntegrityError as e: 

973 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") from e 

974 return datasetRefOrType 

975 

976 _LOG.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

977 if not self.isWriteable(): 

978 raise TypeError("Butler is read-only.") 

979 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

980 

981 # Handle dimension records in dataId 

982 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

983 

984 # Add Registry Dataset entry. 

985 dataId = self._registry.expandDataId(dataId, dimensions=datasetType.dimensions, **kwargs) 

986 (ref,) = self._registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

987 self._datastore.put(obj, ref) 

988 

989 return ref 

990 

991 # TODO: remove on DM-40067. 

992 @deprecated( 

993 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef." 

994 " Please use Butler.get(). Will be removed after v26.0.", 

995 version="v26.0", 

996 category=FutureWarning, 

997 ) 

998 def getDirect( 

999 self, 

1000 ref: DatasetRef, 

1001 *, 

1002 parameters: dict[str, Any] | None = None, 

1003 storageClass: StorageClass | str | None = None, 

1004 ) -> Any: 

1005 """Retrieve a stored dataset. 

1006 

1007 Parameters 

1008 ---------- 

1009 ref : `DatasetRef` 

1010 Resolved reference to an already stored dataset. 

1011 parameters : `dict` 

1012 Additional StorageClass-defined options to control reading, 

1013 typically used to efficiently read only a subset of the dataset. 

1014 storageClass : `StorageClass` or `str`, optional 

1015 The storage class to be used to override the Python type 

1016 returned by this method. By default the returned type matches 

1017 the dataset type definition for this dataset. Specifying a 

1018 read `StorageClass` can force a different type to be returned. 

1019 This type must be compatible with the original type. 

1020 

1021 Returns 

1022 ------- 

1023 obj : `object` 

1024 The dataset. 

1025 """ 

1026 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1027 

1028 # TODO: remove on DM-40067. 

1029 @deprecated( 

1030 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. " 

1031 "Please use Butler.getDeferred(). Will be removed after v26.0.", 

1032 version="v26.0", 

1033 category=FutureWarning, 

1034 ) 

1035 def getDirectDeferred( 

1036 self, 

1037 ref: DatasetRef, 

1038 *, 

1039 parameters: dict[str, Any] | None = None, 

1040 storageClass: str | StorageClass | None = None, 

1041 ) -> DeferredDatasetHandle: 

1042 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1043 from a resolved `DatasetRef`. 

1044 

1045 Parameters 

1046 ---------- 

1047 ref : `DatasetRef` 

1048 Resolved reference to an already stored dataset. 

1049 parameters : `dict` 

1050 Additional StorageClass-defined options to control reading, 

1051 typically used to efficiently read only a subset of the dataset. 

1052 storageClass : `StorageClass` or `str`, optional 

1053 The storage class to be used to override the Python type 

1054 returned by this method. By default the returned type matches 

1055 the dataset type definition for this dataset. Specifying a 

1056 read `StorageClass` can force a different type to be returned. 

1057 This type must be compatible with the original type. 

1058 

1059 Returns 

1060 ------- 

1061 obj : `DeferredDatasetHandle` 

1062 A handle which can be used to retrieve a dataset at a later time. 

1063 

1064 Raises 

1065 ------ 

1066 LookupError 

1067 Raised if no matching dataset exists in the `Registry`. 

1068 """ 

1069 # Check that dataset is known to the datastore. 

1070 if not self._datastore.knows(ref): 

1071 raise LookupError(f"Dataset reference {ref} is not known to datastore.") 

1072 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1073 

1074 def getDeferred( 

1075 self, 

1076 datasetRefOrType: DatasetRef | DatasetType | str, 

1077 /, 

1078 dataId: DataId | None = None, 

1079 *, 

1080 parameters: dict | None = None, 

1081 collections: Any = None, 

1082 storageClass: str | StorageClass | None = None, 

1083 **kwargs: Any, 

1084 ) -> DeferredDatasetHandle: 

1085 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1086 after an immediate registry lookup. 

1087 

1088 Parameters 

1089 ---------- 

1090 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1091 When `DatasetRef` the `dataId` should be `None`. 

1092 Otherwise the `DatasetType` or name thereof. 

1093 dataId : `dict` or `DataCoordinate`, optional 

1094 A `dict` of `Dimension` link name, value pairs that label the 

1095 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1096 should be provided as the first argument. 

1097 parameters : `dict` 

1098 Additional StorageClass-defined options to control reading, 

1099 typically used to efficiently read only a subset of the dataset. 

1100 collections : Any, optional 

1101 Collections to be searched, overriding ``self.collections``. 

1102 Can be any of the types supported by the ``collections`` argument 

1103 to butler construction. 

1104 storageClass : `StorageClass` or `str`, optional 

1105 The storage class to be used to override the Python type 

1106 returned by this method. By default the returned type matches 

1107 the dataset type definition for this dataset. Specifying a 

1108 read `StorageClass` can force a different type to be returned. 

1109 This type must be compatible with the original type. 

1110 **kwargs 

1111 Additional keyword arguments used to augment or construct a 

1112 `DataId`. See `DataId` parameters. 

1113 

1114 Returns 

1115 ------- 

1116 obj : `DeferredDatasetHandle` 

1117 A handle which can be used to retrieve a dataset at a later time. 

1118 

1119 Raises 

1120 ------ 

1121 LookupError 

1122 Raised if no matching dataset exists in the `Registry` or 

1123 datastore. 

1124 ValueError 

1125 Raised if a resolved `DatasetRef` was passed as an input, but it 

1126 differs from the one found in the registry. 

1127 TypeError 

1128 Raised if no collections were provided. 

1129 """ 

1130 if isinstance(datasetRefOrType, DatasetRef): 

1131 # Do the quick check first and if that fails, check for artifact 

1132 # existence. This is necessary for datastores that are configured 

1133 # in trust mode where there won't be a record but there will be 

1134 # a file. 

1135 if self._datastore.knows(datasetRefOrType) or self._datastore.exists(datasetRefOrType): 

1136 ref = datasetRefOrType 

1137 else: 

1138 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.") 

1139 else: 

1140 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1141 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1142 

1143 def get( 

1144 self, 

1145 datasetRefOrType: DatasetRef | DatasetType | str, 

1146 /, 

1147 dataId: DataId | None = None, 

1148 *, 

1149 parameters: dict[str, Any] | None = None, 

1150 collections: Any = None, 

1151 storageClass: StorageClass | str | None = None, 

1152 **kwargs: Any, 

1153 ) -> Any: 

1154 """Retrieve a stored dataset. 

1155 

1156 Parameters 

1157 ---------- 

1158 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1159 When `DatasetRef` the `dataId` should be `None`. 

1160 Otherwise the `DatasetType` or name thereof. 

1161 If a resolved `DatasetRef`, the associated dataset 

1162 is returned directly without additional querying. 

1163 dataId : `dict` or `DataCoordinate` 

1164 A `dict` of `Dimension` link name, value pairs that label the 

1165 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1166 should be provided as the first argument. 

1167 parameters : `dict` 

1168 Additional StorageClass-defined options to control reading, 

1169 typically used to efficiently read only a subset of the dataset. 

1170 collections : Any, optional 

1171 Collections to be searched, overriding ``self.collections``. 

1172 Can be any of the types supported by the ``collections`` argument 

1173 to butler construction. 

1174 storageClass : `StorageClass` or `str`, optional 

1175 The storage class to be used to override the Python type 

1176 returned by this method. By default the returned type matches 

1177 the dataset type definition for this dataset. Specifying a 

1178 read `StorageClass` can force a different type to be returned. 

1179 This type must be compatible with the original type. 

1180 **kwargs 

1181 Additional keyword arguments used to augment or construct a 

1182 `DataCoordinate`. See `DataCoordinate.standardize` 

1183 parameters. 

1184 

1185 Returns 

1186 ------- 

1187 obj : `object` 

1188 The dataset. 

1189 

1190 Raises 

1191 ------ 

1192 LookupError 

1193 Raised if no matching dataset exists in the `Registry`. 

1194 TypeError 

1195 Raised if no collections were provided. 

1196 

1197 Notes 

1198 ----- 

1199 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1200 this method requires that the given data ID include temporal dimensions 

1201 beyond the dimensions of the dataset type itself, in order to find the 

1202 dataset with the appropriate validity range. For example, a "bias" 

1203 dataset with native dimensions ``{instrument, detector}`` could be 

1204 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1205 ``exposure`` is a temporal dimension. 

1206 """ 

1207 _LOG.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1208 ref = self._findDatasetRef( 

1209 datasetRefOrType, dataId, collections=collections, datastore_records=True, **kwargs 

1210 ) 

1211 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1212 

1213 def getURIs( 

1214 self, 

1215 datasetRefOrType: DatasetRef | DatasetType | str, 

1216 /, 

1217 dataId: DataId | None = None, 

1218 *, 

1219 predict: bool = False, 

1220 collections: Any = None, 

1221 run: str | None = None, 

1222 **kwargs: Any, 

1223 ) -> DatasetRefURIs: 

1224 """Return the URIs associated with the dataset. 

1225 

1226 Parameters 

1227 ---------- 

1228 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1229 When `DatasetRef` the `dataId` should be `None`. 

1230 Otherwise the `DatasetType` or name thereof. 

1231 dataId : `dict` or `DataCoordinate` 

1232 A `dict` of `Dimension` link name, value pairs that label the 

1233 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1234 should be provided as the first argument. 

1235 predict : `bool` 

1236 If `True`, allow URIs to be returned of datasets that have not 

1237 been written. 

1238 collections : Any, optional 

1239 Collections to be searched, overriding ``self.collections``. 

1240 Can be any of the types supported by the ``collections`` argument 

1241 to butler construction. 

1242 run : `str`, optional 

1243 Run to use for predictions, overriding ``self.run``. 

1244 **kwargs 

1245 Additional keyword arguments used to augment or construct a 

1246 `DataCoordinate`. See `DataCoordinate.standardize` 

1247 parameters. 

1248 

1249 Returns 

1250 ------- 

1251 uris : `DatasetRefURIs` 

1252 The URI to the primary artifact associated with this dataset (if 

1253 the dataset was disassembled within the datastore this may be 

1254 `None`), and the URIs to any components associated with the dataset 

1255 artifact. (can be empty if there are no components). 

1256 """ 

1257 ref = self._findDatasetRef( 

1258 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs 

1259 ) 

1260 return self._datastore.getURIs(ref, predict) 

1261 

1262 def getURI( 

1263 self, 

1264 datasetRefOrType: DatasetRef | DatasetType | str, 

1265 /, 

1266 dataId: DataId | None = None, 

1267 *, 

1268 predict: bool = False, 

1269 collections: Any = None, 

1270 run: str | None = None, 

1271 **kwargs: Any, 

1272 ) -> ResourcePath: 

1273 """Return the URI to the Dataset. 

1274 

1275 Parameters 

1276 ---------- 

1277 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1278 When `DatasetRef` the `dataId` should be `None`. 

1279 Otherwise the `DatasetType` or name thereof. 

1280 dataId : `dict` or `DataCoordinate` 

1281 A `dict` of `Dimension` link name, value pairs that label the 

1282 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1283 should be provided as the first argument. 

1284 predict : `bool` 

1285 If `True`, allow URIs to be returned of datasets that have not 

1286 been written. 

1287 collections : Any, optional 

1288 Collections to be searched, overriding ``self.collections``. 

1289 Can be any of the types supported by the ``collections`` argument 

1290 to butler construction. 

1291 run : `str`, optional 

1292 Run to use for predictions, overriding ``self.run``. 

1293 **kwargs 

1294 Additional keyword arguments used to augment or construct a 

1295 `DataCoordinate`. See `DataCoordinate.standardize` 

1296 parameters. 

1297 

1298 Returns 

1299 ------- 

1300 uri : `lsst.resources.ResourcePath` 

1301 URI pointing to the Dataset within the datastore. If the 

1302 Dataset does not exist in the datastore, and if ``predict`` is 

1303 `True`, the URI will be a prediction and will include a URI 

1304 fragment "#predicted". 

1305 If the datastore does not have entities that relate well 

1306 to the concept of a URI the returned URI string will be 

1307 descriptive. The returned URI is not guaranteed to be obtainable. 

1308 

1309 Raises 

1310 ------ 

1311 LookupError 

1312 A URI has been requested for a dataset that does not exist and 

1313 guessing is not allowed. 

1314 ValueError 

1315 Raised if a resolved `DatasetRef` was passed as an input, but it 

1316 differs from the one found in the registry. 

1317 TypeError 

1318 Raised if no collections were provided. 

1319 RuntimeError 

1320 Raised if a URI is requested for a dataset that consists of 

1321 multiple artifacts. 

1322 """ 

1323 primary, components = self.getURIs( 

1324 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs 

1325 ) 

1326 

1327 if primary is None or components: 

1328 raise RuntimeError( 

1329 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1330 "Use Butler.getURIs() instead." 

1331 ) 

1332 return primary 

1333 

1334 def get_dataset_type(self, name: str) -> DatasetType: 

1335 return self._registry.getDatasetType(name) 

1336 

1337 def get_dataset( 

1338 self, 

1339 id: DatasetId, 

1340 storage_class: str | StorageClass | None = None, 

1341 dimension_records: bool = False, 

1342 datastore_records: bool = False, 

1343 ) -> DatasetRef | None: 

1344 ref = self._registry.getDataset(id) 

1345 if ref is not None: 

1346 if dimension_records: 

1347 ref = ref.expanded( 

1348 self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions) 

1349 ) 

1350 if storage_class: 

1351 ref = ref.overrideStorageClass(storage_class) 

1352 if datastore_records: 

1353 ref = self._registry.get_datastore_records(ref) 

1354 return ref 

1355 

1356 def find_dataset( 

1357 self, 

1358 dataset_type: DatasetType | str, 

1359 data_id: DataId | None = None, 

1360 *, 

1361 collections: str | Sequence[str] | None = None, 

1362 timespan: Timespan | None = None, 

1363 storage_class: str | StorageClass | None = None, 

1364 dimension_records: bool = False, 

1365 datastore_records: bool = False, 

1366 **kwargs: Any, 

1367 ) -> DatasetRef | None: 

1368 # Handle any parts of the dataID that are not using primary dimension 

1369 # keys. 

1370 if isinstance(dataset_type, str): 

1371 actual_type = self.get_dataset_type(dataset_type) 

1372 else: 

1373 actual_type = dataset_type 

1374 data_id, kwargs = self._rewrite_data_id(data_id, actual_type, **kwargs) 

1375 

1376 ref = self._registry.findDataset( 

1377 dataset_type, 

1378 data_id, 

1379 collections=collections, 

1380 timespan=timespan, 

1381 datastore_records=datastore_records, 

1382 **kwargs, 

1383 ) 

1384 if ref is not None and dimension_records: 

1385 ref = ref.expanded(self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions)) 

1386 if ref is not None and storage_class is not None: 

1387 ref = ref.overrideStorageClass(storage_class) 

1388 return ref 

1389 

1390 def retrieveArtifacts( 

1391 self, 

1392 refs: Iterable[DatasetRef], 

1393 destination: ResourcePathExpression, 

1394 transfer: str = "auto", 

1395 preserve_path: bool = True, 

1396 overwrite: bool = False, 

1397 ) -> list[ResourcePath]: 

1398 # Docstring inherited. 

1399 return self._datastore.retrieveArtifacts( 

1400 refs, 

1401 ResourcePath(destination), 

1402 transfer=transfer, 

1403 preserve_path=preserve_path, 

1404 overwrite=overwrite, 

1405 ) 

1406 

1407 def exists( 

1408 self, 

1409 dataset_ref_or_type: DatasetRef | DatasetType | str, 

1410 /, 

1411 data_id: DataId | None = None, 

1412 *, 

1413 full_check: bool = True, 

1414 collections: Any = None, 

1415 **kwargs: Any, 

1416 ) -> DatasetExistence: 

1417 # Docstring inherited. 

1418 existence = DatasetExistence.UNRECOGNIZED 

1419 

1420 if isinstance(dataset_ref_or_type, DatasetRef): 

1421 if collections is not None: 

1422 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2) 

1423 if data_id is not None: 

1424 warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2) 

1425 ref = dataset_ref_or_type 

1426 registry_ref = self._registry.getDataset(dataset_ref_or_type.id) 

1427 if registry_ref is not None: 

1428 existence |= DatasetExistence.RECORDED 

1429 

1430 if dataset_ref_or_type != registry_ref: 

1431 # This could mean that storage classes differ, so we should 

1432 # check for that but use the registry ref for the rest of 

1433 # the method. 

1434 if registry_ref.is_compatible_with(dataset_ref_or_type): 

1435 # Use the registry version from now on. 

1436 ref = registry_ref 

1437 else: 

1438 raise ValueError( 

1439 f"The ref given to exists() ({ref}) has the same dataset ID as one " 

1440 f"in registry but has different incompatible values ({registry_ref})." 

1441 ) 

1442 else: 

1443 try: 

1444 ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs) 

1445 except (LookupError, TypeError, NoDefaultCollectionError): 

1446 return existence 

1447 existence |= DatasetExistence.RECORDED 

1448 

1449 if self._datastore.knows(ref): 

1450 existence |= DatasetExistence.DATASTORE 

1451 

1452 if full_check: 

1453 if self._datastore.exists(ref): 

1454 existence |= DatasetExistence._ARTIFACT 

1455 elif existence.value != DatasetExistence.UNRECOGNIZED.value: 

1456 # Do not add this flag if we have no other idea about a dataset. 

1457 existence |= DatasetExistence(DatasetExistence._ASSUMED) 

1458 

1459 return existence 

1460 

1461 def _exists_many( 

1462 self, 

1463 refs: Iterable[DatasetRef], 

1464 /, 

1465 *, 

1466 full_check: bool = True, 

1467 ) -> dict[DatasetRef, DatasetExistence]: 

1468 # Docstring inherited. 

1469 existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs} 

1470 

1471 # Registry does not have a bulk API to check for a ref. 

1472 for ref in refs: 

1473 registry_ref = self._registry.getDataset(ref.id) 

1474 if registry_ref is not None: 

1475 # It is possible, albeit unlikely, that the given ref does 

1476 # not match the one in registry even though the UUID matches. 

1477 # When checking a single ref we raise, but it's impolite to 

1478 # do that when potentially hundreds of refs are being checked. 

1479 # We could change the API to only accept UUIDs and that would 

1480 # remove the ability to even check and remove the worry 

1481 # about differing storage classes. Given the ongoing discussion 

1482 # on refs vs UUIDs and whether to raise or have a new 

1483 # private flag, treat this as a private API for now. 

1484 existence[ref] |= DatasetExistence.RECORDED 

1485 

1486 # Ask datastore if it knows about these refs. 

1487 knows = self._datastore.knows_these(refs) 

1488 for ref, known in knows.items(): 

1489 if known: 

1490 existence[ref] |= DatasetExistence.DATASTORE 

1491 

1492 if full_check: 

1493 mexists = self._datastore.mexists(refs) 

1494 for ref, exists in mexists.items(): 

1495 if exists: 

1496 existence[ref] |= DatasetExistence._ARTIFACT 

1497 else: 

1498 # Do not set this flag if nothing is known about the dataset. 

1499 for ref in existence: 

1500 if existence[ref] != DatasetExistence.UNRECOGNIZED: 

1501 existence[ref] |= DatasetExistence._ASSUMED 

1502 

1503 return existence 

1504 

1505 # TODO: remove on DM-40079. 

1506 @deprecated( 

1507 reason="Butler.datasetExists() has been replaced by Butler.exists(). Will be removed after v26.0.", 

1508 version="v26.0", 

1509 category=FutureWarning, 

1510 ) 

1511 def datasetExists( 

1512 self, 

1513 datasetRefOrType: DatasetRef | DatasetType | str, 

1514 dataId: DataId | None = None, 

1515 *, 

1516 collections: Any = None, 

1517 **kwargs: Any, 

1518 ) -> bool: 

1519 """Return True if the Dataset is actually present in the Datastore. 

1520 

1521 Parameters 

1522 ---------- 

1523 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1524 When `DatasetRef` the `dataId` should be `None`. 

1525 Otherwise the `DatasetType` or name thereof. 

1526 dataId : `dict` or `DataCoordinate` 

1527 A `dict` of `Dimension` link name, value pairs that label the 

1528 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1529 should be provided as the first argument. 

1530 collections : Any, optional 

1531 Collections to be searched, overriding ``self.collections``. 

1532 Can be any of the types supported by the ``collections`` argument 

1533 to butler construction. 

1534 **kwargs 

1535 Additional keyword arguments used to augment or construct a 

1536 `DataCoordinate`. See `DataCoordinate.standardize` 

1537 parameters. 

1538 

1539 Raises 

1540 ------ 

1541 LookupError 

1542 Raised if the dataset is not even present in the Registry. 

1543 ValueError 

1544 Raised if a resolved `DatasetRef` was passed as an input, but it 

1545 differs from the one found in the registry. 

1546 NoDefaultCollectionError 

1547 Raised if no collections were provided. 

1548 """ 

1549 # A resolved ref may be given that is not known to this butler. 

1550 if isinstance(datasetRefOrType, DatasetRef): 

1551 ref = self._registry.getDataset(datasetRefOrType.id) 

1552 if ref is None: 

1553 raise LookupError( 

1554 f"Resolved DatasetRef with id {datasetRefOrType.id} is not known to registry." 

1555 ) 

1556 else: 

1557 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1558 return self._datastore.exists(ref) 

1559 

1560 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1561 # Docstring inherited. 

1562 if not self.isWriteable(): 

1563 raise TypeError("Butler is read-only.") 

1564 names = list(names) 

1565 refs: list[DatasetRef] = [] 

1566 for name in names: 

1567 collectionType = self._registry.getCollectionType(name) 

1568 if collectionType is not CollectionType.RUN: 

1569 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1570 refs.extend(self._registry.queryDatasets(..., collections=name, findFirst=True)) 

1571 with self._datastore.transaction(), self._registry.transaction(): 

1572 if unstore: 

1573 self._datastore.trash(refs) 

1574 else: 

1575 self._datastore.forget(refs) 

1576 for name in names: 

1577 self._registry.removeCollection(name) 

1578 if unstore: 

1579 # Point of no return for removing artifacts 

1580 self._datastore.emptyTrash() 

1581 

1582 def pruneDatasets( 

1583 self, 

1584 refs: Iterable[DatasetRef], 

1585 *, 

1586 disassociate: bool = True, 

1587 unstore: bool = False, 

1588 tags: Iterable[str] = (), 

1589 purge: bool = False, 

1590 ) -> None: 

1591 # docstring inherited from LimitedButler 

1592 

1593 if not self.isWriteable(): 

1594 raise TypeError("Butler is read-only.") 

1595 if purge: 

1596 if not disassociate: 

1597 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1598 if not unstore: 

1599 raise TypeError("Cannot pass purge=True without unstore=True.") 

1600 elif disassociate: 

1601 tags = tuple(tags) 

1602 if not tags: 

1603 raise TypeError("No tags provided but disassociate=True.") 

1604 for tag in tags: 

1605 collectionType = self._registry.getCollectionType(tag) 

1606 if collectionType is not CollectionType.TAGGED: 

1607 raise TypeError( 

1608 f"Cannot disassociate from collection '{tag}' " 

1609 f"of non-TAGGED type {collectionType.name}." 

1610 ) 

1611 # Transform possibly-single-pass iterable into something we can iterate 

1612 # over multiple times. 

1613 refs = list(refs) 

1614 # Pruning a component of a DatasetRef makes no sense since registry 

1615 # doesn't know about components and datastore might not store 

1616 # components in a separate file 

1617 for ref in refs: 

1618 if ref.datasetType.component(): 

1619 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1620 # We don't need an unreliable Datastore transaction for this, because 

1621 # we've been extra careful to ensure that Datastore.trash only involves 

1622 # mutating the Registry (it can _look_ at Datastore-specific things, 

1623 # but shouldn't change them), and hence all operations here are 

1624 # Registry operations. 

1625 with self._datastore.transaction(), self._registry.transaction(): 

1626 if unstore: 

1627 self._datastore.trash(refs) 

1628 if purge: 

1629 self._registry.removeDatasets(refs) 

1630 elif disassociate: 

1631 assert tags, "Guaranteed by earlier logic in this function." 

1632 for tag in tags: 

1633 self._registry.disassociate(tag, refs) 

1634 # We've exited the Registry transaction, and apparently committed. 

1635 # (if there was an exception, everything rolled back, and it's as if 

1636 # nothing happened - and we never get here). 

1637 # Datastore artifacts are not yet gone, but they're clearly marked 

1638 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1639 # problems we can try again later, and if manual administrative 

1640 # intervention is required, it's pretty clear what that should entail: 

1641 # deleting everything on disk and in private Datastore tables that is 

1642 # in the dataset_location_trash table. 

1643 if unstore: 

1644 # Point of no return for removing artifacts 

1645 self._datastore.emptyTrash() 

1646 

1647 @transactional 

1648 def ingest( 

1649 self, 

1650 *datasets: FileDataset, 

1651 transfer: str | None = "auto", 

1652 run: str | None = None, 

1653 idGenerationMode: DatasetIdGenEnum | None = None, 

1654 record_validation_info: bool = True, 

1655 ) -> None: 

1656 # Docstring inherited. 

1657 if not self.isWriteable(): 

1658 raise TypeError("Butler is read-only.") 

1659 

1660 _LOG.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s") 

1661 if not datasets: 

1662 return 

1663 

1664 if idGenerationMode is not None: 

1665 warnings.warn( 

1666 "The idGenerationMode parameter is no longer used and is ignored. " 

1667 " Will be removed after v26.0", 

1668 FutureWarning, 

1669 stacklevel=2, 

1670 ) 

1671 

1672 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1673 

1674 # We need to reorganize all the inputs so that they are grouped 

1675 # by dataset type and run. Multiple refs in a single FileDataset 

1676 # are required to share the run and dataset type. 

1677 groupedData: MutableMapping[tuple[DatasetType, str], list[FileDataset]] = defaultdict(list) 

1678 

1679 # Track DataIDs that are being ingested so we can spot issues early 

1680 # with duplication. Retain previous FileDataset so we can report it. 

1681 groupedDataIds: MutableMapping[ 

1682 tuple[DatasetType, str], dict[DataCoordinate, FileDataset] 

1683 ] = defaultdict(dict) 

1684 

1685 used_run = False 

1686 

1687 # And the nested loop that populates it: 

1688 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1689 # Somewhere to store pre-existing refs if we have an 

1690 # execution butler. 

1691 existingRefs: list[DatasetRef] = [] 

1692 

1693 for ref in dataset.refs: 

1694 assert ref.run is not None # For mypy 

1695 group_key = (ref.datasetType, ref.run) 

1696 

1697 if ref.dataId in groupedDataIds[group_key]: 

1698 raise ConflictingDefinitionError( 

1699 f"Ingest conflict. Dataset {dataset.path} has same" 

1700 " DataId as other ingest dataset" 

1701 f" {groupedDataIds[group_key][ref.dataId].path} " 

1702 f" ({ref.dataId})" 

1703 ) 

1704 

1705 groupedDataIds[group_key][ref.dataId] = dataset 

1706 

1707 if existingRefs: 

1708 if len(dataset.refs) != len(existingRefs): 

1709 # Keeping track of partially pre-existing datasets is hard 

1710 # and should generally never happen. For now don't allow 

1711 # it. 

1712 raise ConflictingDefinitionError( 

1713 f"For dataset {dataset.path} some dataIds already exist" 

1714 " in registry but others do not. This is not supported." 

1715 ) 

1716 

1717 # Store expanded form in the original FileDataset. 

1718 dataset.refs = existingRefs 

1719 else: 

1720 groupedData[group_key].append(dataset) 

1721 

1722 if not used_run and run is not None: 

1723 warnings.warn( 

1724 "All DatasetRefs to be ingested had resolved dataset IDs. The value given to the " 

1725 f"'run' parameter ({run!r}) was not used and the parameter will be removed in the future.", 

1726 category=FutureWarning, 

1727 stacklevel=3, # Take into account the @transactional decorator. 

1728 ) 

1729 

1730 # Now we can bulk-insert into Registry for each DatasetType. 

1731 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks( 

1732 groupedData.items(), desc="Bulk-inserting datasets by type" 

1733 ): 

1734 refs_to_import = [] 

1735 for dataset in grouped_datasets: 

1736 refs_to_import.extend(dataset.refs) 

1737 

1738 n_refs = len(refs_to_import) 

1739 _LOG.verbose( 

1740 "Importing %d ref%s of dataset type %r into run %r", 

1741 n_refs, 

1742 "" if n_refs == 1 else "s", 

1743 datasetType.name, 

1744 this_run, 

1745 ) 

1746 

1747 # Import the refs and expand the DataCoordinates since we can't 

1748 # guarantee that they are expanded and Datastore will need 

1749 # the records. 

1750 imported_refs = self._registry._importDatasets(refs_to_import, expand=True) 

1751 assert set(imported_refs) == set(refs_to_import) 

1752 

1753 # Replace all the refs in the FileDataset with expanded versions. 

1754 # Pull them off in the order we put them on the list. 

1755 for dataset in grouped_datasets: 

1756 n_dataset_refs = len(dataset.refs) 

1757 dataset.refs = imported_refs[:n_dataset_refs] 

1758 del imported_refs[:n_dataset_refs] 

1759 

1760 # Bulk-insert everything into Datastore. 

1761 # We do not know if any of the registry entries already existed 

1762 # (_importDatasets only complains if they exist but differ) so 

1763 # we have to catch IntegrityError explicitly. 

1764 try: 

1765 self._datastore.ingest( 

1766 *datasets, transfer=transfer, record_validation_info=record_validation_info 

1767 ) 

1768 except IntegrityError as e: 

1769 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") from e 

1770 

1771 @contextlib.contextmanager 

1772 def export( 

1773 self, 

1774 *, 

1775 directory: str | None = None, 

1776 filename: str | None = None, 

1777 format: str | None = None, 

1778 transfer: str | None = None, 

1779 ) -> Iterator[RepoExportContext]: 

1780 # Docstring inherited. 

1781 if directory is None and transfer is not None: 

1782 raise TypeError("Cannot transfer without providing a directory.") 

1783 if transfer == "move": 

1784 raise TypeError("Transfer may not be 'move': export is read-only") 

1785 if format is None: 

1786 if filename is None: 

1787 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1788 else: 

1789 _, format = os.path.splitext(filename) 

1790 if not format: 

1791 raise ValueError("Please specify a file extension to determine export format.") 

1792 format = format[1:] # Strip leading "."" 

1793 elif filename is None: 

1794 filename = f"export.{format}" 

1795 if directory is not None: 

1796 filename = os.path.join(directory, filename) 

1797 formats = self._config["repo_transfer_formats"] 

1798 if format not in formats: 

1799 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}") 

1800 BackendClass = get_class_of(formats[format, "export"]) 

1801 with open(filename, "w") as stream: 

1802 backend = BackendClass(stream, universe=self.dimensions) 

1803 try: 

1804 helper = RepoExportContext( 

1805 self._registry, self._datastore, backend=backend, directory=directory, transfer=transfer 

1806 ) 

1807 yield helper 

1808 except BaseException: 

1809 raise 

1810 else: 

1811 helper._finish() 

1812 

1813 def import_( 

1814 self, 

1815 *, 

1816 directory: ResourcePathExpression | None = None, 

1817 filename: ResourcePathExpression | TextIO | None = None, 

1818 format: str | None = None, 

1819 transfer: str | None = None, 

1820 skip_dimensions: set | None = None, 

1821 ) -> None: 

1822 # Docstring inherited. 

1823 if not self.isWriteable(): 

1824 raise TypeError("Butler is read-only.") 

1825 if format is None: 

1826 if filename is None: 

1827 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1828 else: 

1829 _, format = os.path.splitext(filename) # type: ignore 

1830 elif filename is None: 

1831 filename = ResourcePath(f"export.{format}", forceAbsolute=False) 

1832 if directory is not None: 

1833 directory = ResourcePath(directory, forceDirectory=True) 

1834 # mypy doesn't think this will work but it does in python >= 3.10. 

1835 if isinstance(filename, ResourcePathExpression): # type: ignore 

1836 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore 

1837 if not filename.isabs() and directory is not None: 

1838 potential = directory.join(filename) 

1839 exists_in_cwd = filename.exists() 

1840 exists_in_dir = potential.exists() 

1841 if exists_in_cwd and exists_in_dir: 

1842 _LOG.warning( 

1843 "A relative path for filename was specified (%s) which exists relative to cwd. " 

1844 "Additionally, the file exists relative to the given search directory (%s). " 

1845 "Using the export file in the given directory.", 

1846 filename, 

1847 potential, 

1848 ) 

1849 # Given they specified an explicit directory and that 

1850 # directory has the export file in it, assume that that 

1851 # is what was meant despite the file in cwd. 

1852 filename = potential 

1853 elif exists_in_dir: 

1854 filename = potential 

1855 elif not exists_in_cwd and not exists_in_dir: 

1856 # Raise early. 

1857 raise FileNotFoundError( 

1858 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}." 

1859 ) 

1860 BackendClass: type[RepoImportBackend] = get_class_of( 

1861 self._config["repo_transfer_formats"][format]["import"] 

1862 ) 

1863 

1864 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None: 

1865 backend = BackendClass(importStream, self._registry) # type: ignore[call-arg] 

1866 backend.register() 

1867 with self.transaction(): 

1868 backend.load( 

1869 self._datastore, 

1870 directory=directory, 

1871 transfer=transfer, 

1872 skip_dimensions=skip_dimensions, 

1873 ) 

1874 

1875 if isinstance(filename, ResourcePath): 

1876 # We can not use open() here at the moment because of 

1877 # DM-38589 since yaml does stream.read(8192) in a loop. 

1878 stream = io.StringIO(filename.read().decode()) 

1879 doImport(stream) 

1880 else: 

1881 doImport(filename) # type: ignore 

1882 

1883 def transfer_dimension_records_from( 

1884 self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef] 

1885 ) -> None: 

1886 # Allowed dimensions in the target butler. 

1887 elements = frozenset( 

1888 element for element in self.dimensions.elements if element.hasTable() and element.viewOf is None 

1889 ) 

1890 

1891 data_ids = {ref.dataId for ref in source_refs} 

1892 

1893 dimension_records = self._extract_all_dimension_records_from_data_ids( 

1894 source_butler, data_ids, elements 

1895 ) 

1896 

1897 # Insert order is important. 

1898 for element in self.dimensions.sorted(dimension_records.keys()): 

1899 records = [r for r in dimension_records[element].values()] 

1900 # Assume that if the record is already present that we can 

1901 # use it without having to check that the record metadata 

1902 # is consistent. 

1903 self._registry.insertDimensionData(element, *records, skip_existing=True) 

1904 _LOG.debug("Dimension '%s' -- number of records transferred: %d", element.name, len(records)) 

1905 

1906 def _extract_all_dimension_records_from_data_ids( 

1907 self, 

1908 source_butler: LimitedButler | Butler, 

1909 data_ids: set[DataCoordinate], 

1910 allowed_elements: frozenset[DimensionElement], 

1911 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]: 

1912 primary_records = self._extract_dimension_records_from_data_ids( 

1913 source_butler, data_ids, allowed_elements 

1914 ) 

1915 

1916 can_query = True if isinstance(source_butler, Butler) else False 

1917 

1918 additional_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

1919 for original_element, record_mapping in primary_records.items(): 

1920 # Get dimensions that depend on this dimension. 

1921 populated_by = self.dimensions.get_elements_populated_by( 

1922 self.dimensions[original_element.name] # type: ignore 

1923 ) 

1924 

1925 for data_id in record_mapping.keys(): 

1926 for element in populated_by: 

1927 if element not in allowed_elements: 

1928 continue 

1929 if element.name == original_element.name: 

1930 continue 

1931 

1932 if element.name in primary_records: 

1933 # If this element has already been stored avoid 

1934 # re-finding records since that may lead to additional 

1935 # spurious records. e.g. visit is populated_by 

1936 # visit_detector_region but querying 

1937 # visit_detector_region by visit will return all the 

1938 # detectors for this visit -- the visit dataId does not 

1939 # constrain this. 

1940 # To constrain the query the original dataIds would 

1941 # have to be scanned. 

1942 continue 

1943 

1944 if not can_query: 

1945 raise RuntimeError( 

1946 f"Transferring populated_by records like {element.name} requires a full Butler." 

1947 ) 

1948 

1949 records = source_butler.registry.queryDimensionRecords( # type: ignore 

1950 element.name, **data_id.mapping # type: ignore 

1951 ) 

1952 for record in records: 

1953 additional_records[record.definition].setdefault(record.dataId, record) 

1954 

1955 # The next step is to walk back through the additional records to 

1956 # pick up any missing content (such as visit_definition needing to 

1957 # know the exposure). Want to ensure we do not request records we 

1958 # already have. 

1959 missing_data_ids = set() 

1960 for name, record_mapping in additional_records.items(): 

1961 for data_id in record_mapping.keys(): 

1962 if data_id not in primary_records[name]: 

1963 missing_data_ids.add(data_id) 

1964 

1965 # Fill out the new records. Assume that these new records do not 

1966 # also need to carry over additional populated_by records. 

1967 secondary_records = self._extract_dimension_records_from_data_ids( 

1968 source_butler, missing_data_ids, allowed_elements 

1969 ) 

1970 

1971 # Merge the extra sets of records in with the original. 

1972 for name, record_mapping in itertools.chain(additional_records.items(), secondary_records.items()): 

1973 primary_records[name].update(record_mapping) 

1974 

1975 return primary_records 

1976 

1977 def _extract_dimension_records_from_data_ids( 

1978 self, 

1979 source_butler: LimitedButler | Butler, 

1980 data_ids: set[DataCoordinate], 

1981 allowed_elements: frozenset[DimensionElement], 

1982 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]: 

1983 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

1984 

1985 for data_id in data_ids: 

1986 # Need an expanded record, if not expanded that we need a full 

1987 # butler with registry (allow mocks with registry too). 

1988 if not data_id.hasRecords(): 

1989 if registry := getattr(source_butler, "registry", None): 

1990 data_id = registry.expandDataId(data_id) 

1991 else: 

1992 raise TypeError("Input butler needs to be a full butler to expand DataId.") 

1993 # If this butler doesn't know about a dimension in the source 

1994 # butler things will break later. 

1995 for element_name in data_id.dimensions.elements: 

1996 record = data_id.records[element_name] 

1997 if record is not None and record.definition in allowed_elements: 

1998 dimension_records[record.definition].setdefault(record.dataId, record) 

1999 

2000 return dimension_records 

2001 

2002 def transfer_from( 

2003 self, 

2004 source_butler: LimitedButler, 

2005 source_refs: Iterable[DatasetRef], 

2006 transfer: str = "auto", 

2007 skip_missing: bool = True, 

2008 register_dataset_types: bool = False, 

2009 transfer_dimensions: bool = False, 

2010 ) -> collections.abc.Collection[DatasetRef]: 

2011 # Docstring inherited. 

2012 if not self.isWriteable(): 

2013 raise TypeError("Butler is read-only.") 

2014 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

2015 

2016 # Will iterate through the refs multiple times so need to convert 

2017 # to a list if this isn't a collection. 

2018 if not isinstance(source_refs, collections.abc.Collection): 

2019 source_refs = list(source_refs) 

2020 

2021 original_count = len(source_refs) 

2022 _LOG.info("Transferring %d datasets into %s", original_count, str(self)) 

2023 

2024 # In some situations the datastore artifact may be missing 

2025 # and we do not want that registry entry to be imported. 

2026 # Asking datastore is not sufficient, the records may have been 

2027 # purged, we have to ask for the (predicted) URI and check 

2028 # existence explicitly. Execution butler is set up exactly like 

2029 # this with no datastore records. 

2030 artifact_existence: dict[ResourcePath, bool] = {} 

2031 if skip_missing: 

2032 dataset_existence = source_butler._datastore.mexists( 

2033 source_refs, artifact_existence=artifact_existence 

2034 ) 

2035 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

2036 filtered_count = len(source_refs) 

2037 n_missing = original_count - filtered_count 

2038 _LOG.verbose( 

2039 "%d dataset%s removed because the artifact does not exist. Now have %d.", 

2040 n_missing, 

2041 "" if n_missing == 1 else "s", 

2042 filtered_count, 

2043 ) 

2044 

2045 # Importing requires that we group the refs by dataset type and run 

2046 # before doing the import. 

2047 source_dataset_types = set() 

2048 grouped_refs = defaultdict(list) 

2049 for ref in source_refs: 

2050 grouped_refs[ref.datasetType, ref.run].append(ref) 

2051 source_dataset_types.add(ref.datasetType) 

2052 

2053 # Check to see if the dataset type in the source butler has 

2054 # the same definition in the target butler and register missing 

2055 # ones if requested. Registration must happen outside a transaction. 

2056 newly_registered_dataset_types = set() 

2057 for datasetType in source_dataset_types: 

2058 if register_dataset_types: 

2059 # Let this raise immediately if inconsistent. Continuing 

2060 # on to find additional inconsistent dataset types 

2061 # might result in additional unwanted dataset types being 

2062 # registered. 

2063 if self._registry.registerDatasetType(datasetType): 

2064 newly_registered_dataset_types.add(datasetType) 

2065 else: 

2066 # If the dataset type is missing, let it fail immediately. 

2067 target_dataset_type = self.get_dataset_type(datasetType.name) 

2068 if target_dataset_type != datasetType: 

2069 raise ConflictingDefinitionError( 

2070 "Source butler dataset type differs from definition" 

2071 f" in target butler: {datasetType} !=" 

2072 f" {target_dataset_type}" 

2073 ) 

2074 if newly_registered_dataset_types: 

2075 # We may have registered some even if there were inconsistencies 

2076 # but should let people know (or else remove them again). 

2077 _LOG.verbose( 

2078 "Registered the following dataset types in the target Butler: %s", 

2079 ", ".join(d.name for d in newly_registered_dataset_types), 

2080 ) 

2081 else: 

2082 _LOG.verbose("All required dataset types are known to the target Butler") 

2083 

2084 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

2085 if transfer_dimensions: 

2086 # Collect all the dimension records for these refs. 

2087 # All dimensions are to be copied but the list of valid dimensions 

2088 # come from this butler's universe. 

2089 elements = frozenset( 

2090 element 

2091 for element in self.dimensions.elements 

2092 if element.hasTable() and element.viewOf is None 

2093 ) 

2094 dataIds = {ref.dataId for ref in source_refs} 

2095 dimension_records = self._extract_all_dimension_records_from_data_ids( 

2096 source_butler, dataIds, elements 

2097 ) 

2098 

2099 handled_collections: set[str] = set() 

2100 

2101 # Do all the importing in a single transaction. 

2102 with self.transaction(): 

2103 if dimension_records: 

2104 _LOG.verbose("Ensuring that dimension records exist for transferred datasets.") 

2105 # Order matters. 

2106 for element in self.dimensions.sorted(dimension_records.keys()): 

2107 records = [r for r in dimension_records[element].values()] 

2108 # Assume that if the record is already present that we can 

2109 # use it without having to check that the record metadata 

2110 # is consistent. 

2111 self._registry.insertDimensionData(element, *records, skip_existing=True) 

2112 

2113 n_imported = 0 

2114 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

2115 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

2116 ): 

2117 if run not in handled_collections: 

2118 # May need to create output collection. If source butler 

2119 # has a registry, ask for documentation string. 

2120 run_doc = None 

2121 if registry := getattr(source_butler, "registry", None): 

2122 run_doc = registry.getCollectionDocumentation(run) 

2123 registered = self._registry.registerRun(run, doc=run_doc) 

2124 handled_collections.add(run) 

2125 if registered: 

2126 _LOG.verbose("Creating output run %s", run) 

2127 

2128 n_refs = len(refs_to_import) 

2129 _LOG.verbose( 

2130 "Importing %d ref%s of dataset type %s into run %s", 

2131 n_refs, 

2132 "" if n_refs == 1 else "s", 

2133 datasetType.name, 

2134 run, 

2135 ) 

2136 

2137 # Assume we are using UUIDs and the source refs will match 

2138 # those imported. 

2139 imported_refs = self._registry._importDatasets(refs_to_import) 

2140 assert set(imported_refs) == set(refs_to_import) 

2141 n_imported += len(imported_refs) 

2142 

2143 assert len(source_refs) == n_imported 

2144 _LOG.verbose("Imported %d datasets into destination butler", n_imported) 

2145 

2146 # Ask the datastore to transfer. The datastore has to check that 

2147 # the source datastore is compatible with the target datastore. 

2148 accepted, rejected = self._datastore.transfer_from( 

2149 source_butler._datastore, 

2150 source_refs, 

2151 transfer=transfer, 

2152 artifact_existence=artifact_existence, 

2153 ) 

2154 if rejected: 

2155 # For now, accept the registry entries but not the files. 

2156 _LOG.warning( 

2157 "%d datasets were rejected and %d accepted for dataset type %s in run %r.", 

2158 len(rejected), 

2159 len(accepted), 

2160 datasetType, 

2161 run, 

2162 ) 

2163 

2164 return source_refs 

2165 

2166 def validateConfiguration( 

2167 self, 

2168 logFailures: bool = False, 

2169 datasetTypeNames: Iterable[str] | None = None, 

2170 ignore: Iterable[str] | None = None, 

2171 ) -> None: 

2172 # Docstring inherited. 

2173 if datasetTypeNames: 

2174 datasetTypes = [self.get_dataset_type(name) for name in datasetTypeNames] 

2175 else: 

2176 datasetTypes = list(self._registry.queryDatasetTypes()) 

2177 

2178 # filter out anything from the ignore list 

2179 if ignore: 

2180 ignore = set(ignore) 

2181 datasetTypes = [ 

2182 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

2183 ] 

2184 else: 

2185 ignore = set() 

2186 

2187 # For each datasetType that has an instrument dimension, create 

2188 # a DatasetRef for each defined instrument 

2189 datasetRefs = [] 

2190 

2191 # Find all the registered instruments (if "instrument" is in the 

2192 # universe). 

2193 if "instrument" in self.dimensions: 

2194 instruments = {record.name for record in self._registry.queryDimensionRecords("instrument")} 

2195 

2196 for datasetType in datasetTypes: 

2197 if "instrument" in datasetType.dimensions: 

2198 # In order to create a conforming dataset ref, create 

2199 # fake DataCoordinate values for the non-instrument 

2200 # dimensions. The type of the value does not matter here. 

2201 dataId = {dim: 1 for dim in datasetType.dimensions.names if dim != "instrument"} 

2202 

2203 for instrument in instruments: 

2204 datasetRef = DatasetRef( 

2205 datasetType, 

2206 DataCoordinate.standardize( 

2207 dataId, instrument=instrument, dimensions=datasetType.dimensions 

2208 ), 

2209 run="validate", 

2210 ) 

2211 datasetRefs.append(datasetRef) 

2212 

2213 entities: list[DatasetType | DatasetRef] = [] 

2214 entities.extend(datasetTypes) 

2215 entities.extend(datasetRefs) 

2216 

2217 datastoreErrorStr = None 

2218 try: 

2219 self._datastore.validateConfiguration(entities, logFailures=logFailures) 

2220 except ValidationError as e: 

2221 datastoreErrorStr = str(e) 

2222 

2223 # Also check that the LookupKeys used by the datastores match 

2224 # registry and storage class definitions 

2225 keys = self._datastore.getLookupKeys() 

2226 

2227 failedNames = set() 

2228 failedDataId = set() 

2229 for key in keys: 

2230 if key.name is not None: 

2231 if key.name in ignore: 

2232 continue 

2233 

2234 # skip if specific datasetType names were requested and this 

2235 # name does not match 

2236 if datasetTypeNames and key.name not in datasetTypeNames: 

2237 continue 

2238 

2239 # See if it is a StorageClass or a DatasetType 

2240 if key.name in self.storageClasses: 

2241 pass 

2242 else: 

2243 try: 

2244 self.get_dataset_type(key.name) 

2245 except KeyError: 

2246 if logFailures: 

2247 _LOG.critical( 

2248 "Key '%s' does not correspond to a DatasetType or StorageClass", key 

2249 ) 

2250 failedNames.add(key) 

2251 else: 

2252 # Dimensions are checked for consistency when the Butler 

2253 # is created and rendezvoused with a universe. 

2254 pass 

2255 

2256 # Check that the instrument is a valid instrument 

2257 # Currently only support instrument so check for that 

2258 if key.dataId: 

2259 dataIdKeys = set(key.dataId) 

2260 if {"instrument"} != dataIdKeys: 

2261 if logFailures: 

2262 _LOG.critical("Key '%s' has unsupported DataId override", key) 

2263 failedDataId.add(key) 

2264 elif key.dataId["instrument"] not in instruments: 

2265 if logFailures: 

2266 _LOG.critical("Key '%s' has unknown instrument", key) 

2267 failedDataId.add(key) 

2268 

2269 messages = [] 

2270 

2271 if datastoreErrorStr: 

2272 messages.append(datastoreErrorStr) 

2273 

2274 for failed, msg in ( 

2275 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2276 (failedDataId, "Keys with bad DataId entries: "), 

2277 ): 

2278 if failed: 

2279 msg += ", ".join(str(k) for k in failed) 

2280 messages.append(msg) 

2281 

2282 if messages: 

2283 raise ValidationError(";\n".join(messages)) 

2284 

2285 @property 

2286 def collections(self) -> Sequence[str]: 

2287 """The collections to search by default, in order 

2288 (`~collections.abc.Sequence` [ `str` ]). 

2289 

2290 This is an alias for ``self.registry.defaults.collections``. It cannot 

2291 be set directly in isolation, but all defaults may be changed together 

2292 by assigning a new `RegistryDefaults` instance to 

2293 ``self.registry.defaults``. 

2294 """ 

2295 return self._registry.defaults.collections 

2296 

2297 @property 

2298 def run(self) -> str | None: 

2299 """Name of the run this butler writes outputs to by default (`str` or 

2300 `None`). 

2301 

2302 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2303 directly in isolation, but all defaults may be changed together by 

2304 assigning a new `RegistryDefaults` instance to 

2305 ``self.registry.defaults``. 

2306 """ 

2307 return self._registry.defaults.run 

2308 

2309 @property 

2310 def registry(self) -> Registry: 

2311 """The object that manages dataset metadata and relationships 

2312 (`Registry`). 

2313 

2314 Many operations that don't involve reading or writing butler datasets 

2315 are accessible only via `Registry` methods. Eventually these methods 

2316 will be replaced by equivalent `Butler` methods. 

2317 """ 

2318 return self._registry_shim 

2319 

2320 @property 

2321 def dimensions(self) -> DimensionUniverse: 

2322 # Docstring inherited. 

2323 return self._registry.dimensions 

2324 

2325 @contextlib.contextmanager 

2326 def _query(self) -> Iterator[Query]: 

2327 # Docstring inherited. 

2328 yield DirectQuery(self._registry) 

2329 

2330 def _query_data_ids( 

2331 self, 

2332 dimensions: DimensionGroup | Iterable[str] | str, 

2333 *, 

2334 data_id: DataId | None = None, 

2335 where: str = "", 

2336 bind: Mapping[str, Any] | None = None, 

2337 expanded: bool = False, 

2338 order_by: Iterable[str] | str | None = None, 

2339 limit: int | None = None, 

2340 offset: int | None = None, 

2341 explain: bool = True, 

2342 **kwargs: Any, 

2343 ) -> list[DataCoordinate]: 

2344 # Docstring inherited. 

2345 query = DirectQuery(self._registry) 

2346 result = query.data_ids(dimensions, data_id=data_id, where=where, bind=bind, **kwargs) 

2347 if expanded: 

2348 result = result.expanded() 

2349 if order_by: 

2350 result = result.order_by(*ensure_iterable(order_by)) 

2351 if limit is not None: 

2352 result = result.limit(limit, offset) 

2353 else: 

2354 if offset is not None: 

2355 raise TypeError("offset is specified without limit") 

2356 data_ids = list(result) 

2357 if explain and not data_ids: 

2358 raise EmptyQueryResultError(list(result.explain_no_results())) 

2359 return data_ids 

2360 

2361 def _query_datasets( 

2362 self, 

2363 dataset_type: Any, 

2364 collections: CollectionArgType | None = None, 

2365 *, 

2366 find_first: bool = True, 

2367 data_id: DataId | None = None, 

2368 where: str = "", 

2369 bind: Mapping[str, Any] | None = None, 

2370 expanded: bool = False, 

2371 explain: bool = True, 

2372 **kwargs: Any, 

2373 ) -> list[DatasetRef]: 

2374 # Docstring inherited. 

2375 query = DirectQuery(self._registry) 

2376 result = query.datasets( 

2377 dataset_type, 

2378 collections, 

2379 find_first=find_first, 

2380 data_id=data_id, 

2381 where=where, 

2382 bind=bind, 

2383 **kwargs, 

2384 ) 

2385 if expanded: 

2386 result = result.expanded() 

2387 refs = list(result) 

2388 if explain and not refs: 

2389 raise EmptyQueryResultError(list(result.explain_no_results())) 

2390 return refs 

2391 

2392 def _query_dimension_records( 

2393 self, 

2394 element: str, 

2395 *, 

2396 data_id: DataId | None = None, 

2397 where: str = "", 

2398 bind: Mapping[str, Any] | None = None, 

2399 order_by: Iterable[str] | str | None = None, 

2400 limit: int | None = None, 

2401 offset: int | None = None, 

2402 explain: bool = True, 

2403 **kwargs: Any, 

2404 ) -> list[DimensionRecord]: 

2405 # Docstring inherited. 

2406 query = DirectQuery(self._registry) 

2407 result = query.dimension_records(element, data_id=data_id, where=where, bind=bind, **kwargs) 

2408 if order_by: 

2409 result = result.order_by(*ensure_iterable(order_by)) 

2410 if limit is not None: 

2411 result = result.limit(limit, offset) 

2412 else: 

2413 if offset is not None: 

2414 raise TypeError("offset is specified without limit") 

2415 data_ids = list(result) 

2416 if explain and not data_ids: 

2417 raise EmptyQueryResultError(list(result.explain_no_results())) 

2418 return data_ids 

2419 

2420 _registry: SqlRegistry 

2421 """The object that manages dataset metadata and relationships 

2422 (`SqlRegistry`). 

2423 

2424 Most operations that don't involve reading or writing butler datasets are 

2425 accessible only via `SqlRegistry` methods. 

2426 """ 

2427 

2428 datastore: Datastore 

2429 """The object that manages actual dataset storage (`Datastore`). 

2430 

2431 Direct user access to the datastore should rarely be necessary; the primary 

2432 exception is the case where a `Datastore` implementation provides extra 

2433 functionality beyond what the base class defines. 

2434 """ 

2435 

2436 storageClasses: StorageClassFactory 

2437 """An object that maps known storage class names to objects that fully 

2438 describe them (`StorageClassFactory`). 

2439 """