Coverage for python/lsst/daf/butler/direct_butler.py: 10%

789 statements  

« prev     ^ index     » next       coverage.py v7.4.1, created at 2024-02-01 11:20 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Butler top level classes. 

29""" 

30from __future__ import annotations 

31 

32__all__ = ( 

33 "DirectButler", 

34 "ButlerValidationError", 

35) 

36 

37import collections.abc 

38import contextlib 

39import io 

40import itertools 

41import logging 

42import numbers 

43import os 

44import warnings 

45from collections import Counter, defaultdict 

46from collections.abc import Iterable, Iterator, Mapping, MutableMapping, Sequence 

47from typing import TYPE_CHECKING, Any, ClassVar, TextIO, cast 

48 

49from lsst.resources import ResourcePath, ResourcePathExpression 

50from lsst.utils.introspection import get_class_of 

51from lsst.utils.iteration import ensure_iterable 

52from lsst.utils.logging import VERBOSE, getLogger 

53from sqlalchemy.exc import IntegrityError 

54 

55from ._butler import Butler 

56from ._butler_config import ButlerConfig 

57from ._butler_instance_options import ButlerInstanceOptions 

58from ._dataset_existence import DatasetExistence 

59from ._dataset_ref import DatasetRef 

60from ._dataset_type import DatasetType 

61from ._deferredDatasetHandle import DeferredDatasetHandle 

62from ._exceptions import EmptyQueryResultError, ValidationError 

63from ._limited_butler import LimitedButler 

64from ._registry_shim import RegistryShim 

65from ._storage_class import StorageClass, StorageClassFactory 

66from ._timespan import Timespan 

67from .datastore import Datastore, NullDatastore 

68from .dimensions import DataCoordinate, Dimension 

69from .direct_query import DirectQuery 

70from .progress import Progress 

71from .registry import ( 

72 CollectionType, 

73 ConflictingDefinitionError, 

74 DataIdError, 

75 MissingDatasetTypeError, 

76 NoDefaultCollectionError, 

77 RegistryDefaults, 

78 _RegistryFactory, 

79) 

80from .registry.sql_registry import SqlRegistry 

81from .transfers import RepoExportContext 

82from .utils import transactional 

83 

84if TYPE_CHECKING: 

85 from lsst.resources import ResourceHandleProtocol 

86 

87 from ._dataset_ref import DatasetId 

88 from ._file_dataset import FileDataset 

89 from ._query import Query 

90 from .datastore import DatasetRefURIs 

91 from .dimensions import ( 

92 DataId, 

93 DataIdValue, 

94 DimensionElement, 

95 DimensionGroup, 

96 DimensionRecord, 

97 DimensionUniverse, 

98 ) 

99 from .registry import CollectionArgType, Registry 

100 from .transfers import RepoImportBackend 

101 

102_LOG = getLogger(__name__) 

103 

104 

105class ButlerValidationError(ValidationError): 

106 """There is a problem with the Butler configuration.""" 

107 

108 pass 

109 

110 

111class DirectButler(Butler): # numpydoc ignore=PR02 

112 """Main entry point for the data access system. 

113 

114 Parameters 

115 ---------- 

116 config : `ButlerConfig` 

117 The configuration for this Butler instance. 

118 registry : `SqlRegistry` 

119 The object that manages dataset metadata and relationships. 

120 datastore : Datastore 

121 The object that manages actual dataset storage. 

122 storageClasses : StorageClassFactory 

123 An object that maps known storage class names to objects that fully 

124 describe them. 

125 

126 Notes 

127 ----- 

128 Most users should call the top-level `Butler`.``from_config`` instead of 

129 using this constructor directly. 

130 """ 

131 

132 # This is __new__ instead of __init__ because we have to support 

133 # instantiation via the legacy constructor Butler.__new__(), which 

134 # reads the configuration and selects which subclass to instantiate. The 

135 # interaction between __new__ and __init__ is kind of wacky in Python. If 

136 # we were using __init__ here, __init__ would be called twice (once when 

137 # the DirectButler instance is constructed inside Butler.from_config(), and 

138 # a second time with the original arguments to Butler() when the instance 

139 # is returned from Butler.__new__() 

140 def __new__( 

141 cls, 

142 *, 

143 config: ButlerConfig, 

144 registry: SqlRegistry, 

145 datastore: Datastore, 

146 storageClasses: StorageClassFactory, 

147 ) -> DirectButler: 

148 self = cast(DirectButler, super().__new__(cls)) 

149 self._config = config 

150 self._registry = registry 

151 self._datastore = datastore 

152 self.storageClasses = storageClasses 

153 

154 # For execution butler the datastore needs a special 

155 # dependency-inversion trick. This is not used by regular butler, 

156 # but we do not have a way to distinguish regular butler from execution 

157 # butler. 

158 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

159 

160 self._registry_shim = RegistryShim(self) 

161 

162 return self 

163 

164 @classmethod 

165 def create_from_config( 

166 cls, 

167 config: ButlerConfig, 

168 *, 

169 options: ButlerInstanceOptions, 

170 without_datastore: bool = False, 

171 ) -> DirectButler: 

172 """Construct a Butler instance from a configuration file. 

173 

174 Parameters 

175 ---------- 

176 config : `ButlerConfig` 

177 The configuration for this Butler instance. 

178 options : `ButlerInstanceOptions` 

179 Default values and other settings for the Butler instance. 

180 without_datastore : `bool`, optional 

181 If `True` do not attach a datastore to this butler. Any attempts 

182 to use a datastore will fail. 

183 

184 Notes 

185 ----- 

186 Most users should call the top-level `Butler`.``from_config`` 

187 instead of using this function directly. 

188 """ 

189 if "run" in config or "collection" in config: 

190 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

191 

192 defaults = RegistryDefaults( 

193 collections=options.collections, run=options.run, infer=options.inferDefaults, **options.kwargs 

194 ) 

195 try: 

196 butlerRoot = config.get("root", config.configDir) 

197 writeable = options.writeable 

198 if writeable is None: 

199 writeable = options.run is not None 

200 registry = _RegistryFactory(config).from_config( 

201 butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

202 ) 

203 if without_datastore: 

204 datastore: Datastore = NullDatastore(None, None) 

205 else: 

206 datastore = Datastore.fromConfig( 

207 config, registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

208 ) 

209 # TODO: Once datastore drops dependency on registry we can 

210 # construct datastore first and pass opaque tables to registry 

211 # constructor. 

212 registry.make_datastore_tables(datastore.get_opaque_table_definitions()) 

213 storageClasses = StorageClassFactory() 

214 storageClasses.addFromConfig(config) 

215 

216 return DirectButler( 

217 config=config, registry=registry, datastore=datastore, storageClasses=storageClasses 

218 ) 

219 except Exception: 

220 # Failures here usually mean that configuration is incomplete, 

221 # just issue an error message which includes config file URI. 

222 _LOG.error(f"Failed to instantiate Butler from config {config.configFile}.") 

223 raise 

224 

225 def _clone( 

226 self, 

227 *, 

228 collections: Any = None, 

229 run: str | None = None, 

230 inferDefaults: bool = True, 

231 **kwargs: Any, 

232 ) -> DirectButler: 

233 # Docstring inherited 

234 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

235 registry = self._registry.copy(defaults) 

236 

237 return DirectButler( 

238 registry=registry, 

239 config=self._config, 

240 datastore=self._datastore.clone(registry.getDatastoreBridgeManager()), 

241 storageClasses=self.storageClasses, 

242 ) 

243 

244 GENERATION: ClassVar[int] = 3 

245 """This is a Generation 3 Butler. 

246 

247 This attribute may be removed in the future, once the Generation 2 Butler 

248 interface has been fully retired; it should only be used in transitional 

249 code. 

250 """ 

251 

252 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

253 """Return DatasetType defined in registry given dataset type name.""" 

254 try: 

255 return self.get_dataset_type(name) 

256 except MissingDatasetTypeError: 

257 return None 

258 

259 @classmethod 

260 def _unpickle( 

261 cls, 

262 config: ButlerConfig, 

263 collections: tuple[str, ...] | None, 

264 run: str | None, 

265 defaultDataId: dict[str, str], 

266 writeable: bool, 

267 ) -> DirectButler: 

268 """Callable used to unpickle a Butler. 

269 

270 We prefer not to use ``Butler.__init__`` directly so we can force some 

271 of its many arguments to be keyword-only (note that ``__reduce__`` 

272 can only invoke callables with positional arguments). 

273 

274 Parameters 

275 ---------- 

276 config : `ButlerConfig` 

277 Butler configuration, already coerced into a true `ButlerConfig` 

278 instance (and hence after any search paths for overrides have been 

279 utilized). 

280 collections : `tuple` [ `str` ] 

281 Names of the default collections to read from. 

282 run : `str`, optional 

283 Name of the default `~CollectionType.RUN` collection to write to. 

284 defaultDataId : `dict` [ `str`, `str` ] 

285 Default data ID values. 

286 writeable : `bool` 

287 Whether the Butler should support write operations. 

288 

289 Returns 

290 ------- 

291 butler : `Butler` 

292 A new `Butler` instance. 

293 """ 

294 return cls.create_from_config( 

295 config=config, 

296 options=ButlerInstanceOptions( 

297 collections=collections, run=run, writeable=writeable, kwargs=defaultDataId 

298 ), 

299 ) 

300 

301 def __reduce__(self) -> tuple: 

302 """Support pickling.""" 

303 return ( 

304 DirectButler._unpickle, 

305 ( 

306 self._config, 

307 self.collections, 

308 self.run, 

309 dict(self._registry.defaults.dataId.required), 

310 self._registry.isWriteable(), 

311 ), 

312 ) 

313 

314 def __str__(self) -> str: 

315 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

316 self.collections, self.run, self._datastore, self._registry 

317 ) 

318 

319 def isWriteable(self) -> bool: 

320 # Docstring inherited. 

321 return self._registry.isWriteable() 

322 

323 def _caching_context(self) -> contextlib.AbstractContextManager[None]: 

324 """Context manager that enables caching.""" 

325 return self._registry.caching_context() 

326 

327 @contextlib.contextmanager 

328 def transaction(self) -> Iterator[None]: 

329 """Context manager supporting `Butler` transactions. 

330 

331 Transactions can be nested. 

332 """ 

333 with self._registry.transaction(), self._datastore.transaction(): 

334 yield 

335 

336 def _standardizeArgs( 

337 self, 

338 datasetRefOrType: DatasetRef | DatasetType | str, 

339 dataId: DataId | None = None, 

340 for_put: bool = True, 

341 **kwargs: Any, 

342 ) -> tuple[DatasetType, DataId | None]: 

343 """Standardize the arguments passed to several Butler APIs. 

344 

345 Parameters 

346 ---------- 

347 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

348 When `DatasetRef` the `dataId` should be `None`. 

349 Otherwise the `DatasetType` or name thereof. 

350 dataId : `dict` or `DataCoordinate` 

351 A `dict` of `Dimension` link name, value pairs that label the 

352 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

353 should be provided as the second argument. 

354 for_put : `bool`, optional 

355 If `True` this call is invoked as part of a `Butler.put()`. 

356 Otherwise it is assumed to be part of a `Butler.get()`. This 

357 parameter is only relevant if there is dataset type 

358 inconsistency. 

359 **kwargs 

360 Additional keyword arguments used to augment or construct a 

361 `DataCoordinate`. See `DataCoordinate.standardize` 

362 parameters. 

363 

364 Returns 

365 ------- 

366 datasetType : `DatasetType` 

367 A `DatasetType` instance extracted from ``datasetRefOrType``. 

368 dataId : `dict` or `DataId`, optional 

369 Argument that can be used (along with ``kwargs``) to construct a 

370 `DataId`. 

371 

372 Notes 

373 ----- 

374 Butler APIs that conceptually need a DatasetRef also allow passing a 

375 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

376 keyword arguments that can be used to construct one) separately. This 

377 method accepts those arguments and always returns a true `DatasetType` 

378 and a `DataId` or `dict`. 

379 

380 Standardization of `dict` vs `DataId` is best handled by passing the 

381 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

382 generally similarly flexible. 

383 """ 

384 externalDatasetType: DatasetType | None = None 

385 internalDatasetType: DatasetType | None = None 

386 if isinstance(datasetRefOrType, DatasetRef): 

387 if dataId is not None or kwargs: 

388 raise ValueError("DatasetRef given, cannot use dataId as well") 

389 externalDatasetType = datasetRefOrType.datasetType 

390 dataId = datasetRefOrType.dataId 

391 else: 

392 # Don't check whether DataId is provided, because Registry APIs 

393 # can usually construct a better error message when it wasn't. 

394 if isinstance(datasetRefOrType, DatasetType): 

395 externalDatasetType = datasetRefOrType 

396 else: 

397 internalDatasetType = self.get_dataset_type(datasetRefOrType) 

398 

399 # Check that they are self-consistent 

400 if externalDatasetType is not None: 

401 internalDatasetType = self.get_dataset_type(externalDatasetType.name) 

402 if externalDatasetType != internalDatasetType: 

403 # We can allow differences if they are compatible, depending 

404 # on whether this is a get or a put. A get requires that 

405 # the python type associated with the datastore can be 

406 # converted to the user type. A put requires that the user 

407 # supplied python type can be converted to the internal 

408 # type expected by registry. 

409 relevantDatasetType = internalDatasetType 

410 if for_put: 

411 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

412 else: 

413 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

414 relevantDatasetType = externalDatasetType 

415 if not is_compatible: 

416 raise ValueError( 

417 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

418 f"registry definition ({internalDatasetType})" 

419 ) 

420 # Override the internal definition. 

421 internalDatasetType = relevantDatasetType 

422 

423 assert internalDatasetType is not None 

424 return internalDatasetType, dataId 

425 

426 def _rewrite_data_id( 

427 self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any 

428 ) -> tuple[DataId | None, dict[str, Any]]: 

429 """Rewrite a data ID taking into account dimension records. 

430 

431 Take a Data ID and keyword args and rewrite it if necessary to 

432 allow the user to specify dimension records rather than dimension 

433 primary values. 

434 

435 This allows a user to include a dataId dict with keys of 

436 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

437 the integer exposure ID. It also allows a string to be given 

438 for a dimension value rather than the integer ID if that is more 

439 convenient. For example, rather than having to specifying the 

440 detector with ``detector.full_name``, a string given for ``detector`` 

441 will be interpreted as the full name and converted to the integer 

442 value. 

443 

444 Keyword arguments can also use strings for dimensions like detector 

445 and exposure but python does not allow them to include ``.`` and 

446 so the ``exposure.day_obs`` syntax can not be used in a keyword 

447 argument. 

448 

449 Parameters 

450 ---------- 

451 dataId : `dict` or `DataCoordinate` 

452 A `dict` of `Dimension` link name, value pairs that will label the 

453 `DatasetRef` within a Collection. 

454 datasetType : `DatasetType` 

455 The dataset type associated with this dataId. Required to 

456 determine the relevant dimensions. 

457 **kwargs 

458 Additional keyword arguments used to augment or construct a 

459 `DataId`. See `DataId` parameters. 

460 

461 Returns 

462 ------- 

463 dataId : `dict` or `DataCoordinate` 

464 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

465 no keyword arguments, the original dataId will be returned 

466 unchanged. 

467 **kwargs : `dict` 

468 Any unused keyword arguments (would normally be empty dict). 

469 """ 

470 # Do nothing if we have a standalone DataCoordinate. 

471 if isinstance(dataId, DataCoordinate) and not kwargs: 

472 return dataId, kwargs 

473 

474 # Process dimension records that are using record information 

475 # rather than ids 

476 newDataId: dict[str, DataIdValue] = {} 

477 byRecord: dict[str, dict[str, Any]] = defaultdict(dict) 

478 

479 # if all the dataId comes from keyword parameters we do not need 

480 # to do anything here because they can't be of the form 

481 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

482 if dataId: 

483 for k, v in dataId.items(): 

484 # If we have a Dimension we do not need to do anything 

485 # because it cannot be a compound key. 

486 if isinstance(k, str) and "." in k: 

487 # Someone is using a more human-readable dataId 

488 dimensionName, record = k.split(".", 1) 

489 byRecord[dimensionName][record] = v 

490 elif isinstance(k, Dimension): 

491 newDataId[k.name] = v 

492 else: 

493 newDataId[k] = v 

494 

495 # Go through the updated dataId and check the type in case someone is 

496 # using an alternate key. We have already filtered out the compound 

497 # keys dimensions.record format. 

498 not_dimensions = {} 

499 

500 # Will need to look in the dataId and the keyword arguments 

501 # and will remove them if they need to be fixed or are unrecognized. 

502 for dataIdDict in (newDataId, kwargs): 

503 # Use a list so we can adjust the dict safely in the loop 

504 for dimensionName in list(dataIdDict): 

505 value = dataIdDict[dimensionName] 

506 try: 

507 dimension = self.dimensions.dimensions[dimensionName] 

508 except KeyError: 

509 # This is not a real dimension 

510 not_dimensions[dimensionName] = value 

511 del dataIdDict[dimensionName] 

512 continue 

513 

514 # Convert an integral type to an explicit int to simplify 

515 # comparisons here 

516 if isinstance(value, numbers.Integral): 

517 value = int(value) 

518 

519 if not isinstance(value, dimension.primaryKey.getPythonType()): 

520 for alternate in dimension.alternateKeys: 

521 if isinstance(value, alternate.getPythonType()): 

522 byRecord[dimensionName][alternate.name] = value 

523 del dataIdDict[dimensionName] 

524 _LOG.debug( 

525 "Converting dimension %s to %s.%s=%s", 

526 dimensionName, 

527 dimensionName, 

528 alternate.name, 

529 value, 

530 ) 

531 break 

532 else: 

533 _LOG.warning( 

534 "Type mismatch found for value '%r' provided for dimension %s. " 

535 "Could not find matching alternative (primary key has type %s) " 

536 "so attempting to use as-is.", 

537 value, 

538 dimensionName, 

539 dimension.primaryKey.getPythonType(), 

540 ) 

541 

542 # By this point kwargs and newDataId should only include valid 

543 # dimensions. Merge kwargs in to the new dataId and log if there 

544 # are dimensions in both (rather than calling update). 

545 for k, v in kwargs.items(): 

546 if k in newDataId and newDataId[k] != v: 

547 _LOG.debug( 

548 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

549 ) 

550 newDataId[k] = v 

551 # No need to retain any values in kwargs now. 

552 kwargs = {} 

553 

554 # If we have some unrecognized dimensions we have to try to connect 

555 # them to records in other dimensions. This is made more complicated 

556 # by some dimensions having records with clashing names. A mitigation 

557 # is that we can tell by this point which dimensions are missing 

558 # for the DatasetType but this does not work for calibrations 

559 # where additional dimensions can be used to constrain the temporal 

560 # axis. 

561 if not_dimensions: 

562 # Search for all dimensions even if we have been given a value 

563 # explicitly. In some cases records are given as well as the 

564 # actually dimension and this should not be an error if they 

565 # match. 

566 mandatoryDimensions = datasetType.dimensions.names # - provided 

567 

568 candidateDimensions: set[str] = set() 

569 candidateDimensions.update(mandatoryDimensions) 

570 

571 # For calibrations we may well be needing temporal dimensions 

572 # so rather than always including all dimensions in the scan 

573 # restrict things a little. It is still possible for there 

574 # to be confusion over day_obs in visit vs exposure for example. 

575 # If we are not searching calibration collections things may 

576 # fail but they are going to fail anyway because of the 

577 # ambiguousness of the dataId... 

578 if datasetType.isCalibration(): 

579 for dim in self.dimensions.dimensions: 

580 if dim.temporal: 

581 candidateDimensions.add(str(dim)) 

582 

583 # Look up table for the first association with a dimension 

584 guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict) 

585 

586 # Keep track of whether an item is associated with multiple 

587 # dimensions. 

588 counter: Counter[str] = Counter() 

589 assigned: dict[str, set[str]] = defaultdict(set) 

590 

591 # Go through the missing dimensions and associate the 

592 # given names with records within those dimensions 

593 matched_dims = set() 

594 for dimensionName in candidateDimensions: 

595 dimension = self.dimensions.dimensions[dimensionName] 

596 fields = dimension.metadata.names | dimension.uniqueKeys.names 

597 for field in not_dimensions: 

598 if field in fields: 

599 guessedAssociation[dimensionName][field] = not_dimensions[field] 

600 counter[dimensionName] += 1 

601 assigned[field].add(dimensionName) 

602 matched_dims.add(field) 

603 

604 # Calculate the fields that matched nothing. 

605 never_found = set(not_dimensions) - matched_dims 

606 

607 if never_found: 

608 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

609 

610 # There is a chance we have allocated a single dataId item 

611 # to multiple dimensions. Need to decide which should be retained. 

612 # For now assume that the most popular alternative wins. 

613 # This means that day_obs with seq_num will result in 

614 # exposure.day_obs and not visit.day_obs 

615 # Also prefer an explicitly missing dimension over an inferred 

616 # temporal dimension. 

617 for fieldName, assignedDimensions in assigned.items(): 

618 if len(assignedDimensions) > 1: 

619 # Pick the most popular (preferring mandatory dimensions) 

620 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

621 if requiredButMissing: 

622 candidateDimensions = requiredButMissing 

623 else: 

624 candidateDimensions = assignedDimensions 

625 

626 # If this is a choice between visit and exposure and 

627 # neither was a required part of the dataset type, 

628 # (hence in this branch) always prefer exposure over 

629 # visit since exposures are always defined and visits 

630 # are defined from exposures. 

631 if candidateDimensions == {"exposure", "visit"}: 

632 candidateDimensions = {"exposure"} 

633 

634 # Select the relevant items and get a new restricted 

635 # counter. 

636 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

637 duplicatesCounter: Counter[str] = Counter() 

638 duplicatesCounter.update(theseCounts) 

639 

640 # Choose the most common. If they are equally common 

641 # we will pick the one that was found first. 

642 # Returns a list of tuples 

643 selected = duplicatesCounter.most_common(1)[0][0] 

644 

645 _LOG.debug( 

646 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

647 " Removed ambiguity by choosing dimension %s.", 

648 fieldName, 

649 ", ".join(assignedDimensions), 

650 selected, 

651 ) 

652 

653 for candidateDimension in assignedDimensions: 

654 if candidateDimension != selected: 

655 del guessedAssociation[candidateDimension][fieldName] 

656 

657 # Update the record look up dict with the new associations 

658 for dimensionName, values in guessedAssociation.items(): 

659 if values: # A dict might now be empty 

660 _LOG.debug( 

661 "Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values 

662 ) 

663 byRecord[dimensionName].update(values) 

664 

665 if byRecord: 

666 # Some record specifiers were found so we need to convert 

667 # them to the Id form 

668 for dimensionName, values in byRecord.items(): 

669 if dimensionName in newDataId: 

670 _LOG.debug( 

671 "DataId specified explicit %s dimension value of %s in addition to" 

672 " general record specifiers for it of %s. Ignoring record information.", 

673 dimensionName, 

674 newDataId[dimensionName], 

675 str(values), 

676 ) 

677 # Get the actual record and compare with these values. 

678 try: 

679 recs = list(self._registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

680 except DataIdError: 

681 raise ValueError( 

682 f"Could not find dimension '{dimensionName}'" 

683 f" with dataId {newDataId} as part of comparing with" 

684 f" record values {byRecord[dimensionName]}" 

685 ) from None 

686 if len(recs) == 1: 

687 errmsg: list[str] = [] 

688 for k, v in values.items(): 

689 if (recval := getattr(recs[0], k)) != v: 

690 errmsg.append(f"{k}({recval} != {v})") 

691 if errmsg: 

692 raise ValueError( 

693 f"Dimension {dimensionName} in dataId has explicit value" 

694 " inconsistent with records: " + ", ".join(errmsg) 

695 ) 

696 else: 

697 # Multiple matches for an explicit dimension 

698 # should never happen but let downstream complain. 

699 pass 

700 continue 

701 

702 # Build up a WHERE expression 

703 bind = dict(values.items()) 

704 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

705 

706 # Hopefully we get a single record that matches 

707 records = set( 

708 self._registry.queryDimensionRecords( 

709 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

710 ) 

711 ) 

712 

713 if len(records) != 1: 

714 if len(records) > 1: 

715 # visit can have an ambiguous answer without involving 

716 # visit_system. The default visit_system is defined 

717 # by the instrument. 

718 if ( 

719 dimensionName == "visit" 

720 and "visit_system_membership" in self.dimensions 

721 and "visit_system" in self.dimensions["instrument"].metadata 

722 ): 

723 instrument_records = list( 

724 self._registry.queryDimensionRecords( 

725 "instrument", 

726 dataId=newDataId, 

727 **kwargs, 

728 ) 

729 ) 

730 if len(instrument_records) == 1: 

731 visit_system = instrument_records[0].visit_system 

732 if visit_system is None: 

733 # Set to a value that will never match. 

734 visit_system = -1 

735 

736 # Look up each visit in the 

737 # visit_system_membership records. 

738 for rec in records: 

739 membership = list( 

740 self._registry.queryDimensionRecords( 

741 # Use bind to allow zero results. 

742 # This is a fully-specified query. 

743 "visit_system_membership", 

744 where="instrument = inst AND visit_system = system AND visit = v", 

745 bind=dict( 

746 inst=instrument_records[0].name, system=visit_system, v=rec.id 

747 ), 

748 ) 

749 ) 

750 if membership: 

751 # This record is the right answer. 

752 records = {rec} 

753 break 

754 

755 # The ambiguity may have been resolved so check again. 

756 if len(records) > 1: 

757 _LOG.debug( 

758 "Received %d records from constraints of %s", len(records), str(values) 

759 ) 

760 for r in records: 

761 _LOG.debug("- %s", str(r)) 

762 raise ValueError( 

763 f"DataId specification for dimension {dimensionName} is not" 

764 f" uniquely constrained to a single dataset by {values}." 

765 f" Got {len(records)} results." 

766 ) 

767 else: 

768 raise ValueError( 

769 f"DataId specification for dimension {dimensionName} matched no" 

770 f" records when constrained by {values}" 

771 ) 

772 

773 # Get the primary key from the real dimension object 

774 dimension = self.dimensions.dimensions[dimensionName] 

775 if not isinstance(dimension, Dimension): 

776 raise RuntimeError( 

777 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

778 ) 

779 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

780 

781 return newDataId, kwargs 

782 

783 def _findDatasetRef( 

784 self, 

785 datasetRefOrType: DatasetRef | DatasetType | str, 

786 dataId: DataId | None = None, 

787 *, 

788 collections: Any = None, 

789 predict: bool = False, 

790 run: str | None = None, 

791 datastore_records: bool = False, 

792 **kwargs: Any, 

793 ) -> DatasetRef: 

794 """Shared logic for methods that start with a search for a dataset in 

795 the registry. 

796 

797 Parameters 

798 ---------- 

799 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

800 When `DatasetRef` the `dataId` should be `None`. 

801 Otherwise the `DatasetType` or name thereof. 

802 dataId : `dict` or `DataCoordinate`, optional 

803 A `dict` of `Dimension` link name, value pairs that label the 

804 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

805 should be provided as the first argument. 

806 collections : Any, optional 

807 Collections to be searched, overriding ``self.collections``. 

808 Can be any of the types supported by the ``collections`` argument 

809 to butler construction. 

810 predict : `bool`, optional 

811 If `True`, return a newly created `DatasetRef` with a unique 

812 dataset ID if finding a reference in the `Registry` fails. 

813 Defaults to `False`. 

814 run : `str`, optional 

815 Run collection name to use for creating `DatasetRef` for predicted 

816 datasets. Only used if ``predict`` is `True`. 

817 datastore_records : `bool`, optional 

818 If `True` add datastore records to returned `DatasetRef`. 

819 **kwargs 

820 Additional keyword arguments used to augment or construct a 

821 `DataId`. See `DataId` parameters. 

822 

823 Returns 

824 ------- 

825 ref : `DatasetRef` 

826 A reference to the dataset identified by the given arguments. 

827 This can be the same dataset reference as given if it was 

828 resolved. 

829 

830 Raises 

831 ------ 

832 LookupError 

833 Raised if no matching dataset exists in the `Registry` (and 

834 ``predict`` is `False`). 

835 ValueError 

836 Raised if a resolved `DatasetRef` was passed as an input, but it 

837 differs from the one found in the registry. 

838 TypeError 

839 Raised if no collections were provided. 

840 """ 

841 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

842 if isinstance(datasetRefOrType, DatasetRef): 

843 if collections is not None: 

844 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3) 

845 # May need to retrieve datastore records if requested. 

846 if datastore_records and datasetRefOrType._datastore_records is None: 

847 datasetRefOrType = self._registry.get_datastore_records(datasetRefOrType) 

848 return datasetRefOrType 

849 timespan: Timespan | None = None 

850 

851 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

852 

853 if datasetType.isCalibration(): 

854 # Because this is a calibration dataset, first try to make a 

855 # standardize the data ID without restricting the dimensions to 

856 # those of the dataset type requested, because there may be extra 

857 # dimensions that provide temporal information for a validity-range 

858 # lookup. 

859 dataId = DataCoordinate.standardize( 

860 dataId, universe=self.dimensions, defaults=self._registry.defaults.dataId, **kwargs 

861 ) 

862 if dataId.dimensions.temporal: 

863 dataId = self._registry.expandDataId(dataId) 

864 timespan = dataId.timespan 

865 else: 

866 # Standardize the data ID to just the dimensions of the dataset 

867 # type instead of letting registry.findDataset do it, so we get the 

868 # result even if no dataset is found. 

869 dataId = DataCoordinate.standardize( 

870 dataId, 

871 dimensions=datasetType.dimensions, 

872 defaults=self._registry.defaults.dataId, 

873 **kwargs, 

874 ) 

875 # Always lookup the DatasetRef, even if one is given, to ensure it is 

876 # present in the current collection. 

877 ref = self.find_dataset( 

878 datasetType, 

879 dataId, 

880 collections=collections, 

881 timespan=timespan, 

882 datastore_records=datastore_records, 

883 ) 

884 if ref is None: 

885 if predict: 

886 if run is None: 

887 run = self.run 

888 if run is None: 

889 raise TypeError("Cannot predict dataset ID/location with run=None.") 

890 return DatasetRef(datasetType, dataId, run=run) 

891 else: 

892 if collections is None: 

893 collections = self._registry.defaults.collections 

894 raise LookupError( 

895 f"Dataset {datasetType.name} with data ID {dataId} " 

896 f"could not be found in collections {collections}." 

897 ) 

898 if datasetType != ref.datasetType: 

899 # If they differ it is because the user explicitly specified 

900 # a compatible dataset type to this call rather than using the 

901 # registry definition. The DatasetRef must therefore be recreated 

902 # using the user definition such that the expected type is 

903 # returned. 

904 ref = DatasetRef( 

905 datasetType, ref.dataId, run=ref.run, id=ref.id, datastore_records=ref._datastore_records 

906 ) 

907 

908 return ref 

909 

910 @transactional 

911 def put( 

912 self, 

913 obj: Any, 

914 datasetRefOrType: DatasetRef | DatasetType | str, 

915 /, 

916 dataId: DataId | None = None, 

917 *, 

918 run: str | None = None, 

919 **kwargs: Any, 

920 ) -> DatasetRef: 

921 """Store and register a dataset. 

922 

923 Parameters 

924 ---------- 

925 obj : `object` 

926 The dataset. 

927 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

928 When `DatasetRef` is provided, ``dataId`` should be `None`. 

929 Otherwise the `DatasetType` or name thereof. If a fully resolved 

930 `DatasetRef` is given the run and ID are used directly. 

931 dataId : `dict` or `DataCoordinate` 

932 A `dict` of `Dimension` link name, value pairs that label the 

933 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

934 should be provided as the second argument. 

935 run : `str`, optional 

936 The name of the run the dataset should be added to, overriding 

937 ``self.run``. Not used if a resolved `DatasetRef` is provided. 

938 **kwargs 

939 Additional keyword arguments used to augment or construct a 

940 `DataCoordinate`. See `DataCoordinate.standardize` 

941 parameters. Not used if a resolve `DatasetRef` is provided. 

942 

943 Returns 

944 ------- 

945 ref : `DatasetRef` 

946 A reference to the stored dataset, updated with the correct id if 

947 given. 

948 

949 Raises 

950 ------ 

951 TypeError 

952 Raised if the butler is read-only or if no run has been provided. 

953 """ 

954 if isinstance(datasetRefOrType, DatasetRef): 

955 # This is a direct put of predefined DatasetRef. 

956 _LOG.debug("Butler put direct: %s", datasetRefOrType) 

957 if run is not None: 

958 warnings.warn("Run collection is not used for DatasetRef", stacklevel=3) 

959 # If registry already has a dataset with the same dataset ID, 

960 # dataset type and DataId, then _importDatasets will do nothing and 

961 # just return an original ref. We have to raise in this case, there 

962 # is a datastore check below for that. 

963 self._registry._importDatasets([datasetRefOrType], expand=True) 

964 # Before trying to write to the datastore check that it does not 

965 # know this dataset. This is prone to races, of course. 

966 if self._datastore.knows(datasetRefOrType): 

967 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}") 

968 # Try to write dataset to the datastore, if it fails due to a race 

969 # with another write, the content of stored data may be 

970 # unpredictable. 

971 try: 

972 self._datastore.put(obj, datasetRefOrType) 

973 except IntegrityError as e: 

974 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") from e 

975 return datasetRefOrType 

976 

977 _LOG.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

978 if not self.isWriteable(): 

979 raise TypeError("Butler is read-only.") 

980 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

981 

982 # Handle dimension records in dataId 

983 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

984 

985 # Add Registry Dataset entry. 

986 dataId = self._registry.expandDataId(dataId, dimensions=datasetType.dimensions, **kwargs) 

987 (ref,) = self._registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

988 self._datastore.put(obj, ref) 

989 

990 return ref 

991 

992 def getDeferred( 

993 self, 

994 datasetRefOrType: DatasetRef | DatasetType | str, 

995 /, 

996 dataId: DataId | None = None, 

997 *, 

998 parameters: dict | None = None, 

999 collections: Any = None, 

1000 storageClass: str | StorageClass | None = None, 

1001 **kwargs: Any, 

1002 ) -> DeferredDatasetHandle: 

1003 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1004 after an immediate registry lookup. 

1005 

1006 Parameters 

1007 ---------- 

1008 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1009 When `DatasetRef` the `dataId` should be `None`. 

1010 Otherwise the `DatasetType` or name thereof. 

1011 dataId : `dict` or `DataCoordinate`, optional 

1012 A `dict` of `Dimension` link name, value pairs that label the 

1013 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1014 should be provided as the first argument. 

1015 parameters : `dict` 

1016 Additional StorageClass-defined options to control reading, 

1017 typically used to efficiently read only a subset of the dataset. 

1018 collections : Any, optional 

1019 Collections to be searched, overriding ``self.collections``. 

1020 Can be any of the types supported by the ``collections`` argument 

1021 to butler construction. 

1022 storageClass : `StorageClass` or `str`, optional 

1023 The storage class to be used to override the Python type 

1024 returned by this method. By default the returned type matches 

1025 the dataset type definition for this dataset. Specifying a 

1026 read `StorageClass` can force a different type to be returned. 

1027 This type must be compatible with the original type. 

1028 **kwargs 

1029 Additional keyword arguments used to augment or construct a 

1030 `DataId`. See `DataId` parameters. 

1031 

1032 Returns 

1033 ------- 

1034 obj : `DeferredDatasetHandle` 

1035 A handle which can be used to retrieve a dataset at a later time. 

1036 

1037 Raises 

1038 ------ 

1039 LookupError 

1040 Raised if no matching dataset exists in the `Registry` or 

1041 datastore. 

1042 ValueError 

1043 Raised if a resolved `DatasetRef` was passed as an input, but it 

1044 differs from the one found in the registry. 

1045 TypeError 

1046 Raised if no collections were provided. 

1047 """ 

1048 if isinstance(datasetRefOrType, DatasetRef): 

1049 # Do the quick check first and if that fails, check for artifact 

1050 # existence. This is necessary for datastores that are configured 

1051 # in trust mode where there won't be a record but there will be 

1052 # a file. 

1053 if self._datastore.knows(datasetRefOrType) or self._datastore.exists(datasetRefOrType): 

1054 ref = datasetRefOrType 

1055 else: 

1056 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.") 

1057 else: 

1058 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1059 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1060 

1061 def get( 

1062 self, 

1063 datasetRefOrType: DatasetRef | DatasetType | str, 

1064 /, 

1065 dataId: DataId | None = None, 

1066 *, 

1067 parameters: dict[str, Any] | None = None, 

1068 collections: Any = None, 

1069 storageClass: StorageClass | str | None = None, 

1070 **kwargs: Any, 

1071 ) -> Any: 

1072 """Retrieve a stored dataset. 

1073 

1074 Parameters 

1075 ---------- 

1076 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1077 When `DatasetRef` the `dataId` should be `None`. 

1078 Otherwise the `DatasetType` or name thereof. 

1079 If a resolved `DatasetRef`, the associated dataset 

1080 is returned directly without additional querying. 

1081 dataId : `dict` or `DataCoordinate` 

1082 A `dict` of `Dimension` link name, value pairs that label the 

1083 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1084 should be provided as the first argument. 

1085 parameters : `dict` 

1086 Additional StorageClass-defined options to control reading, 

1087 typically used to efficiently read only a subset of the dataset. 

1088 collections : Any, optional 

1089 Collections to be searched, overriding ``self.collections``. 

1090 Can be any of the types supported by the ``collections`` argument 

1091 to butler construction. 

1092 storageClass : `StorageClass` or `str`, optional 

1093 The storage class to be used to override the Python type 

1094 returned by this method. By default the returned type matches 

1095 the dataset type definition for this dataset. Specifying a 

1096 read `StorageClass` can force a different type to be returned. 

1097 This type must be compatible with the original type. 

1098 **kwargs 

1099 Additional keyword arguments used to augment or construct a 

1100 `DataCoordinate`. See `DataCoordinate.standardize` 

1101 parameters. 

1102 

1103 Returns 

1104 ------- 

1105 obj : `object` 

1106 The dataset. 

1107 

1108 Raises 

1109 ------ 

1110 LookupError 

1111 Raised if no matching dataset exists in the `Registry`. 

1112 TypeError 

1113 Raised if no collections were provided. 

1114 

1115 Notes 

1116 ----- 

1117 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1118 this method requires that the given data ID include temporal dimensions 

1119 beyond the dimensions of the dataset type itself, in order to find the 

1120 dataset with the appropriate validity range. For example, a "bias" 

1121 dataset with native dimensions ``{instrument, detector}`` could be 

1122 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1123 ``exposure`` is a temporal dimension. 

1124 """ 

1125 _LOG.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1126 ref = self._findDatasetRef( 

1127 datasetRefOrType, dataId, collections=collections, datastore_records=True, **kwargs 

1128 ) 

1129 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1130 

1131 def getURIs( 

1132 self, 

1133 datasetRefOrType: DatasetRef | DatasetType | str, 

1134 /, 

1135 dataId: DataId | None = None, 

1136 *, 

1137 predict: bool = False, 

1138 collections: Any = None, 

1139 run: str | None = None, 

1140 **kwargs: Any, 

1141 ) -> DatasetRefURIs: 

1142 """Return the URIs associated with the dataset. 

1143 

1144 Parameters 

1145 ---------- 

1146 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1147 When `DatasetRef` the `dataId` should be `None`. 

1148 Otherwise the `DatasetType` or name thereof. 

1149 dataId : `dict` or `DataCoordinate` 

1150 A `dict` of `Dimension` link name, value pairs that label the 

1151 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1152 should be provided as the first argument. 

1153 predict : `bool` 

1154 If `True`, allow URIs to be returned of datasets that have not 

1155 been written. 

1156 collections : Any, optional 

1157 Collections to be searched, overriding ``self.collections``. 

1158 Can be any of the types supported by the ``collections`` argument 

1159 to butler construction. 

1160 run : `str`, optional 

1161 Run to use for predictions, overriding ``self.run``. 

1162 **kwargs 

1163 Additional keyword arguments used to augment or construct a 

1164 `DataCoordinate`. See `DataCoordinate.standardize` 

1165 parameters. 

1166 

1167 Returns 

1168 ------- 

1169 uris : `DatasetRefURIs` 

1170 The URI to the primary artifact associated with this dataset (if 

1171 the dataset was disassembled within the datastore this may be 

1172 `None`), and the URIs to any components associated with the dataset 

1173 artifact. (can be empty if there are no components). 

1174 """ 

1175 ref = self._findDatasetRef( 

1176 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs 

1177 ) 

1178 return self._datastore.getURIs(ref, predict) 

1179 

1180 def get_dataset_type(self, name: str) -> DatasetType: 

1181 return self._registry.getDatasetType(name) 

1182 

1183 def get_dataset( 

1184 self, 

1185 id: DatasetId, 

1186 *, 

1187 storage_class: str | StorageClass | None = None, 

1188 dimension_records: bool = False, 

1189 datastore_records: bool = False, 

1190 ) -> DatasetRef | None: 

1191 ref = self._registry.getDataset(id) 

1192 if ref is not None: 

1193 if dimension_records: 

1194 ref = ref.expanded( 

1195 self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions) 

1196 ) 

1197 if storage_class: 

1198 ref = ref.overrideStorageClass(storage_class) 

1199 if datastore_records: 

1200 ref = self._registry.get_datastore_records(ref) 

1201 return ref 

1202 

1203 def find_dataset( 

1204 self, 

1205 dataset_type: DatasetType | str, 

1206 data_id: DataId | None = None, 

1207 *, 

1208 collections: str | Sequence[str] | None = None, 

1209 timespan: Timespan | None = None, 

1210 storage_class: str | StorageClass | None = None, 

1211 dimension_records: bool = False, 

1212 datastore_records: bool = False, 

1213 **kwargs: Any, 

1214 ) -> DatasetRef | None: 

1215 # Handle any parts of the dataID that are not using primary dimension 

1216 # keys. 

1217 if isinstance(dataset_type, str): 

1218 actual_type = self.get_dataset_type(dataset_type) 

1219 else: 

1220 actual_type = dataset_type 

1221 

1222 # Store the component for later. 

1223 component_name = actual_type.component() 

1224 if actual_type.isComponent(): 

1225 parent_type = actual_type.makeCompositeDatasetType() 

1226 else: 

1227 parent_type = actual_type 

1228 

1229 data_id, kwargs = self._rewrite_data_id(data_id, parent_type, **kwargs) 

1230 

1231 ref = self._registry.findDataset( 

1232 parent_type, 

1233 data_id, 

1234 collections=collections, 

1235 timespan=timespan, 

1236 datastore_records=datastore_records, 

1237 **kwargs, 

1238 ) 

1239 if ref is not None and dimension_records: 

1240 ref = ref.expanded(self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions)) 

1241 if ref is not None and component_name: 

1242 ref = ref.makeComponentRef(component_name) 

1243 if ref is not None and storage_class is not None: 

1244 ref = ref.overrideStorageClass(storage_class) 

1245 

1246 return ref 

1247 

1248 def retrieveArtifacts( 

1249 self, 

1250 refs: Iterable[DatasetRef], 

1251 destination: ResourcePathExpression, 

1252 transfer: str = "auto", 

1253 preserve_path: bool = True, 

1254 overwrite: bool = False, 

1255 ) -> list[ResourcePath]: 

1256 # Docstring inherited. 

1257 return self._datastore.retrieveArtifacts( 

1258 refs, 

1259 ResourcePath(destination), 

1260 transfer=transfer, 

1261 preserve_path=preserve_path, 

1262 overwrite=overwrite, 

1263 ) 

1264 

1265 def exists( 

1266 self, 

1267 dataset_ref_or_type: DatasetRef | DatasetType | str, 

1268 /, 

1269 data_id: DataId | None = None, 

1270 *, 

1271 full_check: bool = True, 

1272 collections: Any = None, 

1273 **kwargs: Any, 

1274 ) -> DatasetExistence: 

1275 # Docstring inherited. 

1276 existence = DatasetExistence.UNRECOGNIZED 

1277 

1278 if isinstance(dataset_ref_or_type, DatasetRef): 

1279 if collections is not None: 

1280 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2) 

1281 if data_id is not None: 

1282 warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2) 

1283 ref = dataset_ref_or_type 

1284 registry_ref = self._registry.getDataset(dataset_ref_or_type.id) 

1285 if registry_ref is not None: 

1286 existence |= DatasetExistence.RECORDED 

1287 

1288 if dataset_ref_or_type != registry_ref: 

1289 # This could mean that storage classes differ, so we should 

1290 # check for that but use the registry ref for the rest of 

1291 # the method. 

1292 if registry_ref.is_compatible_with(dataset_ref_or_type): 

1293 # Use the registry version from now on. 

1294 ref = registry_ref 

1295 else: 

1296 raise ValueError( 

1297 f"The ref given to exists() ({ref}) has the same dataset ID as one " 

1298 f"in registry but has different incompatible values ({registry_ref})." 

1299 ) 

1300 else: 

1301 try: 

1302 ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs) 

1303 except (LookupError, TypeError, NoDefaultCollectionError): 

1304 return existence 

1305 existence |= DatasetExistence.RECORDED 

1306 

1307 if self._datastore.knows(ref): 

1308 existence |= DatasetExistence.DATASTORE 

1309 

1310 if full_check: 

1311 if self._datastore.exists(ref): 

1312 existence |= DatasetExistence._ARTIFACT 

1313 elif existence.value != DatasetExistence.UNRECOGNIZED.value: 

1314 # Do not add this flag if we have no other idea about a dataset. 

1315 existence |= DatasetExistence(DatasetExistence._ASSUMED) 

1316 

1317 return existence 

1318 

1319 def _exists_many( 

1320 self, 

1321 refs: Iterable[DatasetRef], 

1322 /, 

1323 *, 

1324 full_check: bool = True, 

1325 ) -> dict[DatasetRef, DatasetExistence]: 

1326 # Docstring inherited. 

1327 existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs} 

1328 

1329 # Registry does not have a bulk API to check for a ref. 

1330 for ref in refs: 

1331 registry_ref = self._registry.getDataset(ref.id) 

1332 if registry_ref is not None: 

1333 # It is possible, albeit unlikely, that the given ref does 

1334 # not match the one in registry even though the UUID matches. 

1335 # When checking a single ref we raise, but it's impolite to 

1336 # do that when potentially hundreds of refs are being checked. 

1337 # We could change the API to only accept UUIDs and that would 

1338 # remove the ability to even check and remove the worry 

1339 # about differing storage classes. Given the ongoing discussion 

1340 # on refs vs UUIDs and whether to raise or have a new 

1341 # private flag, treat this as a private API for now. 

1342 existence[ref] |= DatasetExistence.RECORDED 

1343 

1344 # Ask datastore if it knows about these refs. 

1345 knows = self._datastore.knows_these(refs) 

1346 for ref, known in knows.items(): 

1347 if known: 

1348 existence[ref] |= DatasetExistence.DATASTORE 

1349 

1350 if full_check: 

1351 mexists = self._datastore.mexists(refs) 

1352 for ref, exists in mexists.items(): 

1353 if exists: 

1354 existence[ref] |= DatasetExistence._ARTIFACT 

1355 else: 

1356 # Do not set this flag if nothing is known about the dataset. 

1357 for ref in existence: 

1358 if existence[ref] != DatasetExistence.UNRECOGNIZED: 

1359 existence[ref] |= DatasetExistence._ASSUMED 

1360 

1361 return existence 

1362 

1363 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1364 # Docstring inherited. 

1365 if not self.isWriteable(): 

1366 raise TypeError("Butler is read-only.") 

1367 names = list(names) 

1368 refs: list[DatasetRef] = [] 

1369 for name in names: 

1370 collectionType = self._registry.getCollectionType(name) 

1371 if collectionType is not CollectionType.RUN: 

1372 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1373 refs.extend(self._registry.queryDatasets(..., collections=name, findFirst=True)) 

1374 with self._datastore.transaction(), self._registry.transaction(): 

1375 if unstore: 

1376 self._datastore.trash(refs) 

1377 else: 

1378 self._datastore.forget(refs) 

1379 for name in names: 

1380 self._registry.removeCollection(name) 

1381 if unstore: 

1382 # Point of no return for removing artifacts 

1383 self._datastore.emptyTrash() 

1384 

1385 def pruneDatasets( 

1386 self, 

1387 refs: Iterable[DatasetRef], 

1388 *, 

1389 disassociate: bool = True, 

1390 unstore: bool = False, 

1391 tags: Iterable[str] = (), 

1392 purge: bool = False, 

1393 ) -> None: 

1394 # docstring inherited from LimitedButler 

1395 

1396 if not self.isWriteable(): 

1397 raise TypeError("Butler is read-only.") 

1398 if purge: 

1399 if not disassociate: 

1400 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1401 if not unstore: 

1402 raise TypeError("Cannot pass purge=True without unstore=True.") 

1403 elif disassociate: 

1404 tags = tuple(tags) 

1405 if not tags: 

1406 raise TypeError("No tags provided but disassociate=True.") 

1407 for tag in tags: 

1408 collectionType = self._registry.getCollectionType(tag) 

1409 if collectionType is not CollectionType.TAGGED: 

1410 raise TypeError( 

1411 f"Cannot disassociate from collection '{tag}' " 

1412 f"of non-TAGGED type {collectionType.name}." 

1413 ) 

1414 # Transform possibly-single-pass iterable into something we can iterate 

1415 # over multiple times. 

1416 refs = list(refs) 

1417 # Pruning a component of a DatasetRef makes no sense since registry 

1418 # doesn't know about components and datastore might not store 

1419 # components in a separate file 

1420 for ref in refs: 

1421 if ref.datasetType.component(): 

1422 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1423 # We don't need an unreliable Datastore transaction for this, because 

1424 # we've been extra careful to ensure that Datastore.trash only involves 

1425 # mutating the Registry (it can _look_ at Datastore-specific things, 

1426 # but shouldn't change them), and hence all operations here are 

1427 # Registry operations. 

1428 with self._datastore.transaction(), self._registry.transaction(): 

1429 if unstore: 

1430 self._datastore.trash(refs) 

1431 if purge: 

1432 self._registry.removeDatasets(refs) 

1433 elif disassociate: 

1434 assert tags, "Guaranteed by earlier logic in this function." 

1435 for tag in tags: 

1436 self._registry.disassociate(tag, refs) 

1437 # We've exited the Registry transaction, and apparently committed. 

1438 # (if there was an exception, everything rolled back, and it's as if 

1439 # nothing happened - and we never get here). 

1440 # Datastore artifacts are not yet gone, but they're clearly marked 

1441 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1442 # problems we can try again later, and if manual administrative 

1443 # intervention is required, it's pretty clear what that should entail: 

1444 # deleting everything on disk and in private Datastore tables that is 

1445 # in the dataset_location_trash table. 

1446 if unstore: 

1447 # Point of no return for removing artifacts 

1448 self._datastore.emptyTrash() 

1449 

1450 @transactional 

1451 def ingest( 

1452 self, 

1453 *datasets: FileDataset, 

1454 transfer: str | None = "auto", 

1455 record_validation_info: bool = True, 

1456 ) -> None: 

1457 # Docstring inherited. 

1458 if not self.isWriteable(): 

1459 raise TypeError("Butler is read-only.") 

1460 

1461 _LOG.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s") 

1462 if not datasets: 

1463 return 

1464 

1465 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1466 

1467 # We need to reorganize all the inputs so that they are grouped 

1468 # by dataset type and run. Multiple refs in a single FileDataset 

1469 # are required to share the run and dataset type. 

1470 groupedData: MutableMapping[tuple[DatasetType, str], list[FileDataset]] = defaultdict(list) 

1471 

1472 # Track DataIDs that are being ingested so we can spot issues early 

1473 # with duplication. Retain previous FileDataset so we can report it. 

1474 groupedDataIds: MutableMapping[tuple[DatasetType, str], dict[DataCoordinate, FileDataset]] = ( 

1475 defaultdict(dict) 

1476 ) 

1477 

1478 # And the nested loop that populates it: 

1479 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1480 # Somewhere to store pre-existing refs if we have an 

1481 # execution butler. 

1482 existingRefs: list[DatasetRef] = [] 

1483 

1484 for ref in dataset.refs: 

1485 group_key = (ref.datasetType, ref.run) 

1486 

1487 if ref.dataId in groupedDataIds[group_key]: 

1488 raise ConflictingDefinitionError( 

1489 f"Ingest conflict. Dataset {dataset.path} has same" 

1490 " DataId as other ingest dataset" 

1491 f" {groupedDataIds[group_key][ref.dataId].path} " 

1492 f" ({ref.dataId})" 

1493 ) 

1494 

1495 groupedDataIds[group_key][ref.dataId] = dataset 

1496 

1497 if existingRefs: 

1498 if len(dataset.refs) != len(existingRefs): 

1499 # Keeping track of partially pre-existing datasets is hard 

1500 # and should generally never happen. For now don't allow 

1501 # it. 

1502 raise ConflictingDefinitionError( 

1503 f"For dataset {dataset.path} some dataIds already exist" 

1504 " in registry but others do not. This is not supported." 

1505 ) 

1506 

1507 # Store expanded form in the original FileDataset. 

1508 dataset.refs = existingRefs 

1509 else: 

1510 groupedData[group_key].append(dataset) 

1511 

1512 # Now we can bulk-insert into Registry for each DatasetType. 

1513 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks( 

1514 groupedData.items(), desc="Bulk-inserting datasets by type" 

1515 ): 

1516 refs_to_import = [] 

1517 for dataset in grouped_datasets: 

1518 refs_to_import.extend(dataset.refs) 

1519 

1520 n_refs = len(refs_to_import) 

1521 _LOG.verbose( 

1522 "Importing %d ref%s of dataset type %r into run %r", 

1523 n_refs, 

1524 "" if n_refs == 1 else "s", 

1525 datasetType.name, 

1526 this_run, 

1527 ) 

1528 

1529 # Import the refs and expand the DataCoordinates since we can't 

1530 # guarantee that they are expanded and Datastore will need 

1531 # the records. 

1532 imported_refs = self._registry._importDatasets(refs_to_import, expand=True) 

1533 assert set(imported_refs) == set(refs_to_import) 

1534 

1535 # Replace all the refs in the FileDataset with expanded versions. 

1536 # Pull them off in the order we put them on the list. 

1537 for dataset in grouped_datasets: 

1538 n_dataset_refs = len(dataset.refs) 

1539 dataset.refs = imported_refs[:n_dataset_refs] 

1540 del imported_refs[:n_dataset_refs] 

1541 

1542 # Bulk-insert everything into Datastore. 

1543 # We do not know if any of the registry entries already existed 

1544 # (_importDatasets only complains if they exist but differ) so 

1545 # we have to catch IntegrityError explicitly. 

1546 try: 

1547 self._datastore.ingest( 

1548 *datasets, transfer=transfer, record_validation_info=record_validation_info 

1549 ) 

1550 except IntegrityError as e: 

1551 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") from e 

1552 

1553 @contextlib.contextmanager 

1554 def export( 

1555 self, 

1556 *, 

1557 directory: str | None = None, 

1558 filename: str | None = None, 

1559 format: str | None = None, 

1560 transfer: str | None = None, 

1561 ) -> Iterator[RepoExportContext]: 

1562 # Docstring inherited. 

1563 if directory is None and transfer is not None: 

1564 raise TypeError("Cannot transfer without providing a directory.") 

1565 if transfer == "move": 

1566 raise TypeError("Transfer may not be 'move': export is read-only") 

1567 if format is None: 

1568 if filename is None: 

1569 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1570 else: 

1571 _, format = os.path.splitext(filename) 

1572 if not format: 

1573 raise ValueError("Please specify a file extension to determine export format.") 

1574 format = format[1:] # Strip leading "."" 

1575 elif filename is None: 

1576 filename = f"export.{format}" 

1577 if directory is not None: 

1578 filename = os.path.join(directory, filename) 

1579 formats = self._config["repo_transfer_formats"] 

1580 if format not in formats: 

1581 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}") 

1582 BackendClass = get_class_of(formats[format, "export"]) 

1583 with open(filename, "w") as stream: 

1584 backend = BackendClass(stream, universe=self.dimensions) 

1585 try: 

1586 helper = RepoExportContext( 

1587 self._registry, self._datastore, backend=backend, directory=directory, transfer=transfer 

1588 ) 

1589 with self._caching_context(): 

1590 yield helper 

1591 except BaseException: 

1592 raise 

1593 else: 

1594 helper._finish() 

1595 

1596 def import_( 

1597 self, 

1598 *, 

1599 directory: ResourcePathExpression | None = None, 

1600 filename: ResourcePathExpression | TextIO | None = None, 

1601 format: str | None = None, 

1602 transfer: str | None = None, 

1603 skip_dimensions: set | None = None, 

1604 ) -> None: 

1605 # Docstring inherited. 

1606 if not self.isWriteable(): 

1607 raise TypeError("Butler is read-only.") 

1608 if format is None: 

1609 if filename is None: 

1610 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1611 else: 

1612 _, format = os.path.splitext(filename) # type: ignore 

1613 elif filename is None: 

1614 filename = ResourcePath(f"export.{format}", forceAbsolute=False) 

1615 if directory is not None: 

1616 directory = ResourcePath(directory, forceDirectory=True) 

1617 # mypy doesn't think this will work but it does in python >= 3.10. 

1618 if isinstance(filename, ResourcePathExpression): # type: ignore 

1619 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore 

1620 if not filename.isabs() and directory is not None: 

1621 potential = directory.join(filename) 

1622 exists_in_cwd = filename.exists() 

1623 exists_in_dir = potential.exists() 

1624 if exists_in_cwd and exists_in_dir: 

1625 _LOG.warning( 

1626 "A relative path for filename was specified (%s) which exists relative to cwd. " 

1627 "Additionally, the file exists relative to the given search directory (%s). " 

1628 "Using the export file in the given directory.", 

1629 filename, 

1630 potential, 

1631 ) 

1632 # Given they specified an explicit directory and that 

1633 # directory has the export file in it, assume that that 

1634 # is what was meant despite the file in cwd. 

1635 filename = potential 

1636 elif exists_in_dir: 

1637 filename = potential 

1638 elif not exists_in_cwd and not exists_in_dir: 

1639 # Raise early. 

1640 raise FileNotFoundError( 

1641 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}." 

1642 ) 

1643 BackendClass: type[RepoImportBackend] = get_class_of( 

1644 self._config["repo_transfer_formats"][format]["import"] 

1645 ) 

1646 

1647 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None: 

1648 with self._caching_context(): 

1649 backend = BackendClass(importStream, self._registry) # type: ignore[call-arg] 

1650 backend.register() 

1651 with self.transaction(): 

1652 backend.load( 

1653 self._datastore, 

1654 directory=directory, 

1655 transfer=transfer, 

1656 skip_dimensions=skip_dimensions, 

1657 ) 

1658 

1659 if isinstance(filename, ResourcePath): 

1660 # We can not use open() here at the moment because of 

1661 # DM-38589 since yaml does stream.read(8192) in a loop. 

1662 stream = io.StringIO(filename.read().decode()) 

1663 doImport(stream) 

1664 else: 

1665 doImport(filename) # type: ignore 

1666 

1667 def transfer_dimension_records_from( 

1668 self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef] 

1669 ) -> None: 

1670 # Allowed dimensions in the target butler. 

1671 elements = frozenset(element for element in self.dimensions.elements if element.has_own_table) 

1672 

1673 data_ids = {ref.dataId for ref in source_refs} 

1674 

1675 dimension_records = self._extract_all_dimension_records_from_data_ids( 

1676 source_butler, data_ids, elements 

1677 ) 

1678 

1679 # Insert order is important. 

1680 for element in self.dimensions.sorted(dimension_records.keys()): 

1681 records = [r for r in dimension_records[element].values()] 

1682 # Assume that if the record is already present that we can 

1683 # use it without having to check that the record metadata 

1684 # is consistent. 

1685 self._registry.insertDimensionData(element, *records, skip_existing=True) 

1686 _LOG.debug("Dimension '%s' -- number of records transferred: %d", element.name, len(records)) 

1687 

1688 def _extract_all_dimension_records_from_data_ids( 

1689 self, 

1690 source_butler: LimitedButler | Butler, 

1691 data_ids: set[DataCoordinate], 

1692 allowed_elements: frozenset[DimensionElement], 

1693 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]: 

1694 primary_records = self._extract_dimension_records_from_data_ids( 

1695 source_butler, data_ids, allowed_elements 

1696 ) 

1697 

1698 can_query = True if isinstance(source_butler, Butler) else False 

1699 

1700 additional_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

1701 for original_element, record_mapping in primary_records.items(): 

1702 # Get dimensions that depend on this dimension. 

1703 populated_by = self.dimensions.get_elements_populated_by( 

1704 self.dimensions[original_element.name] # type: ignore 

1705 ) 

1706 

1707 for data_id in record_mapping.keys(): 

1708 for element in populated_by: 

1709 if element not in allowed_elements: 

1710 continue 

1711 if element.name == original_element.name: 

1712 continue 

1713 

1714 if element.name in primary_records: 

1715 # If this element has already been stored avoid 

1716 # re-finding records since that may lead to additional 

1717 # spurious records. e.g. visit is populated_by 

1718 # visit_detector_region but querying 

1719 # visit_detector_region by visit will return all the 

1720 # detectors for this visit -- the visit dataId does not 

1721 # constrain this. 

1722 # To constrain the query the original dataIds would 

1723 # have to be scanned. 

1724 continue 

1725 

1726 if not can_query: 

1727 raise RuntimeError( 

1728 f"Transferring populated_by records like {element.name} requires a full Butler." 

1729 ) 

1730 

1731 records = source_butler.registry.queryDimensionRecords( # type: ignore 

1732 element.name, **data_id.mapping # type: ignore 

1733 ) 

1734 for record in records: 

1735 additional_records[record.definition].setdefault(record.dataId, record) 

1736 

1737 # The next step is to walk back through the additional records to 

1738 # pick up any missing content (such as visit_definition needing to 

1739 # know the exposure). Want to ensure we do not request records we 

1740 # already have. 

1741 missing_data_ids = set() 

1742 for name, record_mapping in additional_records.items(): 

1743 for data_id in record_mapping.keys(): 

1744 if data_id not in primary_records[name]: 

1745 missing_data_ids.add(data_id) 

1746 

1747 # Fill out the new records. Assume that these new records do not 

1748 # also need to carry over additional populated_by records. 

1749 secondary_records = self._extract_dimension_records_from_data_ids( 

1750 source_butler, missing_data_ids, allowed_elements 

1751 ) 

1752 

1753 # Merge the extra sets of records in with the original. 

1754 for name, record_mapping in itertools.chain(additional_records.items(), secondary_records.items()): 

1755 primary_records[name].update(record_mapping) 

1756 

1757 return primary_records 

1758 

1759 def _extract_dimension_records_from_data_ids( 

1760 self, 

1761 source_butler: LimitedButler | Butler, 

1762 data_ids: set[DataCoordinate], 

1763 allowed_elements: frozenset[DimensionElement], 

1764 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]: 

1765 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

1766 

1767 for data_id in data_ids: 

1768 # Need an expanded record, if not expanded that we need a full 

1769 # butler with registry (allow mocks with registry too). 

1770 if not data_id.hasRecords(): 

1771 if registry := getattr(source_butler, "registry", None): 

1772 data_id = registry.expandDataId(data_id) 

1773 else: 

1774 raise TypeError("Input butler needs to be a full butler to expand DataId.") 

1775 # If this butler doesn't know about a dimension in the source 

1776 # butler things will break later. 

1777 for element_name in data_id.dimensions.elements: 

1778 record = data_id.records[element_name] 

1779 if record is not None and record.definition in allowed_elements: 

1780 dimension_records[record.definition].setdefault(record.dataId, record) 

1781 

1782 return dimension_records 

1783 

1784 def transfer_from( 

1785 self, 

1786 source_butler: LimitedButler, 

1787 source_refs: Iterable[DatasetRef], 

1788 transfer: str = "auto", 

1789 skip_missing: bool = True, 

1790 register_dataset_types: bool = False, 

1791 transfer_dimensions: bool = False, 

1792 dry_run: bool = False, 

1793 ) -> collections.abc.Collection[DatasetRef]: 

1794 # Docstring inherited. 

1795 if not self.isWriteable(): 

1796 raise TypeError("Butler is read-only.") 

1797 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

1798 

1799 # Will iterate through the refs multiple times so need to convert 

1800 # to a list if this isn't a collection. 

1801 if not isinstance(source_refs, collections.abc.Collection): 

1802 source_refs = list(source_refs) 

1803 

1804 original_count = len(source_refs) 

1805 _LOG.info("Transferring %d datasets into %s", original_count, str(self)) 

1806 

1807 # In some situations the datastore artifact may be missing 

1808 # and we do not want that registry entry to be imported. 

1809 # Asking datastore is not sufficient, the records may have been 

1810 # purged, we have to ask for the (predicted) URI and check 

1811 # existence explicitly. Execution butler is set up exactly like 

1812 # this with no datastore records. 

1813 artifact_existence: dict[ResourcePath, bool] = {} 

1814 if skip_missing: 

1815 dataset_existence = source_butler._datastore.mexists( 

1816 source_refs, artifact_existence=artifact_existence 

1817 ) 

1818 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

1819 filtered_count = len(source_refs) 

1820 n_missing = original_count - filtered_count 

1821 _LOG.verbose( 

1822 "%d dataset%s removed because the artifact does not exist. Now have %d.", 

1823 n_missing, 

1824 "" if n_missing == 1 else "s", 

1825 filtered_count, 

1826 ) 

1827 

1828 # Importing requires that we group the refs by dataset type and run 

1829 # before doing the import. 

1830 source_dataset_types = set() 

1831 grouped_refs = defaultdict(list) 

1832 for ref in source_refs: 

1833 grouped_refs[ref.datasetType, ref.run].append(ref) 

1834 source_dataset_types.add(ref.datasetType) 

1835 

1836 # Check to see if the dataset type in the source butler has 

1837 # the same definition in the target butler and register missing 

1838 # ones if requested. Registration must happen outside a transaction. 

1839 newly_registered_dataset_types = set() 

1840 for datasetType in source_dataset_types: 

1841 if register_dataset_types: 

1842 # Let this raise immediately if inconsistent. Continuing 

1843 # on to find additional inconsistent dataset types 

1844 # might result in additional unwanted dataset types being 

1845 # registered. 

1846 if self._registry.registerDatasetType(datasetType): 

1847 newly_registered_dataset_types.add(datasetType) 

1848 else: 

1849 # If the dataset type is missing, let it fail immediately. 

1850 target_dataset_type = self.get_dataset_type(datasetType.name) 

1851 if target_dataset_type != datasetType: 

1852 raise ConflictingDefinitionError( 

1853 "Source butler dataset type differs from definition" 

1854 f" in target butler: {datasetType} !=" 

1855 f" {target_dataset_type}" 

1856 ) 

1857 if newly_registered_dataset_types: 

1858 # We may have registered some even if there were inconsistencies 

1859 # but should let people know (or else remove them again). 

1860 _LOG.verbose( 

1861 "Registered the following dataset types in the target Butler: %s", 

1862 ", ".join(d.name for d in newly_registered_dataset_types), 

1863 ) 

1864 else: 

1865 _LOG.verbose("All required dataset types are known to the target Butler") 

1866 

1867 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

1868 if transfer_dimensions: 

1869 # Collect all the dimension records for these refs. 

1870 # All dimensions are to be copied but the list of valid dimensions 

1871 # come from this butler's universe. 

1872 elements = frozenset(element for element in self.dimensions.elements if element.has_own_table) 

1873 dataIds = {ref.dataId for ref in source_refs} 

1874 dimension_records = self._extract_all_dimension_records_from_data_ids( 

1875 source_butler, dataIds, elements 

1876 ) 

1877 

1878 handled_collections: set[str] = set() 

1879 

1880 # Do all the importing in a single transaction. 

1881 with self.transaction(): 

1882 if dimension_records and not dry_run: 

1883 _LOG.verbose("Ensuring that dimension records exist for transferred datasets.") 

1884 # Order matters. 

1885 for element in self.dimensions.sorted(dimension_records.keys()): 

1886 records = [r for r in dimension_records[element].values()] 

1887 # Assume that if the record is already present that we can 

1888 # use it without having to check that the record metadata 

1889 # is consistent. 

1890 self._registry.insertDimensionData(element, *records, skip_existing=True) 

1891 

1892 n_imported = 0 

1893 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

1894 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

1895 ): 

1896 if run not in handled_collections: 

1897 # May need to create output collection. If source butler 

1898 # has a registry, ask for documentation string. 

1899 run_doc = None 

1900 if registry := getattr(source_butler, "registry", None): 

1901 run_doc = registry.getCollectionDocumentation(run) 

1902 if not dry_run: 

1903 registered = self._registry.registerRun(run, doc=run_doc) 

1904 else: 

1905 registered = True 

1906 handled_collections.add(run) 

1907 if registered: 

1908 _LOG.verbose("Creating output run %s", run) 

1909 

1910 n_refs = len(refs_to_import) 

1911 _LOG.verbose( 

1912 "Importing %d ref%s of dataset type %s into run %s", 

1913 n_refs, 

1914 "" if n_refs == 1 else "s", 

1915 datasetType.name, 

1916 run, 

1917 ) 

1918 

1919 # Assume we are using UUIDs and the source refs will match 

1920 # those imported. 

1921 if not dry_run: 

1922 imported_refs = self._registry._importDatasets(refs_to_import) 

1923 else: 

1924 imported_refs = refs_to_import 

1925 assert set(imported_refs) == set(refs_to_import) 

1926 n_imported += len(imported_refs) 

1927 

1928 assert len(source_refs) == n_imported 

1929 _LOG.verbose("Imported %d datasets into destination butler", n_imported) 

1930 

1931 # Ask the datastore to transfer. The datastore has to check that 

1932 # the source datastore is compatible with the target datastore. 

1933 accepted, rejected = self._datastore.transfer_from( 

1934 source_butler._datastore, 

1935 source_refs, 

1936 transfer=transfer, 

1937 artifact_existence=artifact_existence, 

1938 dry_run=dry_run, 

1939 ) 

1940 if rejected: 

1941 # For now, accept the registry entries but not the files. 

1942 _LOG.warning( 

1943 "%d datasets were rejected and %d accepted for dataset type %s in run %r.", 

1944 len(rejected), 

1945 len(accepted), 

1946 datasetType, 

1947 run, 

1948 ) 

1949 

1950 return source_refs 

1951 

1952 def validateConfiguration( 

1953 self, 

1954 logFailures: bool = False, 

1955 datasetTypeNames: Iterable[str] | None = None, 

1956 ignore: Iterable[str] | None = None, 

1957 ) -> None: 

1958 # Docstring inherited. 

1959 if datasetTypeNames: 

1960 datasetTypes = [self.get_dataset_type(name) for name in datasetTypeNames] 

1961 else: 

1962 datasetTypes = list(self._registry.queryDatasetTypes()) 

1963 

1964 # filter out anything from the ignore list 

1965 if ignore: 

1966 ignore = set(ignore) 

1967 datasetTypes = [ 

1968 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

1969 ] 

1970 else: 

1971 ignore = set() 

1972 

1973 # For each datasetType that has an instrument dimension, create 

1974 # a DatasetRef for each defined instrument 

1975 datasetRefs = [] 

1976 

1977 # Find all the registered instruments (if "instrument" is in the 

1978 # universe). 

1979 if "instrument" in self.dimensions: 

1980 instruments = {record.name for record in self._registry.queryDimensionRecords("instrument")} 

1981 

1982 for datasetType in datasetTypes: 

1983 if "instrument" in datasetType.dimensions: 

1984 # In order to create a conforming dataset ref, create 

1985 # fake DataCoordinate values for the non-instrument 

1986 # dimensions. The type of the value does not matter here. 

1987 dataId = {dim: 1 for dim in datasetType.dimensions.names if dim != "instrument"} 

1988 

1989 for instrument in instruments: 

1990 datasetRef = DatasetRef( 

1991 datasetType, 

1992 DataCoordinate.standardize( 

1993 dataId, instrument=instrument, dimensions=datasetType.dimensions 

1994 ), 

1995 run="validate", 

1996 ) 

1997 datasetRefs.append(datasetRef) 

1998 

1999 entities: list[DatasetType | DatasetRef] = [] 

2000 entities.extend(datasetTypes) 

2001 entities.extend(datasetRefs) 

2002 

2003 datastoreErrorStr = None 

2004 try: 

2005 self._datastore.validateConfiguration(entities, logFailures=logFailures) 

2006 except ValidationError as e: 

2007 datastoreErrorStr = str(e) 

2008 

2009 # Also check that the LookupKeys used by the datastores match 

2010 # registry and storage class definitions 

2011 keys = self._datastore.getLookupKeys() 

2012 

2013 failedNames = set() 

2014 failedDataId = set() 

2015 for key in keys: 

2016 if key.name is not None: 

2017 if key.name in ignore: 

2018 continue 

2019 

2020 # skip if specific datasetType names were requested and this 

2021 # name does not match 

2022 if datasetTypeNames and key.name not in datasetTypeNames: 

2023 continue 

2024 

2025 # See if it is a StorageClass or a DatasetType 

2026 if key.name in self.storageClasses: 

2027 pass 

2028 else: 

2029 try: 

2030 self.get_dataset_type(key.name) 

2031 except KeyError: 

2032 if logFailures: 

2033 _LOG.critical( 

2034 "Key '%s' does not correspond to a DatasetType or StorageClass", key 

2035 ) 

2036 failedNames.add(key) 

2037 else: 

2038 # Dimensions are checked for consistency when the Butler 

2039 # is created and rendezvoused with a universe. 

2040 pass 

2041 

2042 # Check that the instrument is a valid instrument 

2043 # Currently only support instrument so check for that 

2044 if key.dataId: 

2045 dataIdKeys = set(key.dataId) 

2046 if {"instrument"} != dataIdKeys: 

2047 if logFailures: 

2048 _LOG.critical("Key '%s' has unsupported DataId override", key) 

2049 failedDataId.add(key) 

2050 elif key.dataId["instrument"] not in instruments: 

2051 if logFailures: 

2052 _LOG.critical("Key '%s' has unknown instrument", key) 

2053 failedDataId.add(key) 

2054 

2055 messages = [] 

2056 

2057 if datastoreErrorStr: 

2058 messages.append(datastoreErrorStr) 

2059 

2060 for failed, msg in ( 

2061 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2062 (failedDataId, "Keys with bad DataId entries: "), 

2063 ): 

2064 if failed: 

2065 msg += ", ".join(str(k) for k in failed) 

2066 messages.append(msg) 

2067 

2068 if messages: 

2069 raise ValidationError(";\n".join(messages)) 

2070 

2071 @property 

2072 def collections(self) -> Sequence[str]: 

2073 """The collections to search by default, in order 

2074 (`~collections.abc.Sequence` [ `str` ]). 

2075 

2076 This is an alias for ``self.registry.defaults.collections``. It cannot 

2077 be set directly in isolation, but all defaults may be changed together 

2078 by assigning a new `RegistryDefaults` instance to 

2079 ``self.registry.defaults``. 

2080 """ 

2081 return self._registry.defaults.collections 

2082 

2083 @property 

2084 def run(self) -> str | None: 

2085 """Name of the run this butler writes outputs to by default (`str` or 

2086 `None`). 

2087 

2088 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2089 directly in isolation, but all defaults may be changed together by 

2090 assigning a new `RegistryDefaults` instance to 

2091 ``self.registry.defaults``. 

2092 """ 

2093 return self._registry.defaults.run 

2094 

2095 @property 

2096 def registry(self) -> Registry: 

2097 """The object that manages dataset metadata and relationships 

2098 (`Registry`). 

2099 

2100 Many operations that don't involve reading or writing butler datasets 

2101 are accessible only via `Registry` methods. Eventually these methods 

2102 will be replaced by equivalent `Butler` methods. 

2103 """ 

2104 return self._registry_shim 

2105 

2106 @property 

2107 def dimensions(self) -> DimensionUniverse: 

2108 # Docstring inherited. 

2109 return self._registry.dimensions 

2110 

2111 @contextlib.contextmanager 

2112 def _query(self) -> Iterator[Query]: 

2113 # Docstring inherited. 

2114 with self._caching_context(): 

2115 yield DirectQuery(self._registry) 

2116 

2117 def _query_data_ids( 

2118 self, 

2119 dimensions: DimensionGroup | Iterable[str] | str, 

2120 *, 

2121 data_id: DataId | None = None, 

2122 where: str = "", 

2123 bind: Mapping[str, Any] | None = None, 

2124 expanded: bool = False, 

2125 order_by: Iterable[str] | str | None = None, 

2126 limit: int | None = None, 

2127 offset: int | None = None, 

2128 explain: bool = True, 

2129 **kwargs: Any, 

2130 ) -> list[DataCoordinate]: 

2131 # Docstring inherited. 

2132 query = DirectQuery(self._registry) 

2133 result = query.data_ids(dimensions, data_id=data_id, where=where, bind=bind, **kwargs) 

2134 if expanded: 

2135 result = result.expanded() 

2136 if order_by: 

2137 result = result.order_by(*ensure_iterable(order_by)) 

2138 if limit is not None: 

2139 result = result.limit(limit, offset) 

2140 else: 

2141 if offset is not None: 

2142 raise TypeError("offset is specified without limit") 

2143 data_ids = list(result) 

2144 if explain and not data_ids: 

2145 raise EmptyQueryResultError(list(result.explain_no_results())) 

2146 return data_ids 

2147 

2148 def _query_datasets( 

2149 self, 

2150 dataset_type: Any, 

2151 collections: CollectionArgType | None = None, 

2152 *, 

2153 find_first: bool = True, 

2154 data_id: DataId | None = None, 

2155 where: str = "", 

2156 bind: Mapping[str, Any] | None = None, 

2157 expanded: bool = False, 

2158 explain: bool = True, 

2159 **kwargs: Any, 

2160 ) -> list[DatasetRef]: 

2161 # Docstring inherited. 

2162 query = DirectQuery(self._registry) 

2163 result = query.datasets( 

2164 dataset_type, 

2165 collections, 

2166 find_first=find_first, 

2167 data_id=data_id, 

2168 where=where, 

2169 bind=bind, 

2170 **kwargs, 

2171 ) 

2172 if expanded: 

2173 result = result.expanded() 

2174 refs = list(result) 

2175 if explain and not refs: 

2176 raise EmptyQueryResultError(list(result.explain_no_results())) 

2177 return refs 

2178 

2179 def _query_dimension_records( 

2180 self, 

2181 element: str, 

2182 *, 

2183 data_id: DataId | None = None, 

2184 where: str = "", 

2185 bind: Mapping[str, Any] | None = None, 

2186 order_by: Iterable[str] | str | None = None, 

2187 limit: int | None = None, 

2188 offset: int | None = None, 

2189 explain: bool = True, 

2190 **kwargs: Any, 

2191 ) -> list[DimensionRecord]: 

2192 # Docstring inherited. 

2193 query = DirectQuery(self._registry) 

2194 result = query.dimension_records(element, data_id=data_id, where=where, bind=bind, **kwargs) 

2195 if order_by: 

2196 result = result.order_by(*ensure_iterable(order_by)) 

2197 if limit is not None: 

2198 result = result.limit(limit, offset) 

2199 else: 

2200 if offset is not None: 

2201 raise TypeError("offset is specified without limit") 

2202 data_ids = list(result) 

2203 if explain and not data_ids: 

2204 raise EmptyQueryResultError(list(result.explain_no_results())) 

2205 return data_ids 

2206 

2207 def _preload_cache(self) -> None: 

2208 """Immediately load caches that are used for common operations.""" 

2209 self._registry.preload_cache() 

2210 

2211 _config: ButlerConfig 

2212 """Configuration for this Butler instance.""" 

2213 

2214 _registry: SqlRegistry 

2215 """The object that manages dataset metadata and relationships 

2216 (`SqlRegistry`). 

2217 

2218 Most operations that don't involve reading or writing butler datasets are 

2219 accessible only via `SqlRegistry` methods. 

2220 """ 

2221 

2222 datastore: Datastore 

2223 """The object that manages actual dataset storage (`Datastore`). 

2224 

2225 Direct user access to the datastore should rarely be necessary; the primary 

2226 exception is the case where a `Datastore` implementation provides extra 

2227 functionality beyond what the base class defines. 

2228 """ 

2229 

2230 storageClasses: StorageClassFactory 

2231 """An object that maps known storage class names to objects that fully 

2232 describe them (`StorageClassFactory`). 

2233 """ 

2234 

2235 _registry_shim: RegistryShim 

2236 """Shim object to provide a legacy public interface for querying via the 

2237 the ``registry`` property. 

2238 """