Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "Registry", 

26) 

27 

28from collections import defaultdict 

29import contextlib 

30import logging 

31from typing import ( 

32 Any, 

33 Dict, 

34 Iterable, 

35 Iterator, 

36 List, 

37 Mapping, 

38 Optional, 

39 Set, 

40 Type, 

41 TYPE_CHECKING, 

42 Union, 

43) 

44 

45import sqlalchemy 

46 

47from ..core import ( 

48 Config, 

49 DataCoordinate, 

50 DataCoordinateIterable, 

51 DataId, 

52 DatasetAssociation, 

53 DatasetRef, 

54 DatasetType, 

55 ddl, 

56 Dimension, 

57 DimensionElement, 

58 DimensionGraph, 

59 DimensionRecord, 

60 DimensionUniverse, 

61 NamedKeyMapping, 

62 NameLookupMapping, 

63 StorageClassFactory, 

64 Timespan, 

65) 

66from . import queries 

67from ..core.utils import doImport, iterable, transactional 

68from ._config import RegistryConfig 

69from ._collectionType import CollectionType 

70from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError 

71from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch, Ellipsis 

72from .interfaces import ChainedCollectionRecord, RunRecord 

73from .versions import ButlerVersionsManager, DigestMismatchError 

74 

75if TYPE_CHECKING: 75 ↛ 76line 75 didn't jump to line 76, because the condition on line 75 was never true

76 from ..butlerConfig import ButlerConfig 

77 from .interfaces import ( 

78 ButlerAttributeManager, 

79 CollectionManager, 

80 Database, 

81 OpaqueTableStorageManager, 

82 DimensionRecordStorageManager, 

83 DatasetRecordStorageManager, 

84 DatastoreRegistryBridgeManager, 

85 ) 

86 

87 

88_LOG = logging.getLogger(__name__) 

89 

90 

91class Registry: 

92 """Registry interface. 

93 

94 Parameters 

95 ---------- 

96 database : `Database` 

97 Database instance to store Registry. 

98 universe : `DimensionUniverse` 

99 Full set of dimensions for Registry. 

100 attributes : `type` 

101 Manager class implementing `ButlerAttributeManager`. 

102 opaque : `type` 

103 Manager class implementing `OpaqueTableStorageManager`. 

104 dimensions : `type` 

105 Manager class implementing `DimensionRecordStorageManager`. 

106 collections : `type` 

107 Manager class implementing `CollectionManager`. 

108 datasets : `type` 

109 Manager class implementing `DatasetRecordStorageManager`. 

110 datastoreBridges : `type` 

111 Manager class implementing `DatastoreRegistryBridgeManager`. 

112 writeable : `bool`, optional 

113 If True then Registry will support write operations. 

114 create : `bool`, optional 

115 If True then database schema will be initialized, it must be empty 

116 before instantiating Registry. 

117 """ 

118 

119 defaultConfigFile: Optional[str] = None 

120 """Path to configuration defaults. Accessed within the ``configs`` resource 

121 or relative to a search path. Can be None if no defaults specified. 

122 """ 

123 

124 @classmethod 

125 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False, 

126 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry: 

127 """Create `Registry` subclass instance from `config`. 

128 

129 Uses ``registry.cls`` from `config` to determine which subclass to 

130 instantiate. 

131 

132 Parameters 

133 ---------- 

134 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

135 Registry configuration 

136 create : `bool`, optional 

137 Assume empty Registry and create a new one. 

138 butlerRoot : `str`, optional 

139 Path to the repository root this `Registry` will manage. 

140 writeable : `bool`, optional 

141 If `True` (default) create a read-write connection to the database. 

142 

143 Returns 

144 ------- 

145 registry : `Registry` (subclass) 

146 A new `Registry` subclass instance. 

147 """ 

148 if not isinstance(config, RegistryConfig): 

149 if isinstance(config, str) or isinstance(config, Config): 

150 config = RegistryConfig(config) 

151 else: 

152 raise ValueError("Incompatible Registry configuration: {}".format(config)) 

153 config.replaceRoot(butlerRoot) 

154 DatabaseClass = config.getDatabaseClass() 

155 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

156 namespace=config.get("namespace"), writeable=writeable) 

157 universe = DimensionUniverse(config) 

158 attributes = doImport(config["managers", "attributes"]) 

159 opaque = doImport(config["managers", "opaque"]) 

160 dimensions = doImport(config["managers", "dimensions"]) 

161 collections = doImport(config["managers", "collections"]) 

162 datasets = doImport(config["managers", "datasets"]) 

163 datastoreBridges = doImport(config["managers", "datastores"]) 

164 

165 return cls(database, universe, dimensions=dimensions, attributes=attributes, opaque=opaque, 

166 collections=collections, datasets=datasets, datastoreBridges=datastoreBridges, 

167 writeable=writeable, create=create) 

168 

169 def __init__(self, database: Database, universe: DimensionUniverse, *, 

170 attributes: Type[ButlerAttributeManager], 

171 opaque: Type[OpaqueTableStorageManager], 

172 dimensions: Type[DimensionRecordStorageManager], 

173 collections: Type[CollectionManager], 

174 datasets: Type[DatasetRecordStorageManager], 

175 datastoreBridges: Type[DatastoreRegistryBridgeManager], 

176 writeable: bool = True, 

177 create: bool = False): 

178 self._db = database 

179 self.storageClasses = StorageClassFactory() 

180 with self._db.declareStaticTables(create=create) as context: 

181 self._attributes = attributes.initialize(self._db, context) 

182 self._dimensions = dimensions.initialize(self._db, context, universe=universe) 

183 self._collections = collections.initialize(self._db, context) 

184 self._datasets = datasets.initialize(self._db, context, 

185 collections=self._collections, 

186 universe=self.dimensions) 

187 self._opaque = opaque.initialize(self._db, context) 

188 self._datastoreBridges = datastoreBridges.initialize(self._db, context, 

189 opaque=self._opaque, 

190 datasets=datasets, 

191 universe=self.dimensions) 

192 versions = ButlerVersionsManager( 

193 self._attributes, 

194 dict( 

195 attributes=self._attributes, 

196 opaque=self._opaque, 

197 dimensions=self._dimensions, 

198 collections=self._collections, 

199 datasets=self._datasets, 

200 datastores=self._datastoreBridges, 

201 ) 

202 ) 

203 # store managers and their versions in attributes table 

204 context.addInitializer(lambda db: versions.storeManagersConfig()) 

205 context.addInitializer(lambda db: versions.storeManagersVersions()) 

206 

207 if not create: 

208 # verify that configured versions are compatible with schema 

209 versions.checkManagersConfig() 

210 versions.checkManagersVersions(writeable) 

211 try: 

212 versions.checkManagersDigests() 

213 except DigestMismatchError as exc: 

214 # potentially digest mismatch is a serious error but during 

215 # development it could be benign, treat this as warning for 

216 # now. 

217 _LOG.warning(f"Registry schema digest mismatch: {exc}") 

218 

219 self._collections.refresh() 

220 self._datasets.refresh(universe=self._dimensions.universe) 

221 

222 def __str__(self) -> str: 

223 return str(self._db) 

224 

225 def __repr__(self) -> str: 

226 return f"Registry({self._db!r}, {self.dimensions!r})" 

227 

228 def isWriteable(self) -> bool: 

229 """Return `True` if this registry allows write operations, and `False` 

230 otherwise. 

231 """ 

232 return self._db.isWriteable() 

233 

234 @property 

235 def dimensions(self) -> DimensionUniverse: 

236 """All dimensions recognized by this `Registry` (`DimensionUniverse`). 

237 """ 

238 return self._dimensions.universe 

239 

240 @contextlib.contextmanager 

241 def transaction(self, *, savepoint: bool = False) -> Iterator[None]: 

242 """Return a context manager that represents a transaction. 

243 """ 

244 try: 

245 with self._db.transaction(savepoint=savepoint): 

246 yield 

247 except BaseException: 

248 # TODO: this clears the caches sometimes when we wouldn't actually 

249 # need to. Can we avoid that? 

250 self._dimensions.clearCaches() 

251 raise 

252 

253 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None: 

254 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

255 other data repository client. 

256 

257 Opaque table records can be added via `insertOpaqueData`, retrieved via 

258 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

259 

260 Parameters 

261 ---------- 

262 tableName : `str` 

263 Logical name of the opaque table. This may differ from the 

264 actual name used in the database by a prefix and/or suffix. 

265 spec : `ddl.TableSpec` 

266 Specification for the table to be added. 

267 """ 

268 self._opaque.register(tableName, spec) 

269 

270 @transactional 

271 def insertOpaqueData(self, tableName: str, *data: dict) -> None: 

272 """Insert records into an opaque table. 

273 

274 Parameters 

275 ---------- 

276 tableName : `str` 

277 Logical name of the opaque table. Must match the name used in a 

278 previous call to `registerOpaqueTable`. 

279 data 

280 Each additional positional argument is a dictionary that represents 

281 a single row to be added. 

282 """ 

283 self._opaque[tableName].insert(*data) 

284 

285 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

286 """Retrieve records from an opaque table. 

287 

288 Parameters 

289 ---------- 

290 tableName : `str` 

291 Logical name of the opaque table. Must match the name used in a 

292 previous call to `registerOpaqueTable`. 

293 where 

294 Additional keyword arguments are interpreted as equality 

295 constraints that restrict the returned rows (combined with AND); 

296 keyword arguments are column names and values are the values they 

297 must have. 

298 

299 Yields 

300 ------ 

301 row : `dict` 

302 A dictionary representing a single result row. 

303 """ 

304 yield from self._opaque[tableName].fetch(**where) 

305 

306 @transactional 

307 def deleteOpaqueData(self, tableName: str, **where: Any) -> None: 

308 """Remove records from an opaque table. 

309 

310 Parameters 

311 ---------- 

312 tableName : `str` 

313 Logical name of the opaque table. Must match the name used in a 

314 previous call to `registerOpaqueTable`. 

315 where 

316 Additional keyword arguments are interpreted as equality 

317 constraints that restrict the deleted rows (combined with AND); 

318 keyword arguments are column names and values are the values they 

319 must have. 

320 """ 

321 self._opaque[tableName].delete(**where) 

322 

323 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED) -> None: 

324 """Add a new collection if one with the given name does not exist. 

325 

326 Parameters 

327 ---------- 

328 name : `str` 

329 The name of the collection to create. 

330 type : `CollectionType` 

331 Enum value indicating the type of collection to create. 

332 

333 Notes 

334 ----- 

335 This method cannot be called within transactions, as it needs to be 

336 able to perform its own transaction to be concurrent. 

337 """ 

338 self._collections.register(name, type) 

339 

340 def getCollectionType(self, name: str) -> CollectionType: 

341 """Return an enumeration value indicating the type of the given 

342 collection. 

343 

344 Parameters 

345 ---------- 

346 name : `str` 

347 The name of the collection. 

348 

349 Returns 

350 ------- 

351 type : `CollectionType` 

352 Enum value indicating the type of this collection. 

353 

354 Raises 

355 ------ 

356 MissingCollectionError 

357 Raised if no collection with the given name exists. 

358 """ 

359 return self._collections.find(name).type 

360 

361 def registerRun(self, name: str) -> None: 

362 """Add a new run if one with the given name does not exist. 

363 

364 Parameters 

365 ---------- 

366 name : `str` 

367 The name of the run to create. 

368 

369 Notes 

370 ----- 

371 This method cannot be called within transactions, as it needs to be 

372 able to perform its own transaction to be concurrent. 

373 """ 

374 self._collections.register(name, CollectionType.RUN) 

375 

376 @transactional 

377 def removeCollection(self, name: str) -> None: 

378 """Completely remove the given collection. 

379 

380 Parameters 

381 ---------- 

382 name : `str` 

383 The name of the collection to remove. 

384 

385 Raises 

386 ------ 

387 MissingCollectionError 

388 Raised if no collection with the given name exists. 

389 

390 Notes 

391 ----- 

392 If this is a `~CollectionType.RUN` collection, all datasets and quanta 

393 in it are also fully removed. This requires that those datasets be 

394 removed (or at least trashed) from any datastores that hold them first. 

395 

396 A collection may not be deleted as long as it is referenced by a 

397 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must 

398 be deleted or redefined first. 

399 """ 

400 self._collections.remove(name) 

401 

402 def getCollectionChain(self, parent: str) -> CollectionSearch: 

403 """Return the child collections in a `~CollectionType.CHAINED` 

404 collection. 

405 

406 Parameters 

407 ---------- 

408 parent : `str` 

409 Name of the chained collection. Must have already been added via 

410 a call to `Registry.registerCollection`. 

411 

412 Returns 

413 ------- 

414 children : `CollectionSearch` 

415 An object that defines the search path of the collection. 

416 See :ref:`daf_butler_collection_expressions` for more information. 

417 

418 Raises 

419 ------ 

420 MissingCollectionError 

421 Raised if ``parent`` does not exist in the `Registry`. 

422 TypeError 

423 Raised if ``parent`` does not correspond to a 

424 `~CollectionType.CHAINED` collection. 

425 """ 

426 record = self._collections.find(parent) 

427 if record.type is not CollectionType.CHAINED: 

428 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

429 assert isinstance(record, ChainedCollectionRecord) 

430 return record.children 

431 

432 @transactional 

433 def setCollectionChain(self, parent: str, children: Any) -> None: 

434 """Define or redefine a `~CollectionType.CHAINED` collection. 

435 

436 Parameters 

437 ---------- 

438 parent : `str` 

439 Name of the chained collection. Must have already been added via 

440 a call to `Registry.registerCollection`. 

441 children : `Any` 

442 An expression defining an ordered search of child collections, 

443 generally an iterable of `str`. Restrictions on the dataset types 

444 to be searched can also be included, by passing mapping or an 

445 iterable containing tuples; see 

446 :ref:`daf_butler_collection_expressions` for more information. 

447 

448 Raises 

449 ------ 

450 MissingCollectionError 

451 Raised when any of the given collections do not exist in the 

452 `Registry`. 

453 TypeError 

454 Raised if ``parent`` does not correspond to a 

455 `~CollectionType.CHAINED` collection. 

456 ValueError 

457 Raised if the given collections contains a cycle. 

458 """ 

459 record = self._collections.find(parent) 

460 if record.type is not CollectionType.CHAINED: 

461 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

462 assert isinstance(record, ChainedCollectionRecord) 

463 children = CollectionSearch.fromExpression(children) 

464 if children != record.children: 

465 record.update(self._collections, children) 

466 

467 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

468 """ 

469 Add a new `DatasetType` to the Registry. 

470 

471 It is not an error to register the same `DatasetType` twice. 

472 

473 Parameters 

474 ---------- 

475 datasetType : `DatasetType` 

476 The `DatasetType` to be added. 

477 

478 Returns 

479 ------- 

480 inserted : `bool` 

481 `True` if ``datasetType`` was inserted, `False` if an identical 

482 existing `DatsetType` was found. Note that in either case the 

483 DatasetType is guaranteed to be defined in the Registry 

484 consistently with the given definition. 

485 

486 Raises 

487 ------ 

488 ValueError 

489 Raised if the dimensions or storage class are invalid. 

490 ConflictingDefinitionError 

491 Raised if this DatasetType is already registered with a different 

492 definition. 

493 

494 Notes 

495 ----- 

496 This method cannot be called within transactions, as it needs to be 

497 able to perform its own transaction to be concurrent. 

498 """ 

499 _, inserted = self._datasets.register(datasetType) 

500 return inserted 

501 

502 def removeDatasetType(self, name: str) -> None: 

503 """Remove the named `DatasetType` from the registry. 

504 

505 .. warning:: 

506 

507 Registry caches the dataset type definitions. This means that 

508 deleting the dataset type definition may result in unexpected 

509 behavior from other butler processes that are active that have 

510 not seen the deletion. 

511 

512 Parameters 

513 ---------- 

514 name : `str` 

515 Name of the type to be removed. 

516 

517 Raises 

518 ------ 

519 lsst.daf.butler.registry.OrphanedRecordError 

520 Raised if an attempt is made to remove the dataset type definition 

521 when there are already datasets associated with it. 

522 

523 Notes 

524 ----- 

525 If the dataset type is not registered the method will return without 

526 action. 

527 """ 

528 self._datasets.remove(name, universe=self._dimensions.universe) 

529 

530 def getDatasetType(self, name: str) -> DatasetType: 

531 """Get the `DatasetType`. 

532 

533 Parameters 

534 ---------- 

535 name : `str` 

536 Name of the type. 

537 

538 Returns 

539 ------- 

540 type : `DatasetType` 

541 The `DatasetType` associated with the given name. 

542 

543 Raises 

544 ------ 

545 KeyError 

546 Requested named DatasetType could not be found in registry. 

547 """ 

548 return self._datasets[name].datasetType 

549 

550 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *, 

551 collections: Any, timespan: Optional[Timespan] = None, 

552 **kwargs: Any) -> Optional[DatasetRef]: 

553 """Find a dataset given its `DatasetType` and data ID. 

554 

555 This can be used to obtain a `DatasetRef` that permits the dataset to 

556 be read from a `Datastore`. If the dataset is a component and can not 

557 be found using the provided dataset type, a dataset ref for the parent 

558 will be returned instead but with the correct dataset type. 

559 

560 Parameters 

561 ---------- 

562 datasetType : `DatasetType` or `str` 

563 A `DatasetType` or the name of one. 

564 dataId : `dict` or `DataCoordinate`, optional 

565 A `dict`-like object containing the `Dimension` links that identify 

566 the dataset within a collection. 

567 collections 

568 An expression that fully or partially identifies the collections 

569 to search for the dataset, such as a `str`, `DatasetType`, or 

570 iterable thereof. See :ref:`daf_butler_collection_expressions` 

571 for more information. 

572 timespan : `Timespan`, optional 

573 A timespan that the validity range of the dataset must overlap. 

574 If not provided, any `~CollectionType.CALIBRATION` collections 

575 matched by the ``collections`` argument will not be searched. 

576 **kwargs 

577 Additional keyword arguments passed to 

578 `DataCoordinate.standardize` to convert ``dataId`` to a true 

579 `DataCoordinate` or augment an existing one. 

580 

581 Returns 

582 ------- 

583 ref : `DatasetRef` 

584 A reference to the dataset, or `None` if no matching Dataset 

585 was found. 

586 

587 Raises 

588 ------ 

589 LookupError 

590 Raised if one or more data ID keys are missing. 

591 KeyError 

592 Raised if the dataset type does not exist. 

593 MissingCollectionError 

594 Raised if any of ``collections`` does not exist in the registry. 

595 

596 Notes 

597 ----- 

598 This method simply returns `None` and does not raise an exception even 

599 when the set of collections searched is intrinsically incompatible with 

600 the dataset type, e.g. if ``datasetType.isCalibration() is False``, but 

601 only `~CollectionType.CALIBRATION` collections are being searched. 

602 This may make it harder to debug some lookup failures, but the behavior 

603 is intentional; we consider it more important that failed searches are 

604 reported consistently, regardless of the reason, and that adding 

605 additional collections that do not contain a match to the search path 

606 never changes the behavior. 

607 """ 

608 if isinstance(datasetType, DatasetType): 

609 storage = self._datasets[datasetType.name] 

610 else: 

611 storage = self._datasets[datasetType] 

612 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions, 

613 universe=self.dimensions, **kwargs) 

614 collections = CollectionSearch.fromExpression(collections) 

615 for collectionRecord in collections.iter(self._collections, datasetType=storage.datasetType): 

616 if (collectionRecord.type is CollectionType.CALIBRATION 

617 and (not storage.datasetType.isCalibration() or timespan is None)): 

618 continue 

619 result = storage.find(collectionRecord, dataId, timespan=timespan) 

620 if result is not None: 

621 return result 

622 

623 return None 

624 

625 @transactional 

626 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId], 

627 run: str) -> List[DatasetRef]: 

628 """Insert one or more datasets into the `Registry` 

629 

630 This always adds new datasets; to associate existing datasets with 

631 a new collection, use ``associate``. 

632 

633 Parameters 

634 ---------- 

635 datasetType : `DatasetType` or `str` 

636 A `DatasetType` or the name of one. 

637 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate` 

638 Dimension-based identifiers for the new datasets. 

639 run : `str` 

640 The name of the run that produced the datasets. 

641 

642 Returns 

643 ------- 

644 refs : `list` of `DatasetRef` 

645 Resolved `DatasetRef` instances for all given data IDs (in the same 

646 order). 

647 

648 Raises 

649 ------ 

650 ConflictingDefinitionError 

651 If a dataset with the same dataset type and data ID as one of those 

652 given already exists in ``run``. 

653 MissingCollectionError 

654 Raised if ``run`` does not exist in the registry. 

655 """ 

656 if isinstance(datasetType, DatasetType): 

657 storage = self._datasets.find(datasetType.name) 

658 if storage is None: 

659 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

660 else: 

661 storage = self._datasets.find(datasetType) 

662 if storage is None: 

663 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

664 runRecord = self._collections.find(run) 

665 if runRecord.type is not CollectionType.RUN: 

666 raise TypeError("Given collection is of type {runRecord.type.name}; RUN collection required.") 

667 assert isinstance(runRecord, RunRecord) 

668 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) 

669 for dataId in dataIds] 

670 try: 

671 refs = list(storage.insert(runRecord, expandedDataIds)) 

672 except sqlalchemy.exc.IntegrityError as err: 

673 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

674 f"one or more datasets of type {storage.datasetType} into " 

675 f"collection '{run}'. " 

676 f"This probably means a dataset with the same data ID " 

677 f"and dataset type already exists, but it may also mean a " 

678 f"dimension row is missing.") from err 

679 return refs 

680 

681 def getDataset(self, id: int) -> Optional[DatasetRef]: 

682 """Retrieve a Dataset entry. 

683 

684 Parameters 

685 ---------- 

686 id : `int` 

687 The unique identifier for the dataset. 

688 

689 Returns 

690 ------- 

691 ref : `DatasetRef` or `None` 

692 A ref to the Dataset, or `None` if no matching Dataset 

693 was found. 

694 """ 

695 ref = self._datasets.getDatasetRef(id, universe=self.dimensions) 

696 if ref is None: 

697 return None 

698 return ref 

699 

700 @transactional 

701 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None: 

702 """Remove datasets from the Registry. 

703 

704 The datasets will be removed unconditionally from all collections, and 

705 any `Quantum` that consumed this dataset will instead be marked with 

706 having a NULL input. `Datastore` records will *not* be deleted; the 

707 caller is responsible for ensuring that the dataset has already been 

708 removed from all Datastores. 

709 

710 Parameters 

711 ---------- 

712 refs : `Iterable` of `DatasetRef` 

713 References to the datasets to be removed. Must include a valid 

714 ``id`` attribute, and should be considered invalidated upon return. 

715 

716 Raises 

717 ------ 

718 AmbiguousDatasetError 

719 Raised if any ``ref.id`` is `None`. 

720 OrphanedRecordError 

721 Raised if any dataset is still present in any `Datastore`. 

722 """ 

723 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

724 storage = self._datasets.find(datasetType.name) 

725 assert storage is not None 

726 try: 

727 storage.delete(refsForType) 

728 except sqlalchemy.exc.IntegrityError as err: 

729 raise OrphanedRecordError("One or more datasets is still " 

730 "present in one or more Datastores.") from err 

731 

732 @transactional 

733 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

734 """Add existing datasets to a `~CollectionType.TAGGED` collection. 

735 

736 If a DatasetRef with the same exact integer ID is already in a 

737 collection nothing is changed. If a `DatasetRef` with the same 

738 `DatasetType` and data ID but with different integer ID 

739 exists in the collection, `ConflictingDefinitionError` is raised. 

740 

741 Parameters 

742 ---------- 

743 collection : `str` 

744 Indicates the collection the datasets should be associated with. 

745 refs : `Iterable` [ `DatasetRef` ] 

746 An iterable of resolved `DatasetRef` instances that already exist 

747 in this `Registry`. 

748 

749 Raises 

750 ------ 

751 ConflictingDefinitionError 

752 If a Dataset with the given `DatasetRef` already exists in the 

753 given collection. 

754 AmbiguousDatasetError 

755 Raised if ``any(ref.id is None for ref in refs)``. 

756 MissingCollectionError 

757 Raised if ``collection`` does not exist in the registry. 

758 TypeError 

759 Raise adding new datasets to the given ``collection`` is not 

760 allowed. 

761 """ 

762 collectionRecord = self._collections.find(collection) 

763 if collectionRecord.type is not CollectionType.TAGGED: 

764 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

765 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

766 storage = self._datasets.find(datasetType.name) 

767 assert storage is not None 

768 try: 

769 storage.associate(collectionRecord, refsForType) 

770 except sqlalchemy.exc.IntegrityError as err: 

771 raise ConflictingDefinitionError( 

772 f"Constraint violation while associating dataset of type {datasetType.name} with " 

773 f"collection {collection}. This probably means that one or more datasets with the same " 

774 f"dataset type and data ID already exist in the collection, but it may also indicate " 

775 f"that the datasets do not exist." 

776 ) from err 

777 

778 @transactional 

779 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

780 """Remove existing datasets from a `~CollectionType.TAGGED` collection. 

781 

782 ``collection`` and ``ref`` combinations that are not currently 

783 associated are silently ignored. 

784 

785 Parameters 

786 ---------- 

787 collection : `str` 

788 The collection the datasets should no longer be associated with. 

789 refs : `Iterable` [ `DatasetRef` ] 

790 An iterable of resolved `DatasetRef` instances that already exist 

791 in this `Registry`. 

792 

793 Raises 

794 ------ 

795 AmbiguousDatasetError 

796 Raised if any of the given dataset references is unresolved. 

797 MissingCollectionError 

798 Raised if ``collection`` does not exist in the registry. 

799 TypeError 

800 Raise adding new datasets to the given ``collection`` is not 

801 allowed. 

802 """ 

803 collectionRecord = self._collections.find(collection) 

804 if collectionRecord.type is not CollectionType.TAGGED: 

805 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; " 

806 "expected TAGGED.") 

807 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

808 storage = self._datasets.find(datasetType.name) 

809 assert storage is not None 

810 storage.disassociate(collectionRecord, refsForType) 

811 

812 @transactional 

813 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None: 

814 """Associate one or more datasets with a calibration collection and a 

815 validity range within it. 

816 

817 Parameters 

818 ---------- 

819 collection : `str` 

820 The name of an already-registered `~CollectionType.CALIBRATION` 

821 collection. 

822 refs : `Iterable` [ `DatasetRef` ] 

823 Datasets to be associated. 

824 timespan : `Timespan` 

825 The validity range for these datasets within the collection. 

826 

827 Raises 

828 ------ 

829 AmbiguousDatasetError 

830 Raised if any of the given `DatasetRef` instances is unresolved. 

831 ConflictingDefinitionError 

832 Raised if the collection already contains a different dataset with 

833 the same `DatasetType` and data ID and an overlapping validity 

834 range. 

835 TypeError 

836 Raised if ``collection`` is not a `~CollectionType.CALIBRATION` 

837 collection or if one or more datasets are of a dataset type for 

838 which `DatasetType.isCalibration` returns `False`. 

839 """ 

840 collectionRecord = self._collections.find(collection) 

841 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

842 storage = self._datasets[datasetType.name] 

843 storage.certify(collectionRecord, refsForType, timespan) 

844 

845 @transactional 

846 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *, 

847 dataIds: Optional[Iterable[DataId]] = None) -> None: 

848 """Remove or adjust datasets to clear a validity range within a 

849 calibration collection. 

850 

851 Parameters 

852 ---------- 

853 collection : `str` 

854 The name of an already-registered `~CollectionType.CALIBRATION` 

855 collection. 

856 datasetType : `str` or `DatasetType` 

857 Name or `DatasetType` instance for the datasets to be decertified. 

858 timespan : `Timespan`, optional 

859 The validity range to remove datasets from within the collection. 

860 Datasets that overlap this range but are not contained by it will 

861 have their validity ranges adjusted to not overlap it, which may 

862 split a single dataset validity range into two. 

863 dataIds : `Iterable` [ `DataId` ], optional 

864 Data IDs that should be decertified within the given validity range 

865 If `None`, all data IDs for ``self.datasetType`` will be 

866 decertified. 

867 

868 Raises 

869 ------ 

870 TypeError 

871 Raised if ``collection`` is not a `~CollectionType.CALIBRATION` 

872 collection or if ``datasetType.isCalibration() is False``. 

873 """ 

874 collectionRecord = self._collections.find(collection) 

875 if isinstance(datasetType, str): 

876 storage = self._datasets[datasetType] 

877 else: 

878 storage = self._datasets[datasetType.name] 

879 standardizedDataIds = None 

880 if dataIds is not None: 

881 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) 

882 for d in dataIds] 

883 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds) 

884 

885 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: 

886 """Return an object that allows a new `Datastore` instance to 

887 communicate with this `Registry`. 

888 

889 Returns 

890 ------- 

891 manager : `DatastoreRegistryBridgeManager` 

892 Object that mediates communication between this `Registry` and its 

893 associated datastores. 

894 """ 

895 return self._datastoreBridges 

896 

897 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]: 

898 """Retrieve datastore locations for a given dataset. 

899 

900 Parameters 

901 ---------- 

902 ref : `DatasetRef` 

903 A reference to the dataset for which to retrieve storage 

904 information. 

905 

906 Returns 

907 ------- 

908 datastores : `Iterable` [ `str` ] 

909 All the matching datastores holding this dataset. 

910 

911 Raises 

912 ------ 

913 AmbiguousDatasetError 

914 Raised if ``ref.id`` is `None`. 

915 """ 

916 return self._datastoreBridges.findDatastores(ref) 

917 

918 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None, 

919 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None, 

920 **kwargs: Any) -> DataCoordinate: 

921 """Expand a dimension-based data ID to include additional information. 

922 

923 Parameters 

924 ---------- 

925 dataId : `DataCoordinate` or `dict`, optional 

926 Data ID to be expanded; augmented and overridden by ``kwds``. 

927 graph : `DimensionGraph`, optional 

928 Set of dimensions for the expanded ID. If `None`, the dimensions 

929 will be inferred from the keys of ``dataId`` and ``kwds``. 

930 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph`` 

931 are silently ignored, providing a way to extract and expand a 

932 subset of a data ID. 

933 records : `Mapping` [`str`, `DimensionRecord`], optional 

934 Dimension record data to use before querying the database for that 

935 data, keyed by element name. 

936 **kwargs 

937 Additional keywords are treated like additional key-value pairs for 

938 ``dataId``, extending and overriding 

939 

940 Returns 

941 ------- 

942 expanded : `DataCoordinate` 

943 A data ID that includes full metadata for all of the dimensions it 

944 identifieds, i.e. guarantees that ``expanded.hasRecords()`` and 

945 ``expanded.hasFull()`` both return `True`. 

946 """ 

947 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwargs) 

948 if standardized.hasRecords(): 

949 return standardized 

950 if records is None: 

951 records = {} 

952 elif isinstance(records, NamedKeyMapping): 

953 records = records.byName() 

954 else: 

955 records = dict(records) 

956 if isinstance(dataId, DataCoordinate) and dataId.hasRecords(): 

957 records.update(dataId.records.byName()) 

958 keys = standardized.byName() 

959 for element in standardized.graph.primaryKeyTraversalOrder: 

960 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

961 if record is ...: 

962 if isinstance(element, Dimension) and keys.get(element.name) is None: 

963 if element in standardized.graph.required: 

964 raise LookupError( 

965 f"No value or null value for required dimension {element.name}." 

966 ) 

967 keys[element.name] = None 

968 record = None 

969 else: 

970 storage = self._dimensions[element] 

971 dataIdSet = DataCoordinateIterable.fromScalar( 

972 DataCoordinate.standardize(keys, graph=element.graph) 

973 ) 

974 fetched = tuple(storage.fetch(dataIdSet)) 

975 try: 

976 (record,) = fetched 

977 except ValueError: 

978 record = None 

979 records[element.name] = record 

980 if record is not None: 

981 for d in element.implied: 

982 value = getattr(record, d.name) 

983 if keys.setdefault(d.name, value) != value: 

984 raise InconsistentDataIdError( 

985 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, " 

986 f"but {element.name} implies {d.name}={value!r}." 

987 ) 

988 else: 

989 if element in standardized.graph.required: 

990 raise LookupError( 

991 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

992 ) 

993 if element.alwaysJoin: 

994 raise InconsistentDataIdError( 

995 f"Could not fetch record for element {element.name} via keys {keys}, ", 

996 "but it is marked alwaysJoin=True; this means one or more dimensions are not " 

997 "related." 

998 ) 

999 for d in element.implied: 

1000 keys.setdefault(d.name, None) 

1001 records.setdefault(d.name, None) 

1002 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records) 

1003 

1004 def insertDimensionData(self, element: Union[DimensionElement, str], 

1005 *data: Union[Mapping[str, Any], DimensionRecord], 

1006 conform: bool = True) -> None: 

1007 """Insert one or more dimension records into the database. 

1008 

1009 Parameters 

1010 ---------- 

1011 element : `DimensionElement` or `str` 

1012 The `DimensionElement` or name thereof that identifies the table 

1013 records will be inserted into. 

1014 data : `dict` or `DimensionRecord` (variadic) 

1015 One or more records to insert. 

1016 conform : `bool`, optional 

1017 If `False` (`True` is default) perform no checking or conversions, 

1018 and assume that ``element`` is a `DimensionElement` instance and 

1019 ``data`` is a one or more `DimensionRecord` instances of the 

1020 appropriate subclass. 

1021 """ 

1022 if conform: 

1023 if isinstance(element, str): 

1024 element = self.dimensions[element] 

1025 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

1026 for row in data] 

1027 else: 

1028 # Ignore typing since caller said to trust them with conform=False. 

1029 records = data # type: ignore 

1030 storage = self._dimensions[element] # type: ignore 

1031 storage.insert(*records) 

1032 

1033 def syncDimensionData(self, element: Union[DimensionElement, str], 

1034 row: Union[Mapping[str, Any], DimensionRecord], 

1035 conform: bool = True) -> bool: 

1036 """Synchronize the given dimension record with the database, inserting 

1037 if it does not already exist and comparing values if it does. 

1038 

1039 Parameters 

1040 ---------- 

1041 element : `DimensionElement` or `str` 

1042 The `DimensionElement` or name thereof that identifies the table 

1043 records will be inserted into. 

1044 row : `dict` or `DimensionRecord` 

1045 The record to insert. 

1046 conform : `bool`, optional 

1047 If `False` (`True` is default) perform no checking or conversions, 

1048 and assume that ``element`` is a `DimensionElement` instance and 

1049 ``data`` is a one or more `DimensionRecord` instances of the 

1050 appropriate subclass. 

1051 

1052 Returns 

1053 ------- 

1054 inserted : `bool` 

1055 `True` if a new row was inserted, `False` otherwise. 

1056 

1057 Raises 

1058 ------ 

1059 ConflictingDefinitionError 

1060 Raised if the record exists in the database (according to primary 

1061 key lookup) but is inconsistent with the given one. 

1062 

1063 Notes 

1064 ----- 

1065 This method cannot be called within transactions, as it needs to be 

1066 able to perform its own transaction to be concurrent. 

1067 """ 

1068 if conform: 

1069 if isinstance(element, str): 

1070 element = self.dimensions[element] 

1071 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

1072 else: 

1073 # Ignore typing since caller said to trust them with conform=False. 

1074 record = row # type: ignore 

1075 storage = self._dimensions[element] # type: ignore 

1076 return storage.sync(record) 

1077 

1078 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None 

1079 ) -> Iterator[DatasetType]: 

1080 """Iterate over the dataset types whose names match an expression. 

1081 

1082 Parameters 

1083 ---------- 

1084 expression : `Any`, optional 

1085 An expression that fully or partially identifies the dataset types 

1086 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1087 `...` can be used to return all dataset types, and is the default. 

1088 See :ref:`daf_butler_dataset_type_expressions` for more 

1089 information. 

1090 components : `bool`, optional 

1091 If `True`, apply all expression patterns to component dataset type 

1092 names as well. If `False`, never apply patterns to components. 

1093 If `None` (default), apply patterns to components only if their 

1094 parent datasets were not matched by the expression. 

1095 Fully-specified component datasets (`str` or `DatasetType` 

1096 instances) are always included. 

1097 

1098 Yields 

1099 ------ 

1100 datasetType : `DatasetType` 

1101 A `DatasetType` instance whose name matches ``expression``. 

1102 """ 

1103 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name) 

1104 if wildcard is Ellipsis: 

1105 for datasetType in self._datasets: 

1106 # The dataset type can no longer be a component 

1107 yield datasetType 

1108 if components and datasetType.isComposite(): 

1109 # Automatically create the component dataset types 

1110 for component in datasetType.makeAllComponentDatasetTypes(): 

1111 yield component 

1112 return 

1113 done: Set[str] = set() 

1114 for name in wildcard.strings: 

1115 storage = self._datasets.find(name) 

1116 if storage is not None: 

1117 done.add(storage.datasetType.name) 

1118 yield storage.datasetType 

1119 if wildcard.patterns: 

1120 # If components (the argument) is None, we'll save component 

1121 # dataset that we might want to match, but only if their parents 

1122 # didn't get included. 

1123 componentsForLater = [] 

1124 for registeredDatasetType in self._datasets: 

1125 # Components are not stored in registry so expand them here 

1126 allDatasetTypes = [registeredDatasetType] \ 

1127 + registeredDatasetType.makeAllComponentDatasetTypes() 

1128 for datasetType in allDatasetTypes: 

1129 if datasetType.name in done: 

1130 continue 

1131 parentName, componentName = datasetType.nameAndComponent() 

1132 if componentName is not None and not components: 

1133 if components is None and parentName not in done: 

1134 componentsForLater.append(datasetType) 

1135 continue 

1136 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

1137 done.add(datasetType.name) 

1138 yield datasetType 

1139 # Go back and try to match saved components. 

1140 for datasetType in componentsForLater: 

1141 parentName, _ = datasetType.nameAndComponent() 

1142 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

1143 yield datasetType 

1144 

1145 def queryCollections(self, expression: Any = ..., 

1146 datasetType: Optional[DatasetType] = None, 

1147 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

1148 flattenChains: bool = False, 

1149 includeChains: Optional[bool] = None) -> Iterator[str]: 

1150 """Iterate over the collections whose names match an expression. 

1151 

1152 Parameters 

1153 ---------- 

1154 expression : `Any`, optional 

1155 An expression that fully or partially identifies the collections 

1156 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1157 `...` can be used to return all collections, and is the default. 

1158 See :ref:`daf_butler_collection_expressions` for more 

1159 information. 

1160 datasetType : `DatasetType`, optional 

1161 If provided, only yield collections that should be searched for 

1162 this dataset type according to ``expression``. If this is 

1163 not provided, any dataset type restrictions in ``expression`` are 

1164 ignored. 

1165 collectionTypes : `AbstractSet` [ `CollectionType` ], optional 

1166 If provided, only yield collections of these types. 

1167 flattenChains : `bool`, optional 

1168 If `True` (`False` is default), recursively yield the child 

1169 collections of matching `~CollectionType.CHAINED` collections. 

1170 includeChains : `bool`, optional 

1171 If `True`, yield records for matching `~CollectionType.CHAINED` 

1172 collections. Default is the opposite of ``flattenChains``: include 

1173 either CHAINED collections or their children, but not both. 

1174 

1175 Yields 

1176 ------ 

1177 collection : `str` 

1178 The name of a collection that matches ``expression``. 

1179 """ 

1180 query = CollectionQuery.fromExpression(expression) 

1181 for record in query.iter(self._collections, datasetType=datasetType, 

1182 collectionTypes=frozenset(collectionTypes), 

1183 flattenChains=flattenChains, includeChains=includeChains): 

1184 yield record.name 

1185 

1186 def makeQueryBuilder(self, summary: queries.QuerySummary) -> queries.QueryBuilder: 

1187 """Return a `QueryBuilder` instance capable of constructing and 

1188 managing more complex queries than those obtainable via `Registry` 

1189 interfaces. 

1190 

1191 This is an advanced interface; downstream code should prefer 

1192 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those 

1193 are sufficient. 

1194 

1195 Parameters 

1196 ---------- 

1197 summary : `queries.QuerySummary` 

1198 Object describing and categorizing the full set of dimensions that 

1199 will be included in the query. 

1200 

1201 Returns 

1202 ------- 

1203 builder : `queries.QueryBuilder` 

1204 Object that can be used to construct and perform advanced queries. 

1205 """ 

1206 return queries.QueryBuilder( 

1207 summary, 

1208 queries.RegistryManagers( 

1209 collections=self._collections, 

1210 dimensions=self._dimensions, 

1211 datasets=self._datasets 

1212 ) 

1213 ) 

1214 

1215 def queryDatasets(self, datasetType: Any, *, 

1216 collections: Any, 

1217 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

1218 dataId: Optional[DataId] = None, 

1219 where: Optional[str] = None, 

1220 deduplicate: bool = False, 

1221 components: Optional[bool] = None, 

1222 **kwargs: Any) -> queries.DatasetQueryResults: 

1223 """Query for and iterate over dataset references matching user-provided 

1224 criteria. 

1225 

1226 Parameters 

1227 ---------- 

1228 datasetType 

1229 An expression that fully or partially identifies the dataset types 

1230 to be queried. Allowed types include `DatasetType`, `str`, 

1231 `re.Pattern`, and iterables thereof. The special value `...` can 

1232 be used to query all dataset types. See 

1233 :ref:`daf_butler_dataset_type_expressions` for more information. 

1234 collections 

1235 An expression that fully or partially identifies the collections 

1236 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1237 thereof. `...` can be used to datasets from all 

1238 `~CollectionType.RUN` collections (no other collections are 

1239 necessary, because all datasets are in a ``RUN`` collection). See 

1240 :ref:`daf_butler_collection_expressions` for more information. 

1241 dimensions : `~collections.abc.Iterable` of `Dimension` or `str` 

1242 Dimensions to include in the query (in addition to those used 

1243 to identify the queried dataset type(s)), either to constrain 

1244 the resulting datasets to those for which a matching dimension 

1245 exists, or to relate the dataset type's dimensions to dimensions 

1246 referenced by the ``dataId`` or ``where`` arguments. 

1247 dataId : `dict` or `DataCoordinate`, optional 

1248 A data ID whose key-value pairs are used as equality constraints 

1249 in the query. 

1250 where : `str`, optional 

1251 A string expression similar to a SQL WHERE clause. May involve 

1252 any column of a dimension table or (as a shortcut for the primary 

1253 key column of a dimension table) dimension name. See 

1254 :ref:`daf_butler_dimension_expressions` for more information. 

1255 deduplicate : `bool`, optional 

1256 If `True` (`False` is default), for each result data ID, only 

1257 yield one `DatasetRef` of each `DatasetType`, from the first 

1258 collection in which a dataset of that dataset type appears 

1259 (according to the order of ``collections`` passed in). If `True`, 

1260 ``collections`` must not contain regular expressions and may not 

1261 be `...`. 

1262 components : `bool`, optional 

1263 If `True`, apply all dataset expression patterns to component 

1264 dataset type names as well. If `False`, never apply patterns to 

1265 components. If `None` (default), apply patterns to components only 

1266 if their parent datasets were not matched by the expression. 

1267 Fully-specified component datasets (`str` or `DatasetType` 

1268 instances) are always included. 

1269 **kwargs 

1270 Additional keyword arguments are forwarded to 

1271 `DataCoordinate.standardize` when processing the ``dataId`` 

1272 argument (and may be used to provide a constraining data ID even 

1273 when the ``dataId`` argument is `None`). 

1274 

1275 Returns 

1276 ------- 

1277 refs : `queries.DatasetQueryResults` 

1278 Dataset references matching the given query criteria. 

1279 

1280 Raises 

1281 ------ 

1282 TypeError 

1283 Raised when the arguments are incompatible, such as when a 

1284 collection wildcard is passed when ``deduplicate`` is `True`. 

1285 

1286 Notes 

1287 ----- 

1288 When multiple dataset types are queried in a single call, the 

1289 results of this operation are equivalent to querying for each dataset 

1290 type separately in turn, and no information about the relationships 

1291 between datasets of different types is included. In contexts where 

1292 that kind of information is important, the recommended pattern is to 

1293 use `queryDataIds` to first obtain data IDs (possibly with the 

1294 desired dataset types and collections passed as constraints to the 

1295 query), and then use multiple (generally much simpler) calls to 

1296 `queryDatasets` with the returned data IDs passed as constraints. 

1297 """ 

1298 # Standardize the collections expression. 

1299 if deduplicate: 

1300 collections = CollectionSearch.fromExpression(collections) 

1301 else: 

1302 collections = CollectionQuery.fromExpression(collections) 

1303 # Standardize and expand the data ID provided as a constraint. 

1304 standardizedDataId = self.expandDataId(dataId, **kwargs) 

1305 

1306 # We can only query directly if given a non-component DatasetType 

1307 # instance. If we were given an expression or str or a component 

1308 # DatasetType instance, we'll populate this dict, recurse, and return. 

1309 # If we already have a non-component DatasetType, it will remain None 

1310 # and we'll run the query directly. 

1311 composition: Optional[ 

1312 Dict[ 

1313 DatasetType, # parent dataset type 

1314 List[Optional[str]] # component name, or None for parent 

1315 ] 

1316 ] = None 

1317 if not isinstance(datasetType, DatasetType): 

1318 # We were given a dataset type expression (which may be as simple 

1319 # as a str). Loop over all matching datasets, delegating handling 

1320 # of the `components` argument to queryDatasetTypes, as we populate 

1321 # the composition dict. 

1322 composition = defaultdict(list) 

1323 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components): 

1324 parentName, componentName = trueDatasetType.nameAndComponent() 

1325 if componentName is not None: 

1326 parentDatasetType = self.getDatasetType(parentName) 

1327 composition.setdefault(parentDatasetType, []).append(componentName) 

1328 else: 

1329 composition.setdefault(trueDatasetType, []).append(None) 

1330 elif datasetType.isComponent(): 

1331 # We were given a true DatasetType instance, but it's a component. 

1332 # the composition dict will have exactly one item. 

1333 parentName, componentName = datasetType.nameAndComponent() 

1334 parentDatasetType = self.getDatasetType(parentName) 

1335 composition = {parentDatasetType: [componentName]} 

1336 if composition is not None: 

1337 # We need to recurse. Do that once for each parent dataset type. 

1338 chain = [] 

1339 for parentDatasetType, componentNames in composition.items(): 

1340 parentResults = self.queryDatasets( 

1341 parentDatasetType, 

1342 collections=collections, 

1343 dimensions=dimensions, 

1344 dataId=standardizedDataId, 

1345 where=where, 

1346 deduplicate=deduplicate 

1347 ) 

1348 if isinstance(parentResults, queries.ParentDatasetQueryResults): 

1349 chain.append( 

1350 parentResults.withComponents(componentNames) 

1351 ) 

1352 else: 

1353 # Should only happen if we know there would be no results. 

1354 assert isinstance(parentResults, queries.ChainedDatasetQueryResults) \ 

1355 and not parentResults._chain 

1356 return queries.ChainedDatasetQueryResults(chain) 

1357 # If we get here, there's no need to recurse (or we are already 

1358 # recursing; there can only ever be one level of recursion). 

1359 

1360 # The full set of dimensions in the query is the combination of those 

1361 # needed for the DatasetType and those explicitly requested, if any. 

1362 requestedDimensionNames = set(datasetType.dimensions.names) 

1363 if dimensions is not None: 

1364 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

1365 # Construct the summary structure needed to construct a QueryBuilder. 

1366 summary = queries.QuerySummary( 

1367 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1368 dataId=standardizedDataId, 

1369 expression=where, 

1370 ) 

1371 builder = self.makeQueryBuilder(summary) 

1372 # Add the dataset subquery to the query, telling the QueryBuilder to 

1373 # include the rank of the selected collection in the results only if we 

1374 # need to deduplicate. Note that if any of the collections are 

1375 # actually wildcard expressions, and we've asked for deduplication, 

1376 # this will raise TypeError for us. 

1377 if not builder.joinDataset(datasetType, collections, isResult=True, deduplicate=deduplicate): 

1378 return queries.ChainedDatasetQueryResults(()) 

1379 query = builder.finish() 

1380 return queries.ParentDatasetQueryResults(self._db, query, components=[None]) 

1381 

1382 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *, 

1383 dataId: Optional[DataId] = None, 

1384 datasets: Any = None, 

1385 collections: Any = None, 

1386 where: Optional[str] = None, 

1387 components: Optional[bool] = None, 

1388 **kwargs: Any) -> queries.DataCoordinateQueryResults: 

1389 """Query for data IDs matching user-provided criteria. 

1390 

1391 Parameters 

1392 ---------- 

1393 dimensions : `Dimension` or `str`, or iterable thereof 

1394 The dimensions of the data IDs to yield, as either `Dimension` 

1395 instances or `str`. Will be automatically expanded to a complete 

1396 `DimensionGraph`. 

1397 dataId : `dict` or `DataCoordinate`, optional 

1398 A data ID whose key-value pairs are used as equality constraints 

1399 in the query. 

1400 datasets : `Any`, optional 

1401 An expression that fully or partially identifies dataset types 

1402 that should constrain the yielded data IDs. For example, including 

1403 "raw" here would constrain the yielded ``instrument``, 

1404 ``exposure``, ``detector``, and ``physical_filter`` values to only 

1405 those for which at least one "raw" dataset exists in 

1406 ``collections``. Allowed types include `DatasetType`, `str`, 

1407 `re.Pattern`, and iterables thereof. Unlike other dataset type 

1408 expressions, ``...`` is not permitted - it doesn't make sense to 

1409 constrain data IDs on the existence of *all* datasets. 

1410 See :ref:`daf_butler_dataset_type_expressions` for more 

1411 information. 

1412 collections: `Any`, optional 

1413 An expression that fully or partially identifies the collections 

1414 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1415 thereof. `...` can be used to return all collections. Must be 

1416 provided if ``datasets`` is, and is ignored if it is not. See 

1417 :ref:`daf_butler_collection_expressions` for more information. 

1418 where : `str`, optional 

1419 A string expression similar to a SQL WHERE clause. May involve 

1420 any column of a dimension table or (as a shortcut for the primary 

1421 key column of a dimension table) dimension name. See 

1422 :ref:`daf_butler_dimension_expressions` for more information. 

1423 components : `bool`, optional 

1424 If `True`, apply all dataset expression patterns to component 

1425 dataset type names as well. If `False`, never apply patterns to 

1426 components. If `None` (default), apply patterns to components only 

1427 if their parent datasets were not matched by the expression. 

1428 Fully-specified component datasets (`str` or `DatasetType` 

1429 instances) are always included. 

1430 **kwargs 

1431 Additional keyword arguments are forwarded to 

1432 `DataCoordinate.standardize` when processing the ``dataId`` 

1433 argument (and may be used to provide a constraining data ID even 

1434 when the ``dataId`` argument is `None`). 

1435 

1436 Returns 

1437 ------- 

1438 dataIds : `DataCoordinateQueryResults` 

1439 Data IDs matching the given query parameters. These are guaranteed 

1440 to identify all dimensions (`DataCoordinate.hasFull` returns 

1441 `True`), but will not contain `DimensionRecord` objects 

1442 (`DataCoordinate.hasRecords` returns `False`). Call 

1443 `DataCoordinateQueryResults.expanded` on the returned object to 

1444 fetch those (and consider using 

1445 `DataCoordinateQueryResults.materialize` on the returned object 

1446 first if the expected number of rows is very large). See 

1447 documentation for those methods for additional information. 

1448 """ 

1449 dimensions = iterable(dimensions) 

1450 standardizedDataId = self.expandDataId(dataId, **kwargs) 

1451 standardizedDatasetTypes = set() 

1452 requestedDimensions = self.dimensions.extract(dimensions) 

1453 queryDimensionNames = set(requestedDimensions.names) 

1454 if datasets is not None: 

1455 if collections is None: 

1456 raise TypeError("Cannot pass 'datasets' without 'collections'.") 

1457 for datasetType in self.queryDatasetTypes(datasets, components=components): 

1458 queryDimensionNames.update(datasetType.dimensions.names) 

1459 # If any matched dataset type is a component, just operate on 

1460 # its parent instead, because Registry doesn't know anything 

1461 # about what components exist, and here (unlike queryDatasets) 

1462 # we don't care about returning them. 

1463 parentDatasetTypeName, componentName = datasetType.nameAndComponent() 

1464 if componentName is not None: 

1465 datasetType = self.getDatasetType(parentDatasetTypeName) 

1466 standardizedDatasetTypes.add(datasetType) 

1467 # Preprocess collections expression in case the original included 

1468 # single-pass iterators (we'll want to use it multiple times 

1469 # below). 

1470 collections = CollectionQuery.fromExpression(collections) 

1471 

1472 summary = queries.QuerySummary( 

1473 requested=DimensionGraph(self.dimensions, names=queryDimensionNames), 

1474 dataId=standardizedDataId, 

1475 expression=where, 

1476 ) 

1477 builder = self.makeQueryBuilder(summary) 

1478 for datasetType in standardizedDatasetTypes: 

1479 builder.joinDataset(datasetType, collections, isResult=False) 

1480 query = builder.finish() 

1481 return queries.DataCoordinateQueryResults(self._db, query) 

1482 

1483 def queryDimensionRecords(self, element: Union[DimensionElement, str], *, 

1484 dataId: Optional[DataId] = None, 

1485 datasets: Any = None, 

1486 collections: Any = None, 

1487 where: Optional[str] = None, 

1488 components: Optional[bool] = None, 

1489 **kwargs: Any) -> Iterator[DimensionRecord]: 

1490 """Query for dimension information matching user-provided criteria. 

1491 

1492 Parameters 

1493 ---------- 

1494 element : `DimensionElement` or `str` 

1495 The dimension element to obtain r 

1496 dataId : `dict` or `DataCoordinate`, optional 

1497 A data ID whose key-value pairs are used as equality constraints 

1498 in the query. 

1499 datasets : `Any`, optional 

1500 An expression that fully or partially identifies dataset types 

1501 that should constrain the yielded records. See `queryDataIds` and 

1502 :ref:`daf_butler_dataset_type_expressions` for more information. 

1503 collections: `Any`, optional 

1504 An expression that fully or partially identifies the collections 

1505 to search for datasets. See `queryDataIds` and 

1506 :ref:`daf_butler_collection_expressions` for more information. 

1507 where : `str`, optional 

1508 A string expression similar to a SQL WHERE clause. See 

1509 `queryDataIds` and :ref:`daf_butler_dimension_expressions` for more 

1510 information. 

1511 components : `bool`, optional 

1512 Whether to apply dataset expressions to components as well. 

1513 See `queryDataIds` for more information. 

1514 **kwargs 

1515 Additional keyword arguments are forwarded to 

1516 `DataCoordinate.standardize` when processing the ``dataId`` 

1517 argument (and may be used to provide a constraining data ID even 

1518 when the ``dataId`` argument is `None`). 

1519 

1520 Returns 

1521 ------- 

1522 dataIds : `DataCoordinateQueryResults` 

1523 Data IDs matching the given query parameters. 

1524 """ 

1525 if not isinstance(element, DimensionElement): 

1526 element = self.dimensions[element] 

1527 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections, 

1528 where=where, components=components, **kwargs) 

1529 return iter(self._dimensions[element].fetch(dataIds)) 

1530 

1531 def queryDatasetAssociations( 

1532 self, 

1533 datasetType: Union[str, DatasetType], 

1534 collections: Any = ..., 

1535 *, 

1536 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

1537 flattenChains: bool = False, 

1538 ) -> Iterator[DatasetAssociation]: 

1539 """Iterate over dataset-collection combinations where the dataset is in 

1540 the collection. 

1541 

1542 This method is a temporary placeholder for better support for 

1543 assocation results in `queryDatasets`. It will probably be 

1544 removed in the future, and should be avoided in production code 

1545 whenever possible. 

1546 

1547 Parameters 

1548 ---------- 

1549 datasetType : `DatasetType` or `str` 

1550 A dataset type object or the name of one. 

1551 collections: `Any`, optional 

1552 An expression that fully or partially identifies the collections 

1553 to search for datasets. See `queryCollections` and 

1554 :ref:`daf_butler_collection_expressions` for more information. 

1555 collectionTypes : `AbstractSet` [ `CollectionType` ], optional 

1556 If provided, only yield associations from collections of these 

1557 types. 

1558 flattenChains : `bool`, optional 

1559 If `True` (default) search in the children of 

1560 `~CollectionType.CHAINED` collections. If `False`, ``CHAINED`` 

1561 collections are ignored. 

1562 

1563 Yields 

1564 ------ 

1565 association : `DatasetAssociation` 

1566 Object representing the relationship beween a single dataset and 

1567 a single collection. 

1568 """ 

1569 collections = CollectionQuery.fromExpression(collections) 

1570 tsRepr = self._db.getTimespanRepresentation() 

1571 if isinstance(datasetType, str): 

1572 storage = self._datasets[datasetType] 

1573 else: 

1574 storage = self._datasets[datasetType.name] 

1575 for collectionRecord in collections.iter(self._collections, datasetType=datasetType, 

1576 collectionTypes=frozenset(collectionTypes), 

1577 flattenChains=flattenChains): 

1578 query = storage.select(collectionRecord) 

1579 if query is None: 

1580 continue 

1581 for row in self._db.query(query.combine()): 

1582 dataId = DataCoordinate.fromRequiredValues( 

1583 storage.datasetType.dimensions, 

1584 tuple(row[name] for name in storage.datasetType.dimensions.required.names) 

1585 ) 

1586 runRecord = self._collections[row[self._collections.getRunForeignKeyName()]] 

1587 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, 

1588 conform=False) 

1589 if collectionRecord.type is CollectionType.CALIBRATION: 

1590 timespan = tsRepr.extract(row) 

1591 else: 

1592 timespan = None 

1593 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan) 

1594 

1595 storageClasses: StorageClassFactory 

1596 """All storage classes known to the registry (`StorageClassFactory`). 

1597 """