Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "ConsistentDataIds", 

26 "Registry", 

27) 

28 

29from collections import defaultdict 

30import contextlib 

31from dataclasses import dataclass 

32import sys 

33from typing import ( 

34 Any, 

35 Dict, 

36 Iterable, 

37 Iterator, 

38 List, 

39 Mapping, 

40 Optional, 

41 Set, 

42 Type, 

43 TYPE_CHECKING, 

44 Union, 

45) 

46 

47import astropy.time 

48import sqlalchemy 

49 

50import lsst.sphgeom 

51from ..core import ( 

52 Config, 

53 DataCoordinate, 

54 DataId, 

55 DatasetRef, 

56 DatasetType, 

57 ddl, 

58 Dimension, 

59 DimensionElement, 

60 DimensionGraph, 

61 DimensionRecord, 

62 DimensionUniverse, 

63 ExpandedDataCoordinate, 

64 NamedKeyDict, 

65 Timespan, 

66 StorageClassFactory, 

67) 

68from ..core.utils import doImport, iterable, transactional 

69from ._config import RegistryConfig 

70from .queries import ( 

71 QueryBuilder, 

72 QuerySummary, 

73) 

74from .tables import makeRegistryTableSpecs 

75from ._collectionType import CollectionType 

76from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError 

77from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch, Ellipsis 

78from .interfaces import ChainedCollectionRecord, RunRecord 

79 

80if TYPE_CHECKING: 80 ↛ 81line 80 didn't jump to line 81, because the condition on line 80 was never true

81 from ..butlerConfig import ButlerConfig 

82 from ..core import ( 

83 Quantum 

84 ) 

85 from .interfaces import ( 

86 CollectionManager, 

87 Database, 

88 OpaqueTableStorageManager, 

89 DimensionRecordStorageManager, 

90 DatasetRecordStorageManager, 

91 DatastoreRegistryBridgeManager, 

92 ) 

93 

94 

95@dataclass 

96class ConsistentDataIds: 

97 """A struct used to report relationships between data IDs by 

98 `Registry.relateDataIds`. 

99 

100 If an instance of this class is returned (instead of `None`), the data IDs 

101 are "not inconsistent" - any keys they have in common have the same value, 

102 and any spatial or temporal relationships they have at least might involve 

103 an overlap. To capture this, any instance of `ConsistentDataIds` coerces 

104 to `True` in boolean contexts. 

105 """ 

106 

107 overlaps: bool 

108 """If `True`, the data IDs have at least one key in common, associated with 

109 the same value. 

110 

111 Note that data IDs are not inconsistent even if overlaps is `False` - they 

112 may simply have no keys in common, which means they cannot have 

113 inconsistent values for any keys. They may even be equal, in the case that 

114 both data IDs are empty. 

115 

116 This field does _not_ indicate whether a spatial or temporal overlap 

117 relationship exists. 

118 """ 

119 

120 contains: bool 

121 """If `True`, all keys in the first data ID are in the second, and are 

122 associated with the same values. 

123 

124 This includes case where the first data ID is empty. 

125 """ 

126 

127 within: bool 

128 """If `True`, all keys in the second data ID are in the first, and are 

129 associated with the same values. 

130 

131 This includes case where the second data ID is empty. 

132 """ 

133 

134 @property 

135 def equal(self) -> bool: 

136 """If `True`, the two data IDs are the same. 

137 

138 Data IDs are equal if they have both a `contains` and a `within` 

139 relationship. 

140 """ 

141 return self.contains and self.within 

142 

143 @property 

144 def disjoint(self) -> bool: 

145 """If `True`, the two data IDs have no keys in common. 

146 

147 This is simply the oppose of `overlaps`. Disjoint datasets are by 

148 definition not inconsistent. 

149 """ 

150 return not self.overlaps 

151 

152 def __bool__(self) -> bool: 

153 return True 

154 

155 

156class Registry: 

157 """Registry interface. 

158 

159 Parameters 

160 ---------- 

161 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

162 Registry configuration 

163 """ 

164 

165 defaultConfigFile = None 

166 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

167 absolute path. Can be None if no defaults specified. 

168 """ 

169 

170 @classmethod 

171 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False, 

172 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry: 

173 """Create `Registry` subclass instance from `config`. 

174 

175 Uses ``registry.cls`` from `config` to determine which subclass to 

176 instantiate. 

177 

178 Parameters 

179 ---------- 

180 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

181 Registry configuration 

182 create : `bool`, optional 

183 Assume empty Registry and create a new one. 

184 butlerRoot : `str`, optional 

185 Path to the repository root this `Registry` will manage. 

186 writeable : `bool`, optional 

187 If `True` (default) create a read-write connection to the database. 

188 

189 Returns 

190 ------- 

191 registry : `Registry` (subclass) 

192 A new `Registry` subclass instance. 

193 """ 

194 if not isinstance(config, RegistryConfig): 

195 if isinstance(config, str) or isinstance(config, Config): 

196 config = RegistryConfig(config) 

197 else: 

198 raise ValueError("Incompatible Registry configuration: {}".format(config)) 

199 config.replaceRoot(butlerRoot) 

200 DatabaseClass = config.getDatabaseClass() 

201 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

202 namespace=config.get("namespace"), writeable=writeable) 

203 universe = DimensionUniverse(config) 

204 opaque = doImport(config["managers", "opaque"]) 

205 dimensions = doImport(config["managers", "dimensions"]) 

206 collections = doImport(config["managers", "collections"]) 

207 datasets = doImport(config["managers", "datasets"]) 

208 datastoreBridges = doImport(config["managers", "datastores"]) 

209 return cls(database, universe, dimensions=dimensions, opaque=opaque, collections=collections, 

210 datasets=datasets, datastoreBridges=datastoreBridges, create=create) 

211 

212 def __init__(self, database: Database, universe: DimensionUniverse, *, 

213 opaque: Type[OpaqueTableStorageManager], 

214 dimensions: Type[DimensionRecordStorageManager], 

215 collections: Type[CollectionManager], 

216 datasets: Type[DatasetRecordStorageManager], 

217 datastoreBridges: Type[DatastoreRegistryBridgeManager], 

218 create: bool = False): 

219 self._db = database 

220 self.storageClasses = StorageClassFactory() 

221 with self._db.declareStaticTables(create=create) as context: 

222 self._dimensions = dimensions.initialize(self._db, context, universe=universe) 

223 self._collections = collections.initialize(self._db, context) 

224 self._datasets = datasets.initialize(self._db, context, 

225 collections=self._collections, 

226 universe=self.dimensions) 

227 self._opaque = opaque.initialize(self._db, context) 

228 self._datastoreBridges = datastoreBridges.initialize(self._db, context, 

229 opaque=self._opaque, 

230 datasets=datasets, 

231 universe=self.dimensions) 

232 self._tables = context.addTableTuple(makeRegistryTableSpecs(self.dimensions, 

233 self._collections, 

234 self._datasets)) 

235 self._collections.refresh() 

236 self._datasets.refresh(universe=self._dimensions.universe) 

237 

238 def __str__(self) -> str: 

239 return str(self._db) 

240 

241 def __repr__(self) -> str: 

242 return f"Registry({self._db!r}, {self.dimensions!r})" 

243 

244 def isWriteable(self) -> bool: 

245 """Return `True` if this registry allows write operations, and `False` 

246 otherwise. 

247 """ 

248 return self._db.isWriteable() 

249 

250 @property 

251 def dimensions(self) -> DimensionUniverse: 

252 """All dimensions recognized by this `Registry` (`DimensionUniverse`). 

253 """ 

254 return self._dimensions.universe 

255 

256 @contextlib.contextmanager 

257 def transaction(self) -> Iterator[None]: 

258 """Return a context manager that represents a transaction. 

259 """ 

260 # TODO make savepoint=False the default. 

261 try: 

262 with self._db.transaction(): 

263 yield 

264 except BaseException: 

265 # TODO: this clears the caches sometimes when we wouldn't actually 

266 # need to. Can we avoid that? 

267 self._dimensions.clearCaches() 

268 raise 

269 

270 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None: 

271 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

272 other data repository client. 

273 

274 Opaque table records can be added via `insertOpaqueData`, retrieved via 

275 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

276 

277 Parameters 

278 ---------- 

279 tableName : `str` 

280 Logical name of the opaque table. This may differ from the 

281 actual name used in the database by a prefix and/or suffix. 

282 spec : `ddl.TableSpec` 

283 Specification for the table to be added. 

284 """ 

285 self._opaque.register(tableName, spec) 

286 

287 @transactional 

288 def insertOpaqueData(self, tableName: str, *data: dict) -> None: 

289 """Insert records into an opaque table. 

290 

291 Parameters 

292 ---------- 

293 tableName : `str` 

294 Logical name of the opaque table. Must match the name used in a 

295 previous call to `registerOpaqueTable`. 

296 data 

297 Each additional positional argument is a dictionary that represents 

298 a single row to be added. 

299 """ 

300 self._opaque[tableName].insert(*data) 

301 

302 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

303 """Retrieve records from an opaque table. 

304 

305 Parameters 

306 ---------- 

307 tableName : `str` 

308 Logical name of the opaque table. Must match the name used in a 

309 previous call to `registerOpaqueTable`. 

310 where 

311 Additional keyword arguments are interpreted as equality 

312 constraints that restrict the returned rows (combined with AND); 

313 keyword arguments are column names and values are the values they 

314 must have. 

315 

316 Yields 

317 ------ 

318 row : `dict` 

319 A dictionary representing a single result row. 

320 """ 

321 yield from self._opaque[tableName].fetch(**where) 

322 

323 @transactional 

324 def deleteOpaqueData(self, tableName: str, **where: Any) -> None: 

325 """Remove records from an opaque table. 

326 

327 Parameters 

328 ---------- 

329 tableName : `str` 

330 Logical name of the opaque table. Must match the name used in a 

331 previous call to `registerOpaqueTable`. 

332 where 

333 Additional keyword arguments are interpreted as equality 

334 constraints that restrict the deleted rows (combined with AND); 

335 keyword arguments are column names and values are the values they 

336 must have. 

337 """ 

338 self._opaque[tableName].delete(**where) 

339 

340 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED) -> None: 

341 """Add a new collection if one with the given name does not exist. 

342 

343 Parameters 

344 ---------- 

345 name : `str` 

346 The name of the collection to create. 

347 type : `CollectionType` 

348 Enum value indicating the type of collection to create. 

349 

350 Notes 

351 ----- 

352 This method cannot be called within transactions, as it needs to be 

353 able to perform its own transaction to be concurrent. 

354 """ 

355 self._collections.register(name, type) 

356 

357 def getCollectionType(self, name: str) -> CollectionType: 

358 """Return an enumeration value indicating the type of the given 

359 collection. 

360 

361 Parameters 

362 ---------- 

363 name : `str` 

364 The name of the collection. 

365 

366 Returns 

367 ------- 

368 type : `CollectionType` 

369 Enum value indicating the type of this collection. 

370 

371 Raises 

372 ------ 

373 MissingCollectionError 

374 Raised if no collection with the given name exists. 

375 """ 

376 return self._collections.find(name).type 

377 

378 def registerRun(self, name: str) -> None: 

379 """Add a new run if one with the given name does not exist. 

380 

381 Parameters 

382 ---------- 

383 name : `str` 

384 The name of the run to create. 

385 

386 Notes 

387 ----- 

388 This method cannot be called within transactions, as it needs to be 

389 able to perform its own transaction to be concurrent. 

390 """ 

391 self._collections.register(name, CollectionType.RUN) 

392 

393 @transactional 

394 def removeCollection(self, name: str) -> None: 

395 """Completely remove the given collection. 

396 

397 Parameters 

398 ---------- 

399 name : `str` 

400 The name of the collection to remove. 

401 

402 Raises 

403 ------ 

404 MissingCollectionError 

405 Raised if no collection with the given name exists. 

406 

407 Notes 

408 ----- 

409 If this is a `~CollectionType.RUN` collection, all datasets and quanta 

410 in it are also fully removed. This requires that those datasets be 

411 removed (or at least trashed) from any datastores that hold them first. 

412 

413 A collection may not be deleted as long as it is referenced by a 

414 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must 

415 be deleted or redefined first. 

416 """ 

417 self._collections.remove(name) 

418 

419 def getCollectionChain(self, parent: str) -> CollectionSearch: 

420 """Return the child collections in a `~CollectionType.CHAINED` 

421 collection. 

422 

423 Parameters 

424 ---------- 

425 parent : `str` 

426 Name of the chained collection. Must have already been added via 

427 a call to `Registry.registerCollection`. 

428 

429 Returns 

430 ------- 

431 children : `CollectionSearch` 

432 An object that defines the search path of the collection. 

433 See :ref:`daf_butler_collection_expressions` for more information. 

434 

435 Raises 

436 ------ 

437 MissingCollectionError 

438 Raised if ``parent`` does not exist in the `Registry`. 

439 TypeError 

440 Raised if ``parent`` does not correspond to a 

441 `~CollectionType.CHAINED` collection. 

442 """ 

443 record = self._collections.find(parent) 

444 if record.type is not CollectionType.CHAINED: 

445 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

446 assert isinstance(record, ChainedCollectionRecord) 

447 return record.children 

448 

449 @transactional 

450 def setCollectionChain(self, parent: str, children: Any) -> None: 

451 """Define or redefine a `~CollectionType.CHAINED` collection. 

452 

453 Parameters 

454 ---------- 

455 parent : `str` 

456 Name of the chained collection. Must have already been added via 

457 a call to `Registry.registerCollection`. 

458 children : `Any` 

459 An expression defining an ordered search of child collections, 

460 generally an iterable of `str`. Restrictions on the dataset types 

461 to be searched can also be included, by passing mapping or an 

462 iterable containing tuples; see 

463 :ref:`daf_butler_collection_expressions` for more information. 

464 

465 Raises 

466 ------ 

467 MissingCollectionError 

468 Raised when any of the given collections do not exist in the 

469 `Registry`. 

470 TypeError 

471 Raised if ``parent`` does not correspond to a 

472 `~CollectionType.CHAINED` collection. 

473 ValueError 

474 Raised if the given collections contains a cycle. 

475 """ 

476 record = self._collections.find(parent) 

477 if record.type is not CollectionType.CHAINED: 

478 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

479 assert isinstance(record, ChainedCollectionRecord) 

480 children = CollectionSearch.fromExpression(children) 

481 if children != record.children: 

482 record.update(self._collections, children) 

483 

484 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

485 """ 

486 Add a new `DatasetType` to the Registry. 

487 

488 It is not an error to register the same `DatasetType` twice. 

489 

490 Parameters 

491 ---------- 

492 datasetType : `DatasetType` 

493 The `DatasetType` to be added. 

494 

495 Returns 

496 ------- 

497 inserted : `bool` 

498 `True` if ``datasetType`` was inserted, `False` if an identical 

499 existing `DatsetType` was found. Note that in either case the 

500 DatasetType is guaranteed to be defined in the Registry 

501 consistently with the given definition. 

502 

503 Raises 

504 ------ 

505 ValueError 

506 Raised if the dimensions or storage class are invalid. 

507 ConflictingDefinitionError 

508 Raised if this DatasetType is already registered with a different 

509 definition. 

510 

511 Notes 

512 ----- 

513 This method cannot be called within transactions, as it needs to be 

514 able to perform its own transaction to be concurrent. 

515 """ 

516 _, inserted = self._datasets.register(datasetType) 

517 return inserted 

518 

519 def getDatasetType(self, name: str) -> DatasetType: 

520 """Get the `DatasetType`. 

521 

522 Parameters 

523 ---------- 

524 name : `str` 

525 Name of the type. 

526 

527 Returns 

528 ------- 

529 type : `DatasetType` 

530 The `DatasetType` associated with the given name. 

531 

532 Raises 

533 ------ 

534 KeyError 

535 Requested named DatasetType could not be found in registry. 

536 """ 

537 storage = self._datasets.find(name) 

538 if storage is None: 

539 raise KeyError(f"DatasetType '{name}' could not be found.") 

540 return storage.datasetType 

541 

542 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *, 

543 collections: Any, **kwargs: Any) -> Optional[DatasetRef]: 

544 """Find a dataset given its `DatasetType` and data ID. 

545 

546 This can be used to obtain a `DatasetRef` that permits the dataset to 

547 be read from a `Datastore`. If the dataset is a component and can not 

548 be found using the provided dataset type, a dataset ref for the parent 

549 will be returned instead but with the correct dataset type. 

550 

551 Parameters 

552 ---------- 

553 datasetType : `DatasetType` or `str` 

554 A `DatasetType` or the name of one. 

555 dataId : `dict` or `DataCoordinate`, optional 

556 A `dict`-like object containing the `Dimension` links that identify 

557 the dataset within a collection. 

558 collections 

559 An expression that fully or partially identifies the collections 

560 to search for the dataset, such as a `str`, `re.Pattern`, or 

561 iterable thereof. `...` can be used to return all collections. 

562 See :ref:`daf_butler_collection_expressions` for more information. 

563 **kwargs 

564 Additional keyword arguments passed to 

565 `DataCoordinate.standardize` to convert ``dataId`` to a true 

566 `DataCoordinate` or augment an existing one. 

567 

568 Returns 

569 ------- 

570 ref : `DatasetRef` 

571 A reference to the dataset, or `None` if no matching Dataset 

572 was found. 

573 

574 Raises 

575 ------ 

576 LookupError 

577 Raised if one or more data ID keys are missing or the dataset type 

578 does not exist. 

579 MissingCollectionError 

580 Raised if any of ``collections`` does not exist in the registry. 

581 """ 

582 if isinstance(datasetType, DatasetType): 

583 storage = self._datasets.find(datasetType.name) 

584 if storage is None: 

585 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

586 else: 

587 storage = self._datasets.find(datasetType) 

588 if storage is None: 

589 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

590 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions, 

591 universe=self.dimensions, **kwargs) 

592 collections = CollectionSearch.fromExpression(collections) 

593 for collectionRecord in collections.iter(self._collections, datasetType=storage.datasetType): 

594 result = storage.find(collectionRecord, dataId) 

595 if result is not None: 

596 return result 

597 

598 # fallback to the parent if we got nothing and this was a component 

599 if storage.datasetType.isComponent(): 

600 parentType, _ = storage.datasetType.nameAndComponent() 

601 parentRef = self.findDataset(parentType, dataId, collections=collections, **kwargs) 

602 if parentRef is not None: 

603 # Should already conform and we know no components 

604 return DatasetRef(storage.datasetType, parentRef.dataId, id=parentRef.id, 

605 run=parentRef.run, conform=False, hasParentId=True) 

606 

607 return None 

608 

609 @transactional 

610 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId], 

611 run: str, *, producer: Optional[Quantum] = None) -> List[DatasetRef]: 

612 """Insert one or more datasets into the `Registry` 

613 

614 This always adds new datasets; to associate existing datasets with 

615 a new collection, use ``associate``. 

616 

617 Parameters 

618 ---------- 

619 datasetType : `DatasetType` or `str` 

620 A `DatasetType` or the name of one. 

621 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate` 

622 Dimension-based identifiers for the new datasets. 

623 run : `str` 

624 The name of the run that produced the datasets. 

625 producer : `Quantum` 

626 Unit of work that produced the datasets. May be `None` to store 

627 no provenance information, but if present the `Quantum` must 

628 already have been added to the Registry. 

629 

630 Returns 

631 ------- 

632 refs : `list` of `DatasetRef` 

633 Resolved `DatasetRef` instances for all given data IDs (in the same 

634 order). 

635 

636 Raises 

637 ------ 

638 ConflictingDefinitionError 

639 If a dataset with the same dataset type and data ID as one of those 

640 given already exists in ``run``. 

641 MissingCollectionError 

642 Raised if ``run`` does not exist in the registry. 

643 """ 

644 if isinstance(datasetType, DatasetType): 

645 storage = self._datasets.find(datasetType.name) 

646 if storage is None: 

647 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

648 else: 

649 storage = self._datasets.find(datasetType) 

650 if storage is None: 

651 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

652 runRecord = self._collections.find(run) 

653 if runRecord.type is not CollectionType.RUN: 

654 raise TypeError("Given collection is of type {runRecord.type.name}; RUN collection required.") 

655 assert isinstance(runRecord, RunRecord) 

656 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) 

657 for dataId in dataIds] 

658 try: 

659 refs = list(storage.insert(runRecord, expandedDataIds, quantum=producer)) 

660 except sqlalchemy.exc.IntegrityError as err: 

661 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

662 f"one or more datasets of type {storage.datasetType} into " 

663 f"collection '{run}'. " 

664 f"This probably means a dataset with the same data ID " 

665 f"and dataset type already exists, but it may also mean a " 

666 f"dimension row is missing.") from err 

667 return refs 

668 

669 def getDataset(self, id: int) -> Optional[DatasetRef]: 

670 """Retrieve a Dataset entry. 

671 

672 Parameters 

673 ---------- 

674 id : `int` 

675 The unique identifier for the dataset. 

676 

677 Returns 

678 ------- 

679 ref : `DatasetRef` or `None` 

680 A ref to the Dataset, or `None` if no matching Dataset 

681 was found. 

682 """ 

683 ref = self._datasets.getDatasetRef(id, universe=self.dimensions) 

684 if ref is None: 

685 return None 

686 return ref 

687 

688 @transactional 

689 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None: 

690 """Remove datasets from the Registry. 

691 

692 The datasets will be removed unconditionally from all collections, and 

693 any `Quantum` that consumed this dataset will instead be marked with 

694 having a NULL input. `Datastore` records will *not* be deleted; the 

695 caller is responsible for ensuring that the dataset has already been 

696 removed from all Datastores. 

697 

698 Parameters 

699 ---------- 

700 refs : `Iterable` of `DatasetRef` 

701 References to the datasets to be removed. Must include a valid 

702 ``id`` attribute, and should be considered invalidated upon return. 

703 

704 Raises 

705 ------ 

706 AmbiguousDatasetError 

707 Raised if any ``ref.id`` is `None`. 

708 OrphanedRecordError 

709 Raised if any dataset is still present in any `Datastore`. 

710 """ 

711 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

712 storage = self._datasets.find(datasetType.name) 

713 assert storage is not None 

714 try: 

715 storage.delete(refsForType) 

716 except sqlalchemy.exc.IntegrityError as err: 

717 raise OrphanedRecordError("One or more datasets is still " 

718 "present in one or more Datastores.") from err 

719 

720 @transactional 

721 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

722 """Add existing datasets to a `~CollectionType.TAGGED` collection. 

723 

724 If a DatasetRef with the same exact integer ID is already in a 

725 collection nothing is changed. If a `DatasetRef` with the same 

726 `DatasetType` and data ID but with different integer ID 

727 exists in the collection, `ConflictingDefinitionError` is raised. 

728 

729 Parameters 

730 ---------- 

731 collection : `str` 

732 Indicates the collection the datasets should be associated with. 

733 refs : `Iterable` [ `DatasetRef` ] 

734 An iterable of resolved `DatasetRef` instances that already exist 

735 in this `Registry`. 

736 

737 Raises 

738 ------ 

739 ConflictingDefinitionError 

740 If a Dataset with the given `DatasetRef` already exists in the 

741 given collection. 

742 AmbiguousDatasetError 

743 Raised if ``any(ref.id is None for ref in refs)``. 

744 MissingCollectionError 

745 Raised if ``collection`` does not exist in the registry. 

746 TypeError 

747 Raise adding new datasets to the given ``collection`` is not 

748 allowed. 

749 """ 

750 collectionRecord = self._collections.find(collection) 

751 if collectionRecord.type is not CollectionType.TAGGED: 

752 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

753 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

754 storage = self._datasets.find(datasetType.name) 

755 assert storage is not None 

756 try: 

757 storage.associate(collectionRecord, refsForType) 

758 except sqlalchemy.exc.IntegrityError as err: 

759 raise ConflictingDefinitionError( 

760 f"Constraint violation while associating dataset of type {datasetType.name} with " 

761 f"collection {collection}. This probably means that one or more datasets with the same " 

762 f"dataset type and data ID already exist in the collection, but it may also indicate " 

763 f"that the datasets do not exist." 

764 ) from err 

765 

766 @transactional 

767 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

768 """Remove existing datasets from a `~CollectionType.TAGGED` collection. 

769 

770 ``collection`` and ``ref`` combinations that are not currently 

771 associated are silently ignored. 

772 

773 Parameters 

774 ---------- 

775 collection : `str` 

776 The collection the datasets should no longer be associated with. 

777 refs : `Iterable` [ `DatasetRef` ] 

778 An iterable of resolved `DatasetRef` instances that already exist 

779 in this `Registry`. 

780 

781 Raises 

782 ------ 

783 AmbiguousDatasetError 

784 Raised if any of the given dataset references is unresolved. 

785 MissingCollectionError 

786 Raised if ``collection`` does not exist in the registry. 

787 TypeError 

788 Raise adding new datasets to the given ``collection`` is not 

789 allowed. 

790 """ 

791 collectionRecord = self._collections.find(collection) 

792 if collectionRecord.type is not CollectionType.TAGGED: 

793 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; " 

794 "expected TAGGED.") 

795 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

796 storage = self._datasets.find(datasetType.name) 

797 assert storage is not None 

798 storage.disassociate(collectionRecord, refsForType) 

799 

800 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: 

801 """Return an object that allows a new `Datastore` instance to 

802 communicate with this `Registry`. 

803 

804 Returns 

805 ------- 

806 manager : `DatastoreRegistryBridgeManager` 

807 Object that mediates communication between this `Registry` and its 

808 associated datastores. 

809 """ 

810 return self._datastoreBridges 

811 

812 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]: 

813 """Retrieve datastore locations for a given dataset. 

814 

815 Parameters 

816 ---------- 

817 ref : `DatasetRef` 

818 A reference to the dataset for which to retrieve storage 

819 information. 

820 

821 Returns 

822 ------- 

823 datastores : `Iterable` [ `str` ] 

824 All the matching datastores holding this dataset. 

825 

826 Raises 

827 ------ 

828 AmbiguousDatasetError 

829 Raised if ``ref.id`` is `None`. 

830 """ 

831 return self._datastoreBridges.findDatastores(ref) 

832 

833 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None, 

834 records: Optional[Mapping[DimensionElement, Optional[DimensionRecord]]] = None, 

835 **kwargs: Any) -> ExpandedDataCoordinate: 

836 """Expand a dimension-based data ID to include additional information. 

837 

838 Parameters 

839 ---------- 

840 dataId : `DataCoordinate` or `dict`, optional 

841 Data ID to be expanded; augmented and overridden by ``kwds``. 

842 graph : `DimensionGraph`, optional 

843 Set of dimensions for the expanded ID. If `None`, the dimensions 

844 will be inferred from the keys of ``dataId`` and ``kwds``. 

845 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph`` 

846 are silently ignored, providing a way to extract and expand a 

847 subset of a data ID. 

848 records : `Mapping` [`DimensionElement`, `DimensionRecord`], optional 

849 Dimension record data to use before querying the database for that 

850 data. 

851 **kwargs 

852 Additional keywords are treated like additional key-value pairs for 

853 ``dataId``, extending and overriding 

854 

855 Returns 

856 ------- 

857 expanded : `ExpandedDataCoordinate` 

858 A data ID that includes full metadata for all of the dimensions it 

859 identifieds. 

860 """ 

861 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwargs) 

862 if isinstance(standardized, ExpandedDataCoordinate): 

863 return standardized 

864 elif isinstance(dataId, ExpandedDataCoordinate): 

865 records = NamedKeyDict(records) if records is not None else NamedKeyDict() 

866 records.update(dataId.records) 

867 else: 

868 records = NamedKeyDict(records) if records is not None else NamedKeyDict() 

869 keys = dict(standardized.byName()) 

870 regions: List[lsst.sphgeom.ConvexPolygon] = [] 

871 timespans: List[Timespan[astropy.time.Time]] = [] 

872 for element in standardized.graph.primaryKeyTraversalOrder: 

873 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

874 if record is ...: 

875 storage = self._dimensions[element] 

876 record = storage.fetch(keys) 

877 records[element] = record 

878 if record is not None: 

879 for d in element.implied: 

880 value = getattr(record, d.name) 

881 if keys.setdefault(d.name, value) != value: 

882 raise InconsistentDataIdError( 

883 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, " 

884 f"but {element.name} implies {d.name}={value!r}." 

885 ) 

886 if element in standardized.graph.spatial and record.region is not None: 

887 if any(record.region.relate(r) & lsst.sphgeom.DISJOINT for r in regions): 

888 raise InconsistentDataIdError(f"Data ID {standardized}'s region for {element.name} " 

889 f"is disjoint with those for other elements.") 

890 regions.append(record.region) 

891 if element in standardized.graph.temporal: 

892 if any(not record.timespan.overlaps(t) for t in timespans): 

893 raise InconsistentDataIdError(f"Data ID {standardized}'s timespan for {element.name}" 

894 f" is disjoint with those for other elements.") 

895 timespans.append(record.timespan) 

896 else: 

897 if element in standardized.graph.required: 

898 raise LookupError( 

899 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

900 ) 

901 if element.alwaysJoin: 

902 raise InconsistentDataIdError( 

903 f"Could not fetch record for element {element.name} via keys {keys}, ", 

904 "but it is marked alwaysJoin=True; this means one or more dimensions are not " 

905 "related." 

906 ) 

907 records.update((d, None) for d in element.implied) 

908 return ExpandedDataCoordinate(standardized.graph, standardized.values(), records=records) 

909 

910 def relateDataIds(self, a: DataId, b: DataId) -> Optional[ConsistentDataIds]: 

911 """Compare the keys and values of a pair of data IDs for consistency. 

912 

913 See `ConsistentDataIds` for more information. 

914 

915 Parameters 

916 ---------- 

917 a : `dict` or `DataCoordinate` 

918 First data ID to be compared. 

919 b : `dict` or `DataCoordinate` 

920 Second data ID to be compared. 

921 

922 Returns 

923 ------- 

924 relationship : `ConsistentDataIds` or `None` 

925 Relationship information. This is not `None` and coerces to 

926 `True` in boolean contexts if and only if the data IDs are 

927 consistent in terms of all common key-value pairs, all many-to-many 

928 join tables, and all spatial andtemporal relationships. 

929 """ 

930 a = DataCoordinate.standardize(a, universe=self.dimensions) 

931 b = DataCoordinate.standardize(b, universe=self.dimensions) 

932 aFull = getattr(a, "full", None) 

933 bFull = getattr(b, "full", None) 

934 aBest = aFull if aFull is not None else a 

935 bBest = bFull if bFull is not None else b 

936 jointKeys = aBest.keys() & bBest.keys() 

937 # If any common values are not equal, we know they are inconsistent. 

938 if any(aBest[k] != bBest[k] for k in jointKeys): 

939 return None 

940 # If the graphs are equal, we know the data IDs are. 

941 if a.graph == b.graph: 

942 return ConsistentDataIds(contains=True, within=True, overlaps=bool(jointKeys)) 

943 # Result is still inconclusive. Try to expand a data ID containing 

944 # keys from both; that will fail if they are inconsistent. 

945 # First, if either input was already an ExpandedDataCoordinate, extract 

946 # its records so we don't have to query for them. 

947 records: NamedKeyDict[DimensionElement, Optional[DimensionRecord]] = NamedKeyDict() 

948 if isinstance(a, ExpandedDataCoordinate): 

949 records.update(a.records) 

950 if isinstance(b, ExpandedDataCoordinate): 

951 records.update(b.records) 

952 try: 

953 self.expandDataId({**a.byName(), **b.byName()}, graph=(a.graph | b.graph), records=records) 

954 except InconsistentDataIdError: 

955 return None 

956 # We know the answer is not `None`; time to figure out what it is. 

957 return ConsistentDataIds( 

958 contains=(a.graph >= b.graph), 

959 within=(a.graph <= b.graph), 

960 overlaps=bool(a.graph & b.graph), 

961 ) 

962 

963 def insertDimensionData(self, element: Union[DimensionElement, str], 

964 *data: Union[Mapping[str, Any], DimensionRecord], 

965 conform: bool = True) -> None: 

966 """Insert one or more dimension records into the database. 

967 

968 Parameters 

969 ---------- 

970 element : `DimensionElement` or `str` 

971 The `DimensionElement` or name thereof that identifies the table 

972 records will be inserted into. 

973 data : `dict` or `DimensionRecord` (variadic) 

974 One or more records to insert. 

975 conform : `bool`, optional 

976 If `False` (`True` is default) perform no checking or conversions, 

977 and assume that ``element`` is a `DimensionElement` instance and 

978 ``data`` is a one or more `DimensionRecord` instances of the 

979 appropriate subclass. 

980 """ 

981 if conform: 

982 if isinstance(element, str): 

983 element = self.dimensions[element] 

984 records = [row if isinstance(row, DimensionRecord) else element.RecordClass.fromDict(row) 

985 for row in data] 

986 else: 

987 # Ignore typing since caller said to trust them with conform=False. 

988 records = data # type: ignore 

989 storage = self._dimensions[element] # type: ignore 

990 storage.insert(*records) 

991 

992 def syncDimensionData(self, element: Union[DimensionElement, str], 

993 row: Union[Mapping[str, Any], DimensionRecord], 

994 conform: bool = True) -> bool: 

995 """Synchronize the given dimension record with the database, inserting 

996 if it does not already exist and comparing values if it does. 

997 

998 Parameters 

999 ---------- 

1000 element : `DimensionElement` or `str` 

1001 The `DimensionElement` or name thereof that identifies the table 

1002 records will be inserted into. 

1003 row : `dict` or `DimensionRecord` 

1004 The record to insert. 

1005 conform : `bool`, optional 

1006 If `False` (`True` is default) perform no checking or conversions, 

1007 and assume that ``element`` is a `DimensionElement` instance and 

1008 ``data`` is a one or more `DimensionRecord` instances of the 

1009 appropriate subclass. 

1010 

1011 Returns 

1012 ------- 

1013 inserted : `bool` 

1014 `True` if a new row was inserted, `False` otherwise. 

1015 

1016 Raises 

1017 ------ 

1018 ConflictingDefinitionError 

1019 Raised if the record exists in the database (according to primary 

1020 key lookup) but is inconsistent with the given one. 

1021 

1022 Notes 

1023 ----- 

1024 This method cannot be called within transactions, as it needs to be 

1025 able to perform its own transaction to be concurrent. 

1026 """ 

1027 if conform: 

1028 if isinstance(element, str): 

1029 element = self.dimensions[element] 

1030 record = row if isinstance(row, DimensionRecord) else element.RecordClass.fromDict(row) 

1031 else: 

1032 # Ignore typing since caller said to trust them with conform=False. 

1033 record = row # type: ignore 

1034 storage = self._dimensions[element] # type: ignore 

1035 return storage.sync(record) 

1036 

1037 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None 

1038 ) -> Iterator[DatasetType]: 

1039 """Iterate over the dataset types whose names match an expression. 

1040 

1041 Parameters 

1042 ---------- 

1043 expression : `Any`, optional 

1044 An expression that fully or partially identifies the dataset types 

1045 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1046 `...` can be used to return all dataset types, and is the default. 

1047 See :ref:`daf_butler_dataset_type_expressions` for more 

1048 information. 

1049 components : `bool`, optional 

1050 If `True`, apply all expression patterns to component dataset type 

1051 names as well. If `False`, never apply patterns to components. 

1052 If `None` (default), apply patterns to components only if their 

1053 parent datasets were not matched by the expression. 

1054 Fully-specified component datasets (`str` or `DatasetType` 

1055 instances) are always included. 

1056 

1057 Yields 

1058 ------ 

1059 datasetType : `DatasetType` 

1060 A `DatasetType` instance whose name matches ``expression``. 

1061 """ 

1062 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name) 

1063 if wildcard is Ellipsis: 

1064 for datasetType in self._datasets: 

1065 if components or not datasetType.isComponent(): 

1066 yield datasetType 

1067 return 

1068 done: Set[str] = set() 

1069 for name in wildcard.strings: 

1070 storage = self._datasets.find(name) 

1071 if storage is not None: 

1072 done.add(storage.datasetType.name) 

1073 yield storage.datasetType 

1074 if wildcard.patterns: 

1075 # If components (the argument) is None, we'll save component 

1076 # dataset that we might want to match, but only if their parents 

1077 # didn't get included. 

1078 componentsForLater = [] 

1079 for datasetType in self._datasets: 

1080 if datasetType.name in done: 

1081 continue 

1082 parentName, componentName = datasetType.nameAndComponent() 

1083 if componentName is not None and not components: 

1084 if components is None and parentName not in done: 

1085 componentsForLater.append(datasetType) 

1086 continue 

1087 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

1088 done.add(datasetType.name) 

1089 yield datasetType 

1090 # Go back and try to match saved components. 

1091 for datasetType in componentsForLater: 

1092 parentName, _ = datasetType.nameAndComponent() 

1093 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

1094 yield datasetType 

1095 

1096 def queryCollections(self, expression: Any = ..., 

1097 datasetType: Optional[DatasetType] = None, 

1098 collectionType: Optional[CollectionType] = None, 

1099 flattenChains: bool = False, 

1100 includeChains: Optional[bool] = None) -> Iterator[str]: 

1101 """Iterate over the collections whose names match an expression. 

1102 

1103 Parameters 

1104 ---------- 

1105 expression : `Any`, optional 

1106 An expression that fully or partially identifies the collections 

1107 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1108 `...` can be used to return all collections, and is the default. 

1109 See :ref:`daf_butler_collection_expressions` for more 

1110 information. 

1111 datasetType : `DatasetType`, optional 

1112 If provided, only yield collections that should be searched for 

1113 this dataset type according to ``expression``. If this is 

1114 not provided, any dataset type restrictions in ``expression`` are 

1115 ignored. 

1116 collectionType : `CollectionType`, optional 

1117 If provided, only yield collections of this type. 

1118 flattenChains : `bool`, optional 

1119 If `True` (`False` is default), recursively yield the child 

1120 collections of matching `~CollectionType.CHAINED` collections. 

1121 includeChains : `bool`, optional 

1122 If `True`, yield records for matching `~CollectionType.CHAINED` 

1123 collections. Default is the opposite of ``flattenChains``: include 

1124 either CHAINED collections or their children, but not both. 

1125 

1126 Yields 

1127 ------ 

1128 collection : `str` 

1129 The name of a collection that matches ``expression``. 

1130 """ 

1131 query = CollectionQuery.fromExpression(expression) 

1132 for record in query.iter(self._collections, datasetType=datasetType, collectionType=collectionType, 

1133 flattenChains=flattenChains, includeChains=includeChains): 

1134 yield record.name 

1135 

1136 def makeQueryBuilder(self, summary: QuerySummary) -> QueryBuilder: 

1137 """Return a `QueryBuilder` instance capable of constructing and 

1138 managing more complex queries than those obtainable via `Registry` 

1139 interfaces. 

1140 

1141 This is an advanced interface; downstream code should prefer 

1142 `Registry.queryDimensions` and `Registry.queryDatasets` whenever those 

1143 are sufficient. 

1144 

1145 Parameters 

1146 ---------- 

1147 summary : `QuerySummary` 

1148 Object describing and categorizing the full set of dimensions that 

1149 will be included in the query. 

1150 

1151 Returns 

1152 ------- 

1153 builder : `QueryBuilder` 

1154 Object that can be used to construct and perform advanced queries. 

1155 """ 

1156 return QueryBuilder(summary=summary, 

1157 collections=self._collections, 

1158 dimensions=self._dimensions, 

1159 datasets=self._datasets) 

1160 

1161 def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *, 

1162 dataId: Optional[DataId] = None, 

1163 datasets: Any = None, 

1164 collections: Any = None, 

1165 where: Optional[str] = None, 

1166 expand: bool = True, 

1167 components: Optional[bool] = None, 

1168 **kwargs: Any) -> Iterator[DataCoordinate]: 

1169 """Query for and iterate over data IDs matching user-provided criteria. 

1170 

1171 Parameters 

1172 ---------- 

1173 dimensions : `Dimension` or `str`, or iterable thereof 

1174 The dimensions of the data IDs to yield, as either `Dimension` 

1175 instances or `str`. Will be automatically expanded to a complete 

1176 `DimensionGraph`. 

1177 dataId : `dict` or `DataCoordinate`, optional 

1178 A data ID whose key-value pairs are used as equality constraints 

1179 in the query. 

1180 datasets : `Any`, optional 

1181 An expression that fully or partially identifies dataset types 

1182 that should constrain the yielded data IDs. For example, including 

1183 "raw" here would constrain the yielded ``instrument``, 

1184 ``exposure``, ``detector``, and ``physical_filter`` values to only 

1185 those for which at least one "raw" dataset exists in 

1186 ``collections``. Allowed types include `DatasetType`, `str`, 

1187 `re.Pattern`, and iterables thereof. Unlike other dataset type 

1188 expressions, `...` is not permitted - it doesn't make sense to 

1189 constrain data IDs on the existence of *all* datasets. 

1190 See :ref:`daf_butler_dataset_type_expressions` for more 

1191 information. 

1192 collections: `Any`, optional 

1193 An expression that fully or partially identifies the collections 

1194 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1195 thereof. `...` can be used to return all collections. Must be 

1196 provided if ``datasets`` is, and is ignored if it is not. See 

1197 :ref:`daf_butler_collection_expressions` for more information. 

1198 where : `str`, optional 

1199 A string expression similar to a SQL WHERE clause. May involve 

1200 any column of a dimension table or (as a shortcut for the primary 

1201 key column of a dimension table) dimension name. See 

1202 :ref:`daf_butler_dimension_expressions` for more information. 

1203 expand : `bool`, optional 

1204 If `True` (default) yield `ExpandedDataCoordinate` instead of 

1205 minimal `DataCoordinate` base-class instances. 

1206 components : `bool`, optional 

1207 If `True`, apply all dataset expression patterns to component 

1208 dataset type names as well. If `False`, never apply patterns to 

1209 components. If `None` (default), apply patterns to components only 

1210 if their parent datasets were not matched by the expression. 

1211 Fully-specified component datasets (`str` or `DatasetType` 

1212 instances) are always included. 

1213 **kwargs 

1214 Additional keyword arguments are forwarded to 

1215 `DataCoordinate.standardize` when processing the ``dataId`` 

1216 argument (and may be used to provide a constraining data ID even 

1217 when the ``dataId`` argument is `None`). 

1218 

1219 Yields 

1220 ------ 

1221 dataId : `DataCoordinate` 

1222 Data IDs matching the given query parameters. Order is 

1223 unspecified. 

1224 """ 

1225 dimensions = iterable(dimensions) 

1226 standardizedDataId = self.expandDataId(dataId, **kwargs) 

1227 standardizedDatasetTypes = set() 

1228 requestedDimensionNames = set(self.dimensions.extract(dimensions).names) 

1229 if datasets is not None: 

1230 if collections is None: 

1231 raise TypeError("Cannot pass 'datasets' without 'collections'.") 

1232 for datasetType in self.queryDatasetTypes(datasets, components=components): 

1233 requestedDimensionNames.update(datasetType.dimensions.names) 

1234 # If any matched dataset type is a component, just operate on 

1235 # its parent instead, because Registry doesn't know anything 

1236 # about what components exist, and here (unlike queryDatasets) 

1237 # we don't care about returning them. 

1238 parentDatasetTypeName, componentName = datasetType.nameAndComponent() 

1239 if componentName is not None: 

1240 datasetType = self.getDatasetType(parentDatasetTypeName) 

1241 standardizedDatasetTypes.add(datasetType) 

1242 # Preprocess collections expression in case the original included 

1243 # single-pass iterators (we'll want to use it multiple times 

1244 # below). 

1245 collections = CollectionQuery.fromExpression(collections) 

1246 

1247 summary = QuerySummary( 

1248 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1249 dataId=standardizedDataId, 

1250 expression=where, 

1251 ) 

1252 builder = self.makeQueryBuilder(summary) 

1253 for datasetType in standardizedDatasetTypes: 

1254 builder.joinDataset(datasetType, collections, isResult=False) 

1255 query = builder.finish() 

1256 predicate = query.predicate() 

1257 for row in self._db.query(query.sql): 

1258 if predicate(row): 

1259 result = query.extractDataId(row) 

1260 if expand: 

1261 yield self.expandDataId(result, records=standardizedDataId.records) 

1262 else: 

1263 yield result 

1264 

1265 def queryDatasets(self, datasetType: Any, *, 

1266 collections: Any, 

1267 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

1268 dataId: Optional[DataId] = None, 

1269 where: Optional[str] = None, 

1270 deduplicate: bool = False, 

1271 expand: bool = True, 

1272 components: Optional[bool] = None, 

1273 **kwargs: Any) -> Iterator[DatasetRef]: 

1274 """Query for and iterate over dataset references matching user-provided 

1275 criteria. 

1276 

1277 Parameters 

1278 ---------- 

1279 datasetType 

1280 An expression that fully or partially identifies the dataset types 

1281 to be queried. Allowed types include `DatasetType`, `str`, 

1282 `re.Pattern`, and iterables thereof. The special value `...` can 

1283 be used to query all dataset types. See 

1284 :ref:`daf_butler_dataset_type_expressions` for more information. 

1285 collections 

1286 An expression that fully or partially identifies the collections 

1287 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1288 thereof. `...` can be used to return all collections. See 

1289 :ref:`daf_butler_collection_expressions` for more information. 

1290 dimensions : `~collections.abc.Iterable` of `Dimension` or `str` 

1291 Dimensions to include in the query (in addition to those used 

1292 to identify the queried dataset type(s)), either to constrain 

1293 the resulting datasets to those for which a matching dimension 

1294 exists, or to relate the dataset type's dimensions to dimensions 

1295 referenced by the ``dataId`` or ``where`` arguments. 

1296 dataId : `dict` or `DataCoordinate`, optional 

1297 A data ID whose key-value pairs are used as equality constraints 

1298 in the query. 

1299 where : `str`, optional 

1300 A string expression similar to a SQL WHERE clause. May involve 

1301 any column of a dimension table or (as a shortcut for the primary 

1302 key column of a dimension table) dimension name. See 

1303 :ref:`daf_butler_dimension_expressions` for more information. 

1304 deduplicate : `bool`, optional 

1305 If `True` (`False` is default), for each result data ID, only 

1306 yield one `DatasetRef` of each `DatasetType`, from the first 

1307 collection in which a dataset of that dataset type appears 

1308 (according to the order of ``collections`` passed in). If `True`, 

1309 ``collections`` must not contain regular expressions and may not 

1310 be `...`. 

1311 expand : `bool`, optional 

1312 If `True` (default) attach `ExpandedDataCoordinate` instead of 

1313 minimal `DataCoordinate` base-class instances. 

1314 components : `bool`, optional 

1315 If `True`, apply all dataset expression patterns to component 

1316 dataset type names as well. If `False`, never apply patterns to 

1317 components. If `None` (default), apply patterns to components only 

1318 if their parent datasets were not matched by the expression. 

1319 Fully-specified component datasets (`str` or `DatasetType` 

1320 instances) are always included. 

1321 **kwargs 

1322 Additional keyword arguments are forwarded to 

1323 `DataCoordinate.standardize` when processing the ``dataId`` 

1324 argument (and may be used to provide a constraining data ID even 

1325 when the ``dataId`` argument is `None`). 

1326 

1327 Yields 

1328 ------ 

1329 ref : `DatasetRef` 

1330 Dataset references matching the given query criteria. These 

1331 are grouped by `DatasetType` if the query evaluates to multiple 

1332 dataset types, but order is otherwise unspecified. 

1333 

1334 Raises 

1335 ------ 

1336 TypeError 

1337 Raised when the arguments are incompatible, such as when a 

1338 collection wildcard is passed when ``deduplicate`` is `True`. 

1339 

1340 Notes 

1341 ----- 

1342 When multiple dataset types are queried in a single call, the 

1343 results of this operation are equivalent to querying for each dataset 

1344 type separately in turn, and no information about the relationships 

1345 between datasets of different types is included. In contexts where 

1346 that kind of information is important, the recommended pattern is to 

1347 use `queryDimensions` to first obtain data IDs (possibly with the 

1348 desired dataset types and collections passed as constraints to the 

1349 query), and then use multiple (generally much simpler) calls to 

1350 `queryDatasets` with the returned data IDs passed as constraints. 

1351 """ 

1352 # Standardize the collections expression. 

1353 if deduplicate: 

1354 collections = CollectionSearch.fromExpression(collections) 

1355 else: 

1356 collections = CollectionQuery.fromExpression(collections) 

1357 # Standardize and expand the data ID provided as a constraint. 

1358 standardizedDataId = self.expandDataId(dataId, **kwargs) 

1359 

1360 # We can only query directly if given a non-component DatasetType 

1361 # instance. If we were given an expression or str or a component 

1362 # DatasetType instance, we'll populate this dict, recurse, and return. 

1363 # If we already have a non-component DatasetType, it will remain None 

1364 # and we'll run the query directly. 

1365 composition: Optional[ 

1366 Dict[ 

1367 DatasetType, # parent dataset type 

1368 List[Optional[str]] # component name, or None for parent 

1369 ] 

1370 ] = None 

1371 if not isinstance(datasetType, DatasetType): 

1372 # We were given a dataset type expression (which may be as simple 

1373 # as a str). Loop over all matching datasets, delegating handling 

1374 # of the `components` argument to queryDatasetTypes, as we populate 

1375 # the composition dict. 

1376 composition = defaultdict(list) 

1377 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components): 

1378 parentName, componentName = trueDatasetType.nameAndComponent() 

1379 if componentName is not None: 

1380 parentDatasetType = self.getDatasetType(parentName) 

1381 composition.setdefault(parentDatasetType, []).append(componentName) 

1382 else: 

1383 composition.setdefault(trueDatasetType, []).append(None) 

1384 elif datasetType.isComponent(): 

1385 # We were given a true DatasetType instance, but it's a component. 

1386 # the composition dict will have exactly one item. 

1387 parentName, componentName = datasetType.nameAndComponent() 

1388 parentDatasetType = self.getDatasetType(parentName) 

1389 composition = {parentDatasetType: [componentName]} 

1390 if composition is not None: 

1391 # We need to recurse. Do that once for each parent dataset type. 

1392 for parentDatasetType, componentNames in composition.items(): 

1393 for parentRef in self.queryDatasets(parentDatasetType, collections=collections, 

1394 dimensions=dimensions, dataId=standardizedDataId, 

1395 where=where, deduplicate=deduplicate): 

1396 # Loop over components, yielding one for each one for each 

1397 # one requested. 

1398 for componentName in componentNames: 

1399 if componentName is None: 

1400 yield parentRef 

1401 else: 

1402 yield parentRef.makeComponentRef(componentName) 

1403 return 

1404 # If we get here, there's no need to recurse (or we are already 

1405 # recursing; there can only ever be one level of recursion). 

1406 

1407 # The full set of dimensions in the query is the combination of those 

1408 # needed for the DatasetType and those explicitly requested, if any. 

1409 requestedDimensionNames = set(datasetType.dimensions.names) 

1410 if dimensions is not None: 

1411 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

1412 # Construct the summary structure needed to construct a QueryBuilder. 

1413 summary = QuerySummary( 

1414 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1415 dataId=standardizedDataId, 

1416 expression=where, 

1417 ) 

1418 builder = self.makeQueryBuilder(summary) 

1419 # Add the dataset subquery to the query, telling the QueryBuilder to 

1420 # include the rank of the selected collection in the results only if we 

1421 # need to deduplicate. Note that if any of the collections are 

1422 # actually wildcard expressions, and we've asked for deduplication, 

1423 # this will raise TypeError for us. 

1424 if not builder.joinDataset(datasetType, collections, isResult=True, addRank=deduplicate): 

1425 return 

1426 query = builder.finish() 

1427 predicate = query.predicate() 

1428 if not deduplicate: 

1429 # No need to de-duplicate across collections. 

1430 for row in self._db.query(query.sql): 

1431 if predicate(row): 

1432 dataId = query.extractDataId(row, graph=datasetType.dimensions) 

1433 if expand: 

1434 dataId = self.expandDataId(dataId, records=standardizedDataId.records) 

1435 yield query.extractDatasetRef(row, datasetType, dataId)[0] 

1436 else: 

1437 # For each data ID, yield only the DatasetRef with the lowest 

1438 # collection rank. 

1439 bestRefs = {} 

1440 bestRanks: Dict[DataCoordinate, int] = {} 

1441 for row in self._db.query(query.sql): 

1442 if predicate(row): 

1443 ref, rank = query.extractDatasetRef(row, datasetType) 

1444 bestRank = bestRanks.get(ref.dataId, sys.maxsize) 

1445 assert rank is not None 

1446 if rank < bestRank: 

1447 bestRefs[ref.dataId] = ref 

1448 bestRanks[ref.dataId] = rank 

1449 # If caller requested expanded data IDs, we defer that until here 

1450 # so we do as little expansion as possible. 

1451 if expand: 

1452 for ref in bestRefs.values(): 

1453 dataId = self.expandDataId(ref.dataId, records=standardizedDataId.records) 

1454 yield ref.expanded(dataId) 

1455 else: 

1456 yield from bestRefs.values() 

1457 

1458 storageClasses: StorageClassFactory 

1459 """All storage classes known to the registry (`StorageClassFactory`). 

1460 """