Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "Registry", 

26) 

27 

28from collections import defaultdict 

29import contextlib 

30import sys 

31from typing import ( 

32 Any, 

33 Dict, 

34 Iterable, 

35 Iterator, 

36 List, 

37 Mapping, 

38 Optional, 

39 Set, 

40 Type, 

41 TYPE_CHECKING, 

42 Union, 

43) 

44 

45import sqlalchemy 

46 

47from ..core import ( 

48 Config, 

49 DataCoordinate, 

50 DataCoordinateIterable, 

51 DataId, 

52 DatasetRef, 

53 DatasetType, 

54 ddl, 

55 Dimension, 

56 DimensionElement, 

57 DimensionGraph, 

58 DimensionRecord, 

59 DimensionUniverse, 

60 NamedKeyMapping, 

61 NameLookupMapping, 

62 StorageClassFactory, 

63) 

64from ..core.utils import doImport, iterable, transactional 

65from ._config import RegistryConfig 

66from .queries import ( 

67 QueryBuilder, 

68 QuerySummary, 

69) 

70from ._collectionType import CollectionType 

71from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError 

72from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch, Ellipsis 

73from .interfaces import ChainedCollectionRecord, RunRecord 

74from .versions import ButlerVersionsManager 

75 

76if TYPE_CHECKING: 76 ↛ 77line 76 didn't jump to line 77, because the condition on line 76 was never true

77 from ..butlerConfig import ButlerConfig 

78 from .interfaces import ( 

79 ButlerAttributeManager, 

80 CollectionManager, 

81 Database, 

82 OpaqueTableStorageManager, 

83 DimensionRecordStorageManager, 

84 DatasetRecordStorageManager, 

85 DatastoreRegistryBridgeManager, 

86 ) 

87 

88 

89class Registry: 

90 """Registry interface. 

91 

92 Parameters 

93 ---------- 

94 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

95 Registry configuration 

96 """ 

97 

98 defaultConfigFile: Optional[str] = None 

99 """Path to configuration defaults. Accessed within the ``config`` resource 

100 or relative to a search path. Can be None if no defaults specified. 

101 """ 

102 

103 @classmethod 

104 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False, 

105 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry: 

106 """Create `Registry` subclass instance from `config`. 

107 

108 Uses ``registry.cls`` from `config` to determine which subclass to 

109 instantiate. 

110 

111 Parameters 

112 ---------- 

113 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

114 Registry configuration 

115 create : `bool`, optional 

116 Assume empty Registry and create a new one. 

117 butlerRoot : `str`, optional 

118 Path to the repository root this `Registry` will manage. 

119 writeable : `bool`, optional 

120 If `True` (default) create a read-write connection to the database. 

121 

122 Returns 

123 ------- 

124 registry : `Registry` (subclass) 

125 A new `Registry` subclass instance. 

126 """ 

127 if not isinstance(config, RegistryConfig): 

128 if isinstance(config, str) or isinstance(config, Config): 

129 config = RegistryConfig(config) 

130 else: 

131 raise ValueError("Incompatible Registry configuration: {}".format(config)) 

132 config.replaceRoot(butlerRoot) 

133 DatabaseClass = config.getDatabaseClass() 

134 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

135 namespace=config.get("namespace"), writeable=writeable) 

136 universe = DimensionUniverse(config) 

137 attributes = doImport(config["managers", "attributes"]) 

138 opaque = doImport(config["managers", "opaque"]) 

139 dimensions = doImport(config["managers", "dimensions"]) 

140 collections = doImport(config["managers", "collections"]) 

141 datasets = doImport(config["managers", "datasets"]) 

142 datastoreBridges = doImport(config["managers", "datastores"]) 

143 versions = ButlerVersionsManager.fromConfig(config.get("schema_versions")) 

144 

145 return cls(database, universe, dimensions=dimensions, attributes=attributes, opaque=opaque, 

146 collections=collections, datasets=datasets, datastoreBridges=datastoreBridges, 

147 versions=versions, writeable=writeable, create=create) 

148 

149 def __init__(self, database: Database, universe: DimensionUniverse, *, 

150 attributes: Type[ButlerAttributeManager], 

151 opaque: Type[OpaqueTableStorageManager], 

152 dimensions: Type[DimensionRecordStorageManager], 

153 collections: Type[CollectionManager], 

154 datasets: Type[DatasetRecordStorageManager], 

155 datastoreBridges: Type[DatastoreRegistryBridgeManager], 

156 versions: ButlerVersionsManager, 

157 writeable: bool = True, 

158 create: bool = False): 

159 self._db = database 

160 self.storageClasses = StorageClassFactory() 

161 with self._db.declareStaticTables(create=create) as context: 

162 self._attributes = attributes.initialize(self._db, context) 

163 self._dimensions = dimensions.initialize(self._db, context, universe=universe) 

164 self._collections = collections.initialize(self._db, context) 

165 self._datasets = datasets.initialize(self._db, context, 

166 collections=self._collections, 

167 universe=self.dimensions) 

168 self._opaque = opaque.initialize(self._db, context) 

169 self._datastoreBridges = datastoreBridges.initialize(self._db, context, 

170 opaque=self._opaque, 

171 datasets=datasets, 

172 universe=self.dimensions) 

173 context.addInitializer(lambda db: versions.storeVersions(self._attributes)) 

174 

175 # This call does not do anything right now as we do not have a way to 

176 # split tables between sub-schemas yet. 

177 versions.checkVersionDigests() 

178 if not create: 

179 # verify that configured versions are compatible with schema 

180 versions.checkStoredVersions(self._attributes, writeable) 

181 

182 self._collections.refresh() 

183 self._datasets.refresh(universe=self._dimensions.universe) 

184 

185 def __str__(self) -> str: 

186 return str(self._db) 

187 

188 def __repr__(self) -> str: 

189 return f"Registry({self._db!r}, {self.dimensions!r})" 

190 

191 def isWriteable(self) -> bool: 

192 """Return `True` if this registry allows write operations, and `False` 

193 otherwise. 

194 """ 

195 return self._db.isWriteable() 

196 

197 @property 

198 def dimensions(self) -> DimensionUniverse: 

199 """All dimensions recognized by this `Registry` (`DimensionUniverse`). 

200 """ 

201 return self._dimensions.universe 

202 

203 @contextlib.contextmanager 

204 def transaction(self) -> Iterator[None]: 

205 """Return a context manager that represents a transaction. 

206 """ 

207 # TODO make savepoint=False the default. 

208 try: 

209 with self._db.transaction(): 

210 yield 

211 except BaseException: 

212 # TODO: this clears the caches sometimes when we wouldn't actually 

213 # need to. Can we avoid that? 

214 self._dimensions.clearCaches() 

215 raise 

216 

217 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None: 

218 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

219 other data repository client. 

220 

221 Opaque table records can be added via `insertOpaqueData`, retrieved via 

222 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

223 

224 Parameters 

225 ---------- 

226 tableName : `str` 

227 Logical name of the opaque table. This may differ from the 

228 actual name used in the database by a prefix and/or suffix. 

229 spec : `ddl.TableSpec` 

230 Specification for the table to be added. 

231 """ 

232 self._opaque.register(tableName, spec) 

233 

234 @transactional 

235 def insertOpaqueData(self, tableName: str, *data: dict) -> None: 

236 """Insert records into an opaque table. 

237 

238 Parameters 

239 ---------- 

240 tableName : `str` 

241 Logical name of the opaque table. Must match the name used in a 

242 previous call to `registerOpaqueTable`. 

243 data 

244 Each additional positional argument is a dictionary that represents 

245 a single row to be added. 

246 """ 

247 self._opaque[tableName].insert(*data) 

248 

249 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

250 """Retrieve records from an opaque table. 

251 

252 Parameters 

253 ---------- 

254 tableName : `str` 

255 Logical name of the opaque table. Must match the name used in a 

256 previous call to `registerOpaqueTable`. 

257 where 

258 Additional keyword arguments are interpreted as equality 

259 constraints that restrict the returned rows (combined with AND); 

260 keyword arguments are column names and values are the values they 

261 must have. 

262 

263 Yields 

264 ------ 

265 row : `dict` 

266 A dictionary representing a single result row. 

267 """ 

268 yield from self._opaque[tableName].fetch(**where) 

269 

270 @transactional 

271 def deleteOpaqueData(self, tableName: str, **where: Any) -> None: 

272 """Remove records from an opaque table. 

273 

274 Parameters 

275 ---------- 

276 tableName : `str` 

277 Logical name of the opaque table. Must match the name used in a 

278 previous call to `registerOpaqueTable`. 

279 where 

280 Additional keyword arguments are interpreted as equality 

281 constraints that restrict the deleted rows (combined with AND); 

282 keyword arguments are column names and values are the values they 

283 must have. 

284 """ 

285 self._opaque[tableName].delete(**where) 

286 

287 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED) -> None: 

288 """Add a new collection if one with the given name does not exist. 

289 

290 Parameters 

291 ---------- 

292 name : `str` 

293 The name of the collection to create. 

294 type : `CollectionType` 

295 Enum value indicating the type of collection to create. 

296 

297 Notes 

298 ----- 

299 This method cannot be called within transactions, as it needs to be 

300 able to perform its own transaction to be concurrent. 

301 """ 

302 self._collections.register(name, type) 

303 

304 def getCollectionType(self, name: str) -> CollectionType: 

305 """Return an enumeration value indicating the type of the given 

306 collection. 

307 

308 Parameters 

309 ---------- 

310 name : `str` 

311 The name of the collection. 

312 

313 Returns 

314 ------- 

315 type : `CollectionType` 

316 Enum value indicating the type of this collection. 

317 

318 Raises 

319 ------ 

320 MissingCollectionError 

321 Raised if no collection with the given name exists. 

322 """ 

323 return self._collections.find(name).type 

324 

325 def registerRun(self, name: str) -> None: 

326 """Add a new run if one with the given name does not exist. 

327 

328 Parameters 

329 ---------- 

330 name : `str` 

331 The name of the run to create. 

332 

333 Notes 

334 ----- 

335 This method cannot be called within transactions, as it needs to be 

336 able to perform its own transaction to be concurrent. 

337 """ 

338 self._collections.register(name, CollectionType.RUN) 

339 

340 @transactional 

341 def removeCollection(self, name: str) -> None: 

342 """Completely remove the given collection. 

343 

344 Parameters 

345 ---------- 

346 name : `str` 

347 The name of the collection to remove. 

348 

349 Raises 

350 ------ 

351 MissingCollectionError 

352 Raised if no collection with the given name exists. 

353 

354 Notes 

355 ----- 

356 If this is a `~CollectionType.RUN` collection, all datasets and quanta 

357 in it are also fully removed. This requires that those datasets be 

358 removed (or at least trashed) from any datastores that hold them first. 

359 

360 A collection may not be deleted as long as it is referenced by a 

361 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must 

362 be deleted or redefined first. 

363 """ 

364 self._collections.remove(name) 

365 

366 def getCollectionChain(self, parent: str) -> CollectionSearch: 

367 """Return the child collections in a `~CollectionType.CHAINED` 

368 collection. 

369 

370 Parameters 

371 ---------- 

372 parent : `str` 

373 Name of the chained collection. Must have already been added via 

374 a call to `Registry.registerCollection`. 

375 

376 Returns 

377 ------- 

378 children : `CollectionSearch` 

379 An object that defines the search path of the collection. 

380 See :ref:`daf_butler_collection_expressions` for more information. 

381 

382 Raises 

383 ------ 

384 MissingCollectionError 

385 Raised if ``parent`` does not exist in the `Registry`. 

386 TypeError 

387 Raised if ``parent`` does not correspond to a 

388 `~CollectionType.CHAINED` collection. 

389 """ 

390 record = self._collections.find(parent) 

391 if record.type is not CollectionType.CHAINED: 

392 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

393 assert isinstance(record, ChainedCollectionRecord) 

394 return record.children 

395 

396 @transactional 

397 def setCollectionChain(self, parent: str, children: Any) -> None: 

398 """Define or redefine a `~CollectionType.CHAINED` collection. 

399 

400 Parameters 

401 ---------- 

402 parent : `str` 

403 Name of the chained collection. Must have already been added via 

404 a call to `Registry.registerCollection`. 

405 children : `Any` 

406 An expression defining an ordered search of child collections, 

407 generally an iterable of `str`. Restrictions on the dataset types 

408 to be searched can also be included, by passing mapping or an 

409 iterable containing tuples; see 

410 :ref:`daf_butler_collection_expressions` for more information. 

411 

412 Raises 

413 ------ 

414 MissingCollectionError 

415 Raised when any of the given collections do not exist in the 

416 `Registry`. 

417 TypeError 

418 Raised if ``parent`` does not correspond to a 

419 `~CollectionType.CHAINED` collection. 

420 ValueError 

421 Raised if the given collections contains a cycle. 

422 """ 

423 record = self._collections.find(parent) 

424 if record.type is not CollectionType.CHAINED: 

425 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

426 assert isinstance(record, ChainedCollectionRecord) 

427 children = CollectionSearch.fromExpression(children) 

428 if children != record.children: 

429 record.update(self._collections, children) 

430 

431 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

432 """ 

433 Add a new `DatasetType` to the Registry. 

434 

435 It is not an error to register the same `DatasetType` twice. 

436 

437 Parameters 

438 ---------- 

439 datasetType : `DatasetType` 

440 The `DatasetType` to be added. 

441 

442 Returns 

443 ------- 

444 inserted : `bool` 

445 `True` if ``datasetType`` was inserted, `False` if an identical 

446 existing `DatsetType` was found. Note that in either case the 

447 DatasetType is guaranteed to be defined in the Registry 

448 consistently with the given definition. 

449 

450 Raises 

451 ------ 

452 ValueError 

453 Raised if the dimensions or storage class are invalid. 

454 ConflictingDefinitionError 

455 Raised if this DatasetType is already registered with a different 

456 definition. 

457 

458 Notes 

459 ----- 

460 This method cannot be called within transactions, as it needs to be 

461 able to perform its own transaction to be concurrent. 

462 """ 

463 _, inserted = self._datasets.register(datasetType) 

464 return inserted 

465 

466 def getDatasetType(self, name: str) -> DatasetType: 

467 """Get the `DatasetType`. 

468 

469 Parameters 

470 ---------- 

471 name : `str` 

472 Name of the type. 

473 

474 Returns 

475 ------- 

476 type : `DatasetType` 

477 The `DatasetType` associated with the given name. 

478 

479 Raises 

480 ------ 

481 KeyError 

482 Requested named DatasetType could not be found in registry. 

483 """ 

484 storage = self._datasets.find(name) 

485 if storage is None: 

486 raise KeyError(f"DatasetType '{name}' could not be found.") 

487 return storage.datasetType 

488 

489 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *, 

490 collections: Any, **kwargs: Any) -> Optional[DatasetRef]: 

491 """Find a dataset given its `DatasetType` and data ID. 

492 

493 This can be used to obtain a `DatasetRef` that permits the dataset to 

494 be read from a `Datastore`. If the dataset is a component and can not 

495 be found using the provided dataset type, a dataset ref for the parent 

496 will be returned instead but with the correct dataset type. 

497 

498 Parameters 

499 ---------- 

500 datasetType : `DatasetType` or `str` 

501 A `DatasetType` or the name of one. 

502 dataId : `dict` or `DataCoordinate`, optional 

503 A `dict`-like object containing the `Dimension` links that identify 

504 the dataset within a collection. 

505 collections 

506 An expression that fully or partially identifies the collections 

507 to search for the dataset, such as a `str`, `re.Pattern`, or 

508 iterable thereof. `...` can be used to return all collections. 

509 See :ref:`daf_butler_collection_expressions` for more information. 

510 **kwargs 

511 Additional keyword arguments passed to 

512 `DataCoordinate.standardize` to convert ``dataId`` to a true 

513 `DataCoordinate` or augment an existing one. 

514 

515 Returns 

516 ------- 

517 ref : `DatasetRef` 

518 A reference to the dataset, or `None` if no matching Dataset 

519 was found. 

520 

521 Raises 

522 ------ 

523 LookupError 

524 Raised if one or more data ID keys are missing or the dataset type 

525 does not exist. 

526 MissingCollectionError 

527 Raised if any of ``collections`` does not exist in the registry. 

528 """ 

529 if isinstance(datasetType, DatasetType): 

530 storage = self._datasets.find(datasetType.name) 

531 if storage is None: 

532 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

533 else: 

534 storage = self._datasets.find(datasetType) 

535 if storage is None: 

536 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

537 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions, 

538 universe=self.dimensions, **kwargs) 

539 collections = CollectionSearch.fromExpression(collections) 

540 for collectionRecord in collections.iter(self._collections, datasetType=storage.datasetType): 

541 result = storage.find(collectionRecord, dataId) 

542 if result is not None: 

543 return result 

544 

545 return None 

546 

547 @transactional 

548 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId], 

549 run: str) -> List[DatasetRef]: 

550 """Insert one or more datasets into the `Registry` 

551 

552 This always adds new datasets; to associate existing datasets with 

553 a new collection, use ``associate``. 

554 

555 Parameters 

556 ---------- 

557 datasetType : `DatasetType` or `str` 

558 A `DatasetType` or the name of one. 

559 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate` 

560 Dimension-based identifiers for the new datasets. 

561 run : `str` 

562 The name of the run that produced the datasets. 

563 

564 Returns 

565 ------- 

566 refs : `list` of `DatasetRef` 

567 Resolved `DatasetRef` instances for all given data IDs (in the same 

568 order). 

569 

570 Raises 

571 ------ 

572 ConflictingDefinitionError 

573 If a dataset with the same dataset type and data ID as one of those 

574 given already exists in ``run``. 

575 MissingCollectionError 

576 Raised if ``run`` does not exist in the registry. 

577 """ 

578 if isinstance(datasetType, DatasetType): 

579 storage = self._datasets.find(datasetType.name) 

580 if storage is None: 

581 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

582 else: 

583 storage = self._datasets.find(datasetType) 

584 if storage is None: 

585 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

586 runRecord = self._collections.find(run) 

587 if runRecord.type is not CollectionType.RUN: 

588 raise TypeError("Given collection is of type {runRecord.type.name}; RUN collection required.") 

589 assert isinstance(runRecord, RunRecord) 

590 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) 

591 for dataId in dataIds] 

592 try: 

593 refs = list(storage.insert(runRecord, expandedDataIds)) 

594 except sqlalchemy.exc.IntegrityError as err: 

595 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

596 f"one or more datasets of type {storage.datasetType} into " 

597 f"collection '{run}'. " 

598 f"This probably means a dataset with the same data ID " 

599 f"and dataset type already exists, but it may also mean a " 

600 f"dimension row is missing.") from err 

601 return refs 

602 

603 def getDataset(self, id: int) -> Optional[DatasetRef]: 

604 """Retrieve a Dataset entry. 

605 

606 Parameters 

607 ---------- 

608 id : `int` 

609 The unique identifier for the dataset. 

610 

611 Returns 

612 ------- 

613 ref : `DatasetRef` or `None` 

614 A ref to the Dataset, or `None` if no matching Dataset 

615 was found. 

616 """ 

617 ref = self._datasets.getDatasetRef(id, universe=self.dimensions) 

618 if ref is None: 

619 return None 

620 return ref 

621 

622 @transactional 

623 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None: 

624 """Remove datasets from the Registry. 

625 

626 The datasets will be removed unconditionally from all collections, and 

627 any `Quantum` that consumed this dataset will instead be marked with 

628 having a NULL input. `Datastore` records will *not* be deleted; the 

629 caller is responsible for ensuring that the dataset has already been 

630 removed from all Datastores. 

631 

632 Parameters 

633 ---------- 

634 refs : `Iterable` of `DatasetRef` 

635 References to the datasets to be removed. Must include a valid 

636 ``id`` attribute, and should be considered invalidated upon return. 

637 

638 Raises 

639 ------ 

640 AmbiguousDatasetError 

641 Raised if any ``ref.id`` is `None`. 

642 OrphanedRecordError 

643 Raised if any dataset is still present in any `Datastore`. 

644 """ 

645 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

646 storage = self._datasets.find(datasetType.name) 

647 assert storage is not None 

648 try: 

649 storage.delete(refsForType) 

650 except sqlalchemy.exc.IntegrityError as err: 

651 raise OrphanedRecordError("One or more datasets is still " 

652 "present in one or more Datastores.") from err 

653 

654 @transactional 

655 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

656 """Add existing datasets to a `~CollectionType.TAGGED` collection. 

657 

658 If a DatasetRef with the same exact integer ID is already in a 

659 collection nothing is changed. If a `DatasetRef` with the same 

660 `DatasetType` and data ID but with different integer ID 

661 exists in the collection, `ConflictingDefinitionError` is raised. 

662 

663 Parameters 

664 ---------- 

665 collection : `str` 

666 Indicates the collection the datasets should be associated with. 

667 refs : `Iterable` [ `DatasetRef` ] 

668 An iterable of resolved `DatasetRef` instances that already exist 

669 in this `Registry`. 

670 

671 Raises 

672 ------ 

673 ConflictingDefinitionError 

674 If a Dataset with the given `DatasetRef` already exists in the 

675 given collection. 

676 AmbiguousDatasetError 

677 Raised if ``any(ref.id is None for ref in refs)``. 

678 MissingCollectionError 

679 Raised if ``collection`` does not exist in the registry. 

680 TypeError 

681 Raise adding new datasets to the given ``collection`` is not 

682 allowed. 

683 """ 

684 collectionRecord = self._collections.find(collection) 

685 if collectionRecord.type is not CollectionType.TAGGED: 

686 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

687 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

688 storage = self._datasets.find(datasetType.name) 

689 assert storage is not None 

690 try: 

691 storage.associate(collectionRecord, refsForType) 

692 except sqlalchemy.exc.IntegrityError as err: 

693 raise ConflictingDefinitionError( 

694 f"Constraint violation while associating dataset of type {datasetType.name} with " 

695 f"collection {collection}. This probably means that one or more datasets with the same " 

696 f"dataset type and data ID already exist in the collection, but it may also indicate " 

697 f"that the datasets do not exist." 

698 ) from err 

699 

700 @transactional 

701 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

702 """Remove existing datasets from a `~CollectionType.TAGGED` collection. 

703 

704 ``collection`` and ``ref`` combinations that are not currently 

705 associated are silently ignored. 

706 

707 Parameters 

708 ---------- 

709 collection : `str` 

710 The collection the datasets should no longer be associated with. 

711 refs : `Iterable` [ `DatasetRef` ] 

712 An iterable of resolved `DatasetRef` instances that already exist 

713 in this `Registry`. 

714 

715 Raises 

716 ------ 

717 AmbiguousDatasetError 

718 Raised if any of the given dataset references is unresolved. 

719 MissingCollectionError 

720 Raised if ``collection`` does not exist in the registry. 

721 TypeError 

722 Raise adding new datasets to the given ``collection`` is not 

723 allowed. 

724 """ 

725 collectionRecord = self._collections.find(collection) 

726 if collectionRecord.type is not CollectionType.TAGGED: 

727 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; " 

728 "expected TAGGED.") 

729 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

730 storage = self._datasets.find(datasetType.name) 

731 assert storage is not None 

732 storage.disassociate(collectionRecord, refsForType) 

733 

734 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: 

735 """Return an object that allows a new `Datastore` instance to 

736 communicate with this `Registry`. 

737 

738 Returns 

739 ------- 

740 manager : `DatastoreRegistryBridgeManager` 

741 Object that mediates communication between this `Registry` and its 

742 associated datastores. 

743 """ 

744 return self._datastoreBridges 

745 

746 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]: 

747 """Retrieve datastore locations for a given dataset. 

748 

749 Parameters 

750 ---------- 

751 ref : `DatasetRef` 

752 A reference to the dataset for which to retrieve storage 

753 information. 

754 

755 Returns 

756 ------- 

757 datastores : `Iterable` [ `str` ] 

758 All the matching datastores holding this dataset. 

759 

760 Raises 

761 ------ 

762 AmbiguousDatasetError 

763 Raised if ``ref.id`` is `None`. 

764 """ 

765 return self._datastoreBridges.findDatastores(ref) 

766 

767 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None, 

768 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None, 

769 **kwargs: Any) -> DataCoordinate: 

770 """Expand a dimension-based data ID to include additional information. 

771 

772 Parameters 

773 ---------- 

774 dataId : `DataCoordinate` or `dict`, optional 

775 Data ID to be expanded; augmented and overridden by ``kwds``. 

776 graph : `DimensionGraph`, optional 

777 Set of dimensions for the expanded ID. If `None`, the dimensions 

778 will be inferred from the keys of ``dataId`` and ``kwds``. 

779 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph`` 

780 are silently ignored, providing a way to extract and expand a 

781 subset of a data ID. 

782 records : `Mapping` [`str`, `DimensionRecord`], optional 

783 Dimension record data to use before querying the database for that 

784 data, keyed by element name. 

785 **kwargs 

786 Additional keywords are treated like additional key-value pairs for 

787 ``dataId``, extending and overriding 

788 

789 Returns 

790 ------- 

791 expanded : `DataCoordinate` 

792 A data ID that includes full metadata for all of the dimensions it 

793 identifieds, i.e. guarantees that ``expanded.hasRecords()`` and 

794 ``expanded.hasFull()`` both return `True`. 

795 """ 

796 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwargs) 

797 if standardized.hasRecords(): 

798 return standardized 

799 if records is None: 

800 records = {} 

801 elif isinstance(records, NamedKeyMapping): 

802 records = records.byName() 

803 else: 

804 records = dict(records) 

805 if isinstance(dataId, DataCoordinate) and dataId.hasRecords(): 

806 records.update(dataId.records.byName()) 

807 keys = standardized.byName() 

808 for element in standardized.graph.primaryKeyTraversalOrder: 

809 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

810 if record is ...: 

811 if isinstance(element, Dimension) and keys.get(element.name) is None: 

812 if element in standardized.graph.required: 

813 raise LookupError( 

814 f"No value or null value for required dimension {element.name}." 

815 ) 

816 keys[element.name] = None 

817 record = None 

818 else: 

819 storage = self._dimensions[element] 

820 dataIdSet = DataCoordinateIterable.fromScalar( 

821 DataCoordinate.standardize(keys, graph=element.graph) 

822 ) 

823 fetched = tuple(storage.fetch(dataIdSet)) 

824 try: 

825 (record,) = fetched 

826 except ValueError: 

827 record = None 

828 records[element.name] = record 

829 if record is not None: 

830 for d in element.implied: 

831 value = getattr(record, d.name) 

832 if keys.setdefault(d.name, value) != value: 

833 raise InconsistentDataIdError( 

834 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, " 

835 f"but {element.name} implies {d.name}={value!r}." 

836 ) 

837 else: 

838 if element in standardized.graph.required: 

839 raise LookupError( 

840 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

841 ) 

842 if element.alwaysJoin: 

843 raise InconsistentDataIdError( 

844 f"Could not fetch record for element {element.name} via keys {keys}, ", 

845 "but it is marked alwaysJoin=True; this means one or more dimensions are not " 

846 "related." 

847 ) 

848 for d in element.implied: 

849 keys.setdefault(d.name, None) 

850 records.setdefault(d.name, None) 

851 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records) 

852 

853 def insertDimensionData(self, element: Union[DimensionElement, str], 

854 *data: Union[Mapping[str, Any], DimensionRecord], 

855 conform: bool = True) -> None: 

856 """Insert one or more dimension records into the database. 

857 

858 Parameters 

859 ---------- 

860 element : `DimensionElement` or `str` 

861 The `DimensionElement` or name thereof that identifies the table 

862 records will be inserted into. 

863 data : `dict` or `DimensionRecord` (variadic) 

864 One or more records to insert. 

865 conform : `bool`, optional 

866 If `False` (`True` is default) perform no checking or conversions, 

867 and assume that ``element`` is a `DimensionElement` instance and 

868 ``data`` is a one or more `DimensionRecord` instances of the 

869 appropriate subclass. 

870 """ 

871 if conform: 

872 if isinstance(element, str): 

873 element = self.dimensions[element] 

874 records = [row if isinstance(row, DimensionRecord) else element.RecordClass.fromDict(row) 

875 for row in data] 

876 else: 

877 # Ignore typing since caller said to trust them with conform=False. 

878 records = data # type: ignore 

879 storage = self._dimensions[element] # type: ignore 

880 storage.insert(*records) 

881 

882 def syncDimensionData(self, element: Union[DimensionElement, str], 

883 row: Union[Mapping[str, Any], DimensionRecord], 

884 conform: bool = True) -> bool: 

885 """Synchronize the given dimension record with the database, inserting 

886 if it does not already exist and comparing values if it does. 

887 

888 Parameters 

889 ---------- 

890 element : `DimensionElement` or `str` 

891 The `DimensionElement` or name thereof that identifies the table 

892 records will be inserted into. 

893 row : `dict` or `DimensionRecord` 

894 The record to insert. 

895 conform : `bool`, optional 

896 If `False` (`True` is default) perform no checking or conversions, 

897 and assume that ``element`` is a `DimensionElement` instance and 

898 ``data`` is a one or more `DimensionRecord` instances of the 

899 appropriate subclass. 

900 

901 Returns 

902 ------- 

903 inserted : `bool` 

904 `True` if a new row was inserted, `False` otherwise. 

905 

906 Raises 

907 ------ 

908 ConflictingDefinitionError 

909 Raised if the record exists in the database (according to primary 

910 key lookup) but is inconsistent with the given one. 

911 

912 Notes 

913 ----- 

914 This method cannot be called within transactions, as it needs to be 

915 able to perform its own transaction to be concurrent. 

916 """ 

917 if conform: 

918 if isinstance(element, str): 

919 element = self.dimensions[element] 

920 record = row if isinstance(row, DimensionRecord) else element.RecordClass.fromDict(row) 

921 else: 

922 # Ignore typing since caller said to trust them with conform=False. 

923 record = row # type: ignore 

924 storage = self._dimensions[element] # type: ignore 

925 return storage.sync(record) 

926 

927 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None 

928 ) -> Iterator[DatasetType]: 

929 """Iterate over the dataset types whose names match an expression. 

930 

931 Parameters 

932 ---------- 

933 expression : `Any`, optional 

934 An expression that fully or partially identifies the dataset types 

935 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

936 `...` can be used to return all dataset types, and is the default. 

937 See :ref:`daf_butler_dataset_type_expressions` for more 

938 information. 

939 components : `bool`, optional 

940 If `True`, apply all expression patterns to component dataset type 

941 names as well. If `False`, never apply patterns to components. 

942 If `None` (default), apply patterns to components only if their 

943 parent datasets were not matched by the expression. 

944 Fully-specified component datasets (`str` or `DatasetType` 

945 instances) are always included. 

946 

947 Yields 

948 ------ 

949 datasetType : `DatasetType` 

950 A `DatasetType` instance whose name matches ``expression``. 

951 """ 

952 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name) 

953 if wildcard is Ellipsis: 

954 for datasetType in self._datasets: 

955 # The dataset type can no longer be a component 

956 yield datasetType 

957 if components and datasetType.isComposite(): 

958 # Automatically create the component dataset types 

959 for component in datasetType.makeAllComponentDatasetTypes(): 

960 yield component 

961 return 

962 done: Set[str] = set() 

963 for name in wildcard.strings: 

964 storage = self._datasets.find(name) 

965 if storage is not None: 

966 done.add(storage.datasetType.name) 

967 yield storage.datasetType 

968 if wildcard.patterns: 

969 # If components (the argument) is None, we'll save component 

970 # dataset that we might want to match, but only if their parents 

971 # didn't get included. 

972 componentsForLater = [] 

973 for registeredDatasetType in self._datasets: 

974 # Components are not stored in registry so expand them here 

975 allDatasetTypes = [registeredDatasetType] \ 

976 + registeredDatasetType.makeAllComponentDatasetTypes() 

977 for datasetType in allDatasetTypes: 

978 if datasetType.name in done: 

979 continue 

980 parentName, componentName = datasetType.nameAndComponent() 

981 if componentName is not None and not components: 

982 if components is None and parentName not in done: 

983 componentsForLater.append(datasetType) 

984 continue 

985 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

986 done.add(datasetType.name) 

987 yield datasetType 

988 # Go back and try to match saved components. 

989 for datasetType in componentsForLater: 

990 parentName, _ = datasetType.nameAndComponent() 

991 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

992 yield datasetType 

993 

994 def queryCollections(self, expression: Any = ..., 

995 datasetType: Optional[DatasetType] = None, 

996 collectionType: Optional[CollectionType] = None, 

997 flattenChains: bool = False, 

998 includeChains: Optional[bool] = None) -> Iterator[str]: 

999 """Iterate over the collections whose names match an expression. 

1000 

1001 Parameters 

1002 ---------- 

1003 expression : `Any`, optional 

1004 An expression that fully or partially identifies the collections 

1005 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1006 `...` can be used to return all collections, and is the default. 

1007 See :ref:`daf_butler_collection_expressions` for more 

1008 information. 

1009 datasetType : `DatasetType`, optional 

1010 If provided, only yield collections that should be searched for 

1011 this dataset type according to ``expression``. If this is 

1012 not provided, any dataset type restrictions in ``expression`` are 

1013 ignored. 

1014 collectionType : `CollectionType`, optional 

1015 If provided, only yield collections of this type. 

1016 flattenChains : `bool`, optional 

1017 If `True` (`False` is default), recursively yield the child 

1018 collections of matching `~CollectionType.CHAINED` collections. 

1019 includeChains : `bool`, optional 

1020 If `True`, yield records for matching `~CollectionType.CHAINED` 

1021 collections. Default is the opposite of ``flattenChains``: include 

1022 either CHAINED collections or their children, but not both. 

1023 

1024 Yields 

1025 ------ 

1026 collection : `str` 

1027 The name of a collection that matches ``expression``. 

1028 """ 

1029 query = CollectionQuery.fromExpression(expression) 

1030 for record in query.iter(self._collections, datasetType=datasetType, collectionType=collectionType, 

1031 flattenChains=flattenChains, includeChains=includeChains): 

1032 yield record.name 

1033 

1034 def makeQueryBuilder(self, summary: QuerySummary) -> QueryBuilder: 

1035 """Return a `QueryBuilder` instance capable of constructing and 

1036 managing more complex queries than those obtainable via `Registry` 

1037 interfaces. 

1038 

1039 This is an advanced interface; downstream code should prefer 

1040 `Registry.queryDimensions` and `Registry.queryDatasets` whenever those 

1041 are sufficient. 

1042 

1043 Parameters 

1044 ---------- 

1045 summary : `QuerySummary` 

1046 Object describing and categorizing the full set of dimensions that 

1047 will be included in the query. 

1048 

1049 Returns 

1050 ------- 

1051 builder : `QueryBuilder` 

1052 Object that can be used to construct and perform advanced queries. 

1053 """ 

1054 return QueryBuilder(summary=summary, 

1055 collections=self._collections, 

1056 dimensions=self._dimensions, 

1057 datasets=self._datasets) 

1058 

1059 def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *, 

1060 dataId: Optional[DataId] = None, 

1061 datasets: Any = None, 

1062 collections: Any = None, 

1063 where: Optional[str] = None, 

1064 expand: bool = True, 

1065 components: Optional[bool] = None, 

1066 **kwargs: Any) -> Iterator[DataCoordinate]: 

1067 """Query for and iterate over data IDs matching user-provided criteria. 

1068 

1069 Parameters 

1070 ---------- 

1071 dimensions : `Dimension` or `str`, or iterable thereof 

1072 The dimensions of the data IDs to yield, as either `Dimension` 

1073 instances or `str`. Will be automatically expanded to a complete 

1074 `DimensionGraph`. 

1075 dataId : `dict` or `DataCoordinate`, optional 

1076 A data ID whose key-value pairs are used as equality constraints 

1077 in the query. 

1078 datasets : `Any`, optional 

1079 An expression that fully or partially identifies dataset types 

1080 that should constrain the yielded data IDs. For example, including 

1081 "raw" here would constrain the yielded ``instrument``, 

1082 ``exposure``, ``detector``, and ``physical_filter`` values to only 

1083 those for which at least one "raw" dataset exists in 

1084 ``collections``. Allowed types include `DatasetType`, `str`, 

1085 `re.Pattern`, and iterables thereof. Unlike other dataset type 

1086 expressions, `...` is not permitted - it doesn't make sense to 

1087 constrain data IDs on the existence of *all* datasets. 

1088 See :ref:`daf_butler_dataset_type_expressions` for more 

1089 information. 

1090 collections: `Any`, optional 

1091 An expression that fully or partially identifies the collections 

1092 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1093 thereof. `...` can be used to return all collections. Must be 

1094 provided if ``datasets`` is, and is ignored if it is not. See 

1095 :ref:`daf_butler_collection_expressions` for more information. 

1096 where : `str`, optional 

1097 A string expression similar to a SQL WHERE clause. May involve 

1098 any column of a dimension table or (as a shortcut for the primary 

1099 key column of a dimension table) dimension name. See 

1100 :ref:`daf_butler_dimension_expressions` for more information. 

1101 expand : `bool`, optional 

1102 If `True` (default) yield `DataCoordinate` instances for which 

1103 `~DataCoordinate.hasRecords` is guaranteed to return `True`, 

1104 performing extra database fetches as necessary. 

1105 components : `bool`, optional 

1106 If `True`, apply all dataset expression patterns to component 

1107 dataset type names as well. If `False`, never apply patterns to 

1108 components. If `None` (default), apply patterns to components only 

1109 if their parent datasets were not matched by the expression. 

1110 Fully-specified component datasets (`str` or `DatasetType` 

1111 instances) are always included. 

1112 **kwargs 

1113 Additional keyword arguments are forwarded to 

1114 `DataCoordinate.standardize` when processing the ``dataId`` 

1115 argument (and may be used to provide a constraining data ID even 

1116 when the ``dataId`` argument is `None`). 

1117 

1118 Yields 

1119 ------ 

1120 dataId : `DataCoordinate` 

1121 Data IDs matching the given query parameters. Order is 

1122 unspecified. 

1123 """ 

1124 dimensions = iterable(dimensions) 

1125 standardizedDataId = self.expandDataId(dataId, **kwargs) 

1126 standardizedDatasetTypes = set() 

1127 requestedDimensionNames = set(self.dimensions.extract(dimensions).names) 

1128 if datasets is not None: 

1129 if collections is None: 

1130 raise TypeError("Cannot pass 'datasets' without 'collections'.") 

1131 for datasetType in self.queryDatasetTypes(datasets, components=components): 

1132 requestedDimensionNames.update(datasetType.dimensions.names) 

1133 # If any matched dataset type is a component, just operate on 

1134 # its parent instead, because Registry doesn't know anything 

1135 # about what components exist, and here (unlike queryDatasets) 

1136 # we don't care about returning them. 

1137 parentDatasetTypeName, componentName = datasetType.nameAndComponent() 

1138 if componentName is not None: 

1139 datasetType = self.getDatasetType(parentDatasetTypeName) 

1140 standardizedDatasetTypes.add(datasetType) 

1141 # Preprocess collections expression in case the original included 

1142 # single-pass iterators (we'll want to use it multiple times 

1143 # below). 

1144 collections = CollectionQuery.fromExpression(collections) 

1145 

1146 summary = QuerySummary( 

1147 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1148 dataId=standardizedDataId, 

1149 expression=where, 

1150 ) 

1151 builder = self.makeQueryBuilder(summary) 

1152 for datasetType in standardizedDatasetTypes: 

1153 builder.joinDataset(datasetType, collections, isResult=False) 

1154 query = builder.finish() 

1155 predicate = query.predicate() 

1156 for row in self._db.query(query.sql): 

1157 if predicate(row): 

1158 result = query.extractDataId(row) 

1159 if expand: 

1160 yield self.expandDataId( 

1161 result, 

1162 records=standardizedDataId.records, 

1163 ) 

1164 else: 

1165 yield result 

1166 

1167 def queryDatasets(self, datasetType: Any, *, 

1168 collections: Any, 

1169 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

1170 dataId: Optional[DataId] = None, 

1171 where: Optional[str] = None, 

1172 deduplicate: bool = False, 

1173 expand: bool = True, 

1174 components: Optional[bool] = None, 

1175 **kwargs: Any) -> Iterator[DatasetRef]: 

1176 """Query for and iterate over dataset references matching user-provided 

1177 criteria. 

1178 

1179 Parameters 

1180 ---------- 

1181 datasetType 

1182 An expression that fully or partially identifies the dataset types 

1183 to be queried. Allowed types include `DatasetType`, `str`, 

1184 `re.Pattern`, and iterables thereof. The special value `...` can 

1185 be used to query all dataset types. See 

1186 :ref:`daf_butler_dataset_type_expressions` for more information. 

1187 collections 

1188 An expression that fully or partially identifies the collections 

1189 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1190 thereof. `...` can be used to return all collections. See 

1191 :ref:`daf_butler_collection_expressions` for more information. 

1192 dimensions : `~collections.abc.Iterable` of `Dimension` or `str` 

1193 Dimensions to include in the query (in addition to those used 

1194 to identify the queried dataset type(s)), either to constrain 

1195 the resulting datasets to those for which a matching dimension 

1196 exists, or to relate the dataset type's dimensions to dimensions 

1197 referenced by the ``dataId`` or ``where`` arguments. 

1198 dataId : `dict` or `DataCoordinate`, optional 

1199 A data ID whose key-value pairs are used as equality constraints 

1200 in the query. 

1201 where : `str`, optional 

1202 A string expression similar to a SQL WHERE clause. May involve 

1203 any column of a dimension table or (as a shortcut for the primary 

1204 key column of a dimension table) dimension name. See 

1205 :ref:`daf_butler_dimension_expressions` for more information. 

1206 deduplicate : `bool`, optional 

1207 If `True` (`False` is default), for each result data ID, only 

1208 yield one `DatasetRef` of each `DatasetType`, from the first 

1209 collection in which a dataset of that dataset type appears 

1210 (according to the order of ``collections`` passed in). If `True`, 

1211 ``collections`` must not contain regular expressions and may not 

1212 be `...`. 

1213 expand : `bool`, optional 

1214 If `True` (default) attach `DataCoordinate` instances for which 

1215 `~DataCoordinate.hasRecords` is guaranteed to return `True`, 

1216 performing extra database fetches as necessary. 

1217 components : `bool`, optional 

1218 If `True`, apply all dataset expression patterns to component 

1219 dataset type names as well. If `False`, never apply patterns to 

1220 components. If `None` (default), apply patterns to components only 

1221 if their parent datasets were not matched by the expression. 

1222 Fully-specified component datasets (`str` or `DatasetType` 

1223 instances) are always included. 

1224 **kwargs 

1225 Additional keyword arguments are forwarded to 

1226 `DataCoordinate.standardize` when processing the ``dataId`` 

1227 argument (and may be used to provide a constraining data ID even 

1228 when the ``dataId`` argument is `None`). 

1229 

1230 Yields 

1231 ------ 

1232 ref : `DatasetRef` 

1233 Dataset references matching the given query criteria. These 

1234 are grouped by `DatasetType` if the query evaluates to multiple 

1235 dataset types, but order is otherwise unspecified. 

1236 

1237 Raises 

1238 ------ 

1239 TypeError 

1240 Raised when the arguments are incompatible, such as when a 

1241 collection wildcard is passed when ``deduplicate`` is `True`. 

1242 

1243 Notes 

1244 ----- 

1245 When multiple dataset types are queried in a single call, the 

1246 results of this operation are equivalent to querying for each dataset 

1247 type separately in turn, and no information about the relationships 

1248 between datasets of different types is included. In contexts where 

1249 that kind of information is important, the recommended pattern is to 

1250 use `queryDimensions` to first obtain data IDs (possibly with the 

1251 desired dataset types and collections passed as constraints to the 

1252 query), and then use multiple (generally much simpler) calls to 

1253 `queryDatasets` with the returned data IDs passed as constraints. 

1254 """ 

1255 # Standardize the collections expression. 

1256 if deduplicate: 

1257 collections = CollectionSearch.fromExpression(collections) 

1258 else: 

1259 collections = CollectionQuery.fromExpression(collections) 

1260 # Standardize and expand the data ID provided as a constraint. 

1261 standardizedDataId = self.expandDataId(dataId, **kwargs) 

1262 

1263 # We can only query directly if given a non-component DatasetType 

1264 # instance. If we were given an expression or str or a component 

1265 # DatasetType instance, we'll populate this dict, recurse, and return. 

1266 # If we already have a non-component DatasetType, it will remain None 

1267 # and we'll run the query directly. 

1268 composition: Optional[ 

1269 Dict[ 

1270 DatasetType, # parent dataset type 

1271 List[Optional[str]] # component name, or None for parent 

1272 ] 

1273 ] = None 

1274 if not isinstance(datasetType, DatasetType): 

1275 # We were given a dataset type expression (which may be as simple 

1276 # as a str). Loop over all matching datasets, delegating handling 

1277 # of the `components` argument to queryDatasetTypes, as we populate 

1278 # the composition dict. 

1279 composition = defaultdict(list) 

1280 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components): 

1281 parentName, componentName = trueDatasetType.nameAndComponent() 

1282 if componentName is not None: 

1283 parentDatasetType = self.getDatasetType(parentName) 

1284 composition.setdefault(parentDatasetType, []).append(componentName) 

1285 else: 

1286 composition.setdefault(trueDatasetType, []).append(None) 

1287 elif datasetType.isComponent(): 

1288 # We were given a true DatasetType instance, but it's a component. 

1289 # the composition dict will have exactly one item. 

1290 parentName, componentName = datasetType.nameAndComponent() 

1291 parentDatasetType = self.getDatasetType(parentName) 

1292 composition = {parentDatasetType: [componentName]} 

1293 if composition is not None: 

1294 # We need to recurse. Do that once for each parent dataset type. 

1295 for parentDatasetType, componentNames in composition.items(): 

1296 for parentRef in self.queryDatasets(parentDatasetType, collections=collections, 

1297 dimensions=dimensions, dataId=standardizedDataId, 

1298 where=where, deduplicate=deduplicate): 

1299 # Loop over components, yielding one for each one for each 

1300 # one requested. 

1301 for componentName in componentNames: 

1302 if componentName is None: 

1303 yield parentRef 

1304 else: 

1305 yield parentRef.makeComponentRef(componentName) 

1306 return 

1307 # If we get here, there's no need to recurse (or we are already 

1308 # recursing; there can only ever be one level of recursion). 

1309 

1310 # The full set of dimensions in the query is the combination of those 

1311 # needed for the DatasetType and those explicitly requested, if any. 

1312 requestedDimensionNames = set(datasetType.dimensions.names) 

1313 if dimensions is not None: 

1314 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

1315 # Construct the summary structure needed to construct a QueryBuilder. 

1316 summary = QuerySummary( 

1317 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1318 dataId=standardizedDataId, 

1319 expression=where, 

1320 ) 

1321 builder = self.makeQueryBuilder(summary) 

1322 # Add the dataset subquery to the query, telling the QueryBuilder to 

1323 # include the rank of the selected collection in the results only if we 

1324 # need to deduplicate. Note that if any of the collections are 

1325 # actually wildcard expressions, and we've asked for deduplication, 

1326 # this will raise TypeError for us. 

1327 if not builder.joinDataset(datasetType, collections, isResult=True, addRank=deduplicate): 

1328 return 

1329 query = builder.finish() 

1330 predicate = query.predicate() 

1331 if not deduplicate: 

1332 # No need to de-duplicate across collections. 

1333 for row in self._db.query(query.sql): 

1334 if predicate(row): 

1335 dataId = query.extractDataId(row, graph=datasetType.dimensions) 

1336 if expand: 

1337 dataId = self.expandDataId( 

1338 dataId, 

1339 records=standardizedDataId.records 

1340 ) 

1341 yield query.extractDatasetRef(row, datasetType, dataId)[0] 

1342 else: 

1343 # For each data ID, yield only the DatasetRef with the lowest 

1344 # collection rank. 

1345 bestRefs = {} 

1346 bestRanks: Dict[DataCoordinate, int] = {} 

1347 for row in self._db.query(query.sql): 

1348 if predicate(row): 

1349 ref, rank = query.extractDatasetRef(row, datasetType) 

1350 bestRank = bestRanks.get(ref.dataId, sys.maxsize) 

1351 assert rank is not None 

1352 if rank < bestRank: 

1353 bestRefs[ref.dataId] = ref 

1354 bestRanks[ref.dataId] = rank 

1355 # If caller requested expanded data IDs, we defer that until here 

1356 # so we do as little expansion as possible. 

1357 if expand: 

1358 for ref in bestRefs.values(): 

1359 dataId = self.expandDataId( 

1360 ref.dataId, 

1361 records=standardizedDataId.records 

1362 ) 

1363 yield ref.expanded(dataId) 

1364 else: 

1365 yield from bestRefs.values() 

1366 

1367 storageClasses: StorageClassFactory 

1368 """All storage classes known to the registry (`StorageClassFactory`). 

1369 """