Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "ConsistentDataIds", 

26 "Registry", 

27) 

28 

29from collections import defaultdict 

30import contextlib 

31from dataclasses import dataclass 

32import sys 

33from typing import ( 

34 Any, 

35 Iterable, 

36 Iterator, 

37 List, 

38 Mapping, 

39 Optional, 

40 Set, 

41 Type, 

42 TYPE_CHECKING, 

43 Union, 

44) 

45 

46import sqlalchemy 

47 

48import lsst.sphgeom 

49from ..core import ( 

50 Config, 

51 DataCoordinate, 

52 DataId, 

53 DatasetRef, 

54 DatasetType, 

55 Dimension, 

56 DimensionElement, 

57 DimensionGraph, 

58 DimensionRecord, 

59 DimensionUniverse, 

60 ExpandedDataCoordinate, 

61 FakeDatasetRef, 

62 StorageClassFactory, 

63) 

64from ..core import ddl 

65from ..core.utils import doImport, iterable, transactional 

66from ._config import RegistryConfig 

67from .queries import ( 

68 QueryBuilder, 

69 QuerySummary, 

70) 

71from .tables import makeRegistryTableSpecs 

72from ._collectionType import CollectionType 

73from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError 

74from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch 

75 

76if TYPE_CHECKING: 76 ↛ 77line 76 didn't jump to line 77, because the condition on line 76 was never true

77 from ..butlerConfig import ButlerConfig 

78 from ..core import ( 

79 Quantum 

80 ) 

81 from .interfaces import ( 

82 CollectionManager, 

83 Database, 

84 OpaqueTableStorageManager, 

85 DimensionRecordStorageManager, 

86 DatasetRecordStorageManager, 

87 ) 

88 

89 

90@dataclass 

91class ConsistentDataIds: 

92 """A struct used to report relationships between data IDs by 

93 `Registry.relateDataIds`. 

94 

95 If an instance of this class is returned (instead of `None`), the data IDs 

96 are "not inconsistent" - any keys they have in common have the same value, 

97 and any spatial or temporal relationships they have at least might involve 

98 an overlap. To capture this, any instance of `ConsistentDataIds` coerces 

99 to `True` in boolean contexts. 

100 """ 

101 

102 overlaps: bool 

103 """If `True`, the data IDs have at least one key in common, associated with 

104 the same value. 

105 

106 Note that data IDs are not inconsistent even if overlaps is `False` - they 

107 may simply have no keys in common, which means they cannot have 

108 inconsistent values for any keys. They may even be equal, in the case that 

109 both data IDs are empty. 

110 

111 This field does _not_ indicate whether a spatial or temporal overlap 

112 relationship exists. 

113 """ 

114 

115 contains: bool 

116 """If `True`, all keys in the first data ID are in the second, and are 

117 associated with the same values. 

118 

119 This includes case where the first data ID is empty. 

120 """ 

121 

122 within: bool 

123 """If `True`, all keys in the second data ID are in the first, and are 

124 associated with the same values. 

125 

126 This includes case where the second data ID is empty. 

127 """ 

128 

129 @property 

130 def equal(self) -> bool: 

131 """If `True`, the two data IDs are the same. 

132 

133 Data IDs are equal if they have both a `contains` and a `within` 

134 relationship. 

135 """ 

136 return self.contains and self.within 

137 

138 @property 

139 def disjoint(self) -> bool: 

140 """If `True`, the two data IDs have no keys in common. 

141 

142 This is simply the oppose of `overlaps`. Disjoint datasets are by 

143 definition not inconsistent. 

144 """ 

145 return not self.overlaps 

146 

147 def __bool__(self) -> bool: 

148 return True 

149 

150 

151class Registry: 

152 """Registry interface. 

153 

154 Parameters 

155 ---------- 

156 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

157 Registry configuration 

158 """ 

159 

160 defaultConfigFile = None 

161 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

162 absolute path. Can be None if no defaults specified. 

163 """ 

164 

165 @classmethod 

166 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False, 

167 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry: 

168 """Create `Registry` subclass instance from `config`. 

169 

170 Uses ``registry.cls`` from `config` to determine which subclass to 

171 instantiate. 

172 

173 Parameters 

174 ---------- 

175 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

176 Registry configuration 

177 create : `bool`, optional 

178 Assume empty Registry and create a new one. 

179 butlerRoot : `str`, optional 

180 Path to the repository root this `Registry` will manage. 

181 writeable : `bool`, optional 

182 If `True` (default) create a read-write connection to the database. 

183 

184 Returns 

185 ------- 

186 registry : `Registry` (subclass) 

187 A new `Registry` subclass instance. 

188 """ 

189 if not isinstance(config, RegistryConfig): 

190 if isinstance(config, str) or isinstance(config, Config): 

191 config = RegistryConfig(config) 

192 else: 

193 raise ValueError("Incompatible Registry configuration: {}".format(config)) 

194 config.replaceRoot(butlerRoot) 

195 DatabaseClass = config.getDatabaseClass() 

196 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

197 namespace=config.get("namespace"), writeable=writeable) 

198 universe = DimensionUniverse(config) 

199 opaque = doImport(config["managers", "opaque"]) 

200 dimensions = doImport(config["managers", "dimensions"]) 

201 collections = doImport(config["managers", "collections"]) 

202 datasets = doImport(config["managers", "datasets"]) 

203 return cls(database, universe, dimensions=dimensions, opaque=opaque, collections=collections, 

204 datasets=datasets, create=create) 

205 

206 def __init__(self, database: Database, universe: DimensionUniverse, *, 

207 opaque: Type[OpaqueTableStorageManager], 

208 dimensions: Type[DimensionRecordStorageManager], 

209 collections: Type[CollectionManager], 

210 datasets: Type[DatasetRecordStorageManager], 

211 create: bool = False): 

212 self._db = database 

213 self.storageClasses = StorageClassFactory() 

214 with self._db.declareStaticTables(create=create) as context: 

215 self._dimensions = dimensions.initialize(self._db, context, universe=universe) 

216 self._collections = collections.initialize(self._db, context) 

217 self._datasets = datasets.initialize(self._db, context, 

218 collections=self._collections, 

219 universe=self.dimensions) 

220 self._tables = context.addTableTuple(makeRegistryTableSpecs(self.dimensions, 

221 self._collections, 

222 self._datasets)) 

223 self._opaque = opaque.initialize(self._db, context) 

224 self._collections.refresh() 

225 self._datasets.refresh(universe=self._dimensions.universe) 

226 

227 def __str__(self) -> str: 

228 return str(self._db) 

229 

230 def __repr__(self) -> str: 

231 return f"Registry({self._db!r}, {self.dimensions!r})" 

232 

233 def isWriteable(self) -> bool: 

234 """Return `True` if this registry allows write operations, and `False` 

235 otherwise. 

236 """ 

237 return self._db.isWriteable() 

238 

239 @property 

240 def dimensions(self) -> DimensionUniverse: 

241 """All dimensions recognized by this `Registry` (`DimensionUniverse`). 

242 """ 

243 return self._dimensions.universe 

244 

245 @contextlib.contextmanager 

246 def transaction(self): 

247 """Return a context manager that represents a transaction. 

248 """ 

249 # TODO make savepoint=False the default. 

250 try: 

251 with self._db.transaction(): 

252 yield 

253 except BaseException: 

254 # TODO: this clears the caches sometimes when we wouldn't actually 

255 # need to. Can we avoid that? 

256 self._dimensions.clearCaches() 

257 raise 

258 

259 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec): 

260 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

261 other data repository client. 

262 

263 Opaque table records can be added via `insertOpaqueData`, retrieved via 

264 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

265 

266 Parameters 

267 ---------- 

268 tableName : `str` 

269 Logical name of the opaque table. This may differ from the 

270 actual name used in the database by a prefix and/or suffix. 

271 spec : `ddl.TableSpec` 

272 Specification for the table to be added. 

273 """ 

274 self._opaque.register(tableName, spec) 

275 

276 @transactional 

277 def insertOpaqueData(self, tableName: str, *data: dict): 

278 """Insert records into an opaque table. 

279 

280 Parameters 

281 ---------- 

282 tableName : `str` 

283 Logical name of the opaque table. Must match the name used in a 

284 previous call to `registerOpaqueTable`. 

285 data 

286 Each additional positional argument is a dictionary that represents 

287 a single row to be added. 

288 """ 

289 self._opaque[tableName].insert(*data) 

290 

291 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

292 """Retrieve records from an opaque table. 

293 

294 Parameters 

295 ---------- 

296 tableName : `str` 

297 Logical name of the opaque table. Must match the name used in a 

298 previous call to `registerOpaqueTable`. 

299 where 

300 Additional keyword arguments are interpreted as equality 

301 constraints that restrict the returned rows (combined with AND); 

302 keyword arguments are column names and values are the values they 

303 must have. 

304 

305 Yields 

306 ------ 

307 row : `dict` 

308 A dictionary representing a single result row. 

309 """ 

310 yield from self._opaque[tableName].fetch(**where) 

311 

312 @transactional 

313 def deleteOpaqueData(self, tableName: str, **where: Any): 

314 """Remove records from an opaque table. 

315 

316 Parameters 

317 ---------- 

318 tableName : `str` 

319 Logical name of the opaque table. Must match the name used in a 

320 previous call to `registerOpaqueTable`. 

321 where 

322 Additional keyword arguments are interpreted as equality 

323 constraints that restrict the deleted rows (combined with AND); 

324 keyword arguments are column names and values are the values they 

325 must have. 

326 """ 

327 self._opaque[tableName].delete(**where) 

328 

329 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED): 

330 """Add a new collection if one with the given name does not exist. 

331 

332 Parameters 

333 ---------- 

334 name : `str` 

335 The name of the collection to create. 

336 type : `CollectionType` 

337 Enum value indicating the type of collection to create. 

338 

339 Notes 

340 ----- 

341 This method cannot be called within transactions, as it needs to be 

342 able to perform its own transaction to be concurrent. 

343 """ 

344 self._collections.register(name, type) 

345 

346 def getCollectionType(self, name: str) -> CollectionType: 

347 """Return an enumeration value indicating the type of the given 

348 collection. 

349 

350 Parameters 

351 ---------- 

352 name : `str` 

353 The name of the collection. 

354 

355 Returns 

356 ------- 

357 type : `CollectionType` 

358 Enum value indicating the type of this collection. 

359 

360 Raises 

361 ------ 

362 MissingCollectionError 

363 Raised if no collection with the given name exists. 

364 """ 

365 return self._collections.find(name).type 

366 

367 def registerRun(self, name: str): 

368 """Add a new run if one with the given name does not exist. 

369 

370 Parameters 

371 ---------- 

372 name : `str` 

373 The name of the run to create. 

374 

375 Notes 

376 ----- 

377 This method cannot be called within transactions, as it needs to be 

378 able to perform its own transaction to be concurrent. 

379 """ 

380 self._collections.register(name, CollectionType.RUN) 

381 

382 @transactional 

383 def removeCollection(self, name: str): 

384 """Completely remove the given collection. 

385 

386 Parameters 

387 ---------- 

388 name : `str` 

389 The name of the collection to remove. 

390 

391 Raises 

392 ------ 

393 MissingCollectionError 

394 Raised if no collection with the given name exists. 

395 

396 Notes 

397 ----- 

398 If this is a `~CollectionType.RUN` collection, all datasets and quanta 

399 in it are also fully removed. This requires that those datasets be 

400 removed (or at least trashed) from any datastores that hold them first. 

401 

402 A collection may not be deleted as long as it is referenced by a 

403 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must 

404 be deleted or redefined first. 

405 """ 

406 self._collections.remove(name) 

407 

408 def getCollectionChain(self, parent: str) -> CollectionSearch: 

409 """Return the child collections in a `~CollectionType.CHAINED` 

410 collection. 

411 

412 Parameters 

413 ---------- 

414 parent : `str` 

415 Name of the chained collection. Must have already been added via 

416 a call to `Registry.registerCollection`. 

417 

418 Returns 

419 ------- 

420 children : `CollectionSearch` 

421 An object that defines the search path of the collection. 

422 See :ref:`daf_butler_collection_expressions` for more information. 

423 

424 Raises 

425 ------ 

426 MissingCollectionError 

427 Raised if ``parent`` does not exist in the `Registry`. 

428 TypeError 

429 Raised if ``parent`` does not correspond to a 

430 `~CollectionType.CHAINED` collection. 

431 """ 

432 record = self._collections.find(parent) 

433 if record.type is not CollectionType.CHAINED: 

434 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

435 return record.children 

436 

437 @transactional 

438 def setCollectionChain(self, parent: str, children: Any): 

439 """Define or redefine a `~CollectionType.CHAINED` collection. 

440 

441 Parameters 

442 ---------- 

443 parent : `str` 

444 Name of the chained collection. Must have already been added via 

445 a call to `Registry.registerCollection`. 

446 children : `Any` 

447 An expression defining an ordered search of child collections, 

448 generally an iterable of `str`. Restrictions on the dataset types 

449 to be searched can also be included, by passing mapping or an 

450 iterable containing tuples; see 

451 :ref:`daf_butler_collection_expressions` for more information. 

452 

453 Raises 

454 ------ 

455 MissingCollectionError 

456 Raised when any of the given collections do not exist in the 

457 `Registry`. 

458 TypeError 

459 Raised if ``parent`` does not correspond to a 

460 `~CollectionType.CHAINED` collection. 

461 ValueError 

462 Raised if the given collections contains a cycle. 

463 """ 

464 record = self._collections.find(parent) 

465 if record.type is not CollectionType.CHAINED: 

466 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

467 children = CollectionSearch.fromExpression(children) 

468 if children != record.children: 

469 record.update(self._collections, children) 

470 

471 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

472 """ 

473 Add a new `DatasetType` to the Registry. 

474 

475 It is not an error to register the same `DatasetType` twice. 

476 

477 Parameters 

478 ---------- 

479 datasetType : `DatasetType` 

480 The `DatasetType` to be added. 

481 

482 Returns 

483 ------- 

484 inserted : `bool` 

485 `True` if ``datasetType`` was inserted, `False` if an identical 

486 existing `DatsetType` was found. Note that in either case the 

487 DatasetType is guaranteed to be defined in the Registry 

488 consistently with the given definition. 

489 

490 Raises 

491 ------ 

492 ValueError 

493 Raised if the dimensions or storage class are invalid. 

494 ConflictingDefinitionError 

495 Raised if this DatasetType is already registered with a different 

496 definition. 

497 

498 Notes 

499 ----- 

500 This method cannot be called within transactions, as it needs to be 

501 able to perform its own transaction to be concurrent. 

502 """ 

503 _, inserted = self._datasets.register(datasetType) 

504 return inserted 

505 

506 def getDatasetType(self, name: str) -> DatasetType: 

507 """Get the `DatasetType`. 

508 

509 Parameters 

510 ---------- 

511 name : `str` 

512 Name of the type. 

513 

514 Returns 

515 ------- 

516 type : `DatasetType` 

517 The `DatasetType` associated with the given name. 

518 

519 Raises 

520 ------ 

521 KeyError 

522 Requested named DatasetType could not be found in registry. 

523 """ 

524 storage = self._datasets.find(name) 

525 if storage is None: 

526 raise KeyError(f"DatasetType '{name}' could not be found.") 

527 return storage.datasetType 

528 

529 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *, 

530 collections: Any, **kwargs: Any) -> Optional[DatasetRef]: 

531 """Find a dataset given its `DatasetType` and data ID. 

532 

533 This can be used to obtain a `DatasetRef` that permits the dataset to 

534 be read from a `Datastore`. 

535 

536 Parameters 

537 ---------- 

538 datasetType : `DatasetType` or `str` 

539 A `DatasetType` or the name of one. 

540 dataId : `dict` or `DataCoordinate`, optional 

541 A `dict`-like object containing the `Dimension` links that identify 

542 the dataset within a collection. 

543 collections 

544 An expression that fully or partially identifies the collections 

545 to search for the dataset, such as a `str`, `re.Pattern`, or 

546 iterable thereof. `...` can be used to return all collections. 

547 See :ref:`daf_butler_collection_expressions` for more information. 

548 **kwargs 

549 Additional keyword arguments passed to 

550 `DataCoordinate.standardize` to convert ``dataId`` to a true 

551 `DataCoordinate` or augment an existing one. 

552 

553 Returns 

554 ------- 

555 ref : `DatasetRef` 

556 A reference to the dataset, or `None` if no matching Dataset 

557 was found. 

558 

559 Raises 

560 ------ 

561 LookupError 

562 Raised if one or more data ID keys are missing or the dataset type 

563 does not exist. 

564 MissingCollectionError 

565 Raised if any of ``collections`` does not exist in the registry. 

566 """ 

567 if isinstance(datasetType, DatasetType): 

568 storage = self._datasets.find(datasetType.name) 

569 if storage is None: 

570 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

571 else: 

572 storage = self._datasets.find(datasetType) 

573 if storage is None: 

574 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

575 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions, 

576 universe=self.dimensions, **kwargs) 

577 collections = CollectionSearch.fromExpression(collections) 

578 for collectionRecord in collections.iter(self._collections, datasetType=storage.datasetType): 

579 result = storage.find(collectionRecord, dataId) 

580 if result is not None: 

581 if result.datasetType.isComposite(): 

582 result = self._datasets.fetchComponents(result) 

583 return result 

584 return None 

585 

586 @transactional 

587 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId], 

588 run: str, *, producer: Optional[Quantum] = None, recursive: bool = False 

589 ) -> List[DatasetRef]: 

590 """Insert one or more datasets into the `Registry` 

591 

592 This always adds new datasets; to associate existing datasets with 

593 a new collection, use ``associate``. 

594 

595 Parameters 

596 ---------- 

597 datasetType : `DatasetType` or `str` 

598 A `DatasetType` or the name of one. 

599 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate` 

600 Dimension-based identifiers for the new datasets. 

601 run : `str` 

602 The name of the run that produced the datasets. 

603 producer : `Quantum` 

604 Unit of work that produced the datasets. May be `None` to store 

605 no provenance information, but if present the `Quantum` must 

606 already have been added to the Registry. 

607 recursive : `bool` 

608 If True, recursively add datasets and attach entries for component 

609 datasets as well. 

610 

611 Returns 

612 ------- 

613 refs : `list` of `DatasetRef` 

614 Resolved `DatasetRef` instances for all given data IDs (in the same 

615 order). 

616 

617 Raises 

618 ------ 

619 ConflictingDefinitionError 

620 If a dataset with the same dataset type and data ID as one of those 

621 given already exists in ``run``. 

622 MissingCollectionError 

623 Raised if ``run`` does not exist in the registry. 

624 """ 

625 if isinstance(datasetType, DatasetType): 

626 storage = self._datasets.find(datasetType.name) 

627 if storage is None: 

628 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

629 else: 

630 storage = self._datasets.find(datasetType) 

631 if storage is None: 

632 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

633 runRecord = self._collections.find(run) 

634 dataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds] 

635 try: 

636 refs = list(storage.insert(runRecord, dataIds, quantum=producer)) 

637 except sqlalchemy.exc.IntegrityError as err: 

638 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

639 f"one or more datasets of type {storage.datasetType} into " 

640 f"collection '{run}'. " 

641 f"This probably means a dataset with the same data ID " 

642 f"and dataset type already exists, but it may also mean a " 

643 f"dimension row is missing.") from err 

644 if recursive and storage.datasetType.isComposite(): 

645 # Insert component rows by recursing. 

646 composites = defaultdict(dict) 

647 # TODO: we really shouldn't be inserting all components defined by 

648 # the storage class, because there's no guarantee all of them are 

649 # actually present in these datasets. 

650 for componentName in storage.datasetType.storageClass.components: 

651 componentDatasetType = storage.datasetType.makeComponentDatasetType(componentName) 

652 componentRefs = self.insertDatasets(componentDatasetType, 

653 dataIds=dataIds, 

654 run=run, 

655 producer=producer, 

656 recursive=True) 

657 for parentRef, componentRef in zip(refs, componentRefs): 

658 composites[parentRef][componentName] = componentRef 

659 if composites: 

660 refs = list(self._datasets.attachComponents(composites.items())) 

661 return refs 

662 

663 def getDataset(self, id: int) -> Optional[DatasetRef]: 

664 """Retrieve a Dataset entry. 

665 

666 Parameters 

667 ---------- 

668 id : `int` 

669 The unique identifier for the dataset. 

670 

671 Returns 

672 ------- 

673 ref : `DatasetRef` or `None` 

674 A ref to the Dataset, or `None` if no matching Dataset 

675 was found. 

676 """ 

677 ref = self._datasets.getDatasetRef(id) 

678 if ref is None: 

679 return None 

680 if ref.datasetType.isComposite(): 

681 return self._datasets.fetchComponents(ref) 

682 return ref 

683 

684 @transactional 

685 def removeDatasets(self, refs: Iterable[DatasetRef], *, recursive: bool = True): 

686 """Remove datasets from the Registry. 

687 

688 The datasets will be removed unconditionally from all collections, and 

689 any `Quantum` that consumed this dataset will instead be marked with 

690 having a NULL input. `Datastore` records will *not* be deleted; the 

691 caller is responsible for ensuring that the dataset has already been 

692 removed from all Datastores. 

693 

694 Parameters 

695 ---------- 

696 refs : `Iterable` of `DatasetRef` 

697 References to the datasets to be removed. Must include a valid 

698 ``id`` attribute, and should be considered invalidated upon return. 

699 recursive : `bool`, optional 

700 If `True`, remove all component datasets as well. Note that 

701 this only removes components that are actually included in the 

702 given `DatasetRef` instances, which may not be the same as those in 

703 the database (especially if they were obtained from 

704 `queryDatasets`, which does not populate `DatasetRef.components`). 

705 

706 Raises 

707 ------ 

708 AmbiguousDatasetError 

709 Raised if any ``ref.id`` is `None`. 

710 OrphanedRecordError 

711 Raised if any dataset is still present in any `Datastore`. 

712 """ 

713 for datasetType, refsForType in DatasetRef.groupByType(refs, recursive=recursive).items(): 

714 storage = self._datasets.find(datasetType.name) 

715 try: 

716 storage.delete(refsForType) 

717 except sqlalchemy.exc.IntegrityError as err: 

718 raise OrphanedRecordError("One or more datasets is still " 

719 "present in one or more Datastores.") from err 

720 

721 @transactional 

722 def attachComponents(self, parent: DatasetRef, components: Mapping[str, DatasetRef]): 

723 """Attach components to a dataset. 

724 

725 Parameters 

726 ---------- 

727 parent : `DatasetRef` 

728 A reference to the parent dataset. 

729 components : `Mapping` [ `str`, `DatasetRef` ] 

730 Mapping from component name to the `DatasetRef` for that component. 

731 

732 Returns 

733 ------- 

734 ref : `DatasetRef` 

735 An updated version of ``parent`` with components included. 

736 

737 Returns 

738 ------- 

739 ref : `DatasetRef` 

740 A version ``parent`` with ``component`` included in its components. 

741 

742 Raises 

743 ------ 

744 AmbiguousDatasetError 

745 Raised if ``parent.id`` or any `DatasetRef.id` in ``components`` 

746 is `None`. 

747 """ 

748 for name, ref in components.items(): 

749 if ref.datasetType.storageClass != parent.datasetType.storageClass.components[name]: 

750 raise TypeError(f"Expected storage class " 

751 f"'{parent.datasetType.storageClass.components[name].name}' " 

752 f"for component '{name}' of dataset {parent}; got " 

753 f"dataset {ref} with storage class " 

754 f"'{ref.datasetType.storageClass.name}'.") 

755 ref, = self._datasets.attachComponents([(parent, components)]) 

756 return ref 

757 

758 @transactional 

759 def associate(self, collection: str, refs: Iterable[DatasetRef], *, recursive: bool = True): 

760 """Add existing datasets to a `~CollectionType.TAGGED` collection. 

761 

762 If a DatasetRef with the same exact integer ID is already in a 

763 collection nothing is changed. If a `DatasetRef` with the same 

764 `DatasetType` and data ID but with different integer ID 

765 exists in the collection, `ConflictingDefinitionError` is raised. 

766 

767 Parameters 

768 ---------- 

769 collection : `str` 

770 Indicates the collection the datasets should be associated with. 

771 refs : `Iterable` [ `DatasetRef` ] 

772 An iterable of resolved `DatasetRef` instances that already exist 

773 in this `Registry`. 

774 recursive : `bool`, optional 

775 If `True`, associate all component datasets as well. Note that 

776 this only associates components that are actually included in the 

777 given `DatasetRef` instances, which may not be the same as those in 

778 the database (especially if they were obtained from 

779 `queryDatasets`, which does not populate `DatasetRef.components`). 

780 

781 Raises 

782 ------ 

783 ConflictingDefinitionError 

784 If a Dataset with the given `DatasetRef` already exists in the 

785 given collection. 

786 AmbiguousDatasetError 

787 Raised if ``any(ref.id is None for ref in refs)``. 

788 MissingCollectionError 

789 Raised if ``collection`` does not exist in the registry. 

790 TypeError 

791 Raise adding new datasets to the given ``collection`` is not 

792 allowed. 

793 """ 

794 collectionRecord = self._collections.find(collection) 

795 if collectionRecord.type is not CollectionType.TAGGED: 

796 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

797 for datasetType, refsForType in DatasetRef.groupByType(refs, recursive=recursive).items(): 

798 storage = self._datasets.find(datasetType.name) 

799 try: 

800 storage.associate(collectionRecord, refsForType) 

801 except sqlalchemy.exc.IntegrityError as err: 

802 raise ConflictingDefinitionError( 

803 f"Constraint violation while associating dataset of type {datasetType.name} with " 

804 f"collection {collection}. This probably means that one or more datasets with the same " 

805 f"dataset type and data ID already exist in the collection, but it may also indicate " 

806 f"that the datasets do not exist." 

807 ) from err 

808 

809 @transactional 

810 def disassociate(self, collection: str, refs: Iterable[DatasetRef], *, recursive: bool = True): 

811 """Remove existing datasets from a `~CollectionType.TAGGED` collection. 

812 

813 ``collection`` and ``ref`` combinations that are not currently 

814 associated are silently ignored. 

815 

816 Parameters 

817 ---------- 

818 collection : `str` 

819 The collection the datasets should no longer be associated with. 

820 refs : `Iterable` [ `DatasetRef` ] 

821 An iterable of resolved `DatasetRef` instances that already exist 

822 in this `Registry`. 

823 recursive : `bool`, optional 

824 If `True`, disassociate all component datasets as well. Note that 

825 this only disassociates components that are actually included in 

826 the given `DatasetRef` instances, which may not be the same as 

827 those in the database (especially if they were obtained from 

828 `queryDatasets`, which does not populate `DatasetRef.components`). 

829 

830 Raises 

831 ------ 

832 AmbiguousDatasetError 

833 Raised if any of the given dataset references is unresolved. 

834 MissingCollectionError 

835 Raised if ``collection`` does not exist in the registry. 

836 TypeError 

837 Raise adding new datasets to the given ``collection`` is not 

838 allowed. 

839 """ 

840 collectionRecord = self._collections.find(collection) 

841 if collectionRecord.type is not CollectionType.TAGGED: 

842 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; " 

843 "expected TAGGED.") 

844 for datasetType, refsForType in DatasetRef.groupByType(refs, recursive=recursive).items(): 

845 storage = self._datasets.find(datasetType.name) 

846 storage.disassociate(collectionRecord, refsForType) 

847 

848 @transactional 

849 def insertDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]): 

850 """Record that a datastore holds the given datasets. 

851 

852 Typically used by `Datastore`. 

853 

854 Parameters 

855 ---------- 

856 datastoreName : `str` 

857 Name of the datastore holding these datasets. 

858 refs : `~collections.abc.Iterable` of `DatasetRef` 

859 References to the datasets. 

860 

861 Raises 

862 ------ 

863 AmbiguousDatasetError 

864 Raised if ``any(ref.id is None for ref in refs)``. 

865 """ 

866 self._db.insert( 

867 self._tables.dataset_location, 

868 *[{"datastore_name": datastoreName, "dataset_id": ref.getCheckedId()} for ref in refs] 

869 ) 

870 

871 @transactional 

872 def moveDatasetLocationToTrash(self, datastoreName: str, refs: Iterable[DatasetRef]): 

873 """Move the dataset location information to trash. 

874 

875 Parameters 

876 ---------- 

877 datastoreName : `str` 

878 Name of the datastore holding these datasets. 

879 refs : `~collections.abc.Iterable` of `DatasetRef` 

880 References to the datasets. 

881 """ 

882 # We only want to move rows that already exist in the main table 

883 filtered = self.checkDatasetLocations(datastoreName, refs) 

884 self.canDeleteDatasetLocations(datastoreName, filtered) 

885 self.removeDatasetLocation(datastoreName, filtered) 

886 

887 @transactional 

888 def canDeleteDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]): 

889 """Record that a datastore can delete this dataset 

890 

891 Parameters 

892 ---------- 

893 datastoreName : `str` 

894 Name of the datastore holding these datasets. 

895 refs : `~collections.abc.Iterable` of `DatasetRef` 

896 References to the datasets. 

897 

898 Raises 

899 ------ 

900 AmbiguousDatasetError 

901 Raised if ``any(ref.id is None for ref in refs)``. 

902 """ 

903 self._db.insert( 

904 self._tables.dataset_location_trash, 

905 *[{"datastore_name": datastoreName, "dataset_id": ref.getCheckedId()} for ref in refs] 

906 ) 

907 

908 def checkDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]) -> List[DatasetRef]: 

909 """Check which refs are listed for this datastore. 

910 

911 Parameters 

912 ---------- 

913 datastoreName : `str` 

914 Name of the datastore holding these datasets. 

915 refs : `~collections.abc.Iterable` of `DatasetRef` 

916 References to the datasets. 

917 

918 Returns 

919 ------- 

920 present : `list` of `DatasetRef` 

921 All the `DatasetRef` that are listed. 

922 """ 

923 

924 table = self._tables.dataset_location 

925 result = self._db.query( 

926 sqlalchemy.sql.select( 

927 [table.columns.datastore_name, table.columns.dataset_id] 

928 ).where( 

929 sqlalchemy.sql.and_(table.columns.dataset_id.in_([ref.id for ref in refs]), 

930 table.columns.datastore_name == datastoreName) 

931 ) 

932 ).fetchall() 

933 

934 matched_ids = {r["dataset_id"] for r in result} 

935 return [ref for ref in refs if ref.id in matched_ids] 

936 

937 def getDatasetLocations(self, ref: DatasetRef) -> Set[str]: 

938 """Retrieve datastore locations for a given dataset. 

939 

940 Typically used by `Datastore`. 

941 

942 Parameters 

943 ---------- 

944 ref : `DatasetRef` 

945 A reference to the dataset for which to retrieve storage 

946 information. 

947 

948 Returns 

949 ------- 

950 datastores : `set` of `str` 

951 All the matching datastores holding this dataset. Empty set 

952 if the dataset does not exist anywhere. 

953 

954 Raises 

955 ------ 

956 AmbiguousDatasetError 

957 Raised if ``ref.id`` is `None`. 

958 """ 

959 table = self._tables.dataset_location 

960 result = self._db.query( 

961 sqlalchemy.sql.select( 

962 [table.columns.datastore_name] 

963 ).where( 

964 table.columns.dataset_id == ref.id 

965 ) 

966 ).fetchall() 

967 return {r["datastore_name"] for r in result} 

968 

969 @transactional 

970 def getTrashedDatasets(self, datastoreName: str) -> Set[FakeDatasetRef]: 

971 """Retrieve all the dataset ref IDs that are in the trash 

972 associated with the specified datastore. 

973 

974 Parameters 

975 ---------- 

976 datastoreName : `str` 

977 The relevant datastore name to use. 

978 

979 Returns 

980 ------- 

981 ids : `set` of `FakeDatasetRef` 

982 The IDs of datasets that can be safely removed from this datastore. 

983 Can be empty. 

984 """ 

985 table = self._tables.dataset_location_trash 

986 result = self._db.query( 

987 sqlalchemy.sql.select( 

988 [table.columns.dataset_id] 

989 ).where( 

990 table.columns.datastore_name == datastoreName 

991 ) 

992 ).fetchall() 

993 return {FakeDatasetRef(r["dataset_id"]) for r in result} 

994 

995 @transactional 

996 def emptyDatasetLocationsTrash(self, datastoreName: str, refs: Iterable[FakeDatasetRef]) -> None: 

997 """Remove datastore location associated with these datasets from trash. 

998 

999 Typically used by `Datastore` when a dataset is removed. 

1000 

1001 Parameters 

1002 ---------- 

1003 datastoreName : `str` 

1004 Name of this `Datastore`. 

1005 refs : iterable of `FakeDatasetRef` 

1006 The dataset IDs to be removed. 

1007 

1008 Raises 

1009 ------ 

1010 AmbiguousDatasetError 

1011 Raised if ``ref.id`` is `None`. 

1012 """ 

1013 if not refs: 

1014 return 

1015 self._db.delete( 

1016 self._tables.dataset_location_trash, 

1017 ["dataset_id", "datastore_name"], 

1018 *[{"dataset_id": ref.id, "datastore_name": datastoreName} for ref in refs] 

1019 ) 

1020 

1021 @transactional 

1022 def removeDatasetLocation(self, datastoreName: str, refs: Iterable[DatasetRef]) -> None: 

1023 """Remove datastore location associated with this dataset. 

1024 

1025 Typically used by `Datastore` when a dataset is removed. 

1026 

1027 Parameters 

1028 ---------- 

1029 datastoreName : `str` 

1030 Name of this `Datastore`. 

1031 refs : iterable of `DatasetRef` 

1032 A reference to the dataset for which information is to be removed. 

1033 

1034 Raises 

1035 ------ 

1036 AmbiguousDatasetError 

1037 Raised if ``ref.id`` is `None`. 

1038 """ 

1039 if not refs: 

1040 return 

1041 self._db.delete( 

1042 self._tables.dataset_location, 

1043 ["dataset_id", "datastore_name"], 

1044 *[{"dataset_id": ref.getCheckedId(), "datastore_name": datastoreName} for ref in refs] 

1045 ) 

1046 

1047 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None, 

1048 records: Optional[Mapping[DimensionElement, DimensionRecord]] = None, **kwds): 

1049 """Expand a dimension-based data ID to include additional information. 

1050 

1051 Parameters 

1052 ---------- 

1053 dataId : `DataCoordinate` or `dict`, optional 

1054 Data ID to be expanded; augmented and overridden by ``kwds``. 

1055 graph : `DimensionGraph`, optional 

1056 Set of dimensions for the expanded ID. If `None`, the dimensions 

1057 will be inferred from the keys of ``dataId`` and ``kwds``. 

1058 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph`` 

1059 are silently ignored, providing a way to extract and expand a 

1060 subset of a data ID. 

1061 records : mapping [`DimensionElement`, `DimensionRecord`], optional 

1062 Dimension record data to use before querying the database for that 

1063 data. 

1064 **kwds 

1065 Additional keywords are treated like additional key-value pairs for 

1066 ``dataId``, extending and overriding 

1067 

1068 Returns 

1069 ------- 

1070 expanded : `ExpandedDataCoordinate` 

1071 A data ID that includes full metadata for all of the dimensions it 

1072 identifieds. 

1073 """ 

1074 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwds) 

1075 if isinstance(standardized, ExpandedDataCoordinate): 

1076 return standardized 

1077 elif isinstance(dataId, ExpandedDataCoordinate): 

1078 records = dict(records) if records is not None else {} 

1079 records.update(dataId.records) 

1080 else: 

1081 records = dict(records) if records is not None else {} 

1082 keys = dict(standardized) 

1083 regions = [] 

1084 timespans = [] 

1085 for element in standardized.graph.primaryKeyTraversalOrder: 

1086 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

1087 if record is ...: 

1088 storage = self._dimensions[element] 

1089 record = storage.fetch(keys) 

1090 records[element] = record 

1091 if record is not None: 

1092 for d in element.implied: 

1093 value = getattr(record, d.name) 

1094 if keys.setdefault(d, value) != value: 

1095 raise InconsistentDataIdError(f"Data ID {standardized} has {d.name}={keys[d]!r}, " 

1096 f"but {element.name} implies {d.name}={value!r}.") 

1097 if element in standardized.graph.spatial and record.region is not None: 

1098 if any(record.region.relate(r) & lsst.sphgeom.DISJOINT for r in regions): 

1099 raise InconsistentDataIdError(f"Data ID {standardized}'s region for {element.name} " 

1100 f"is disjoint with those for other elements.") 

1101 regions.append(record.region) 

1102 if element in standardized.graph.temporal: 

1103 if any(not record.timespan.overlaps(t) for t in timespans): 

1104 raise InconsistentDataIdError(f"Data ID {standardized}'s timespan for {element.name}" 

1105 f" is disjoint with those for other elements.") 

1106 timespans.append(record.timespan) 

1107 else: 

1108 if element in standardized.graph.required: 

1109 raise LookupError( 

1110 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

1111 ) 

1112 if element.alwaysJoin: 

1113 raise InconsistentDataIdError( 

1114 f"Could not fetch record for element {element.name} via keys {keys}, ", 

1115 f"but it is marked alwaysJoin=True; this means one or more dimensions are not " 

1116 f"related." 

1117 ) 

1118 records.update((d, None) for d in element.implied) 

1119 return ExpandedDataCoordinate(standardized.graph, standardized.values(), records=records) 

1120 

1121 def relateDataIds(self, a: DataId, b: DataId) -> Optional[ConsistentDataIds]: 

1122 """Compare the keys and values of a pair of data IDs for consistency. 

1123 

1124 See `ConsistentDataIds` for more information. 

1125 

1126 Parameters 

1127 ---------- 

1128 a : `dict` or `DataCoordinate` 

1129 First data ID to be compared. 

1130 b : `dict` or `DataCoordinate` 

1131 Second data ID to be compared. 

1132 

1133 Returns 

1134 ------- 

1135 relationship : `ConsistentDataIds` or `None` 

1136 Relationship information. This is not `None` and coerces to 

1137 `True` in boolean contexts if and only if the data IDs are 

1138 consistent in terms of all common key-value pairs, all many-to-many 

1139 join tables, and all spatial andtemporal relationships. 

1140 """ 

1141 a = DataCoordinate.standardize(a, universe=self.dimensions) 

1142 b = DataCoordinate.standardize(b, universe=self.dimensions) 

1143 aFull = getattr(a, "full", None) 

1144 bFull = getattr(b, "full", None) 

1145 aBest = aFull if aFull is not None else a 

1146 bBest = bFull if bFull is not None else b 

1147 jointKeys = aBest.keys() & bBest.keys() 

1148 # If any common values are not equal, we know they are inconsistent. 

1149 if any(aBest[k] != bBest[k] for k in jointKeys): 

1150 return None 

1151 # If the graphs are equal, we know the data IDs are. 

1152 if a.graph == b.graph: 

1153 return ConsistentDataIds(contains=True, within=True, overlaps=bool(jointKeys)) 

1154 # Result is still inconclusive. Try to expand a data ID containing 

1155 # keys from both; that will fail if they are inconsistent. 

1156 # First, if either input was already an ExpandedDataCoordinate, extract 

1157 # its records so we don't have to query for them. 

1158 records = {} 

1159 if hasattr(a, "records"): 

1160 records.update(a.records) 

1161 if hasattr(b, "records"): 

1162 records.update(b.records) 

1163 try: 

1164 self.expandDataId({**a, **b}, graph=(a.graph | b.graph), records=records) 

1165 except InconsistentDataIdError: 

1166 return None 

1167 # We know the answer is not `None`; time to figure out what it is. 

1168 return ConsistentDataIds( 

1169 contains=(a.graph >= b.graph), 

1170 within=(a.graph <= b.graph), 

1171 overlaps=bool(a.graph & b.graph), 

1172 ) 

1173 

1174 def insertDimensionData(self, element: Union[DimensionElement, str], 

1175 *data: Union[dict, DimensionRecord], 

1176 conform: bool = True): 

1177 """Insert one or more dimension records into the database. 

1178 

1179 Parameters 

1180 ---------- 

1181 element : `DimensionElement` or `str` 

1182 The `DimensionElement` or name thereof that identifies the table 

1183 records will be inserted into. 

1184 data : `dict` or `DimensionRecord` (variadic) 

1185 One or more records to insert. 

1186 conform : `bool`, optional 

1187 If `False` (`True` is default) perform no checking or conversions, 

1188 and assume that ``element`` is a `DimensionElement` instance and 

1189 ``data`` is a one or more `DimensionRecord` instances of the 

1190 appropriate subclass. 

1191 """ 

1192 if conform: 

1193 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement. 

1194 records = [element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row 

1195 for row in data] 

1196 else: 

1197 records = data 

1198 storage = self._dimensions[element] 

1199 storage.insert(*records) 

1200 

1201 def syncDimensionData(self, element: Union[DimensionElement, str], 

1202 row: Union[dict, DimensionRecord], 

1203 conform: bool = True) -> bool: 

1204 """Synchronize the given dimension record with the database, inserting 

1205 if it does not already exist and comparing values if it does. 

1206 

1207 Parameters 

1208 ---------- 

1209 element : `DimensionElement` or `str` 

1210 The `DimensionElement` or name thereof that identifies the table 

1211 records will be inserted into. 

1212 row : `dict` or `DimensionRecord` 

1213 The record to insert. 

1214 conform : `bool`, optional 

1215 If `False` (`True` is default) perform no checking or conversions, 

1216 and assume that ``element`` is a `DimensionElement` instance and 

1217 ``data`` is a one or more `DimensionRecord` instances of the 

1218 appropriate subclass. 

1219 

1220 Returns 

1221 ------- 

1222 inserted : `bool` 

1223 `True` if a new row was inserted, `False` otherwise. 

1224 

1225 Raises 

1226 ------ 

1227 ConflictingDefinitionError 

1228 Raised if the record exists in the database (according to primary 

1229 key lookup) but is inconsistent with the given one. 

1230 

1231 Notes 

1232 ----- 

1233 This method cannot be called within transactions, as it needs to be 

1234 able to perform its own transaction to be concurrent. 

1235 """ 

1236 if conform: 

1237 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement. 

1238 record = element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row 

1239 else: 

1240 record = row 

1241 storage = self._dimensions[element] 

1242 return storage.sync(record) 

1243 

1244 def queryDatasetTypes(self, expression: Any = ...) -> Iterator[DatasetType]: 

1245 """Iterate over the dataset types whose names match an expression. 

1246 

1247 Parameters 

1248 ---------- 

1249 expression : `Any`, optional 

1250 An expression that fully or partially identifies the dataset types 

1251 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1252 `...` can be used to return all dataset types, and is the default. 

1253 See :ref:`daf_butler_dataset_type_expressions` for more 

1254 information. 

1255 

1256 Yields 

1257 ------ 

1258 datasetType : `DatasetType` 

1259 A `DatasetType` instance whose name matches ``expression``. 

1260 """ 

1261 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name) 

1262 if wildcard is ...: 

1263 yield from self._datasets 

1264 return 

1265 done = set() 

1266 for name in wildcard.strings: 

1267 storage = self._datasets.find(name) 

1268 if storage is not None: 

1269 done.add(storage.datasetType) 

1270 yield storage.datasetType 

1271 if wildcard.patterns: 

1272 for datasetType in self._datasets: 

1273 if datasetType.name in done: 

1274 continue 

1275 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

1276 yield datasetType 

1277 

1278 def queryCollections(self, expression: Any = ..., 

1279 datasetType: Optional[DatasetType] = None, 

1280 collectionType: Optional[CollectionType] = None, 

1281 flattenChains: bool = False, 

1282 includeChains: Optional[bool] = None) -> Iterator[str]: 

1283 """Iterate over the collections whose names match an expression. 

1284 

1285 Parameters 

1286 ---------- 

1287 expression : `Any`, optional 

1288 An expression that fully or partially identifies the collections 

1289 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1290 `...` can be used to return all collections, and is the default. 

1291 See :ref:`daf_butler_collection_expressions` for more 

1292 information. 

1293 datasetType : `DatasetType`, optional 

1294 If provided, only yield collections that should be searched for 

1295 this dataset type according to ``expression``. If this is 

1296 not provided, any dataset type restrictions in ``expression`` are 

1297 ignored. 

1298 collectionType : `CollectionType`, optional 

1299 If provided, only yield collections of this type. 

1300 flattenChains : `bool`, optional 

1301 If `True` (`False` is default), recursively yield the child 

1302 collections of matching `~CollectionType.CHAINED` collections. 

1303 includeChains : `bool`, optional 

1304 If `True`, yield records for matching `~CollectionType.CHAINED` 

1305 collections. Default is the opposite of ``flattenChains``: include 

1306 either CHAINED collections or their children, but not both. 

1307 

1308 Yields 

1309 ------ 

1310 collection : `str` 

1311 The name of a collection that matches ``expression``. 

1312 """ 

1313 query = CollectionQuery.fromExpression(expression) 

1314 for record in query.iter(self._collections, datasetType=datasetType, collectionType=collectionType, 

1315 flattenChains=flattenChains, includeChains=includeChains): 

1316 yield record.name 

1317 

1318 def makeQueryBuilder(self, summary: QuerySummary) -> QueryBuilder: 

1319 """Return a `QueryBuilder` instance capable of constructing and 

1320 managing more complex queries than those obtainable via `Registry` 

1321 interfaces. 

1322 

1323 This is an advanced interface; downstream code should prefer 

1324 `Registry.queryDimensions` and `Registry.queryDatasets` whenever those 

1325 are sufficient. 

1326 

1327 Parameters 

1328 ---------- 

1329 summary : `QuerySummary` 

1330 Object describing and categorizing the full set of dimensions that 

1331 will be included in the query. 

1332 

1333 Returns 

1334 ------- 

1335 builder : `QueryBuilder` 

1336 Object that can be used to construct and perform advanced queries. 

1337 """ 

1338 return QueryBuilder(summary=summary, 

1339 collections=self._collections, 

1340 dimensions=self._dimensions, 

1341 datasets=self._datasets) 

1342 

1343 def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *, 

1344 dataId: Optional[DataId] = None, 

1345 datasets: Any = None, 

1346 collections: Any = None, 

1347 where: Optional[str] = None, 

1348 expand: bool = True, 

1349 **kwds) -> Iterator[DataCoordinate]: 

1350 """Query for and iterate over data IDs matching user-provided criteria. 

1351 

1352 Parameters 

1353 ---------- 

1354 dimensions : `Dimension` or `str`, or iterable thereof 

1355 The dimensions of the data IDs to yield, as either `Dimension` 

1356 instances or `str`. Will be automatically expanded to a complete 

1357 `DimensionGraph`. 

1358 dataId : `dict` or `DataCoordinate`, optional 

1359 A data ID whose key-value pairs are used as equality constraints 

1360 in the query. 

1361 datasets : `Any`, optional 

1362 An expression that fully or partially identifies dataset types 

1363 that should constrain the yielded data IDs. For example, including 

1364 "raw" here would constrain the yielded ``instrument``, 

1365 ``exposure``, ``detector``, and ``physical_filter`` values to only 

1366 those for which at least one "raw" dataset exists in 

1367 ``collections``. Allowed types include `DatasetType`, `str`, 

1368 `re.Pattern`, and iterables thereof. Unlike other dataset type 

1369 expressions, `...` is not permitted - it doesn't make sense to 

1370 constrain data IDs on the existence of *all* datasets. 

1371 See :ref:`daf_butler_dataset_type_expressions` for more 

1372 information. 

1373 collections: `Any`, optional 

1374 An expression that fully or partially identifies the collections 

1375 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1376 thereof. `...` can be used to return all collections. Must be 

1377 provided if ``datasets`` is, and is ignored if it is not. See 

1378 :ref:`daf_butler_collection_expressions` for more information. 

1379 where : `str`, optional 

1380 A string expression similar to a SQL WHERE clause. May involve 

1381 any column of a dimension table or (as a shortcut for the primary 

1382 key column of a dimension table) dimension name. See 

1383 :ref:`daf_butler_dimension_expressions` for more information. 

1384 expand : `bool`, optional 

1385 If `True` (default) yield `ExpandedDataCoordinate` instead of 

1386 minimal `DataCoordinate` base-class instances. 

1387 kwds 

1388 Additional keyword arguments are forwarded to 

1389 `DataCoordinate.standardize` when processing the ``dataId`` 

1390 argument (and may be used to provide a constraining data ID even 

1391 when the ``dataId`` argument is `None`). 

1392 

1393 Yields 

1394 ------ 

1395 dataId : `DataCoordinate` 

1396 Data IDs matching the given query parameters. Order is 

1397 unspecified. 

1398 """ 

1399 dimensions = iterable(dimensions) 

1400 standardizedDataId = self.expandDataId(dataId, **kwds) 

1401 standardizedDatasetTypes = [] 

1402 requestedDimensionNames = set(self.dimensions.extract(dimensions).names) 

1403 if datasets is not None: 

1404 if collections is None: 

1405 raise TypeError("Cannot pass 'datasets' without 'collections'.") 

1406 for datasetType in self.queryDatasetTypes(datasets): 

1407 requestedDimensionNames.update(datasetType.dimensions.names) 

1408 standardizedDatasetTypes.append(datasetType) 

1409 # Preprocess collections expression in case the original included 

1410 # single-pass iterators (we'll want to use it multiple times 

1411 # below). 

1412 collections = CollectionQuery.fromExpression(collections) 

1413 

1414 summary = QuerySummary( 

1415 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1416 dataId=standardizedDataId, 

1417 expression=where, 

1418 ) 

1419 builder = self.makeQueryBuilder(summary) 

1420 for datasetType in standardizedDatasetTypes: 

1421 builder.joinDataset(datasetType, collections, isResult=False) 

1422 query = builder.finish() 

1423 predicate = query.predicate() 

1424 for row in self._db.query(query.sql): 

1425 if predicate(row): 

1426 result = query.extractDataId(row) 

1427 if expand: 

1428 yield self.expandDataId(result, records=standardizedDataId.records) 

1429 else: 

1430 yield result 

1431 

1432 def queryDatasets(self, datasetType: Any, *, 

1433 collections: Any, 

1434 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

1435 dataId: Optional[DataId] = None, 

1436 where: Optional[str] = None, 

1437 deduplicate: bool = False, 

1438 expand: bool = True, 

1439 **kwds) -> Iterator[DatasetRef]: 

1440 """Query for and iterate over dataset references matching user-provided 

1441 criteria. 

1442 

1443 Parameters 

1444 ---------- 

1445 datasetType 

1446 An expression that fully or partially identifies the dataset types 

1447 to be queried. Allowed types include `DatasetType`, `str`, 

1448 `re.Pattern`, and iterables thereof. The special value `...` can 

1449 be used to query all dataset types. See 

1450 :ref:`daf_butler_dataset_type_expressions` for more information. 

1451 collections 

1452 An expression that fully or partially identifies the collections 

1453 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1454 thereof. `...` can be used to return all collections. See 

1455 :ref:`daf_butler_collection_expressions` for more information. 

1456 dimensions : `~collections.abc.Iterable` of `Dimension` or `str` 

1457 Dimensions to include in the query (in addition to those used 

1458 to identify the queried dataset type(s)), either to constrain 

1459 the resulting datasets to those for which a matching dimension 

1460 exists, or to relate the dataset type's dimensions to dimensions 

1461 referenced by the ``dataId`` or ``where`` arguments. 

1462 dataId : `dict` or `DataCoordinate`, optional 

1463 A data ID whose key-value pairs are used as equality constraints 

1464 in the query. 

1465 where : `str`, optional 

1466 A string expression similar to a SQL WHERE clause. May involve 

1467 any column of a dimension table or (as a shortcut for the primary 

1468 key column of a dimension table) dimension name. See 

1469 :ref:`daf_butler_dimension_expressions` for more information. 

1470 deduplicate : `bool`, optional 

1471 If `True` (`False` is default), for each result data ID, only 

1472 yield one `DatasetRef` of each `DatasetType`, from the first 

1473 collection in which a dataset of that dataset type appears 

1474 (according to the order of ``collections`` passed in). If `True`, 

1475 ``collections`` must not contain regular expressions and may not 

1476 be `...`. 

1477 expand : `bool`, optional 

1478 If `True` (default) attach `ExpandedDataCoordinate` instead of 

1479 minimal `DataCoordinate` base-class instances. 

1480 kwds 

1481 Additional keyword arguments are forwarded to 

1482 `DataCoordinate.standardize` when processing the ``dataId`` 

1483 argument (and may be used to provide a constraining data ID even 

1484 when the ``dataId`` argument is `None`). 

1485 

1486 Yields 

1487 ------ 

1488 ref : `DatasetRef` 

1489 Dataset references matching the given query criteria. These 

1490 are grouped by `DatasetType` if the query evaluates to multiple 

1491 dataset types, but order is otherwise unspecified. 

1492 

1493 Raises 

1494 ------ 

1495 TypeError 

1496 Raised when the arguments are incompatible, such as when a 

1497 collection wildcard is passed when ``deduplicate`` is `True`. 

1498 

1499 Notes 

1500 ----- 

1501 When multiple dataset types are queried in a single call, the 

1502 results of this operation are equivalent to querying for each dataset 

1503 type separately in turn, and no information about the relationships 

1504 between datasets of different types is included. In contexts where 

1505 that kind of information is important, the recommended pattern is to 

1506 use `queryDimensions` to first obtain data IDs (possibly with the 

1507 desired dataset types and collections passed as constraints to the 

1508 query), and then use multiple (generally much simpler) calls to 

1509 `queryDatasets` with the returned data IDs passed as constraints. 

1510 """ 

1511 # Standardize the collections expression. 

1512 if deduplicate: 

1513 collections = CollectionSearch.fromExpression(collections) 

1514 else: 

1515 collections = CollectionQuery.fromExpression(collections) 

1516 # Standardize and expand the data ID provided as a constraint. 

1517 standardizedDataId = self.expandDataId(dataId, **kwds) 

1518 # If the datasetType passed isn't actually a DatasetType, expand it 

1519 # (it could be an expression that yields multiple DatasetTypes) and 

1520 # recurse. 

1521 if not isinstance(datasetType, DatasetType): 

1522 for trueDatasetType in self.queryDatasetTypes(datasetType): 

1523 yield from self.queryDatasets(trueDatasetType, collections=collections, 

1524 dimensions=dimensions, dataId=standardizedDataId, 

1525 where=where, deduplicate=deduplicate) 

1526 return 

1527 # The full set of dimensions in the query is the combination of those 

1528 # needed for the DatasetType and those explicitly requested, if any. 

1529 requestedDimensionNames = set(datasetType.dimensions.names) 

1530 if dimensions is not None: 

1531 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

1532 # Construct the summary structure needed to construct a QueryBuilder. 

1533 summary = QuerySummary( 

1534 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1535 dataId=standardizedDataId, 

1536 expression=where, 

1537 ) 

1538 builder = self.makeQueryBuilder(summary) 

1539 # Add the dataset subquery to the query, telling the QueryBuilder to 

1540 # include the rank of the selected collection in the results only if we 

1541 # need to deduplicate. Note that if any of the collections are 

1542 # actually wildcard expressions, and we've asked for deduplication, 

1543 # this will raise TypeError for us. 

1544 if not builder.joinDataset(datasetType, collections, isResult=True, addRank=deduplicate): 

1545 return 

1546 query = builder.finish() 

1547 predicate = query.predicate() 

1548 if not deduplicate: 

1549 # No need to de-duplicate across collections. 

1550 for row in self._db.query(query.sql): 

1551 if predicate(row): 

1552 dataId = query.extractDataId(row, graph=datasetType.dimensions) 

1553 if expand: 

1554 dataId = self.expandDataId(dataId, records=standardizedDataId.records) 

1555 yield query.extractDatasetRef(row, datasetType, dataId)[0] 

1556 else: 

1557 # For each data ID, yield only the DatasetRef with the lowest 

1558 # collection rank. 

1559 bestRefs = {} 

1560 bestRanks = {} 

1561 for row in self._db.query(query.sql): 

1562 if predicate(row): 

1563 ref, rank = query.extractDatasetRef(row, datasetType) 

1564 bestRank = bestRanks.get(ref.dataId, sys.maxsize) 

1565 if rank < bestRank: 

1566 bestRefs[ref.dataId] = ref 

1567 bestRanks[ref.dataId] = rank 

1568 # If caller requested expanded data IDs, we defer that until here 

1569 # so we do as little expansion as possible. 

1570 if expand: 

1571 for ref in bestRefs.values(): 

1572 dataId = self.expandDataId(ref.dataId, records=standardizedDataId.records) 

1573 yield ref.expanded(dataId) 

1574 else: 

1575 yield from bestRefs.values() 

1576 

1577 dimensions: DimensionUniverse 

1578 """The universe of all dimensions known to the registry 

1579 (`DimensionUniverse`). 

1580 """ 

1581 

1582 storageClasses: StorageClassFactory 

1583 """All storage classes known to the registry (`StorageClassFactory`). 

1584 """