Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "ConsistentDataIds", 

26 "Registry", 

27) 

28 

29from collections import defaultdict 

30import contextlib 

31from dataclasses import dataclass 

32import sys 

33from typing import ( 

34 Any, 

35 Iterable, 

36 Iterator, 

37 List, 

38 Mapping, 

39 Optional, 

40 Set, 

41 Type, 

42 TYPE_CHECKING, 

43 Union, 

44) 

45 

46import sqlalchemy 

47 

48import lsst.sphgeom 

49from ..core import ( 

50 Config, 

51 DataCoordinate, 

52 DataId, 

53 DatasetRef, 

54 DatasetType, 

55 Dimension, 

56 DimensionElement, 

57 DimensionGraph, 

58 DimensionRecord, 

59 DimensionUniverse, 

60 ExpandedDataCoordinate, 

61 FakeDatasetRef, 

62 StorageClassFactory, 

63) 

64from ..core import ddl 

65from ..core.utils import doImport, iterable, transactional 

66from ._config import RegistryConfig 

67from .queries import ( 

68 QueryBuilder, 

69 QuerySummary, 

70) 

71from .tables import makeRegistryTableSpecs 

72from ._collectionType import CollectionType 

73from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError 

74from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch 

75 

76if TYPE_CHECKING: 76 ↛ 77line 76 didn't jump to line 77, because the condition on line 76 was never true

77 from ..butlerConfig import ButlerConfig 

78 from ..core import ( 

79 Quantum 

80 ) 

81 from .interfaces import ( 

82 CollectionManager, 

83 Database, 

84 OpaqueTableStorageManager, 

85 DimensionRecordStorageManager, 

86 DatasetRecordStorageManager, 

87 ) 

88 

89 

90@dataclass 

91class ConsistentDataIds: 

92 """A struct used to report relationships between data IDs by 

93 `Registry.relateDataIds`. 

94 

95 If an instance of this class is returned (instead of `None`), the data IDs 

96 are "not inconsistent" - any keys they have in common have the same value, 

97 and any spatial or temporal relationships they have at least might involve 

98 an overlap. To capture this, any instance of `ConsistentDataIds` coerces 

99 to `True` in boolean contexts. 

100 """ 

101 

102 overlaps: bool 

103 """If `True`, the data IDs have at least one key in common, associated with 

104 the same value. 

105 

106 Note that data IDs are not inconsistent even if overlaps is `False` - they 

107 may simply have no keys in common, which means they cannot have 

108 inconsistent values for any keys. They may even be equal, in the case that 

109 both data IDs are empty. 

110 

111 This field does _not_ indicate whether a spatial or temporal overlap 

112 relationship exists. 

113 """ 

114 

115 contains: bool 

116 """If `True`, all keys in the first data ID are in the second, and are 

117 associated with the same values. 

118 

119 This includes case where the first data ID is empty. 

120 """ 

121 

122 within: bool 

123 """If `True`, all keys in the second data ID are in the first, and are 

124 associated with the same values. 

125 

126 This includes case where the second data ID is empty. 

127 """ 

128 

129 @property 

130 def equal(self) -> bool: 

131 """If `True`, the two data IDs are the same. 

132 

133 Data IDs are equal if they have both a `contains` and a `within` 

134 relationship. 

135 """ 

136 return self.contains and self.within 

137 

138 @property 

139 def disjoint(self) -> bool: 

140 """If `True`, the two data IDs have no keys in common. 

141 

142 This is simply the oppose of `overlaps`. Disjoint datasets are by 

143 definition not inconsistent. 

144 """ 

145 return not self.overlaps 

146 

147 def __bool__(self) -> bool: 

148 return True 

149 

150 

151class Registry: 

152 """Registry interface. 

153 

154 Parameters 

155 ---------- 

156 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

157 Registry configuration 

158 """ 

159 

160 defaultConfigFile = None 

161 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

162 absolute path. Can be None if no defaults specified. 

163 """ 

164 

165 @classmethod 

166 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False, 

167 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry: 

168 """Create `Registry` subclass instance from `config`. 

169 

170 Uses ``registry.cls`` from `config` to determine which subclass to 

171 instantiate. 

172 

173 Parameters 

174 ---------- 

175 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

176 Registry configuration 

177 create : `bool`, optional 

178 Assume empty Registry and create a new one. 

179 butlerRoot : `str`, optional 

180 Path to the repository root this `Registry` will manage. 

181 writeable : `bool`, optional 

182 If `True` (default) create a read-write connection to the database. 

183 

184 Returns 

185 ------- 

186 registry : `Registry` (subclass) 

187 A new `Registry` subclass instance. 

188 """ 

189 if not isinstance(config, RegistryConfig): 

190 if isinstance(config, str) or isinstance(config, Config): 

191 config = RegistryConfig(config) 

192 else: 

193 raise ValueError("Incompatible Registry configuration: {}".format(config)) 

194 config.replaceRoot(butlerRoot) 

195 DatabaseClass = config.getDatabaseClass() 

196 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

197 namespace=config.get("namespace"), writeable=writeable) 

198 universe = DimensionUniverse(config) 

199 opaque = doImport(config["managers", "opaque"]) 

200 dimensions = doImport(config["managers", "dimensions"]) 

201 collections = doImport(config["managers", "collections"]) 

202 datasets = doImport(config["managers", "datasets"]) 

203 return cls(database, universe, dimensions=dimensions, opaque=opaque, collections=collections, 

204 datasets=datasets, create=create) 

205 

206 def __init__(self, database: Database, universe: DimensionUniverse, *, 

207 opaque: Type[OpaqueTableStorageManager], 

208 dimensions: Type[DimensionRecordStorageManager], 

209 collections: Type[CollectionManager], 

210 datasets: Type[DatasetRecordStorageManager], 

211 create: bool = False): 

212 self._db = database 

213 self.storageClasses = StorageClassFactory() 

214 with self._db.declareStaticTables(create=create) as context: 

215 self._dimensions = dimensions.initialize(self._db, context, universe=universe) 

216 self._collections = collections.initialize(self._db, context) 

217 self._datasets = datasets.initialize(self._db, context, 

218 collections=self._collections, 

219 universe=self.dimensions) 

220 self._tables = context.addTableTuple(makeRegistryTableSpecs(self.dimensions, 

221 self._collections, 

222 self._datasets)) 

223 self._opaque = opaque.initialize(self._db, context) 

224 self._collections.refresh() 

225 self._datasets.refresh(universe=self._dimensions.universe) 

226 

227 def __str__(self) -> str: 

228 return str(self._db) 

229 

230 def __repr__(self) -> str: 

231 return f"Registry({self._db!r}, {self.dimensions!r})" 

232 

233 def isWriteable(self) -> bool: 

234 """Return `True` if this registry allows write operations, and `False` 

235 otherwise. 

236 """ 

237 return self._db.isWriteable() 

238 

239 @property 

240 def dimensions(self) -> DimensionUniverse: 

241 """All dimensions recognized by this `Registry` (`DimensionUniverse`). 

242 """ 

243 return self._dimensions.universe 

244 

245 @contextlib.contextmanager 

246 def transaction(self): 

247 """Return a context manager that represents a transaction. 

248 """ 

249 # TODO make savepoint=False the default. 

250 try: 

251 with self._db.transaction(): 

252 yield 

253 except BaseException: 

254 # TODO: this clears the caches sometimes when we wouldn't actually 

255 # need to. Can we avoid that? 

256 self._dimensions.clearCaches() 

257 raise 

258 

259 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec): 

260 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

261 other data repository client. 

262 

263 Opaque table records can be added via `insertOpaqueData`, retrieved via 

264 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

265 

266 Parameters 

267 ---------- 

268 tableName : `str` 

269 Logical name of the opaque table. This may differ from the 

270 actual name used in the database by a prefix and/or suffix. 

271 spec : `ddl.TableSpec` 

272 Specification for the table to be added. 

273 """ 

274 self._opaque.register(tableName, spec) 

275 

276 @transactional 

277 def insertOpaqueData(self, tableName: str, *data: dict): 

278 """Insert records into an opaque table. 

279 

280 Parameters 

281 ---------- 

282 tableName : `str` 

283 Logical name of the opaque table. Must match the name used in a 

284 previous call to `registerOpaqueTable`. 

285 data 

286 Each additional positional argument is a dictionary that represents 

287 a single row to be added. 

288 """ 

289 self._opaque[tableName].insert(*data) 

290 

291 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

292 """Retrieve records from an opaque table. 

293 

294 Parameters 

295 ---------- 

296 tableName : `str` 

297 Logical name of the opaque table. Must match the name used in a 

298 previous call to `registerOpaqueTable`. 

299 where 

300 Additional keyword arguments are interpreted as equality 

301 constraints that restrict the returned rows (combined with AND); 

302 keyword arguments are column names and values are the values they 

303 must have. 

304 

305 Yields 

306 ------ 

307 row : `dict` 

308 A dictionary representing a single result row. 

309 """ 

310 yield from self._opaque[tableName].fetch(**where) 

311 

312 @transactional 

313 def deleteOpaqueData(self, tableName: str, **where: Any): 

314 """Remove records from an opaque table. 

315 

316 Parameters 

317 ---------- 

318 tableName : `str` 

319 Logical name of the opaque table. Must match the name used in a 

320 previous call to `registerOpaqueTable`. 

321 where 

322 Additional keyword arguments are interpreted as equality 

323 constraints that restrict the deleted rows (combined with AND); 

324 keyword arguments are column names and values are the values they 

325 must have. 

326 """ 

327 self._opaque[tableName].delete(**where) 

328 

329 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED): 

330 """Add a new collection if one with the given name does not exist. 

331 

332 Parameters 

333 ---------- 

334 name : `str` 

335 The name of the collection to create. 

336 type : `CollectionType` 

337 Enum value indicating the type of collection to create. 

338 

339 Notes 

340 ----- 

341 This method cannot be called within transactions, as it needs to be 

342 able to perform its own transaction to be concurrent. 

343 """ 

344 self._collections.register(name, type) 

345 

346 def getCollectionType(self, name: str) -> CollectionType: 

347 """Return an enumeration value indicating the type of the given 

348 collection. 

349 

350 Parameters 

351 ---------- 

352 name : `str` 

353 The name of the collection. 

354 

355 Returns 

356 ------- 

357 type : `CollectionType` 

358 Enum value indicating the type of this collection. 

359 

360 Raises 

361 ------ 

362 MissingCollectionError 

363 Raised if no collection with the given name exists. 

364 """ 

365 return self._collections.find(name).type 

366 

367 def registerRun(self, name: str): 

368 """Add a new run if one with the given name does not exist. 

369 

370 Parameters 

371 ---------- 

372 name : `str` 

373 The name of the run to create. 

374 

375 Notes 

376 ----- 

377 This method cannot be called within transactions, as it needs to be 

378 able to perform its own transaction to be concurrent. 

379 """ 

380 self._collections.register(name, CollectionType.RUN) 

381 

382 @transactional 

383 def removeCollection(self, name: str): 

384 """Completely remove the given collection. 

385 

386 Parameters 

387 ---------- 

388 name : `str` 

389 The name of the collection to remove. 

390 

391 Raises 

392 ------ 

393 MissingCollectionError 

394 Raised if no collection with the given name exists. 

395 

396 Notes 

397 ----- 

398 If this is a `~CollectionType.RUN` collection, all datasets and quanta 

399 in it are also fully removed. This requires that those datasets be 

400 removed (or at least trashed) from any datastores that hold them first. 

401 

402 A collection may not be deleted as long as it is referenced by a 

403 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must 

404 be deleted or redefined first. 

405 """ 

406 self._collections.remove(name) 

407 

408 def getCollectionChain(self, parent: str) -> CollectionSearch: 

409 """Return the child collections in a `~CollectionType.CHAINED` 

410 collection. 

411 

412 Parameters 

413 ---------- 

414 parent : `str` 

415 Name of the chained collection. Must have already been added via 

416 a call to `Registry.registerCollection`. 

417 

418 Returns 

419 ------- 

420 children : `CollectionSearch` 

421 An object that defines the search path of the collection. 

422 See :ref:`daf_butler_collection_expressions` for more information. 

423 

424 Raises 

425 ------ 

426 MissingCollectionError 

427 Raised if ``parent`` does not exist in the `Registry`. 

428 TypeError 

429 Raised if ``parent`` does not correspond to a 

430 `~CollectionType.CHAINED` collection. 

431 """ 

432 record = self._collections.find(parent) 

433 if record.type is not CollectionType.CHAINED: 

434 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

435 return record.children 

436 

437 @transactional 

438 def setCollectionChain(self, parent: str, children: Any): 

439 """Define or redefine a `~CollectionType.CHAINED` collection. 

440 

441 Parameters 

442 ---------- 

443 parent : `str` 

444 Name of the chained collection. Must have already been added via 

445 a call to `Registry.registerCollection`. 

446 children : `Any` 

447 An expression defining an ordered search of child collections, 

448 generally an iterable of `str`. Restrictions on the dataset types 

449 to be searched can also be included, by passing mapping or an 

450 iterable containing tuples; see 

451 :ref:`daf_butler_collection_expressions` for more information. 

452 

453 Raises 

454 ------ 

455 MissingCollectionError 

456 Raised when any of the given collections do not exist in the 

457 `Registry`. 

458 TypeError 

459 Raised if ``parent`` does not correspond to a 

460 `~CollectionType.CHAINED` collection. 

461 ValueError 

462 Raised if the given collections contains a cycle. 

463 """ 

464 record = self._collections.find(parent) 

465 if record.type is not CollectionType.CHAINED: 

466 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

467 children = CollectionSearch.fromExpression(children) 

468 if children != record.children: 

469 record.update(self._collections, children) 

470 

471 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

472 """ 

473 Add a new `DatasetType` to the Registry. 

474 

475 It is not an error to register the same `DatasetType` twice. 

476 

477 Parameters 

478 ---------- 

479 datasetType : `DatasetType` 

480 The `DatasetType` to be added. 

481 

482 Returns 

483 ------- 

484 inserted : `bool` 

485 `True` if ``datasetType`` was inserted, `False` if an identical 

486 existing `DatsetType` was found. Note that in either case the 

487 DatasetType is guaranteed to be defined in the Registry 

488 consistently with the given definition. 

489 

490 Raises 

491 ------ 

492 ValueError 

493 Raised if the dimensions or storage class are invalid. 

494 ConflictingDefinitionError 

495 Raised if this DatasetType is already registered with a different 

496 definition. 

497 

498 Notes 

499 ----- 

500 This method cannot be called within transactions, as it needs to be 

501 able to perform its own transaction to be concurrent. 

502 """ 

503 _, inserted = self._datasets.register(datasetType) 

504 return inserted 

505 

506 def getDatasetType(self, name: str) -> DatasetType: 

507 """Get the `DatasetType`. 

508 

509 Parameters 

510 ---------- 

511 name : `str` 

512 Name of the type. 

513 

514 Returns 

515 ------- 

516 type : `DatasetType` 

517 The `DatasetType` associated with the given name. 

518 

519 Raises 

520 ------ 

521 KeyError 

522 Requested named DatasetType could not be found in registry. 

523 """ 

524 storage = self._datasets.find(name) 

525 if storage is None: 

526 raise KeyError(f"DatasetType '{name}' could not be found.") 

527 return storage.datasetType 

528 

529 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *, 

530 collections: Any, **kwargs: Any) -> Optional[DatasetRef]: 

531 """Find a dataset given its `DatasetType` and data ID. 

532 

533 This can be used to obtain a `DatasetRef` that permits the dataset to 

534 be read from a `Datastore`. If the dataset is a component and can not 

535 be found using the provided dataset type, a dataset ref for the parent 

536 will be returned instead but with the correct dataset type. 

537 

538 Parameters 

539 ---------- 

540 datasetType : `DatasetType` or `str` 

541 A `DatasetType` or the name of one. 

542 dataId : `dict` or `DataCoordinate`, optional 

543 A `dict`-like object containing the `Dimension` links that identify 

544 the dataset within a collection. 

545 collections 

546 An expression that fully or partially identifies the collections 

547 to search for the dataset, such as a `str`, `re.Pattern`, or 

548 iterable thereof. `...` can be used to return all collections. 

549 See :ref:`daf_butler_collection_expressions` for more information. 

550 **kwargs 

551 Additional keyword arguments passed to 

552 `DataCoordinate.standardize` to convert ``dataId`` to a true 

553 `DataCoordinate` or augment an existing one. 

554 

555 Returns 

556 ------- 

557 ref : `DatasetRef` 

558 A reference to the dataset, or `None` if no matching Dataset 

559 was found. 

560 

561 Raises 

562 ------ 

563 LookupError 

564 Raised if one or more data ID keys are missing or the dataset type 

565 does not exist. 

566 MissingCollectionError 

567 Raised if any of ``collections`` does not exist in the registry. 

568 """ 

569 if isinstance(datasetType, DatasetType): 

570 storage = self._datasets.find(datasetType.name) 

571 if storage is None: 

572 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

573 else: 

574 storage = self._datasets.find(datasetType) 

575 if storage is None: 

576 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

577 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions, 

578 universe=self.dimensions, **kwargs) 

579 collections = CollectionSearch.fromExpression(collections) 

580 for collectionRecord in collections.iter(self._collections, datasetType=storage.datasetType): 

581 result = storage.find(collectionRecord, dataId) 

582 if result is not None: 

583 if result.datasetType.isComposite(): 

584 result = self._datasets.fetchComponents(result) 

585 return result 

586 

587 # fallback to the parent if we got nothing and this was a component 

588 if storage.datasetType.isComponent(): 

589 parentType, _ = storage.datasetType.nameAndComponent() 

590 parentRef = self.findDataset(parentType, dataId, collections=collections, **kwargs) 

591 if parentRef is not None: 

592 # Should already conform and we know no components 

593 return DatasetRef(storage.datasetType, parentRef.dataId, id=parentRef.id, 

594 run=parentRef.run, conform=False, hasParentId=True) 

595 

596 return None 

597 

598 @transactional 

599 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId], 

600 run: str, *, producer: Optional[Quantum] = None, recursive: bool = False 

601 ) -> List[DatasetRef]: 

602 """Insert one or more datasets into the `Registry` 

603 

604 This always adds new datasets; to associate existing datasets with 

605 a new collection, use ``associate``. 

606 

607 Parameters 

608 ---------- 

609 datasetType : `DatasetType` or `str` 

610 A `DatasetType` or the name of one. 

611 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate` 

612 Dimension-based identifiers for the new datasets. 

613 run : `str` 

614 The name of the run that produced the datasets. 

615 producer : `Quantum` 

616 Unit of work that produced the datasets. May be `None` to store 

617 no provenance information, but if present the `Quantum` must 

618 already have been added to the Registry. 

619 recursive : `bool` 

620 If True, recursively add datasets and attach entries for component 

621 datasets as well. 

622 

623 Returns 

624 ------- 

625 refs : `list` of `DatasetRef` 

626 Resolved `DatasetRef` instances for all given data IDs (in the same 

627 order). 

628 

629 Raises 

630 ------ 

631 ConflictingDefinitionError 

632 If a dataset with the same dataset type and data ID as one of those 

633 given already exists in ``run``. 

634 MissingCollectionError 

635 Raised if ``run`` does not exist in the registry. 

636 """ 

637 if isinstance(datasetType, DatasetType): 

638 storage = self._datasets.find(datasetType.name) 

639 if storage is None: 

640 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

641 else: 

642 storage = self._datasets.find(datasetType) 

643 if storage is None: 

644 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

645 runRecord = self._collections.find(run) 

646 dataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds] 

647 try: 

648 refs = list(storage.insert(runRecord, dataIds, quantum=producer)) 

649 except sqlalchemy.exc.IntegrityError as err: 

650 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

651 f"one or more datasets of type {storage.datasetType} into " 

652 f"collection '{run}'. " 

653 f"This probably means a dataset with the same data ID " 

654 f"and dataset type already exists, but it may also mean a " 

655 f"dimension row is missing.") from err 

656 if recursive and storage.datasetType.isComposite(): 

657 # Insert component rows by recursing. 

658 composites = defaultdict(dict) 

659 # TODO: we really shouldn't be inserting all components defined by 

660 # the storage class, because there's no guarantee all of them are 

661 # actually present in these datasets. 

662 for componentName in storage.datasetType.storageClass.components: 

663 componentDatasetType = storage.datasetType.makeComponentDatasetType(componentName) 

664 componentRefs = self.insertDatasets(componentDatasetType, 

665 dataIds=dataIds, 

666 run=run, 

667 producer=producer, 

668 recursive=True) 

669 for parentRef, componentRef in zip(refs, componentRefs): 

670 composites[parentRef][componentName] = componentRef 

671 if composites: 

672 refs = list(self._datasets.attachComponents(composites.items())) 

673 return refs 

674 

675 def getDataset(self, id: int) -> Optional[DatasetRef]: 

676 """Retrieve a Dataset entry. 

677 

678 Parameters 

679 ---------- 

680 id : `int` 

681 The unique identifier for the dataset. 

682 

683 Returns 

684 ------- 

685 ref : `DatasetRef` or `None` 

686 A ref to the Dataset, or `None` if no matching Dataset 

687 was found. 

688 """ 

689 ref = self._datasets.getDatasetRef(id) 

690 if ref is None: 

691 return None 

692 if ref.datasetType.isComposite(): 

693 return self._datasets.fetchComponents(ref) 

694 return ref 

695 

696 @transactional 

697 def removeDatasets(self, refs: Iterable[DatasetRef], *, recursive: bool = True): 

698 """Remove datasets from the Registry. 

699 

700 The datasets will be removed unconditionally from all collections, and 

701 any `Quantum` that consumed this dataset will instead be marked with 

702 having a NULL input. `Datastore` records will *not* be deleted; the 

703 caller is responsible for ensuring that the dataset has already been 

704 removed from all Datastores. 

705 

706 Parameters 

707 ---------- 

708 refs : `Iterable` of `DatasetRef` 

709 References to the datasets to be removed. Must include a valid 

710 ``id`` attribute, and should be considered invalidated upon return. 

711 recursive : `bool`, optional 

712 If `True`, remove all component datasets as well. Note that 

713 this only removes components that are actually included in the 

714 given `DatasetRef` instances, which may not be the same as those in 

715 the database (especially if they were obtained from 

716 `queryDatasets`, which does not populate `DatasetRef.components`). 

717 

718 Raises 

719 ------ 

720 AmbiguousDatasetError 

721 Raised if any ``ref.id`` is `None`. 

722 OrphanedRecordError 

723 Raised if any dataset is still present in any `Datastore`. 

724 """ 

725 for datasetType, refsForType in DatasetRef.groupByType(refs, recursive=recursive).items(): 

726 storage = self._datasets.find(datasetType.name) 

727 try: 

728 storage.delete(refsForType) 

729 except sqlalchemy.exc.IntegrityError as err: 

730 raise OrphanedRecordError("One or more datasets is still " 

731 "present in one or more Datastores.") from err 

732 

733 @transactional 

734 def attachComponents(self, parent: DatasetRef, components: Mapping[str, DatasetRef]): 

735 """Attach components to a dataset. 

736 

737 Parameters 

738 ---------- 

739 parent : `DatasetRef` 

740 A reference to the parent dataset. 

741 components : `Mapping` [ `str`, `DatasetRef` ] 

742 Mapping from component name to the `DatasetRef` for that component. 

743 

744 Returns 

745 ------- 

746 ref : `DatasetRef` 

747 An updated version of ``parent`` with components included. 

748 

749 Raises 

750 ------ 

751 AmbiguousDatasetError 

752 Raised if ``parent.id`` or any `DatasetRef.id` in ``components`` 

753 is `None`. 

754 """ 

755 for name, ref in components.items(): 

756 if ref.datasetType.storageClass != parent.datasetType.storageClass.components[name]: 

757 raise TypeError(f"Expected storage class " 

758 f"'{parent.datasetType.storageClass.components[name].name}' " 

759 f"for component '{name}' of dataset {parent}; got " 

760 f"dataset {ref} with storage class " 

761 f"'{ref.datasetType.storageClass.name}'.") 

762 ref, = self._datasets.attachComponents([(parent, components)]) 

763 return ref 

764 

765 @transactional 

766 def associate(self, collection: str, refs: Iterable[DatasetRef], *, recursive: bool = True): 

767 """Add existing datasets to a `~CollectionType.TAGGED` collection. 

768 

769 If a DatasetRef with the same exact integer ID is already in a 

770 collection nothing is changed. If a `DatasetRef` with the same 

771 `DatasetType` and data ID but with different integer ID 

772 exists in the collection, `ConflictingDefinitionError` is raised. 

773 

774 Parameters 

775 ---------- 

776 collection : `str` 

777 Indicates the collection the datasets should be associated with. 

778 refs : `Iterable` [ `DatasetRef` ] 

779 An iterable of resolved `DatasetRef` instances that already exist 

780 in this `Registry`. 

781 recursive : `bool`, optional 

782 If `True`, associate all component datasets as well. Note that 

783 this only associates components that are actually included in the 

784 given `DatasetRef` instances, which may not be the same as those in 

785 the database (especially if they were obtained from 

786 `queryDatasets`, which does not populate `DatasetRef.components`). 

787 

788 Raises 

789 ------ 

790 ConflictingDefinitionError 

791 If a Dataset with the given `DatasetRef` already exists in the 

792 given collection. 

793 AmbiguousDatasetError 

794 Raised if ``any(ref.id is None for ref in refs)``. 

795 MissingCollectionError 

796 Raised if ``collection`` does not exist in the registry. 

797 TypeError 

798 Raise adding new datasets to the given ``collection`` is not 

799 allowed. 

800 """ 

801 collectionRecord = self._collections.find(collection) 

802 if collectionRecord.type is not CollectionType.TAGGED: 

803 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

804 for datasetType, refsForType in DatasetRef.groupByType(refs, recursive=recursive).items(): 

805 storage = self._datasets.find(datasetType.name) 

806 try: 

807 storage.associate(collectionRecord, refsForType) 

808 except sqlalchemy.exc.IntegrityError as err: 

809 raise ConflictingDefinitionError( 

810 f"Constraint violation while associating dataset of type {datasetType.name} with " 

811 f"collection {collection}. This probably means that one or more datasets with the same " 

812 f"dataset type and data ID already exist in the collection, but it may also indicate " 

813 f"that the datasets do not exist." 

814 ) from err 

815 

816 @transactional 

817 def disassociate(self, collection: str, refs: Iterable[DatasetRef], *, recursive: bool = True): 

818 """Remove existing datasets from a `~CollectionType.TAGGED` collection. 

819 

820 ``collection`` and ``ref`` combinations that are not currently 

821 associated are silently ignored. 

822 

823 Parameters 

824 ---------- 

825 collection : `str` 

826 The collection the datasets should no longer be associated with. 

827 refs : `Iterable` [ `DatasetRef` ] 

828 An iterable of resolved `DatasetRef` instances that already exist 

829 in this `Registry`. 

830 recursive : `bool`, optional 

831 If `True`, disassociate all component datasets as well. Note that 

832 this only disassociates components that are actually included in 

833 the given `DatasetRef` instances, which may not be the same as 

834 those in the database (especially if they were obtained from 

835 `queryDatasets`, which does not populate `DatasetRef.components`). 

836 

837 Raises 

838 ------ 

839 AmbiguousDatasetError 

840 Raised if any of the given dataset references is unresolved. 

841 MissingCollectionError 

842 Raised if ``collection`` does not exist in the registry. 

843 TypeError 

844 Raise adding new datasets to the given ``collection`` is not 

845 allowed. 

846 """ 

847 collectionRecord = self._collections.find(collection) 

848 if collectionRecord.type is not CollectionType.TAGGED: 

849 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; " 

850 "expected TAGGED.") 

851 for datasetType, refsForType in DatasetRef.groupByType(refs, recursive=recursive).items(): 

852 storage = self._datasets.find(datasetType.name) 

853 storage.disassociate(collectionRecord, refsForType) 

854 

855 @transactional 

856 def insertDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]): 

857 """Record that a datastore holds the given datasets. 

858 

859 Typically used by `Datastore`. 

860 

861 Parameters 

862 ---------- 

863 datastoreName : `str` 

864 Name of the datastore holding these datasets. 

865 refs : `~collections.abc.Iterable` of `DatasetRef` 

866 References to the datasets. 

867 

868 Raises 

869 ------ 

870 AmbiguousDatasetError 

871 Raised if ``any(ref.id is None for ref in refs)``. 

872 """ 

873 self._db.insert( 

874 self._tables.dataset_location, 

875 *[{"datastore_name": datastoreName, "dataset_id": ref.getCheckedId()} for ref in refs] 

876 ) 

877 

878 @transactional 

879 def moveDatasetLocationToTrash(self, datastoreName: str, refs: Iterable[DatasetRef]): 

880 """Move the dataset location information to trash. 

881 

882 Parameters 

883 ---------- 

884 datastoreName : `str` 

885 Name of the datastore holding these datasets. 

886 refs : `~collections.abc.Iterable` of `DatasetRef` 

887 References to the datasets. 

888 """ 

889 # We only want to move rows that already exist in the main table 

890 filtered = self.checkDatasetLocations(datastoreName, refs) 

891 self.canDeleteDatasetLocations(datastoreName, filtered) 

892 self.removeDatasetLocation(datastoreName, filtered) 

893 

894 @transactional 

895 def canDeleteDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]): 

896 """Record that a datastore can delete this dataset 

897 

898 Parameters 

899 ---------- 

900 datastoreName : `str` 

901 Name of the datastore holding these datasets. 

902 refs : `~collections.abc.Iterable` of `DatasetRef` 

903 References to the datasets. 

904 

905 Raises 

906 ------ 

907 AmbiguousDatasetError 

908 Raised if ``any(ref.id is None for ref in refs)``. 

909 """ 

910 self._db.insert( 

911 self._tables.dataset_location_trash, 

912 *[{"datastore_name": datastoreName, "dataset_id": ref.getCheckedId()} for ref in refs] 

913 ) 

914 

915 def checkDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]) -> List[DatasetRef]: 

916 """Check which refs are listed for this datastore. 

917 

918 Parameters 

919 ---------- 

920 datastoreName : `str` 

921 Name of the datastore holding these datasets. 

922 refs : `~collections.abc.Iterable` of `DatasetRef` 

923 References to the datasets. 

924 

925 Returns 

926 ------- 

927 present : `list` of `DatasetRef` 

928 All the `DatasetRef` that are listed. 

929 """ 

930 

931 table = self._tables.dataset_location 

932 result = self._db.query( 

933 sqlalchemy.sql.select( 

934 [table.columns.datastore_name, table.columns.dataset_id] 

935 ).where( 

936 sqlalchemy.sql.and_(table.columns.dataset_id.in_([ref.id for ref in refs]), 

937 table.columns.datastore_name == datastoreName) 

938 ) 

939 ).fetchall() 

940 

941 matched_ids = {r["dataset_id"] for r in result} 

942 return [ref for ref in refs if ref.id in matched_ids] 

943 

944 def getDatasetLocations(self, ref: DatasetRef) -> Set[str]: 

945 """Retrieve datastore locations for a given dataset. 

946 

947 Typically used by `Datastore`. 

948 

949 Parameters 

950 ---------- 

951 ref : `DatasetRef` 

952 A reference to the dataset for which to retrieve storage 

953 information. 

954 

955 Returns 

956 ------- 

957 datastores : `set` of `str` 

958 All the matching datastores holding this dataset. Empty set 

959 if the dataset does not exist anywhere. 

960 

961 Raises 

962 ------ 

963 AmbiguousDatasetError 

964 Raised if ``ref.id`` is `None`. 

965 """ 

966 table = self._tables.dataset_location 

967 result = self._db.query( 

968 sqlalchemy.sql.select( 

969 [table.columns.datastore_name] 

970 ).where( 

971 table.columns.dataset_id == ref.id 

972 ) 

973 ).fetchall() 

974 return {r["datastore_name"] for r in result} 

975 

976 @transactional 

977 def getTrashedDatasets(self, datastoreName: str) -> Set[FakeDatasetRef]: 

978 """Retrieve all the dataset ref IDs that are in the trash 

979 associated with the specified datastore. 

980 

981 Parameters 

982 ---------- 

983 datastoreName : `str` 

984 The relevant datastore name to use. 

985 

986 Returns 

987 ------- 

988 ids : `set` of `FakeDatasetRef` 

989 The IDs of datasets that can be safely removed from this datastore. 

990 Can be empty. 

991 """ 

992 table = self._tables.dataset_location_trash 

993 result = self._db.query( 

994 sqlalchemy.sql.select( 

995 [table.columns.dataset_id] 

996 ).where( 

997 table.columns.datastore_name == datastoreName 

998 ) 

999 ).fetchall() 

1000 return {FakeDatasetRef(r["dataset_id"]) for r in result} 

1001 

1002 @transactional 

1003 def emptyDatasetLocationsTrash(self, datastoreName: str, refs: Iterable[FakeDatasetRef]) -> None: 

1004 """Remove datastore location associated with these datasets from trash. 

1005 

1006 Typically used by `Datastore` when a dataset is removed. 

1007 

1008 Parameters 

1009 ---------- 

1010 datastoreName : `str` 

1011 Name of this `Datastore`. 

1012 refs : iterable of `FakeDatasetRef` 

1013 The dataset IDs to be removed. 

1014 

1015 Raises 

1016 ------ 

1017 AmbiguousDatasetError 

1018 Raised if ``ref.id`` is `None`. 

1019 """ 

1020 if not refs: 

1021 return 

1022 self._db.delete( 

1023 self._tables.dataset_location_trash, 

1024 ["dataset_id", "datastore_name"], 

1025 *[{"dataset_id": ref.id, "datastore_name": datastoreName} for ref in refs] 

1026 ) 

1027 

1028 @transactional 

1029 def removeDatasetLocation(self, datastoreName: str, refs: Iterable[DatasetRef]) -> None: 

1030 """Remove datastore location associated with this dataset. 

1031 

1032 Typically used by `Datastore` when a dataset is removed. 

1033 

1034 Parameters 

1035 ---------- 

1036 datastoreName : `str` 

1037 Name of this `Datastore`. 

1038 refs : iterable of `DatasetRef` 

1039 A reference to the dataset for which information is to be removed. 

1040 

1041 Raises 

1042 ------ 

1043 AmbiguousDatasetError 

1044 Raised if ``ref.id`` is `None`. 

1045 """ 

1046 if not refs: 

1047 return 

1048 self._db.delete( 

1049 self._tables.dataset_location, 

1050 ["dataset_id", "datastore_name"], 

1051 *[{"dataset_id": ref.getCheckedId(), "datastore_name": datastoreName} for ref in refs] 

1052 ) 

1053 

1054 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None, 

1055 records: Optional[Mapping[DimensionElement, DimensionRecord]] = None, **kwds): 

1056 """Expand a dimension-based data ID to include additional information. 

1057 

1058 Parameters 

1059 ---------- 

1060 dataId : `DataCoordinate` or `dict`, optional 

1061 Data ID to be expanded; augmented and overridden by ``kwds``. 

1062 graph : `DimensionGraph`, optional 

1063 Set of dimensions for the expanded ID. If `None`, the dimensions 

1064 will be inferred from the keys of ``dataId`` and ``kwds``. 

1065 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph`` 

1066 are silently ignored, providing a way to extract and expand a 

1067 subset of a data ID. 

1068 records : mapping [`DimensionElement`, `DimensionRecord`], optional 

1069 Dimension record data to use before querying the database for that 

1070 data. 

1071 **kwds 

1072 Additional keywords are treated like additional key-value pairs for 

1073 ``dataId``, extending and overriding 

1074 

1075 Returns 

1076 ------- 

1077 expanded : `ExpandedDataCoordinate` 

1078 A data ID that includes full metadata for all of the dimensions it 

1079 identifieds. 

1080 """ 

1081 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwds) 

1082 if isinstance(standardized, ExpandedDataCoordinate): 

1083 return standardized 

1084 elif isinstance(dataId, ExpandedDataCoordinate): 

1085 records = dict(records) if records is not None else {} 

1086 records.update(dataId.records) 

1087 else: 

1088 records = dict(records) if records is not None else {} 

1089 keys = dict(standardized) 

1090 regions = [] 

1091 timespans = [] 

1092 for element in standardized.graph.primaryKeyTraversalOrder: 

1093 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

1094 if record is ...: 

1095 storage = self._dimensions[element] 

1096 record = storage.fetch(keys) 

1097 records[element] = record 

1098 if record is not None: 

1099 for d in element.implied: 

1100 value = getattr(record, d.name) 

1101 if keys.setdefault(d, value) != value: 

1102 raise InconsistentDataIdError(f"Data ID {standardized} has {d.name}={keys[d]!r}, " 

1103 f"but {element.name} implies {d.name}={value!r}.") 

1104 if element in standardized.graph.spatial and record.region is not None: 

1105 if any(record.region.relate(r) & lsst.sphgeom.DISJOINT for r in regions): 

1106 raise InconsistentDataIdError(f"Data ID {standardized}'s region for {element.name} " 

1107 f"is disjoint with those for other elements.") 

1108 regions.append(record.region) 

1109 if element in standardized.graph.temporal: 

1110 if any(not record.timespan.overlaps(t) for t in timespans): 

1111 raise InconsistentDataIdError(f"Data ID {standardized}'s timespan for {element.name}" 

1112 f" is disjoint with those for other elements.") 

1113 timespans.append(record.timespan) 

1114 else: 

1115 if element in standardized.graph.required: 

1116 raise LookupError( 

1117 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

1118 ) 

1119 if element.alwaysJoin: 

1120 raise InconsistentDataIdError( 

1121 f"Could not fetch record for element {element.name} via keys {keys}, ", 

1122 f"but it is marked alwaysJoin=True; this means one or more dimensions are not " 

1123 f"related." 

1124 ) 

1125 records.update((d, None) for d in element.implied) 

1126 return ExpandedDataCoordinate(standardized.graph, standardized.values(), records=records) 

1127 

1128 def relateDataIds(self, a: DataId, b: DataId) -> Optional[ConsistentDataIds]: 

1129 """Compare the keys and values of a pair of data IDs for consistency. 

1130 

1131 See `ConsistentDataIds` for more information. 

1132 

1133 Parameters 

1134 ---------- 

1135 a : `dict` or `DataCoordinate` 

1136 First data ID to be compared. 

1137 b : `dict` or `DataCoordinate` 

1138 Second data ID to be compared. 

1139 

1140 Returns 

1141 ------- 

1142 relationship : `ConsistentDataIds` or `None` 

1143 Relationship information. This is not `None` and coerces to 

1144 `True` in boolean contexts if and only if the data IDs are 

1145 consistent in terms of all common key-value pairs, all many-to-many 

1146 join tables, and all spatial andtemporal relationships. 

1147 """ 

1148 a = DataCoordinate.standardize(a, universe=self.dimensions) 

1149 b = DataCoordinate.standardize(b, universe=self.dimensions) 

1150 aFull = getattr(a, "full", None) 

1151 bFull = getattr(b, "full", None) 

1152 aBest = aFull if aFull is not None else a 

1153 bBest = bFull if bFull is not None else b 

1154 jointKeys = aBest.keys() & bBest.keys() 

1155 # If any common values are not equal, we know they are inconsistent. 

1156 if any(aBest[k] != bBest[k] for k in jointKeys): 

1157 return None 

1158 # If the graphs are equal, we know the data IDs are. 

1159 if a.graph == b.graph: 

1160 return ConsistentDataIds(contains=True, within=True, overlaps=bool(jointKeys)) 

1161 # Result is still inconclusive. Try to expand a data ID containing 

1162 # keys from both; that will fail if they are inconsistent. 

1163 # First, if either input was already an ExpandedDataCoordinate, extract 

1164 # its records so we don't have to query for them. 

1165 records = {} 

1166 if hasattr(a, "records"): 

1167 records.update(a.records) 

1168 if hasattr(b, "records"): 

1169 records.update(b.records) 

1170 try: 

1171 self.expandDataId({**a, **b}, graph=(a.graph | b.graph), records=records) 

1172 except InconsistentDataIdError: 

1173 return None 

1174 # We know the answer is not `None`; time to figure out what it is. 

1175 return ConsistentDataIds( 

1176 contains=(a.graph >= b.graph), 

1177 within=(a.graph <= b.graph), 

1178 overlaps=bool(a.graph & b.graph), 

1179 ) 

1180 

1181 def insertDimensionData(self, element: Union[DimensionElement, str], 

1182 *data: Union[dict, DimensionRecord], 

1183 conform: bool = True): 

1184 """Insert one or more dimension records into the database. 

1185 

1186 Parameters 

1187 ---------- 

1188 element : `DimensionElement` or `str` 

1189 The `DimensionElement` or name thereof that identifies the table 

1190 records will be inserted into. 

1191 data : `dict` or `DimensionRecord` (variadic) 

1192 One or more records to insert. 

1193 conform : `bool`, optional 

1194 If `False` (`True` is default) perform no checking or conversions, 

1195 and assume that ``element`` is a `DimensionElement` instance and 

1196 ``data`` is a one or more `DimensionRecord` instances of the 

1197 appropriate subclass. 

1198 """ 

1199 if conform: 

1200 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement. 

1201 records = [element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row 

1202 for row in data] 

1203 else: 

1204 records = data 

1205 storage = self._dimensions[element] 

1206 storage.insert(*records) 

1207 

1208 def syncDimensionData(self, element: Union[DimensionElement, str], 

1209 row: Union[dict, DimensionRecord], 

1210 conform: bool = True) -> bool: 

1211 """Synchronize the given dimension record with the database, inserting 

1212 if it does not already exist and comparing values if it does. 

1213 

1214 Parameters 

1215 ---------- 

1216 element : `DimensionElement` or `str` 

1217 The `DimensionElement` or name thereof that identifies the table 

1218 records will be inserted into. 

1219 row : `dict` or `DimensionRecord` 

1220 The record to insert. 

1221 conform : `bool`, optional 

1222 If `False` (`True` is default) perform no checking or conversions, 

1223 and assume that ``element`` is a `DimensionElement` instance and 

1224 ``data`` is a one or more `DimensionRecord` instances of the 

1225 appropriate subclass. 

1226 

1227 Returns 

1228 ------- 

1229 inserted : `bool` 

1230 `True` if a new row was inserted, `False` otherwise. 

1231 

1232 Raises 

1233 ------ 

1234 ConflictingDefinitionError 

1235 Raised if the record exists in the database (according to primary 

1236 key lookup) but is inconsistent with the given one. 

1237 

1238 Notes 

1239 ----- 

1240 This method cannot be called within transactions, as it needs to be 

1241 able to perform its own transaction to be concurrent. 

1242 """ 

1243 if conform: 

1244 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement. 

1245 record = element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row 

1246 else: 

1247 record = row 

1248 storage = self._dimensions[element] 

1249 return storage.sync(record) 

1250 

1251 def queryDatasetTypes(self, expression: Any = ...) -> Iterator[DatasetType]: 

1252 """Iterate over the dataset types whose names match an expression. 

1253 

1254 Parameters 

1255 ---------- 

1256 expression : `Any`, optional 

1257 An expression that fully or partially identifies the dataset types 

1258 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1259 `...` can be used to return all dataset types, and is the default. 

1260 See :ref:`daf_butler_dataset_type_expressions` for more 

1261 information. 

1262 

1263 Yields 

1264 ------ 

1265 datasetType : `DatasetType` 

1266 A `DatasetType` instance whose name matches ``expression``. 

1267 """ 

1268 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name) 

1269 if wildcard is ...: 

1270 yield from self._datasets 

1271 return 

1272 done = set() 

1273 for name in wildcard.strings: 

1274 storage = self._datasets.find(name) 

1275 if storage is not None: 

1276 done.add(storage.datasetType) 

1277 yield storage.datasetType 

1278 if wildcard.patterns: 

1279 for datasetType in self._datasets: 

1280 if datasetType.name in done: 

1281 continue 

1282 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

1283 yield datasetType 

1284 

1285 def queryCollections(self, expression: Any = ..., 

1286 datasetType: Optional[DatasetType] = None, 

1287 collectionType: Optional[CollectionType] = None, 

1288 flattenChains: bool = False, 

1289 includeChains: Optional[bool] = None) -> Iterator[str]: 

1290 """Iterate over the collections whose names match an expression. 

1291 

1292 Parameters 

1293 ---------- 

1294 expression : `Any`, optional 

1295 An expression that fully or partially identifies the collections 

1296 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1297 `...` can be used to return all collections, and is the default. 

1298 See :ref:`daf_butler_collection_expressions` for more 

1299 information. 

1300 datasetType : `DatasetType`, optional 

1301 If provided, only yield collections that should be searched for 

1302 this dataset type according to ``expression``. If this is 

1303 not provided, any dataset type restrictions in ``expression`` are 

1304 ignored. 

1305 collectionType : `CollectionType`, optional 

1306 If provided, only yield collections of this type. 

1307 flattenChains : `bool`, optional 

1308 If `True` (`False` is default), recursively yield the child 

1309 collections of matching `~CollectionType.CHAINED` collections. 

1310 includeChains : `bool`, optional 

1311 If `True`, yield records for matching `~CollectionType.CHAINED` 

1312 collections. Default is the opposite of ``flattenChains``: include 

1313 either CHAINED collections or their children, but not both. 

1314 

1315 Yields 

1316 ------ 

1317 collection : `str` 

1318 The name of a collection that matches ``expression``. 

1319 """ 

1320 query = CollectionQuery.fromExpression(expression) 

1321 for record in query.iter(self._collections, datasetType=datasetType, collectionType=collectionType, 

1322 flattenChains=flattenChains, includeChains=includeChains): 

1323 yield record.name 

1324 

1325 def makeQueryBuilder(self, summary: QuerySummary) -> QueryBuilder: 

1326 """Return a `QueryBuilder` instance capable of constructing and 

1327 managing more complex queries than those obtainable via `Registry` 

1328 interfaces. 

1329 

1330 This is an advanced interface; downstream code should prefer 

1331 `Registry.queryDimensions` and `Registry.queryDatasets` whenever those 

1332 are sufficient. 

1333 

1334 Parameters 

1335 ---------- 

1336 summary : `QuerySummary` 

1337 Object describing and categorizing the full set of dimensions that 

1338 will be included in the query. 

1339 

1340 Returns 

1341 ------- 

1342 builder : `QueryBuilder` 

1343 Object that can be used to construct and perform advanced queries. 

1344 """ 

1345 return QueryBuilder(summary=summary, 

1346 collections=self._collections, 

1347 dimensions=self._dimensions, 

1348 datasets=self._datasets) 

1349 

1350 def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *, 

1351 dataId: Optional[DataId] = None, 

1352 datasets: Any = None, 

1353 collections: Any = None, 

1354 where: Optional[str] = None, 

1355 expand: bool = True, 

1356 **kwds) -> Iterator[DataCoordinate]: 

1357 """Query for and iterate over data IDs matching user-provided criteria. 

1358 

1359 Parameters 

1360 ---------- 

1361 dimensions : `Dimension` or `str`, or iterable thereof 

1362 The dimensions of the data IDs to yield, as either `Dimension` 

1363 instances or `str`. Will be automatically expanded to a complete 

1364 `DimensionGraph`. 

1365 dataId : `dict` or `DataCoordinate`, optional 

1366 A data ID whose key-value pairs are used as equality constraints 

1367 in the query. 

1368 datasets : `Any`, optional 

1369 An expression that fully or partially identifies dataset types 

1370 that should constrain the yielded data IDs. For example, including 

1371 "raw" here would constrain the yielded ``instrument``, 

1372 ``exposure``, ``detector``, and ``physical_filter`` values to only 

1373 those for which at least one "raw" dataset exists in 

1374 ``collections``. Allowed types include `DatasetType`, `str`, 

1375 `re.Pattern`, and iterables thereof. Unlike other dataset type 

1376 expressions, `...` is not permitted - it doesn't make sense to 

1377 constrain data IDs on the existence of *all* datasets. 

1378 See :ref:`daf_butler_dataset_type_expressions` for more 

1379 information. 

1380 collections: `Any`, optional 

1381 An expression that fully or partially identifies the collections 

1382 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1383 thereof. `...` can be used to return all collections. Must be 

1384 provided if ``datasets`` is, and is ignored if it is not. See 

1385 :ref:`daf_butler_collection_expressions` for more information. 

1386 where : `str`, optional 

1387 A string expression similar to a SQL WHERE clause. May involve 

1388 any column of a dimension table or (as a shortcut for the primary 

1389 key column of a dimension table) dimension name. See 

1390 :ref:`daf_butler_dimension_expressions` for more information. 

1391 expand : `bool`, optional 

1392 If `True` (default) yield `ExpandedDataCoordinate` instead of 

1393 minimal `DataCoordinate` base-class instances. 

1394 kwds 

1395 Additional keyword arguments are forwarded to 

1396 `DataCoordinate.standardize` when processing the ``dataId`` 

1397 argument (and may be used to provide a constraining data ID even 

1398 when the ``dataId`` argument is `None`). 

1399 

1400 Yields 

1401 ------ 

1402 dataId : `DataCoordinate` 

1403 Data IDs matching the given query parameters. Order is 

1404 unspecified. 

1405 """ 

1406 dimensions = iterable(dimensions) 

1407 standardizedDataId = self.expandDataId(dataId, **kwds) 

1408 standardizedDatasetTypes = [] 

1409 requestedDimensionNames = set(self.dimensions.extract(dimensions).names) 

1410 if datasets is not None: 

1411 if collections is None: 

1412 raise TypeError("Cannot pass 'datasets' without 'collections'.") 

1413 for datasetType in self.queryDatasetTypes(datasets): 

1414 requestedDimensionNames.update(datasetType.dimensions.names) 

1415 standardizedDatasetTypes.append(datasetType) 

1416 # Preprocess collections expression in case the original included 

1417 # single-pass iterators (we'll want to use it multiple times 

1418 # below). 

1419 collections = CollectionQuery.fromExpression(collections) 

1420 

1421 summary = QuerySummary( 

1422 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1423 dataId=standardizedDataId, 

1424 expression=where, 

1425 ) 

1426 builder = self.makeQueryBuilder(summary) 

1427 for datasetType in standardizedDatasetTypes: 

1428 builder.joinDataset(datasetType, collections, isResult=False) 

1429 query = builder.finish() 

1430 predicate = query.predicate() 

1431 for row in self._db.query(query.sql): 

1432 if predicate(row): 

1433 result = query.extractDataId(row) 

1434 if expand: 

1435 yield self.expandDataId(result, records=standardizedDataId.records) 

1436 else: 

1437 yield result 

1438 

1439 def queryDatasets(self, datasetType: Any, *, 

1440 collections: Any, 

1441 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

1442 dataId: Optional[DataId] = None, 

1443 where: Optional[str] = None, 

1444 deduplicate: bool = False, 

1445 expand: bool = True, 

1446 **kwds) -> Iterator[DatasetRef]: 

1447 """Query for and iterate over dataset references matching user-provided 

1448 criteria. 

1449 

1450 Parameters 

1451 ---------- 

1452 datasetType 

1453 An expression that fully or partially identifies the dataset types 

1454 to be queried. Allowed types include `DatasetType`, `str`, 

1455 `re.Pattern`, and iterables thereof. The special value `...` can 

1456 be used to query all dataset types. See 

1457 :ref:`daf_butler_dataset_type_expressions` for more information. 

1458 collections 

1459 An expression that fully or partially identifies the collections 

1460 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1461 thereof. `...` can be used to return all collections. See 

1462 :ref:`daf_butler_collection_expressions` for more information. 

1463 dimensions : `~collections.abc.Iterable` of `Dimension` or `str` 

1464 Dimensions to include in the query (in addition to those used 

1465 to identify the queried dataset type(s)), either to constrain 

1466 the resulting datasets to those for which a matching dimension 

1467 exists, or to relate the dataset type's dimensions to dimensions 

1468 referenced by the ``dataId`` or ``where`` arguments. 

1469 dataId : `dict` or `DataCoordinate`, optional 

1470 A data ID whose key-value pairs are used as equality constraints 

1471 in the query. 

1472 where : `str`, optional 

1473 A string expression similar to a SQL WHERE clause. May involve 

1474 any column of a dimension table or (as a shortcut for the primary 

1475 key column of a dimension table) dimension name. See 

1476 :ref:`daf_butler_dimension_expressions` for more information. 

1477 deduplicate : `bool`, optional 

1478 If `True` (`False` is default), for each result data ID, only 

1479 yield one `DatasetRef` of each `DatasetType`, from the first 

1480 collection in which a dataset of that dataset type appears 

1481 (according to the order of ``collections`` passed in). If `True`, 

1482 ``collections`` must not contain regular expressions and may not 

1483 be `...`. 

1484 expand : `bool`, optional 

1485 If `True` (default) attach `ExpandedDataCoordinate` instead of 

1486 minimal `DataCoordinate` base-class instances. 

1487 kwds 

1488 Additional keyword arguments are forwarded to 

1489 `DataCoordinate.standardize` when processing the ``dataId`` 

1490 argument (and may be used to provide a constraining data ID even 

1491 when the ``dataId`` argument is `None`). 

1492 

1493 Yields 

1494 ------ 

1495 ref : `DatasetRef` 

1496 Dataset references matching the given query criteria. These 

1497 are grouped by `DatasetType` if the query evaluates to multiple 

1498 dataset types, but order is otherwise unspecified. 

1499 

1500 Raises 

1501 ------ 

1502 TypeError 

1503 Raised when the arguments are incompatible, such as when a 

1504 collection wildcard is passed when ``deduplicate`` is `True`. 

1505 

1506 Notes 

1507 ----- 

1508 When multiple dataset types are queried in a single call, the 

1509 results of this operation are equivalent to querying for each dataset 

1510 type separately in turn, and no information about the relationships 

1511 between datasets of different types is included. In contexts where 

1512 that kind of information is important, the recommended pattern is to 

1513 use `queryDimensions` to first obtain data IDs (possibly with the 

1514 desired dataset types and collections passed as constraints to the 

1515 query), and then use multiple (generally much simpler) calls to 

1516 `queryDatasets` with the returned data IDs passed as constraints. 

1517 """ 

1518 # Standardize the collections expression. 

1519 if deduplicate: 

1520 collections = CollectionSearch.fromExpression(collections) 

1521 else: 

1522 collections = CollectionQuery.fromExpression(collections) 

1523 # Standardize and expand the data ID provided as a constraint. 

1524 standardizedDataId = self.expandDataId(dataId, **kwds) 

1525 # If the datasetType passed isn't actually a DatasetType, expand it 

1526 # (it could be an expression that yields multiple DatasetTypes) and 

1527 # recurse. 

1528 if not isinstance(datasetType, DatasetType): 

1529 for trueDatasetType in self.queryDatasetTypes(datasetType): 

1530 yield from self.queryDatasets(trueDatasetType, collections=collections, 

1531 dimensions=dimensions, dataId=standardizedDataId, 

1532 where=where, deduplicate=deduplicate) 

1533 return 

1534 # The full set of dimensions in the query is the combination of those 

1535 # needed for the DatasetType and those explicitly requested, if any. 

1536 requestedDimensionNames = set(datasetType.dimensions.names) 

1537 if dimensions is not None: 

1538 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

1539 # Construct the summary structure needed to construct a QueryBuilder. 

1540 summary = QuerySummary( 

1541 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1542 dataId=standardizedDataId, 

1543 expression=where, 

1544 ) 

1545 builder = self.makeQueryBuilder(summary) 

1546 # Add the dataset subquery to the query, telling the QueryBuilder to 

1547 # include the rank of the selected collection in the results only if we 

1548 # need to deduplicate. Note that if any of the collections are 

1549 # actually wildcard expressions, and we've asked for deduplication, 

1550 # this will raise TypeError for us. 

1551 if not builder.joinDataset(datasetType, collections, isResult=True, addRank=deduplicate): 

1552 return 

1553 query = builder.finish() 

1554 predicate = query.predicate() 

1555 if not deduplicate: 

1556 # No need to de-duplicate across collections. 

1557 for row in self._db.query(query.sql): 

1558 if predicate(row): 

1559 dataId = query.extractDataId(row, graph=datasetType.dimensions) 

1560 if expand: 

1561 dataId = self.expandDataId(dataId, records=standardizedDataId.records) 

1562 yield query.extractDatasetRef(row, datasetType, dataId)[0] 

1563 else: 

1564 # For each data ID, yield only the DatasetRef with the lowest 

1565 # collection rank. 

1566 bestRefs = {} 

1567 bestRanks = {} 

1568 for row in self._db.query(query.sql): 

1569 if predicate(row): 

1570 ref, rank = query.extractDatasetRef(row, datasetType) 

1571 bestRank = bestRanks.get(ref.dataId, sys.maxsize) 

1572 if rank < bestRank: 

1573 bestRefs[ref.dataId] = ref 

1574 bestRanks[ref.dataId] = rank 

1575 # If caller requested expanded data IDs, we defer that until here 

1576 # so we do as little expansion as possible. 

1577 if expand: 

1578 for ref in bestRefs.values(): 

1579 dataId = self.expandDataId(ref.dataId, records=standardizedDataId.records) 

1580 yield ref.expanded(dataId) 

1581 else: 

1582 yield from bestRefs.values() 

1583 

1584 dimensions: DimensionUniverse 

1585 """The universe of all dimensions known to the registry 

1586 (`DimensionUniverse`). 

1587 """ 

1588 

1589 storageClasses: StorageClassFactory 

1590 """All storage classes known to the registry (`StorageClassFactory`). 

1591 """