Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "Registry", 

26) 

27 

28from collections import defaultdict 

29import contextlib 

30import logging 

31from typing import ( 

32 Any, 

33 Dict, 

34 Iterable, 

35 Iterator, 

36 List, 

37 Mapping, 

38 Optional, 

39 Set, 

40 TYPE_CHECKING, 

41 Union, 

42) 

43 

44import sqlalchemy 

45 

46from ..core import ( 

47 ButlerURI, 

48 Config, 

49 DataCoordinate, 

50 DataCoordinateIterable, 

51 DataId, 

52 DatasetAssociation, 

53 DatasetRef, 

54 DatasetType, 

55 ddl, 

56 Dimension, 

57 DimensionConfig, 

58 DimensionElement, 

59 DimensionGraph, 

60 DimensionRecord, 

61 DimensionUniverse, 

62 NamedKeyMapping, 

63 NameLookupMapping, 

64 StorageClassFactory, 

65 Timespan, 

66) 

67from . import queries 

68from ..core.utils import iterable, transactional 

69from ._config import RegistryConfig 

70from ._collectionType import CollectionType 

71from ._defaults import RegistryDefaults 

72from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError 

73from .managers import RegistryManagerTypes, RegistryManagerInstances 

74from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch, Ellipsis 

75from .summaries import CollectionSummary 

76from .interfaces import ChainedCollectionRecord, RunRecord 

77 

78if TYPE_CHECKING: 78 ↛ 79line 78 didn't jump to line 79, because the condition on line 78 was never true

79 from .._butlerConfig import ButlerConfig 

80 from .interfaces import ( 

81 Database, 

82 DatastoreRegistryBridgeManager, 

83 ) 

84 

85 

86_LOG = logging.getLogger(__name__) 

87 

88# key for dimensions configuration in attributes table 

89_DIMENSIONS_ATTR = "config:dimensions.json" 

90 

91 

92class Registry: 

93 """Registry interface. 

94 

95 Parameters 

96 ---------- 

97 database : `Database` 

98 Database instance to store Registry. 

99 defaults : `RegistryDefaults`, optional 

100 Default collection search path and/or output `~CollectionType.RUN` 

101 collection. 

102 attributes : `type` 

103 Manager class implementing `ButlerAttributeManager`. 

104 opaque : `type` 

105 Manager class implementing `OpaqueTableStorageManager`. 

106 dimensions : `type` 

107 Manager class implementing `DimensionRecordStorageManager`. 

108 collections : `type` 

109 Manager class implementing `CollectionManager`. 

110 datasets : `type` 

111 Manager class implementing `DatasetRecordStorageManager`. 

112 datastoreBridges : `type` 

113 Manager class implementing `DatastoreRegistryBridgeManager`. 

114 dimensionConfig : `DimensionConfig`, optional 

115 Dimension universe configuration, only used when ``create`` is True. 

116 writeable : `bool`, optional 

117 If True then Registry will support write operations. 

118 create : `bool`, optional 

119 If True then database schema will be initialized, it must be empty 

120 before instantiating Registry. 

121 """ 

122 

123 defaultConfigFile: Optional[str] = None 

124 """Path to configuration defaults. Accessed within the ``configs`` resource 

125 or relative to a search path. Can be None if no defaults specified. 

126 """ 

127 

128 @classmethod 

129 def createFromConfig(cls, config: Optional[Union[RegistryConfig, str]] = None, 

130 dimensionConfig: Optional[Union[DimensionConfig, str]] = None, 

131 butlerRoot: Optional[str] = None) -> Registry: 

132 """Create registry database and return `Registry` instance. 

133 

134 This method initializes database contents, database must be empty 

135 prior to calling this method. 

136 

137 Parameters 

138 ---------- 

139 config : `RegistryConfig` or `str`, optional 

140 Registry configuration, if missing then default configuration will 

141 be loaded from registry.yaml. 

142 dimensionConfig : `DimensionConfig` or `str`, optional 

143 Dimensions configuration, if missing then default configuration 

144 will be loaded from dimensions.yaml. 

145 butlerRoot : `str`, optional 

146 Path to the repository root this `Registry` will manage. 

147 

148 Returns 

149 ------- 

150 registry : `Registry` 

151 A new `Registry` instance. 

152 """ 

153 if isinstance(config, str): 

154 config = RegistryConfig(config) 

155 elif config is None: 

156 config = RegistryConfig() 

157 elif not isinstance(config, RegistryConfig): 

158 raise TypeError(f"Incompatible Registry configuration type: {type(config)}") 

159 config.replaceRoot(butlerRoot) 

160 

161 if isinstance(dimensionConfig, str): 

162 dimensionConfig = DimensionConfig(config) 

163 elif dimensionConfig is None: 

164 dimensionConfig = DimensionConfig() 

165 elif not isinstance(dimensionConfig, DimensionConfig): 

166 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}") 

167 

168 DatabaseClass = config.getDatabaseClass() 

169 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

170 namespace=config.get("namespace")) 

171 managerTypes = RegistryManagerTypes.fromConfig(config) 

172 managers = managerTypes.makeRepo(database, dimensionConfig) 

173 return cls(database, RegistryDefaults(), managers) 

174 

175 @classmethod 

176 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], 

177 butlerRoot: Optional[Union[str, ButlerURI]] = None, writeable: bool = True, 

178 defaults: Optional[RegistryDefaults] = None) -> Registry: 

179 """Create `Registry` subclass instance from `config`. 

180 

181 Registry database must be inbitialized prior to calling this method. 

182 

183 Parameters 

184 ---------- 

185 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

186 Registry configuration 

187 butlerRoot : `str` or `ButlerURI`, optional 

188 Path to the repository root this `Registry` will manage. 

189 writeable : `bool`, optional 

190 If `True` (default) create a read-write connection to the database. 

191 defaults : `RegistryDefaults`, optional 

192 Default collection search path and/or output `~CollectionType.RUN` 

193 collection. 

194 

195 Returns 

196 ------- 

197 registry : `Registry` (subclass) 

198 A new `Registry` subclass instance. 

199 """ 

200 if not isinstance(config, RegistryConfig): 

201 if isinstance(config, str) or isinstance(config, Config): 

202 config = RegistryConfig(config) 

203 else: 

204 raise ValueError("Incompatible Registry configuration: {}".format(config)) 

205 config.replaceRoot(butlerRoot) 

206 DatabaseClass = config.getDatabaseClass() 

207 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

208 namespace=config.get("namespace"), writeable=writeable) 

209 managerTypes = RegistryManagerTypes.fromConfig(config) 

210 managers = managerTypes.loadRepo(database) 

211 if defaults is None: 

212 defaults = RegistryDefaults() 

213 return cls(database, defaults, managers) 

214 

215 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances): 

216 self._db = database 

217 self._managers = managers 

218 self.storageClasses = StorageClassFactory() 

219 # Intentionally invoke property setter to initialize defaults. This 

220 # can only be done after most of the rest of Registry has already been 

221 # initialized, and must be done before the property getter is used. 

222 self.defaults = defaults 

223 

224 def __str__(self) -> str: 

225 return str(self._db) 

226 

227 def __repr__(self) -> str: 

228 return f"Registry({self._db!r}, {self.dimensions!r})" 

229 

230 def isWriteable(self) -> bool: 

231 """Return `True` if this registry allows write operations, and `False` 

232 otherwise. 

233 """ 

234 return self._db.isWriteable() 

235 

236 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry: 

237 """Create a new `Registry` backed by the same data repository and 

238 connection as this one, but independent defaults. 

239 

240 Parameters 

241 ---------- 

242 defaults : `RegistryDefaults`, optional 

243 Default collections and data ID values for the new registry. If 

244 not provided, ``self.defaults`` will be used (but future changes 

245 to either registry's defaults will not affect the other). 

246 

247 Returns 

248 ------- 

249 copy : `Registry` 

250 A new `Registry` instance with its own defaults. 

251 

252 Notes 

253 ----- 

254 Because the new registry shares a connection with the original, they 

255 also share transaction state (despite the fact that their `transaction` 

256 context manager methods do not reflect this), and must be used with 

257 care. 

258 """ 

259 if defaults is None: 

260 # No need to copy, because `RegistryDefaults` is immutable; we 

261 # effectively copy on write. 

262 defaults = self.defaults 

263 return Registry(self._db, defaults, self._managers) 

264 

265 @property 

266 def dimensions(self) -> DimensionUniverse: 

267 """All dimensions recognized by this `Registry` (`DimensionUniverse`). 

268 """ 

269 return self._managers.dimensions.universe 

270 

271 @property 

272 def defaults(self) -> RegistryDefaults: 

273 """Default collection search path and/or output `~CollectionType.RUN` 

274 collection (`RegistryDefaults`). 

275 

276 This is an immutable struct whose components may not be set 

277 individually, but the entire struct can be set by assigning to this 

278 property. 

279 """ 

280 return self._defaults 

281 

282 @defaults.setter 

283 def defaults(self, value: RegistryDefaults) -> None: 

284 if value.run is not None: 

285 self.registerRun(value.run) 

286 value.finish(self) 

287 self._defaults = value 

288 

289 def refresh(self) -> None: 

290 """Refresh all in-memory state by querying the database. 

291 

292 This may be necessary to enable querying for entities added by other 

293 `Registry` instances after this one was constructed. 

294 """ 

295 self._managers.refresh() 

296 

297 @contextlib.contextmanager 

298 def transaction(self, *, savepoint: bool = False) -> Iterator[None]: 

299 """Return a context manager that represents a transaction. 

300 """ 

301 try: 

302 with self._db.transaction(savepoint=savepoint): 

303 yield 

304 except BaseException: 

305 # TODO: this clears the caches sometimes when we wouldn't actually 

306 # need to. Can we avoid that? 

307 self._managers.dimensions.clearCaches() 

308 raise 

309 

310 def resetConnectionPool(self) -> None: 

311 """Reset SQLAlchemy connection pool for registry database. 

312 

313 This operation is useful when using registry with fork-based 

314 multiprocessing. To use registry across fork boundary one has to make 

315 sure that there are no currently active connections (no session or 

316 transaction is in progress) and connection pool is reset using this 

317 method. This method should be called by the child process immediately 

318 after the fork. 

319 """ 

320 self._db._engine.dispose() 

321 

322 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None: 

323 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

324 other data repository client. 

325 

326 Opaque table records can be added via `insertOpaqueData`, retrieved via 

327 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

328 

329 Parameters 

330 ---------- 

331 tableName : `str` 

332 Logical name of the opaque table. This may differ from the 

333 actual name used in the database by a prefix and/or suffix. 

334 spec : `ddl.TableSpec` 

335 Specification for the table to be added. 

336 """ 

337 self._managers.opaque.register(tableName, spec) 

338 

339 @transactional 

340 def insertOpaqueData(self, tableName: str, *data: dict) -> None: 

341 """Insert records into an opaque table. 

342 

343 Parameters 

344 ---------- 

345 tableName : `str` 

346 Logical name of the opaque table. Must match the name used in a 

347 previous call to `registerOpaqueTable`. 

348 data 

349 Each additional positional argument is a dictionary that represents 

350 a single row to be added. 

351 """ 

352 self._managers.opaque[tableName].insert(*data) 

353 

354 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

355 """Retrieve records from an opaque table. 

356 

357 Parameters 

358 ---------- 

359 tableName : `str` 

360 Logical name of the opaque table. Must match the name used in a 

361 previous call to `registerOpaqueTable`. 

362 where 

363 Additional keyword arguments are interpreted as equality 

364 constraints that restrict the returned rows (combined with AND); 

365 keyword arguments are column names and values are the values they 

366 must have. 

367 

368 Yields 

369 ------ 

370 row : `dict` 

371 A dictionary representing a single result row. 

372 """ 

373 yield from self._managers.opaque[tableName].fetch(**where) 

374 

375 @transactional 

376 def deleteOpaqueData(self, tableName: str, **where: Any) -> None: 

377 """Remove records from an opaque table. 

378 

379 Parameters 

380 ---------- 

381 tableName : `str` 

382 Logical name of the opaque table. Must match the name used in a 

383 previous call to `registerOpaqueTable`. 

384 where 

385 Additional keyword arguments are interpreted as equality 

386 constraints that restrict the deleted rows (combined with AND); 

387 keyword arguments are column names and values are the values they 

388 must have. 

389 """ 

390 self._managers.opaque[tableName].delete(where.keys(), where) 

391 

392 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED, 

393 doc: Optional[str] = None) -> None: 

394 """Add a new collection if one with the given name does not exist. 

395 

396 Parameters 

397 ---------- 

398 name : `str` 

399 The name of the collection to create. 

400 type : `CollectionType` 

401 Enum value indicating the type of collection to create. 

402 doc : `str`, optional 

403 Documentation string for the collection. 

404 

405 Notes 

406 ----- 

407 This method cannot be called within transactions, as it needs to be 

408 able to perform its own transaction to be concurrent. 

409 """ 

410 self._managers.collections.register(name, type, doc=doc) 

411 

412 def getCollectionType(self, name: str) -> CollectionType: 

413 """Return an enumeration value indicating the type of the given 

414 collection. 

415 

416 Parameters 

417 ---------- 

418 name : `str` 

419 The name of the collection. 

420 

421 Returns 

422 ------- 

423 type : `CollectionType` 

424 Enum value indicating the type of this collection. 

425 

426 Raises 

427 ------ 

428 MissingCollectionError 

429 Raised if no collection with the given name exists. 

430 """ 

431 return self._managers.collections.find(name).type 

432 

433 def registerRun(self, name: str, doc: Optional[str] = None) -> None: 

434 """Add a new run if one with the given name does not exist. 

435 

436 Parameters 

437 ---------- 

438 name : `str` 

439 The name of the run to create. 

440 doc : `str`, optional 

441 Documentation string for the collection. 

442 

443 Notes 

444 ----- 

445 This method cannot be called within transactions, as it needs to be 

446 able to perform its own transaction to be concurrent. 

447 """ 

448 self._managers.collections.register(name, CollectionType.RUN, doc=doc) 

449 

450 @transactional 

451 def removeCollection(self, name: str) -> None: 

452 """Completely remove the given collection. 

453 

454 Parameters 

455 ---------- 

456 name : `str` 

457 The name of the collection to remove. 

458 

459 Raises 

460 ------ 

461 MissingCollectionError 

462 Raised if no collection with the given name exists. 

463 

464 Notes 

465 ----- 

466 If this is a `~CollectionType.RUN` collection, all datasets and quanta 

467 in it are also fully removed. This requires that those datasets be 

468 removed (or at least trashed) from any datastores that hold them first. 

469 

470 A collection may not be deleted as long as it is referenced by a 

471 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must 

472 be deleted or redefined first. 

473 """ 

474 self._managers.collections.remove(name) 

475 

476 def getCollectionChain(self, parent: str) -> CollectionSearch: 

477 """Return the child collections in a `~CollectionType.CHAINED` 

478 collection. 

479 

480 Parameters 

481 ---------- 

482 parent : `str` 

483 Name of the chained collection. Must have already been added via 

484 a call to `Registry.registerCollection`. 

485 

486 Returns 

487 ------- 

488 children : `CollectionSearch` 

489 An object that defines the search path of the collection. 

490 See :ref:`daf_butler_collection_expressions` for more information. 

491 

492 Raises 

493 ------ 

494 MissingCollectionError 

495 Raised if ``parent`` does not exist in the `Registry`. 

496 TypeError 

497 Raised if ``parent`` does not correspond to a 

498 `~CollectionType.CHAINED` collection. 

499 """ 

500 record = self._managers.collections.find(parent) 

501 if record.type is not CollectionType.CHAINED: 

502 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

503 assert isinstance(record, ChainedCollectionRecord) 

504 return record.children 

505 

506 @transactional 

507 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None: 

508 """Define or redefine a `~CollectionType.CHAINED` collection. 

509 

510 Parameters 

511 ---------- 

512 parent : `str` 

513 Name of the chained collection. Must have already been added via 

514 a call to `Registry.registerCollection`. 

515 children : `Any` 

516 An expression defining an ordered search of child collections, 

517 generally an iterable of `str`; see 

518 :ref:`daf_butler_collection_expressions` for more information. 

519 flatten : `bool`, optional 

520 If `True` (`False` is default), recursively flatten out any nested 

521 `~CollectionType.CHAINED` collections in ``children`` first. 

522 

523 Raises 

524 ------ 

525 MissingCollectionError 

526 Raised when any of the given collections do not exist in the 

527 `Registry`. 

528 TypeError 

529 Raised if ``parent`` does not correspond to a 

530 `~CollectionType.CHAINED` collection. 

531 ValueError 

532 Raised if the given collections contains a cycle. 

533 """ 

534 record = self._managers.collections.find(parent) 

535 if record.type is not CollectionType.CHAINED: 

536 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

537 assert isinstance(record, ChainedCollectionRecord) 

538 children = CollectionSearch.fromExpression(children) 

539 if children != record.children or flatten: 

540 record.update(self._managers.collections, children, flatten=flatten) 

541 

542 def getCollectionDocumentation(self, collection: str) -> Optional[str]: 

543 """Retrieve the documentation string for a collection. 

544 

545 Parameters 

546 ---------- 

547 name : `str` 

548 Name of the collection. 

549 

550 Returns 

551 ------- 

552 docs : `str` or `None` 

553 Docstring for the collection with the given name. 

554 """ 

555 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key) 

556 

557 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None: 

558 """Set the documentation string for a collection. 

559 

560 Parameters 

561 ---------- 

562 name : `str` 

563 Name of the collection. 

564 docs : `str` or `None` 

565 Docstring for the collection with the given name; will replace any 

566 existing docstring. Passing `None` will remove any existing 

567 docstring. 

568 """ 

569 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc) 

570 

571 def getCollectionSummary(self, collection: str) -> CollectionSummary: 

572 """Return a summary for the given collection. 

573 

574 Parameters 

575 ---------- 

576 collection : `str` 

577 Name of the collection for which a summary is to be retrieved. 

578 

579 Returns 

580 ------- 

581 summary : `CollectionSummary` 

582 Summary of the dataset types and governor dimension values in 

583 this collection. 

584 """ 

585 record = self._managers.collections.find(collection) 

586 return self._managers.datasets.getCollectionSummary(record) 

587 

588 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

589 """ 

590 Add a new `DatasetType` to the Registry. 

591 

592 It is not an error to register the same `DatasetType` twice. 

593 

594 Parameters 

595 ---------- 

596 datasetType : `DatasetType` 

597 The `DatasetType` to be added. 

598 

599 Returns 

600 ------- 

601 inserted : `bool` 

602 `True` if ``datasetType`` was inserted, `False` if an identical 

603 existing `DatsetType` was found. Note that in either case the 

604 DatasetType is guaranteed to be defined in the Registry 

605 consistently with the given definition. 

606 

607 Raises 

608 ------ 

609 ValueError 

610 Raised if the dimensions or storage class are invalid. 

611 ConflictingDefinitionError 

612 Raised if this DatasetType is already registered with a different 

613 definition. 

614 

615 Notes 

616 ----- 

617 This method cannot be called within transactions, as it needs to be 

618 able to perform its own transaction to be concurrent. 

619 """ 

620 _, inserted = self._managers.datasets.register(datasetType) 

621 return inserted 

622 

623 def removeDatasetType(self, name: str) -> None: 

624 """Remove the named `DatasetType` from the registry. 

625 

626 .. warning:: 

627 

628 Registry caches the dataset type definitions. This means that 

629 deleting the dataset type definition may result in unexpected 

630 behavior from other butler processes that are active that have 

631 not seen the deletion. 

632 

633 Parameters 

634 ---------- 

635 name : `str` 

636 Name of the type to be removed. 

637 

638 Raises 

639 ------ 

640 lsst.daf.butler.registry.OrphanedRecordError 

641 Raised if an attempt is made to remove the dataset type definition 

642 when there are already datasets associated with it. 

643 

644 Notes 

645 ----- 

646 If the dataset type is not registered the method will return without 

647 action. 

648 """ 

649 self._managers.datasets.remove(name) 

650 

651 def getDatasetType(self, name: str) -> DatasetType: 

652 """Get the `DatasetType`. 

653 

654 Parameters 

655 ---------- 

656 name : `str` 

657 Name of the type. 

658 

659 Returns 

660 ------- 

661 type : `DatasetType` 

662 The `DatasetType` associated with the given name. 

663 

664 Raises 

665 ------ 

666 KeyError 

667 Requested named DatasetType could not be found in registry. 

668 """ 

669 return self._managers.datasets[name].datasetType 

670 

671 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *, 

672 collections: Any = None, timespan: Optional[Timespan] = None, 

673 **kwargs: Any) -> Optional[DatasetRef]: 

674 """Find a dataset given its `DatasetType` and data ID. 

675 

676 This can be used to obtain a `DatasetRef` that permits the dataset to 

677 be read from a `Datastore`. If the dataset is a component and can not 

678 be found using the provided dataset type, a dataset ref for the parent 

679 will be returned instead but with the correct dataset type. 

680 

681 Parameters 

682 ---------- 

683 datasetType : `DatasetType` or `str` 

684 A `DatasetType` or the name of one. 

685 dataId : `dict` or `DataCoordinate`, optional 

686 A `dict`-like object containing the `Dimension` links that identify 

687 the dataset within a collection. 

688 collections, optional. 

689 An expression that fully or partially identifies the collections to 

690 search for the dataset; see 

691 :ref:`daf_butler_collection_expressions` for more information. 

692 Defaults to ``self.defaults.collections``. 

693 timespan : `Timespan`, optional 

694 A timespan that the validity range of the dataset must overlap. 

695 If not provided, any `~CollectionType.CALIBRATION` collections 

696 matched by the ``collections`` argument will not be searched. 

697 **kwargs 

698 Additional keyword arguments passed to 

699 `DataCoordinate.standardize` to convert ``dataId`` to a true 

700 `DataCoordinate` or augment an existing one. 

701 

702 Returns 

703 ------- 

704 ref : `DatasetRef` 

705 A reference to the dataset, or `None` if no matching Dataset 

706 was found. 

707 

708 Raises 

709 ------ 

710 TypeError 

711 Raised if ``collections`` is `None` and 

712 ``self.defaults.collections`` is `None`. 

713 LookupError 

714 Raised if one or more data ID keys are missing. 

715 KeyError 

716 Raised if the dataset type does not exist. 

717 MissingCollectionError 

718 Raised if any of ``collections`` does not exist in the registry. 

719 

720 Notes 

721 ----- 

722 This method simply returns `None` and does not raise an exception even 

723 when the set of collections searched is intrinsically incompatible with 

724 the dataset type, e.g. if ``datasetType.isCalibration() is False``, but 

725 only `~CollectionType.CALIBRATION` collections are being searched. 

726 This may make it harder to debug some lookup failures, but the behavior 

727 is intentional; we consider it more important that failed searches are 

728 reported consistently, regardless of the reason, and that adding 

729 additional collections that do not contain a match to the search path 

730 never changes the behavior. 

731 """ 

732 if isinstance(datasetType, DatasetType): 

733 storage = self._managers.datasets[datasetType.name] 

734 else: 

735 storage = self._managers.datasets[datasetType] 

736 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions, 

737 universe=self.dimensions, defaults=self.defaults.dataId, 

738 **kwargs) 

739 if collections is None: 

740 if not self.defaults.collections: 

741 raise TypeError("No collections provided to findDataset, " 

742 "and no defaults from registry construction.") 

743 collections = self.defaults.collections 

744 else: 

745 collections = CollectionSearch.fromExpression(collections) 

746 for collectionRecord in collections.iter(self._managers.collections): 

747 if (collectionRecord.type is CollectionType.CALIBRATION 

748 and (not storage.datasetType.isCalibration() or timespan is None)): 

749 continue 

750 result = storage.find(collectionRecord, dataId, timespan=timespan) 

751 if result is not None: 

752 return result 

753 

754 return None 

755 

756 @transactional 

757 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId], 

758 run: Optional[str] = None) -> List[DatasetRef]: 

759 """Insert one or more datasets into the `Registry` 

760 

761 This always adds new datasets; to associate existing datasets with 

762 a new collection, use ``associate``. 

763 

764 Parameters 

765 ---------- 

766 datasetType : `DatasetType` or `str` 

767 A `DatasetType` or the name of one. 

768 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate` 

769 Dimension-based identifiers for the new datasets. 

770 run : `str`, optional 

771 The name of the run that produced the datasets. Defaults to 

772 ``self.defaults.run``. 

773 

774 Returns 

775 ------- 

776 refs : `list` of `DatasetRef` 

777 Resolved `DatasetRef` instances for all given data IDs (in the same 

778 order). 

779 

780 Raises 

781 ------ 

782 TypeError 

783 Raised if ``run`` is `None` and ``self.defaults.run`` is `None`. 

784 ConflictingDefinitionError 

785 If a dataset with the same dataset type and data ID as one of those 

786 given already exists in ``run``. 

787 MissingCollectionError 

788 Raised if ``run`` does not exist in the registry. 

789 """ 

790 if isinstance(datasetType, DatasetType): 

791 storage = self._managers.datasets.find(datasetType.name) 

792 if storage is None: 

793 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

794 else: 

795 storage = self._managers.datasets.find(datasetType) 

796 if storage is None: 

797 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

798 if run is None: 

799 if self.defaults.run is None: 

800 raise TypeError("No run provided to insertDatasets, " 

801 "and no default from registry construction.") 

802 run = self.defaults.run 

803 runRecord = self._managers.collections.find(run) 

804 if runRecord.type is not CollectionType.RUN: 

805 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.") 

806 assert isinstance(runRecord, RunRecord) 

807 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) 

808 for dataId in dataIds] 

809 try: 

810 refs = list(storage.insert(runRecord, expandedDataIds)) 

811 except sqlalchemy.exc.IntegrityError as err: 

812 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

813 f"one or more datasets of type {storage.datasetType} into " 

814 f"collection '{run}'. " 

815 f"This probably means a dataset with the same data ID " 

816 f"and dataset type already exists, but it may also mean a " 

817 f"dimension row is missing.") from err 

818 return refs 

819 

820 def getDataset(self, id: int) -> Optional[DatasetRef]: 

821 """Retrieve a Dataset entry. 

822 

823 Parameters 

824 ---------- 

825 id : `int` 

826 The unique identifier for the dataset. 

827 

828 Returns 

829 ------- 

830 ref : `DatasetRef` or `None` 

831 A ref to the Dataset, or `None` if no matching Dataset 

832 was found. 

833 """ 

834 ref = self._managers.datasets.getDatasetRef(id) 

835 if ref is None: 

836 return None 

837 return ref 

838 

839 @transactional 

840 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None: 

841 """Remove datasets from the Registry. 

842 

843 The datasets will be removed unconditionally from all collections, and 

844 any `Quantum` that consumed this dataset will instead be marked with 

845 having a NULL input. `Datastore` records will *not* be deleted; the 

846 caller is responsible for ensuring that the dataset has already been 

847 removed from all Datastores. 

848 

849 Parameters 

850 ---------- 

851 refs : `Iterable` of `DatasetRef` 

852 References to the datasets to be removed. Must include a valid 

853 ``id`` attribute, and should be considered invalidated upon return. 

854 

855 Raises 

856 ------ 

857 AmbiguousDatasetError 

858 Raised if any ``ref.id`` is `None`. 

859 OrphanedRecordError 

860 Raised if any dataset is still present in any `Datastore`. 

861 """ 

862 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

863 storage = self._managers.datasets.find(datasetType.name) 

864 assert storage is not None 

865 try: 

866 storage.delete(refsForType) 

867 except sqlalchemy.exc.IntegrityError as err: 

868 raise OrphanedRecordError("One or more datasets is still " 

869 "present in one or more Datastores.") from err 

870 

871 @transactional 

872 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

873 """Add existing datasets to a `~CollectionType.TAGGED` collection. 

874 

875 If a DatasetRef with the same exact integer ID is already in a 

876 collection nothing is changed. If a `DatasetRef` with the same 

877 `DatasetType` and data ID but with different integer ID 

878 exists in the collection, `ConflictingDefinitionError` is raised. 

879 

880 Parameters 

881 ---------- 

882 collection : `str` 

883 Indicates the collection the datasets should be associated with. 

884 refs : `Iterable` [ `DatasetRef` ] 

885 An iterable of resolved `DatasetRef` instances that already exist 

886 in this `Registry`. 

887 

888 Raises 

889 ------ 

890 ConflictingDefinitionError 

891 If a Dataset with the given `DatasetRef` already exists in the 

892 given collection. 

893 AmbiguousDatasetError 

894 Raised if ``any(ref.id is None for ref in refs)``. 

895 MissingCollectionError 

896 Raised if ``collection`` does not exist in the registry. 

897 TypeError 

898 Raise adding new datasets to the given ``collection`` is not 

899 allowed. 

900 """ 

901 collectionRecord = self._managers.collections.find(collection) 

902 if collectionRecord.type is not CollectionType.TAGGED: 

903 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

904 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

905 storage = self._managers.datasets.find(datasetType.name) 

906 assert storage is not None 

907 try: 

908 storage.associate(collectionRecord, refsForType) 

909 except sqlalchemy.exc.IntegrityError as err: 

910 raise ConflictingDefinitionError( 

911 f"Constraint violation while associating dataset of type {datasetType.name} with " 

912 f"collection {collection}. This probably means that one or more datasets with the same " 

913 f"dataset type and data ID already exist in the collection, but it may also indicate " 

914 f"that the datasets do not exist." 

915 ) from err 

916 

917 @transactional 

918 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

919 """Remove existing datasets from a `~CollectionType.TAGGED` collection. 

920 

921 ``collection`` and ``ref`` combinations that are not currently 

922 associated are silently ignored. 

923 

924 Parameters 

925 ---------- 

926 collection : `str` 

927 The collection the datasets should no longer be associated with. 

928 refs : `Iterable` [ `DatasetRef` ] 

929 An iterable of resolved `DatasetRef` instances that already exist 

930 in this `Registry`. 

931 

932 Raises 

933 ------ 

934 AmbiguousDatasetError 

935 Raised if any of the given dataset references is unresolved. 

936 MissingCollectionError 

937 Raised if ``collection`` does not exist in the registry. 

938 TypeError 

939 Raise adding new datasets to the given ``collection`` is not 

940 allowed. 

941 """ 

942 collectionRecord = self._managers.collections.find(collection) 

943 if collectionRecord.type is not CollectionType.TAGGED: 

944 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; " 

945 "expected TAGGED.") 

946 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

947 storage = self._managers.datasets.find(datasetType.name) 

948 assert storage is not None 

949 storage.disassociate(collectionRecord, refsForType) 

950 

951 @transactional 

952 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None: 

953 """Associate one or more datasets with a calibration collection and a 

954 validity range within it. 

955 

956 Parameters 

957 ---------- 

958 collection : `str` 

959 The name of an already-registered `~CollectionType.CALIBRATION` 

960 collection. 

961 refs : `Iterable` [ `DatasetRef` ] 

962 Datasets to be associated. 

963 timespan : `Timespan` 

964 The validity range for these datasets within the collection. 

965 

966 Raises 

967 ------ 

968 AmbiguousDatasetError 

969 Raised if any of the given `DatasetRef` instances is unresolved. 

970 ConflictingDefinitionError 

971 Raised if the collection already contains a different dataset with 

972 the same `DatasetType` and data ID and an overlapping validity 

973 range. 

974 TypeError 

975 Raised if ``collection`` is not a `~CollectionType.CALIBRATION` 

976 collection or if one or more datasets are of a dataset type for 

977 which `DatasetType.isCalibration` returns `False`. 

978 """ 

979 collectionRecord = self._managers.collections.find(collection) 

980 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

981 storage = self._managers.datasets[datasetType.name] 

982 storage.certify(collectionRecord, refsForType, timespan) 

983 

984 @transactional 

985 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *, 

986 dataIds: Optional[Iterable[DataId]] = None) -> None: 

987 """Remove or adjust datasets to clear a validity range within a 

988 calibration collection. 

989 

990 Parameters 

991 ---------- 

992 collection : `str` 

993 The name of an already-registered `~CollectionType.CALIBRATION` 

994 collection. 

995 datasetType : `str` or `DatasetType` 

996 Name or `DatasetType` instance for the datasets to be decertified. 

997 timespan : `Timespan`, optional 

998 The validity range to remove datasets from within the collection. 

999 Datasets that overlap this range but are not contained by it will 

1000 have their validity ranges adjusted to not overlap it, which may 

1001 split a single dataset validity range into two. 

1002 dataIds : `Iterable` [ `DataId` ], optional 

1003 Data IDs that should be decertified within the given validity range 

1004 If `None`, all data IDs for ``self.datasetType`` will be 

1005 decertified. 

1006 

1007 Raises 

1008 ------ 

1009 TypeError 

1010 Raised if ``collection`` is not a `~CollectionType.CALIBRATION` 

1011 collection or if ``datasetType.isCalibration() is False``. 

1012 """ 

1013 collectionRecord = self._managers.collections.find(collection) 

1014 if isinstance(datasetType, str): 

1015 storage = self._managers.datasets[datasetType] 

1016 else: 

1017 storage = self._managers.datasets[datasetType.name] 

1018 standardizedDataIds = None 

1019 if dataIds is not None: 

1020 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) 

1021 for d in dataIds] 

1022 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds) 

1023 

1024 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: 

1025 """Return an object that allows a new `Datastore` instance to 

1026 communicate with this `Registry`. 

1027 

1028 Returns 

1029 ------- 

1030 manager : `DatastoreRegistryBridgeManager` 

1031 Object that mediates communication between this `Registry` and its 

1032 associated datastores. 

1033 """ 

1034 return self._managers.datastores 

1035 

1036 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]: 

1037 """Retrieve datastore locations for a given dataset. 

1038 

1039 Parameters 

1040 ---------- 

1041 ref : `DatasetRef` 

1042 A reference to the dataset for which to retrieve storage 

1043 information. 

1044 

1045 Returns 

1046 ------- 

1047 datastores : `Iterable` [ `str` ] 

1048 All the matching datastores holding this dataset. 

1049 

1050 Raises 

1051 ------ 

1052 AmbiguousDatasetError 

1053 Raised if ``ref.id`` is `None`. 

1054 """ 

1055 return self._managers.datastores.findDatastores(ref) 

1056 

1057 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None, 

1058 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None, 

1059 withDefaults: bool = True, 

1060 **kwargs: Any) -> DataCoordinate: 

1061 """Expand a dimension-based data ID to include additional information. 

1062 

1063 Parameters 

1064 ---------- 

1065 dataId : `DataCoordinate` or `dict`, optional 

1066 Data ID to be expanded; augmented and overridden by ``kwds``. 

1067 graph : `DimensionGraph`, optional 

1068 Set of dimensions for the expanded ID. If `None`, the dimensions 

1069 will be inferred from the keys of ``dataId`` and ``kwds``. 

1070 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph`` 

1071 are silently ignored, providing a way to extract and expand a 

1072 subset of a data ID. 

1073 records : `Mapping` [`str`, `DimensionRecord`], optional 

1074 Dimension record data to use before querying the database for that 

1075 data, keyed by element name. 

1076 withDefaults : `bool`, optional 

1077 Utilize ``self.defaults.dataId`` to fill in missing governor 

1078 dimension key-value pairs. Defaults to `True` (i.e. defaults are 

1079 used). 

1080 **kwargs 

1081 Additional keywords are treated like additional key-value pairs for 

1082 ``dataId``, extending and overriding 

1083 

1084 Returns 

1085 ------- 

1086 expanded : `DataCoordinate` 

1087 A data ID that includes full metadata for all of the dimensions it 

1088 identifieds, i.e. guarantees that ``expanded.hasRecords()`` and 

1089 ``expanded.hasFull()`` both return `True`. 

1090 """ 

1091 if not withDefaults: 

1092 defaults = None 

1093 else: 

1094 defaults = self.defaults.dataId 

1095 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, 

1096 defaults=defaults, **kwargs) 

1097 if standardized.hasRecords(): 

1098 return standardized 

1099 if records is None: 

1100 records = {} 

1101 elif isinstance(records, NamedKeyMapping): 

1102 records = records.byName() 

1103 else: 

1104 records = dict(records) 

1105 if isinstance(dataId, DataCoordinate) and dataId.hasRecords(): 

1106 records.update(dataId.records.byName()) 

1107 keys = standardized.byName() 

1108 for element in standardized.graph.primaryKeyTraversalOrder: 

1109 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

1110 if record is ...: 

1111 if isinstance(element, Dimension) and keys.get(element.name) is None: 

1112 if element in standardized.graph.required: 

1113 raise LookupError( 

1114 f"No value or null value for required dimension {element.name}." 

1115 ) 

1116 keys[element.name] = None 

1117 record = None 

1118 else: 

1119 storage = self._managers.dimensions[element] 

1120 dataIdSet = DataCoordinateIterable.fromScalar( 

1121 DataCoordinate.standardize(keys, graph=element.graph) 

1122 ) 

1123 fetched = tuple(storage.fetch(dataIdSet)) 

1124 try: 

1125 (record,) = fetched 

1126 except ValueError: 

1127 record = None 

1128 records[element.name] = record 

1129 if record is not None: 

1130 for d in element.implied: 

1131 value = getattr(record, d.name) 

1132 if keys.setdefault(d.name, value) != value: 

1133 raise InconsistentDataIdError( 

1134 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, " 

1135 f"but {element.name} implies {d.name}={value!r}." 

1136 ) 

1137 else: 

1138 if element in standardized.graph.required: 

1139 raise LookupError( 

1140 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

1141 ) 

1142 if element.alwaysJoin: 

1143 raise InconsistentDataIdError( 

1144 f"Could not fetch record for element {element.name} via keys {keys}, ", 

1145 "but it is marked alwaysJoin=True; this means one or more dimensions are not " 

1146 "related." 

1147 ) 

1148 for d in element.implied: 

1149 keys.setdefault(d.name, None) 

1150 records.setdefault(d.name, None) 

1151 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records) 

1152 

1153 def insertDimensionData(self, element: Union[DimensionElement, str], 

1154 *data: Union[Mapping[str, Any], DimensionRecord], 

1155 conform: bool = True) -> None: 

1156 """Insert one or more dimension records into the database. 

1157 

1158 Parameters 

1159 ---------- 

1160 element : `DimensionElement` or `str` 

1161 The `DimensionElement` or name thereof that identifies the table 

1162 records will be inserted into. 

1163 data : `dict` or `DimensionRecord` (variadic) 

1164 One or more records to insert. 

1165 conform : `bool`, optional 

1166 If `False` (`True` is default) perform no checking or conversions, 

1167 and assume that ``element`` is a `DimensionElement` instance and 

1168 ``data`` is a one or more `DimensionRecord` instances of the 

1169 appropriate subclass. 

1170 """ 

1171 if conform: 

1172 if isinstance(element, str): 

1173 element = self.dimensions[element] 

1174 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

1175 for row in data] 

1176 else: 

1177 # Ignore typing since caller said to trust them with conform=False. 

1178 records = data # type: ignore 

1179 storage = self._managers.dimensions[element] # type: ignore 

1180 storage.insert(*records) 

1181 

1182 def syncDimensionData(self, element: Union[DimensionElement, str], 

1183 row: Union[Mapping[str, Any], DimensionRecord], 

1184 conform: bool = True) -> bool: 

1185 """Synchronize the given dimension record with the database, inserting 

1186 if it does not already exist and comparing values if it does. 

1187 

1188 Parameters 

1189 ---------- 

1190 element : `DimensionElement` or `str` 

1191 The `DimensionElement` or name thereof that identifies the table 

1192 records will be inserted into. 

1193 row : `dict` or `DimensionRecord` 

1194 The record to insert. 

1195 conform : `bool`, optional 

1196 If `False` (`True` is default) perform no checking or conversions, 

1197 and assume that ``element`` is a `DimensionElement` instance and 

1198 ``data`` is a one or more `DimensionRecord` instances of the 

1199 appropriate subclass. 

1200 

1201 Returns 

1202 ------- 

1203 inserted : `bool` 

1204 `True` if a new row was inserted, `False` otherwise. 

1205 

1206 Raises 

1207 ------ 

1208 ConflictingDefinitionError 

1209 Raised if the record exists in the database (according to primary 

1210 key lookup) but is inconsistent with the given one. 

1211 """ 

1212 if conform: 

1213 if isinstance(element, str): 

1214 element = self.dimensions[element] 

1215 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

1216 else: 

1217 # Ignore typing since caller said to trust them with conform=False. 

1218 record = row # type: ignore 

1219 storage = self._managers.dimensions[element] # type: ignore 

1220 return storage.sync(record) 

1221 

1222 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None 

1223 ) -> Iterator[DatasetType]: 

1224 """Iterate over the dataset types whose names match an expression. 

1225 

1226 Parameters 

1227 ---------- 

1228 expression : `Any`, optional 

1229 An expression that fully or partially identifies the dataset types 

1230 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1231 `...` can be used to return all dataset types, and is the default. 

1232 See :ref:`daf_butler_dataset_type_expressions` for more 

1233 information. 

1234 components : `bool`, optional 

1235 If `True`, apply all expression patterns to component dataset type 

1236 names as well. If `False`, never apply patterns to components. 

1237 If `None` (default), apply patterns to components only if their 

1238 parent datasets were not matched by the expression. 

1239 Fully-specified component datasets (`str` or `DatasetType` 

1240 instances) are always included. 

1241 

1242 Yields 

1243 ------ 

1244 datasetType : `DatasetType` 

1245 A `DatasetType` instance whose name matches ``expression``. 

1246 """ 

1247 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name) 

1248 if wildcard is Ellipsis: 

1249 for datasetType in self._managers.datasets: 

1250 # The dataset type can no longer be a component 

1251 yield datasetType 

1252 if components: 

1253 # Automatically create the component dataset types 

1254 try: 

1255 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes() 

1256 except KeyError as err: 

1257 _LOG.warning(f"Could not load storage class {err} for {datasetType.name}; " 

1258 "if it has components they will not be included in query results.") 

1259 else: 

1260 yield from componentsForDatasetType 

1261 return 

1262 done: Set[str] = set() 

1263 for name in wildcard.strings: 

1264 storage = self._managers.datasets.find(name) 

1265 if storage is not None: 

1266 done.add(storage.datasetType.name) 

1267 yield storage.datasetType 

1268 if wildcard.patterns: 

1269 # If components (the argument) is None, we'll save component 

1270 # dataset that we might want to match, but only if their parents 

1271 # didn't get included. 

1272 componentsForLater = [] 

1273 for registeredDatasetType in self._managers.datasets: 

1274 # Components are not stored in registry so expand them here 

1275 allDatasetTypes = [registeredDatasetType] 

1276 try: 

1277 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes()) 

1278 except KeyError as err: 

1279 _LOG.warning(f"Could not load storage class {err} for {registeredDatasetType.name}; " 

1280 "if it has components they will not be included in query results.") 

1281 for datasetType in allDatasetTypes: 

1282 if datasetType.name in done: 

1283 continue 

1284 parentName, componentName = datasetType.nameAndComponent() 

1285 if componentName is not None and not components: 

1286 if components is None and parentName not in done: 

1287 componentsForLater.append(datasetType) 

1288 continue 

1289 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

1290 done.add(datasetType.name) 

1291 yield datasetType 

1292 # Go back and try to match saved components. 

1293 for datasetType in componentsForLater: 

1294 parentName, _ = datasetType.nameAndComponent() 

1295 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

1296 yield datasetType 

1297 

1298 def queryCollections(self, expression: Any = ..., 

1299 datasetType: Optional[DatasetType] = None, 

1300 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

1301 flattenChains: bool = False, 

1302 includeChains: Optional[bool] = None) -> Iterator[str]: 

1303 """Iterate over the collections whose names match an expression. 

1304 

1305 Parameters 

1306 ---------- 

1307 expression : `Any`, optional 

1308 An expression that fully or partially identifies the collections 

1309 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1310 `...` can be used to return all collections, and is the default. 

1311 See :ref:`daf_butler_collection_expressions` for more 

1312 information. 

1313 datasetType : `DatasetType`, optional 

1314 If provided, only yield collections that may contain datasets of 

1315 this type. This is a conservative approximation in general; it may 

1316 yield collections that do not have any such datasets. 

1317 collectionTypes : `AbstractSet` [ `CollectionType` ], optional 

1318 If provided, only yield collections of these types. 

1319 flattenChains : `bool`, optional 

1320 If `True` (`False` is default), recursively yield the child 

1321 collections of matching `~CollectionType.CHAINED` collections. 

1322 includeChains : `bool`, optional 

1323 If `True`, yield records for matching `~CollectionType.CHAINED` 

1324 collections. Default is the opposite of ``flattenChains``: include 

1325 either CHAINED collections or their children, but not both. 

1326 

1327 Yields 

1328 ------ 

1329 collection : `str` 

1330 The name of a collection that matches ``expression``. 

1331 """ 

1332 # Right now the datasetTypes argument is completely ignored, but that 

1333 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up 

1334 # ticket will take care of that. 

1335 query = CollectionQuery.fromExpression(expression) 

1336 for record in query.iter(self._managers.collections, collectionTypes=frozenset(collectionTypes), 

1337 flattenChains=flattenChains, includeChains=includeChains): 

1338 yield record.name 

1339 

1340 def makeQueryBuilder(self, summary: queries.QuerySummary) -> queries.QueryBuilder: 

1341 """Return a `QueryBuilder` instance capable of constructing and 

1342 managing more complex queries than those obtainable via `Registry` 

1343 interfaces. 

1344 

1345 This is an advanced interface; downstream code should prefer 

1346 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those 

1347 are sufficient. 

1348 

1349 Parameters 

1350 ---------- 

1351 summary : `queries.QuerySummary` 

1352 Object describing and categorizing the full set of dimensions that 

1353 will be included in the query. 

1354 

1355 Returns 

1356 ------- 

1357 builder : `queries.QueryBuilder` 

1358 Object that can be used to construct and perform advanced queries. 

1359 """ 

1360 return queries.QueryBuilder( 

1361 summary, 

1362 queries.RegistryManagers( 

1363 collections=self._managers.collections, 

1364 dimensions=self._managers.dimensions, 

1365 datasets=self._managers.datasets, 

1366 TimespanReprClass=self._db.getTimespanRepresentation(), 

1367 ), 

1368 ) 

1369 

1370 def queryDatasets(self, datasetType: Any, *, 

1371 collections: Any = None, 

1372 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

1373 dataId: Optional[DataId] = None, 

1374 where: Optional[str] = None, 

1375 findFirst: bool = False, 

1376 components: Optional[bool] = None, 

1377 bind: Optional[Mapping[str, Any]] = None, 

1378 check: bool = True, 

1379 **kwargs: Any) -> queries.DatasetQueryResults: 

1380 """Query for and iterate over dataset references matching user-provided 

1381 criteria. 

1382 

1383 Parameters 

1384 ---------- 

1385 datasetType 

1386 An expression that fully or partially identifies the dataset types 

1387 to be queried. Allowed types include `DatasetType`, `str`, 

1388 `re.Pattern`, and iterables thereof. The special value `...` can 

1389 be used to query all dataset types. See 

1390 :ref:`daf_butler_dataset_type_expressions` for more information. 

1391 collections: optional 

1392 An expression that fully or partially identifies the collections 

1393 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1394 thereof. `...` can be used to find datasets from all 

1395 `~CollectionType.RUN` collections (no other collections are 

1396 necessary, because all datasets are in a ``RUN`` collection). See 

1397 :ref:`daf_butler_collection_expressions` for more information. 

1398 If not provided, ``self.default.collections`` is used. 

1399 dimensions : `~collections.abc.Iterable` of `Dimension` or `str` 

1400 Dimensions to include in the query (in addition to those used 

1401 to identify the queried dataset type(s)), either to constrain 

1402 the resulting datasets to those for which a matching dimension 

1403 exists, or to relate the dataset type's dimensions to dimensions 

1404 referenced by the ``dataId`` or ``where`` arguments. 

1405 dataId : `dict` or `DataCoordinate`, optional 

1406 A data ID whose key-value pairs are used as equality constraints 

1407 in the query. 

1408 where : `str`, optional 

1409 A string expression similar to a SQL WHERE clause. May involve 

1410 any column of a dimension table or (as a shortcut for the primary 

1411 key column of a dimension table) dimension name. See 

1412 :ref:`daf_butler_dimension_expressions` for more information. 

1413 findFirst : `bool`, optional 

1414 If `True` (`False` is default), for each result data ID, only 

1415 yield one `DatasetRef` of each `DatasetType`, from the first 

1416 collection in which a dataset of that dataset type appears 

1417 (according to the order of ``collections`` passed in). If `True`, 

1418 ``collections`` must not contain regular expressions and may not 

1419 be `...`. 

1420 components : `bool`, optional 

1421 If `True`, apply all dataset expression patterns to component 

1422 dataset type names as well. If `False`, never apply patterns to 

1423 components. If `None` (default), apply patterns to components only 

1424 if their parent datasets were not matched by the expression. 

1425 Fully-specified component datasets (`str` or `DatasetType` 

1426 instances) are always included. 

1427 bind : `Mapping`, optional 

1428 Mapping containing literal values that should be injected into the 

1429 ``where`` expression, keyed by the identifiers they replace. 

1430 check : `bool`, optional 

1431 If `True` (default) check the query for consistency before 

1432 executing it. This may reject some valid queries that resemble 

1433 common mistakes (e.g. queries for visits without specifying an 

1434 instrument). 

1435 **kwargs 

1436 Additional keyword arguments are forwarded to 

1437 `DataCoordinate.standardize` when processing the ``dataId`` 

1438 argument (and may be used to provide a constraining data ID even 

1439 when the ``dataId`` argument is `None`). 

1440 

1441 Returns 

1442 ------- 

1443 refs : `queries.DatasetQueryResults` 

1444 Dataset references matching the given query criteria. 

1445 

1446 Raises 

1447 ------ 

1448 TypeError 

1449 Raised when the arguments are incompatible, such as when a 

1450 collection wildcard is passed when ``findFirst`` is `True`, or 

1451 when ``collections`` is `None` and``self.defaults.collections`` is 

1452 also `None`. 

1453 

1454 Notes 

1455 ----- 

1456 When multiple dataset types are queried in a single call, the 

1457 results of this operation are equivalent to querying for each dataset 

1458 type separately in turn, and no information about the relationships 

1459 between datasets of different types is included. In contexts where 

1460 that kind of information is important, the recommended pattern is to 

1461 use `queryDataIds` to first obtain data IDs (possibly with the 

1462 desired dataset types and collections passed as constraints to the 

1463 query), and then use multiple (generally much simpler) calls to 

1464 `queryDatasets` with the returned data IDs passed as constraints. 

1465 """ 

1466 # Standardize the collections expression. 

1467 if collections is None: 

1468 if not self.defaults.collections: 

1469 raise TypeError("No collections provided to findDataset, " 

1470 "and no defaults from registry construction.") 

1471 collections = self.defaults.collections 

1472 elif findFirst: 

1473 collections = CollectionSearch.fromExpression(collections) 

1474 else: 

1475 collections = CollectionQuery.fromExpression(collections) 

1476 # Standardize and expand the data ID provided as a constraint. 

1477 standardizedDataId = self.expandDataId(dataId, **kwargs) 

1478 

1479 # We can only query directly if given a non-component DatasetType 

1480 # instance. If we were given an expression or str or a component 

1481 # DatasetType instance, we'll populate this dict, recurse, and return. 

1482 # If we already have a non-component DatasetType, it will remain None 

1483 # and we'll run the query directly. 

1484 composition: Optional[ 

1485 Dict[ 

1486 DatasetType, # parent dataset type 

1487 List[Optional[str]] # component name, or None for parent 

1488 ] 

1489 ] = None 

1490 if not isinstance(datasetType, DatasetType): 

1491 # We were given a dataset type expression (which may be as simple 

1492 # as a str). Loop over all matching datasets, delegating handling 

1493 # of the `components` argument to queryDatasetTypes, as we populate 

1494 # the composition dict. 

1495 composition = defaultdict(list) 

1496 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components): 

1497 parentName, componentName = trueDatasetType.nameAndComponent() 

1498 if componentName is not None: 

1499 parentDatasetType = self.getDatasetType(parentName) 

1500 composition.setdefault(parentDatasetType, []).append(componentName) 

1501 else: 

1502 composition.setdefault(trueDatasetType, []).append(None) 

1503 elif datasetType.isComponent(): 

1504 # We were given a true DatasetType instance, but it's a component. 

1505 # the composition dict will have exactly one item. 

1506 parentName, componentName = datasetType.nameAndComponent() 

1507 parentDatasetType = self.getDatasetType(parentName) 

1508 composition = {parentDatasetType: [componentName]} 

1509 if composition is not None: 

1510 # We need to recurse. Do that once for each parent dataset type. 

1511 chain = [] 

1512 for parentDatasetType, componentNames in composition.items(): 

1513 parentResults = self.queryDatasets( 

1514 parentDatasetType, 

1515 collections=collections, 

1516 dimensions=dimensions, 

1517 dataId=standardizedDataId, 

1518 where=where, 

1519 findFirst=findFirst, 

1520 check=check, 

1521 ) 

1522 if isinstance(parentResults, queries.ParentDatasetQueryResults): 

1523 chain.append( 

1524 parentResults.withComponents(componentNames) 

1525 ) 

1526 else: 

1527 # Should only happen if we know there would be no results. 

1528 assert isinstance(parentResults, queries.ChainedDatasetQueryResults) \ 

1529 and not parentResults._chain 

1530 return queries.ChainedDatasetQueryResults(chain) 

1531 # If we get here, there's no need to recurse (or we are already 

1532 # recursing; there can only ever be one level of recursion). 

1533 

1534 # The full set of dimensions in the query is the combination of those 

1535 # needed for the DatasetType and those explicitly requested, if any. 

1536 requestedDimensionNames = set(datasetType.dimensions.names) 

1537 if dimensions is not None: 

1538 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

1539 # Construct the summary structure needed to construct a QueryBuilder. 

1540 summary = queries.QuerySummary( 

1541 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1542 dataId=standardizedDataId, 

1543 expression=where, 

1544 bind=bind, 

1545 defaults=self.defaults.dataId, 

1546 check=check, 

1547 ) 

1548 builder = self.makeQueryBuilder(summary) 

1549 # Add the dataset subquery to the query, telling the QueryBuilder to 

1550 # include the rank of the selected collection in the results only if we 

1551 # need to findFirst. Note that if any of the collections are 

1552 # actually wildcard expressions, and we've asked for deduplication, 

1553 # this will raise TypeError for us. 

1554 if not builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst): 

1555 return queries.ChainedDatasetQueryResults(()) 

1556 query = builder.finish() 

1557 return queries.ParentDatasetQueryResults(self._db, query, components=[None]) 

1558 

1559 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *, 

1560 dataId: Optional[DataId] = None, 

1561 datasets: Any = None, 

1562 collections: Any = None, 

1563 where: Optional[str] = None, 

1564 components: Optional[bool] = None, 

1565 bind: Optional[Mapping[str, Any]] = None, 

1566 check: bool = True, 

1567 **kwargs: Any) -> queries.DataCoordinateQueryResults: 

1568 """Query for data IDs matching user-provided criteria. 

1569 

1570 Parameters 

1571 ---------- 

1572 dimensions : `Dimension` or `str`, or iterable thereof 

1573 The dimensions of the data IDs to yield, as either `Dimension` 

1574 instances or `str`. Will be automatically expanded to a complete 

1575 `DimensionGraph`. 

1576 dataId : `dict` or `DataCoordinate`, optional 

1577 A data ID whose key-value pairs are used as equality constraints 

1578 in the query. 

1579 datasets : `Any`, optional 

1580 An expression that fully or partially identifies dataset types 

1581 that should constrain the yielded data IDs. For example, including 

1582 "raw" here would constrain the yielded ``instrument``, 

1583 ``exposure``, ``detector``, and ``physical_filter`` values to only 

1584 those for which at least one "raw" dataset exists in 

1585 ``collections``. Allowed types include `DatasetType`, `str`, 

1586 `re.Pattern`, and iterables thereof. Unlike other dataset type 

1587 expressions, ``...`` is not permitted - it doesn't make sense to 

1588 constrain data IDs on the existence of *all* datasets. 

1589 See :ref:`daf_butler_dataset_type_expressions` for more 

1590 information. 

1591 collections: `Any`, optional 

1592 An expression that fully or partially identifies the collections 

1593 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1594 thereof. `...` can be used to return all collections. Must be 

1595 provided if ``datasets`` is, and is ignored if it is not. See 

1596 :ref:`daf_butler_collection_expressions` for more information. 

1597 If not provided, ``self.default.collections`` is used. 

1598 where : `str`, optional 

1599 A string expression similar to a SQL WHERE clause. May involve 

1600 any column of a dimension table or (as a shortcut for the primary 

1601 key column of a dimension table) dimension name. See 

1602 :ref:`daf_butler_dimension_expressions` for more information. 

1603 components : `bool`, optional 

1604 If `True`, apply all dataset expression patterns to component 

1605 dataset type names as well. If `False`, never apply patterns to 

1606 components. If `None` (default), apply patterns to components only 

1607 if their parent datasets were not matched by the expression. 

1608 Fully-specified component datasets (`str` or `DatasetType` 

1609 instances) are always included. 

1610 bind : `Mapping`, optional 

1611 Mapping containing literal values that should be injected into the 

1612 ``where`` expression, keyed by the identifiers they replace. 

1613 check : `bool`, optional 

1614 If `True` (default) check the query for consistency before 

1615 executing it. This may reject some valid queries that resemble 

1616 common mistakes (e.g. queries for visits without specifying an 

1617 instrument). 

1618 **kwargs 

1619 Additional keyword arguments are forwarded to 

1620 `DataCoordinate.standardize` when processing the ``dataId`` 

1621 argument (and may be used to provide a constraining data ID even 

1622 when the ``dataId`` argument is `None`). 

1623 

1624 Returns 

1625 ------- 

1626 dataIds : `DataCoordinateQueryResults` 

1627 Data IDs matching the given query parameters. These are guaranteed 

1628 to identify all dimensions (`DataCoordinate.hasFull` returns 

1629 `True`), but will not contain `DimensionRecord` objects 

1630 (`DataCoordinate.hasRecords` returns `False`). Call 

1631 `DataCoordinateQueryResults.expanded` on the returned object to 

1632 fetch those (and consider using 

1633 `DataCoordinateQueryResults.materialize` on the returned object 

1634 first if the expected number of rows is very large). See 

1635 documentation for those methods for additional information. 

1636 

1637 Raises 

1638 ------ 

1639 TypeError 

1640 Raised if ``collections`` is `None`, ``self.defaults.collections`` 

1641 is `None`, and ``datasets`` is not `None`. 

1642 """ 

1643 dimensions = iterable(dimensions) 

1644 standardizedDataId = self.expandDataId(dataId, **kwargs) 

1645 standardizedDatasetTypes = set() 

1646 requestedDimensions = self.dimensions.extract(dimensions) 

1647 queryDimensionNames = set(requestedDimensions.names) 

1648 if datasets is not None: 

1649 if collections is None: 

1650 if not self.defaults.collections: 

1651 raise TypeError("Cannot pass 'datasets' without 'collections'.") 

1652 collections = self.defaults.collections 

1653 else: 

1654 # Preprocess collections expression in case the original 

1655 # included single-pass iterators (we'll want to use it multiple 

1656 # times below). 

1657 collections = CollectionQuery.fromExpression(collections) 

1658 for datasetType in self.queryDatasetTypes(datasets, components=components): 

1659 queryDimensionNames.update(datasetType.dimensions.names) 

1660 # If any matched dataset type is a component, just operate on 

1661 # its parent instead, because Registry doesn't know anything 

1662 # about what components exist, and here (unlike queryDatasets) 

1663 # we don't care about returning them. 

1664 parentDatasetTypeName, componentName = datasetType.nameAndComponent() 

1665 if componentName is not None: 

1666 datasetType = self.getDatasetType(parentDatasetTypeName) 

1667 standardizedDatasetTypes.add(datasetType) 

1668 

1669 summary = queries.QuerySummary( 

1670 requested=DimensionGraph(self.dimensions, names=queryDimensionNames), 

1671 dataId=standardizedDataId, 

1672 expression=where, 

1673 bind=bind, 

1674 defaults=self.defaults.dataId, 

1675 check=check, 

1676 ) 

1677 builder = self.makeQueryBuilder(summary) 

1678 for datasetType in standardizedDatasetTypes: 

1679 builder.joinDataset(datasetType, collections, isResult=False) 

1680 query = builder.finish() 

1681 return queries.DataCoordinateQueryResults(self._db, query) 

1682 

1683 def queryDimensionRecords(self, element: Union[DimensionElement, str], *, 

1684 dataId: Optional[DataId] = None, 

1685 datasets: Any = None, 

1686 collections: Any = None, 

1687 where: Optional[str] = None, 

1688 components: Optional[bool] = None, 

1689 bind: Optional[Mapping[str, Any]] = None, 

1690 check: bool = True, 

1691 **kwargs: Any) -> Iterator[DimensionRecord]: 

1692 """Query for dimension information matching user-provided criteria. 

1693 

1694 Parameters 

1695 ---------- 

1696 element : `DimensionElement` or `str` 

1697 The dimension element to obtain records for. 

1698 dataId : `dict` or `DataCoordinate`, optional 

1699 A data ID whose key-value pairs are used as equality constraints 

1700 in the query. 

1701 datasets : `Any`, optional 

1702 An expression that fully or partially identifies dataset types 

1703 that should constrain the yielded records. See `queryDataIds` and 

1704 :ref:`daf_butler_dataset_type_expressions` for more information. 

1705 collections: `Any`, optional 

1706 An expression that fully or partially identifies the collections 

1707 to search for datasets. See `queryDataIds` and 

1708 :ref:`daf_butler_collection_expressions` for more information. 

1709 where : `str`, optional 

1710 A string expression similar to a SQL WHERE clause. See 

1711 `queryDataIds` and :ref:`daf_butler_dimension_expressions` for more 

1712 information. 

1713 components : `bool`, optional 

1714 Whether to apply dataset expressions to components as well. 

1715 See `queryDataIds` for more information. 

1716 bind : `Mapping`, optional 

1717 Mapping containing literal values that should be injected into the 

1718 ``where`` expression, keyed by the identifiers they replace. 

1719 check : `bool`, optional 

1720 If `True` (default) check the query for consistency before 

1721 executing it. This may reject some valid queries that resemble 

1722 common mistakes (e.g. queries for visits without specifying an 

1723 instrument). 

1724 **kwargs 

1725 Additional keyword arguments are forwarded to 

1726 `DataCoordinate.standardize` when processing the ``dataId`` 

1727 argument (and may be used to provide a constraining data ID even 

1728 when the ``dataId`` argument is `None`). 

1729 

1730 Returns 

1731 ------- 

1732 dataIds : `DataCoordinateQueryResults` 

1733 Data IDs matching the given query parameters. 

1734 """ 

1735 if not isinstance(element, DimensionElement): 

1736 try: 

1737 element = self.dimensions[element] 

1738 except KeyError as e: 

1739 raise KeyError(f"No such dimension '{element}', available dimensions: " 

1740 + str(self.dimensions.getStaticElements())) from e 

1741 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections, 

1742 where=where, components=components, bind=bind, check=check, **kwargs) 

1743 return iter(self._managers.dimensions[element].fetch(dataIds)) 

1744 

1745 def queryDatasetAssociations( 

1746 self, 

1747 datasetType: Union[str, DatasetType], 

1748 collections: Any = ..., 

1749 *, 

1750 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

1751 flattenChains: bool = False, 

1752 ) -> Iterator[DatasetAssociation]: 

1753 """Iterate over dataset-collection combinations where the dataset is in 

1754 the collection. 

1755 

1756 This method is a temporary placeholder for better support for 

1757 assocation results in `queryDatasets`. It will probably be 

1758 removed in the future, and should be avoided in production code 

1759 whenever possible. 

1760 

1761 Parameters 

1762 ---------- 

1763 datasetType : `DatasetType` or `str` 

1764 A dataset type object or the name of one. 

1765 collections: `Any`, optional 

1766 An expression that fully or partially identifies the collections 

1767 to search for datasets. See `queryCollections` and 

1768 :ref:`daf_butler_collection_expressions` for more information. 

1769 If not provided, ``self.default.collections`` is used. 

1770 collectionTypes : `AbstractSet` [ `CollectionType` ], optional 

1771 If provided, only yield associations from collections of these 

1772 types. 

1773 flattenChains : `bool`, optional 

1774 If `True` (default) search in the children of 

1775 `~CollectionType.CHAINED` collections. If `False`, ``CHAINED`` 

1776 collections are ignored. 

1777 

1778 Yields 

1779 ------ 

1780 association : `DatasetAssociation` 

1781 Object representing the relationship beween a single dataset and 

1782 a single collection. 

1783 

1784 Raises 

1785 ------ 

1786 TypeError 

1787 Raised if ``collections`` is `None` and 

1788 ``self.defaults.collections`` is `None`. 

1789 """ 

1790 if collections is None: 

1791 if not self.defaults.collections: 

1792 raise TypeError("No collections provided to findDataset, " 

1793 "and no defaults from registry construction.") 

1794 collections = self.defaults.collections 

1795 else: 

1796 collections = CollectionQuery.fromExpression(collections) 

1797 TimespanReprClass = self._db.getTimespanRepresentation() 

1798 if isinstance(datasetType, str): 

1799 storage = self._managers.datasets[datasetType] 

1800 else: 

1801 storage = self._managers.datasets[datasetType.name] 

1802 for collectionRecord in collections.iter(self._managers.collections, 

1803 collectionTypes=frozenset(collectionTypes), 

1804 flattenChains=flattenChains): 

1805 query = storage.select(collectionRecord) 

1806 if query is None: 

1807 continue 

1808 for row in self._db.query(query.combine()): 

1809 dataId = DataCoordinate.fromRequiredValues( 

1810 storage.datasetType.dimensions, 

1811 tuple(row[name] for name in storage.datasetType.dimensions.required.names) 

1812 ) 

1813 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]] 

1814 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, 

1815 conform=False) 

1816 if collectionRecord.type is CollectionType.CALIBRATION: 

1817 timespan = TimespanReprClass.extract(row) 

1818 else: 

1819 timespan = None 

1820 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan) 

1821 

1822 storageClasses: StorageClassFactory 

1823 """All storage classes known to the registry (`StorageClassFactory`). 

1824 """