Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "Registry", 

26) 

27 

28from collections import defaultdict 

29import contextlib 

30import logging 

31from typing import ( 

32 Any, 

33 Dict, 

34 Iterable, 

35 Iterator, 

36 List, 

37 Mapping, 

38 Optional, 

39 Set, 

40 TYPE_CHECKING, 

41 Union, 

42) 

43 

44import sqlalchemy 

45 

46from ..core import ( 

47 ButlerURI, 

48 Config, 

49 DataCoordinate, 

50 DataCoordinateIterable, 

51 DataId, 

52 DatasetAssociation, 

53 DatasetRef, 

54 DatasetType, 

55 ddl, 

56 Dimension, 

57 DimensionConfig, 

58 DimensionElement, 

59 DimensionGraph, 

60 DimensionRecord, 

61 DimensionUniverse, 

62 NamedKeyMapping, 

63 NameLookupMapping, 

64 StorageClassFactory, 

65 Timespan, 

66) 

67from . import queries 

68from ..core.utils import iterable, transactional 

69from ._config import RegistryConfig 

70from ._collectionType import CollectionType 

71from ._defaults import RegistryDefaults 

72from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError 

73from .managers import RegistryManagerTypes, RegistryManagerInstances 

74from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch, Ellipsis 

75from .summaries import CollectionSummary 

76from .interfaces import ChainedCollectionRecord, RunRecord 

77 

78if TYPE_CHECKING: 78 ↛ 79line 78 didn't jump to line 79, because the condition on line 78 was never true

79 from .._butlerConfig import ButlerConfig 

80 from .interfaces import ( 

81 Database, 

82 DatastoreRegistryBridgeManager, 

83 ) 

84 

85 

86_LOG = logging.getLogger(__name__) 

87 

88# key for dimensions configuration in attributes table 

89_DIMENSIONS_ATTR = "config:dimensions.json" 

90 

91 

92class Registry: 

93 """Registry interface. 

94 

95 Parameters 

96 ---------- 

97 database : `Database` 

98 Database instance to store Registry. 

99 defaults : `RegistryDefaults`, optional 

100 Default collection search path and/or output `~CollectionType.RUN` 

101 collection. 

102 attributes : `type` 

103 Manager class implementing `ButlerAttributeManager`. 

104 opaque : `type` 

105 Manager class implementing `OpaqueTableStorageManager`. 

106 dimensions : `type` 

107 Manager class implementing `DimensionRecordStorageManager`. 

108 collections : `type` 

109 Manager class implementing `CollectionManager`. 

110 datasets : `type` 

111 Manager class implementing `DatasetRecordStorageManager`. 

112 datastoreBridges : `type` 

113 Manager class implementing `DatastoreRegistryBridgeManager`. 

114 dimensionConfig : `DimensionConfig`, optional 

115 Dimension universe configuration, only used when ``create`` is True. 

116 writeable : `bool`, optional 

117 If True then Registry will support write operations. 

118 create : `bool`, optional 

119 If True then database schema will be initialized, it must be empty 

120 before instantiating Registry. 

121 """ 

122 

123 defaultConfigFile: Optional[str] = None 

124 """Path to configuration defaults. Accessed within the ``configs`` resource 

125 or relative to a search path. Can be None if no defaults specified. 

126 """ 

127 

128 @classmethod 

129 def createFromConfig(cls, config: Optional[Union[RegistryConfig, str]] = None, 

130 dimensionConfig: Optional[Union[DimensionConfig, str]] = None, 

131 butlerRoot: Optional[str] = None) -> Registry: 

132 """Create registry database and return `Registry` instance. 

133 

134 This method initializes database contents, database must be empty 

135 prior to calling this method. 

136 

137 Parameters 

138 ---------- 

139 config : `RegistryConfig` or `str`, optional 

140 Registry configuration, if missing then default configuration will 

141 be loaded from registry.yaml. 

142 dimensionConfig : `DimensionConfig` or `str`, optional 

143 Dimensions configuration, if missing then default configuration 

144 will be loaded from dimensions.yaml. 

145 butlerRoot : `str`, optional 

146 Path to the repository root this `Registry` will manage. 

147 

148 Returns 

149 ------- 

150 registry : `Registry` 

151 A new `Registry` instance. 

152 """ 

153 if isinstance(config, str): 

154 config = RegistryConfig(config) 

155 elif config is None: 

156 config = RegistryConfig() 

157 elif not isinstance(config, RegistryConfig): 

158 raise TypeError(f"Incompatible Registry configuration type: {type(config)}") 

159 config.replaceRoot(butlerRoot) 

160 

161 if isinstance(dimensionConfig, str): 

162 dimensionConfig = DimensionConfig(config) 

163 elif dimensionConfig is None: 

164 dimensionConfig = DimensionConfig() 

165 elif not isinstance(dimensionConfig, DimensionConfig): 

166 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}") 

167 

168 DatabaseClass = config.getDatabaseClass() 

169 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

170 namespace=config.get("namespace")) 

171 managerTypes = RegistryManagerTypes.fromConfig(config) 

172 managers = managerTypes.makeRepo(database, dimensionConfig) 

173 return cls(database, RegistryDefaults(), managers) 

174 

175 @classmethod 

176 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], 

177 butlerRoot: Optional[Union[str, ButlerURI]] = None, writeable: bool = True, 

178 defaults: Optional[RegistryDefaults] = None) -> Registry: 

179 """Create `Registry` subclass instance from `config`. 

180 

181 Registry database must be inbitialized prior to calling this method. 

182 

183 Parameters 

184 ---------- 

185 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

186 Registry configuration 

187 butlerRoot : `str` or `ButlerURI`, optional 

188 Path to the repository root this `Registry` will manage. 

189 writeable : `bool`, optional 

190 If `True` (default) create a read-write connection to the database. 

191 defaults : `RegistryDefaults`, optional 

192 Default collection search path and/or output `~CollectionType.RUN` 

193 collection. 

194 

195 Returns 

196 ------- 

197 registry : `Registry` (subclass) 

198 A new `Registry` subclass instance. 

199 """ 

200 if not isinstance(config, RegistryConfig): 

201 if isinstance(config, str) or isinstance(config, Config): 

202 config = RegistryConfig(config) 

203 else: 

204 raise ValueError("Incompatible Registry configuration: {}".format(config)) 

205 config.replaceRoot(butlerRoot) 

206 DatabaseClass = config.getDatabaseClass() 

207 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

208 namespace=config.get("namespace"), writeable=writeable) 

209 managerTypes = RegistryManagerTypes.fromConfig(config) 

210 managers = managerTypes.loadRepo(database) 

211 if defaults is None: 

212 defaults = RegistryDefaults() 

213 return cls(database, defaults, managers) 

214 

215 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances): 

216 self._db = database 

217 self._managers = managers 

218 self.storageClasses = StorageClassFactory() 

219 # Intentionally invoke property setter to initialize defaults. This 

220 # can only be done after most of the rest of Registry has already been 

221 # initialized, and must be done before the property getter is used. 

222 self.defaults = defaults 

223 

224 def __str__(self) -> str: 

225 return str(self._db) 

226 

227 def __repr__(self) -> str: 

228 return f"Registry({self._db!r}, {self.dimensions!r})" 

229 

230 def isWriteable(self) -> bool: 

231 """Return `True` if this registry allows write operations, and `False` 

232 otherwise. 

233 """ 

234 return self._db.isWriteable() 

235 

236 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry: 

237 """Create a new `Registry` backed by the same data repository and 

238 connection as this one, but independent defaults. 

239 

240 Parameters 

241 ---------- 

242 defaults : `RegistryDefaults`, optional 

243 Default collections and data ID values for the new registry. If 

244 not provided, ``self.defaults`` will be used (but future changes 

245 to either registry's defaults will not affect the other). 

246 

247 Returns 

248 ------- 

249 copy : `Registry` 

250 A new `Registry` instance with its own defaults. 

251 

252 Notes 

253 ----- 

254 Because the new registry shares a connection with the original, they 

255 also share transaction state (despite the fact that their `transaction` 

256 context manager methods do not reflect this), and must be used with 

257 care. 

258 """ 

259 if defaults is None: 

260 # No need to copy, because `RegistryDefaults` is immutable; we 

261 # effectively copy on write. 

262 defaults = self.defaults 

263 return Registry(self._db, defaults, self._managers) 

264 

265 @property 

266 def dimensions(self) -> DimensionUniverse: 

267 """All dimensions recognized by this `Registry` (`DimensionUniverse`). 

268 """ 

269 return self._managers.dimensions.universe 

270 

271 @property 

272 def defaults(self) -> RegistryDefaults: 

273 """Default collection search path and/or output `~CollectionType.RUN` 

274 collection (`RegistryDefaults`). 

275 

276 This is an immutable struct whose components may not be set 

277 individually, but the entire struct can be set by assigning to this 

278 property. 

279 """ 

280 return self._defaults 

281 

282 @defaults.setter 

283 def defaults(self, value: RegistryDefaults) -> None: 

284 if value.run is not None: 

285 self.registerRun(value.run) 

286 value.finish(self) 

287 self._defaults = value 

288 

289 def refresh(self) -> None: 

290 """Refresh all in-memory state by querying the database. 

291 

292 This may be necessary to enable querying for entities added by other 

293 `Registry` instances after this one was constructed. 

294 """ 

295 self._managers.refresh() 

296 

297 @contextlib.contextmanager 

298 def transaction(self, *, savepoint: bool = False) -> Iterator[None]: 

299 """Return a context manager that represents a transaction. 

300 """ 

301 try: 

302 with self._db.transaction(savepoint=savepoint): 

303 yield 

304 except BaseException: 

305 # TODO: this clears the caches sometimes when we wouldn't actually 

306 # need to. Can we avoid that? 

307 self._managers.dimensions.clearCaches() 

308 raise 

309 

310 def resetConnectionPool(self) -> None: 

311 """Reset SQLAlchemy connection pool for registry database. 

312 

313 This operation is useful when using registry with fork-based 

314 multiprocessing. To use registry across fork boundary one has to make 

315 sure that there are no currently active connections (no session or 

316 transaction is in progress) and connection pool is reset using this 

317 method. This method should be called by the child process immediately 

318 after the fork. 

319 """ 

320 self._db._engine.dispose() 

321 

322 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None: 

323 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

324 other data repository client. 

325 

326 Opaque table records can be added via `insertOpaqueData`, retrieved via 

327 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

328 

329 Parameters 

330 ---------- 

331 tableName : `str` 

332 Logical name of the opaque table. This may differ from the 

333 actual name used in the database by a prefix and/or suffix. 

334 spec : `ddl.TableSpec` 

335 Specification for the table to be added. 

336 """ 

337 self._managers.opaque.register(tableName, spec) 

338 

339 @transactional 

340 def insertOpaqueData(self, tableName: str, *data: dict) -> None: 

341 """Insert records into an opaque table. 

342 

343 Parameters 

344 ---------- 

345 tableName : `str` 

346 Logical name of the opaque table. Must match the name used in a 

347 previous call to `registerOpaqueTable`. 

348 data 

349 Each additional positional argument is a dictionary that represents 

350 a single row to be added. 

351 """ 

352 self._managers.opaque[tableName].insert(*data) 

353 

354 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

355 """Retrieve records from an opaque table. 

356 

357 Parameters 

358 ---------- 

359 tableName : `str` 

360 Logical name of the opaque table. Must match the name used in a 

361 previous call to `registerOpaqueTable`. 

362 where 

363 Additional keyword arguments are interpreted as equality 

364 constraints that restrict the returned rows (combined with AND); 

365 keyword arguments are column names and values are the values they 

366 must have. 

367 

368 Yields 

369 ------ 

370 row : `dict` 

371 A dictionary representing a single result row. 

372 """ 

373 yield from self._managers.opaque[tableName].fetch(**where) 

374 

375 @transactional 

376 def deleteOpaqueData(self, tableName: str, **where: Any) -> None: 

377 """Remove records from an opaque table. 

378 

379 Parameters 

380 ---------- 

381 tableName : `str` 

382 Logical name of the opaque table. Must match the name used in a 

383 previous call to `registerOpaqueTable`. 

384 where 

385 Additional keyword arguments are interpreted as equality 

386 constraints that restrict the deleted rows (combined with AND); 

387 keyword arguments are column names and values are the values they 

388 must have. 

389 """ 

390 self._managers.opaque[tableName].delete(**where) 

391 

392 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED, 

393 doc: Optional[str] = None) -> None: 

394 """Add a new collection if one with the given name does not exist. 

395 

396 Parameters 

397 ---------- 

398 name : `str` 

399 The name of the collection to create. 

400 type : `CollectionType` 

401 Enum value indicating the type of collection to create. 

402 doc : `str`, optional 

403 Documentation string for the collection. 

404 

405 Notes 

406 ----- 

407 This method cannot be called within transactions, as it needs to be 

408 able to perform its own transaction to be concurrent. 

409 """ 

410 self._managers.collections.register(name, type, doc=doc) 

411 

412 def getCollectionType(self, name: str) -> CollectionType: 

413 """Return an enumeration value indicating the type of the given 

414 collection. 

415 

416 Parameters 

417 ---------- 

418 name : `str` 

419 The name of the collection. 

420 

421 Returns 

422 ------- 

423 type : `CollectionType` 

424 Enum value indicating the type of this collection. 

425 

426 Raises 

427 ------ 

428 MissingCollectionError 

429 Raised if no collection with the given name exists. 

430 """ 

431 return self._managers.collections.find(name).type 

432 

433 def registerRun(self, name: str, doc: Optional[str] = None) -> None: 

434 """Add a new run if one with the given name does not exist. 

435 

436 Parameters 

437 ---------- 

438 name : `str` 

439 The name of the run to create. 

440 doc : `str`, optional 

441 Documentation string for the collection. 

442 

443 Notes 

444 ----- 

445 This method cannot be called within transactions, as it needs to be 

446 able to perform its own transaction to be concurrent. 

447 """ 

448 self._managers.collections.register(name, CollectionType.RUN, doc=doc) 

449 

450 @transactional 

451 def removeCollection(self, name: str) -> None: 

452 """Completely remove the given collection. 

453 

454 Parameters 

455 ---------- 

456 name : `str` 

457 The name of the collection to remove. 

458 

459 Raises 

460 ------ 

461 MissingCollectionError 

462 Raised if no collection with the given name exists. 

463 

464 Notes 

465 ----- 

466 If this is a `~CollectionType.RUN` collection, all datasets and quanta 

467 in it are also fully removed. This requires that those datasets be 

468 removed (or at least trashed) from any datastores that hold them first. 

469 

470 A collection may not be deleted as long as it is referenced by a 

471 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must 

472 be deleted or redefined first. 

473 """ 

474 self._managers.collections.remove(name) 

475 

476 def getCollectionChain(self, parent: str) -> CollectionSearch: 

477 """Return the child collections in a `~CollectionType.CHAINED` 

478 collection. 

479 

480 Parameters 

481 ---------- 

482 parent : `str` 

483 Name of the chained collection. Must have already been added via 

484 a call to `Registry.registerCollection`. 

485 

486 Returns 

487 ------- 

488 children : `CollectionSearch` 

489 An object that defines the search path of the collection. 

490 See :ref:`daf_butler_collection_expressions` for more information. 

491 

492 Raises 

493 ------ 

494 MissingCollectionError 

495 Raised if ``parent`` does not exist in the `Registry`. 

496 TypeError 

497 Raised if ``parent`` does not correspond to a 

498 `~CollectionType.CHAINED` collection. 

499 """ 

500 record = self._managers.collections.find(parent) 

501 if record.type is not CollectionType.CHAINED: 

502 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

503 assert isinstance(record, ChainedCollectionRecord) 

504 return record.children 

505 

506 @transactional 

507 def setCollectionChain(self, parent: str, children: Any) -> None: 

508 """Define or redefine a `~CollectionType.CHAINED` collection. 

509 

510 Parameters 

511 ---------- 

512 parent : `str` 

513 Name of the chained collection. Must have already been added via 

514 a call to `Registry.registerCollection`. 

515 children : `Any` 

516 An expression defining an ordered search of child collections, 

517 generally an iterable of `str`; see 

518 :ref:`daf_butler_collection_expressions` for more information. 

519 

520 Raises 

521 ------ 

522 MissingCollectionError 

523 Raised when any of the given collections do not exist in the 

524 `Registry`. 

525 TypeError 

526 Raised if ``parent`` does not correspond to a 

527 `~CollectionType.CHAINED` collection. 

528 ValueError 

529 Raised if the given collections contains a cycle. 

530 """ 

531 record = self._managers.collections.find(parent) 

532 if record.type is not CollectionType.CHAINED: 

533 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

534 assert isinstance(record, ChainedCollectionRecord) 

535 children = CollectionSearch.fromExpression(children) 

536 if children != record.children: 

537 record.update(self._managers.collections, children) 

538 

539 def getCollectionDocumentation(self, collection: str) -> Optional[str]: 

540 """Retrieve the documentation string for a collection. 

541 

542 Parameters 

543 ---------- 

544 name : `str` 

545 Name of the collection. 

546 

547 Returns 

548 ------- 

549 docs : `str` or `None` 

550 Docstring for the collection with the given name. 

551 """ 

552 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key) 

553 

554 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None: 

555 """Set the documentation string for a collection. 

556 

557 Parameters 

558 ---------- 

559 name : `str` 

560 Name of the collection. 

561 docs : `str` or `None` 

562 Docstring for the collection with the given name; will replace any 

563 existing docstring. Passing `None` will remove any existing 

564 docstring. 

565 """ 

566 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc) 

567 

568 def getCollectionSummary(self, collection: str) -> CollectionSummary: 

569 """Return a summary for the given collection. 

570 

571 Parameters 

572 ---------- 

573 collection : `str` 

574 Name of the collection for which a summary is to be retrieved. 

575 

576 Returns 

577 ------- 

578 summary : `CollectionSummary` 

579 Summary of the dataset types and governor dimension values in 

580 this collection. 

581 """ 

582 record = self._managers.collections.find(collection) 

583 return self._managers.datasets.getCollectionSummary(record) 

584 

585 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

586 """ 

587 Add a new `DatasetType` to the Registry. 

588 

589 It is not an error to register the same `DatasetType` twice. 

590 

591 Parameters 

592 ---------- 

593 datasetType : `DatasetType` 

594 The `DatasetType` to be added. 

595 

596 Returns 

597 ------- 

598 inserted : `bool` 

599 `True` if ``datasetType`` was inserted, `False` if an identical 

600 existing `DatsetType` was found. Note that in either case the 

601 DatasetType is guaranteed to be defined in the Registry 

602 consistently with the given definition. 

603 

604 Raises 

605 ------ 

606 ValueError 

607 Raised if the dimensions or storage class are invalid. 

608 ConflictingDefinitionError 

609 Raised if this DatasetType is already registered with a different 

610 definition. 

611 

612 Notes 

613 ----- 

614 This method cannot be called within transactions, as it needs to be 

615 able to perform its own transaction to be concurrent. 

616 """ 

617 _, inserted = self._managers.datasets.register(datasetType) 

618 return inserted 

619 

620 def removeDatasetType(self, name: str) -> None: 

621 """Remove the named `DatasetType` from the registry. 

622 

623 .. warning:: 

624 

625 Registry caches the dataset type definitions. This means that 

626 deleting the dataset type definition may result in unexpected 

627 behavior from other butler processes that are active that have 

628 not seen the deletion. 

629 

630 Parameters 

631 ---------- 

632 name : `str` 

633 Name of the type to be removed. 

634 

635 Raises 

636 ------ 

637 lsst.daf.butler.registry.OrphanedRecordError 

638 Raised if an attempt is made to remove the dataset type definition 

639 when there are already datasets associated with it. 

640 

641 Notes 

642 ----- 

643 If the dataset type is not registered the method will return without 

644 action. 

645 """ 

646 self._managers.datasets.remove(name) 

647 

648 def getDatasetType(self, name: str) -> DatasetType: 

649 """Get the `DatasetType`. 

650 

651 Parameters 

652 ---------- 

653 name : `str` 

654 Name of the type. 

655 

656 Returns 

657 ------- 

658 type : `DatasetType` 

659 The `DatasetType` associated with the given name. 

660 

661 Raises 

662 ------ 

663 KeyError 

664 Requested named DatasetType could not be found in registry. 

665 """ 

666 return self._managers.datasets[name].datasetType 

667 

668 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *, 

669 collections: Any = None, timespan: Optional[Timespan] = None, 

670 **kwargs: Any) -> Optional[DatasetRef]: 

671 """Find a dataset given its `DatasetType` and data ID. 

672 

673 This can be used to obtain a `DatasetRef` that permits the dataset to 

674 be read from a `Datastore`. If the dataset is a component and can not 

675 be found using the provided dataset type, a dataset ref for the parent 

676 will be returned instead but with the correct dataset type. 

677 

678 Parameters 

679 ---------- 

680 datasetType : `DatasetType` or `str` 

681 A `DatasetType` or the name of one. 

682 dataId : `dict` or `DataCoordinate`, optional 

683 A `dict`-like object containing the `Dimension` links that identify 

684 the dataset within a collection. 

685 collections, optional. 

686 An expression that fully or partially identifies the collections to 

687 search for the dataset; see 

688 :ref:`daf_butler_collection_expressions` for more information. 

689 Defaults to ``self.defaults.collections``. 

690 timespan : `Timespan`, optional 

691 A timespan that the validity range of the dataset must overlap. 

692 If not provided, any `~CollectionType.CALIBRATION` collections 

693 matched by the ``collections`` argument will not be searched. 

694 **kwargs 

695 Additional keyword arguments passed to 

696 `DataCoordinate.standardize` to convert ``dataId`` to a true 

697 `DataCoordinate` or augment an existing one. 

698 

699 Returns 

700 ------- 

701 ref : `DatasetRef` 

702 A reference to the dataset, or `None` if no matching Dataset 

703 was found. 

704 

705 Raises 

706 ------ 

707 TypeError 

708 Raised if ``collections`` is `None` and 

709 ``self.defaults.collections`` is `None`. 

710 LookupError 

711 Raised if one or more data ID keys are missing. 

712 KeyError 

713 Raised if the dataset type does not exist. 

714 MissingCollectionError 

715 Raised if any of ``collections`` does not exist in the registry. 

716 

717 Notes 

718 ----- 

719 This method simply returns `None` and does not raise an exception even 

720 when the set of collections searched is intrinsically incompatible with 

721 the dataset type, e.g. if ``datasetType.isCalibration() is False``, but 

722 only `~CollectionType.CALIBRATION` collections are being searched. 

723 This may make it harder to debug some lookup failures, but the behavior 

724 is intentional; we consider it more important that failed searches are 

725 reported consistently, regardless of the reason, and that adding 

726 additional collections that do not contain a match to the search path 

727 never changes the behavior. 

728 """ 

729 if isinstance(datasetType, DatasetType): 

730 storage = self._managers.datasets[datasetType.name] 

731 else: 

732 storage = self._managers.datasets[datasetType] 

733 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions, 

734 universe=self.dimensions, defaults=self.defaults.dataId, 

735 **kwargs) 

736 if collections is None: 

737 if not self.defaults.collections: 

738 raise TypeError("No collections provided to findDataset, " 

739 "and no defaults from registry construction.") 

740 collections = self.defaults.collections 

741 else: 

742 collections = CollectionSearch.fromExpression(collections) 

743 for collectionRecord in collections.iter(self._managers.collections): 

744 if (collectionRecord.type is CollectionType.CALIBRATION 

745 and (not storage.datasetType.isCalibration() or timespan is None)): 

746 continue 

747 result = storage.find(collectionRecord, dataId, timespan=timespan) 

748 if result is not None: 

749 return result 

750 

751 return None 

752 

753 @transactional 

754 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId], 

755 run: Optional[str] = None) -> List[DatasetRef]: 

756 """Insert one or more datasets into the `Registry` 

757 

758 This always adds new datasets; to associate existing datasets with 

759 a new collection, use ``associate``. 

760 

761 Parameters 

762 ---------- 

763 datasetType : `DatasetType` or `str` 

764 A `DatasetType` or the name of one. 

765 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate` 

766 Dimension-based identifiers for the new datasets. 

767 run : `str`, optional 

768 The name of the run that produced the datasets. Defaults to 

769 ``self.defaults.run``. 

770 

771 Returns 

772 ------- 

773 refs : `list` of `DatasetRef` 

774 Resolved `DatasetRef` instances for all given data IDs (in the same 

775 order). 

776 

777 Raises 

778 ------ 

779 TypeError 

780 Raised if ``run`` is `None` and ``self.defaults.run`` is `None`. 

781 ConflictingDefinitionError 

782 If a dataset with the same dataset type and data ID as one of those 

783 given already exists in ``run``. 

784 MissingCollectionError 

785 Raised if ``run`` does not exist in the registry. 

786 """ 

787 if isinstance(datasetType, DatasetType): 

788 storage = self._managers.datasets.find(datasetType.name) 

789 if storage is None: 

790 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

791 else: 

792 storage = self._managers.datasets.find(datasetType) 

793 if storage is None: 

794 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

795 if run is None: 

796 if self.defaults.run is None: 

797 raise TypeError("No run provided to insertDatasets, " 

798 "and no default from registry construction.") 

799 run = self.defaults.run 

800 runRecord = self._managers.collections.find(run) 

801 if runRecord.type is not CollectionType.RUN: 

802 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.") 

803 assert isinstance(runRecord, RunRecord) 

804 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) 

805 for dataId in dataIds] 

806 try: 

807 refs = list(storage.insert(runRecord, expandedDataIds)) 

808 except sqlalchemy.exc.IntegrityError as err: 

809 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

810 f"one or more datasets of type {storage.datasetType} into " 

811 f"collection '{run}'. " 

812 f"This probably means a dataset with the same data ID " 

813 f"and dataset type already exists, but it may also mean a " 

814 f"dimension row is missing.") from err 

815 return refs 

816 

817 def getDataset(self, id: int) -> Optional[DatasetRef]: 

818 """Retrieve a Dataset entry. 

819 

820 Parameters 

821 ---------- 

822 id : `int` 

823 The unique identifier for the dataset. 

824 

825 Returns 

826 ------- 

827 ref : `DatasetRef` or `None` 

828 A ref to the Dataset, or `None` if no matching Dataset 

829 was found. 

830 """ 

831 ref = self._managers.datasets.getDatasetRef(id) 

832 if ref is None: 

833 return None 

834 return ref 

835 

836 @transactional 

837 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None: 

838 """Remove datasets from the Registry. 

839 

840 The datasets will be removed unconditionally from all collections, and 

841 any `Quantum` that consumed this dataset will instead be marked with 

842 having a NULL input. `Datastore` records will *not* be deleted; the 

843 caller is responsible for ensuring that the dataset has already been 

844 removed from all Datastores. 

845 

846 Parameters 

847 ---------- 

848 refs : `Iterable` of `DatasetRef` 

849 References to the datasets to be removed. Must include a valid 

850 ``id`` attribute, and should be considered invalidated upon return. 

851 

852 Raises 

853 ------ 

854 AmbiguousDatasetError 

855 Raised if any ``ref.id`` is `None`. 

856 OrphanedRecordError 

857 Raised if any dataset is still present in any `Datastore`. 

858 """ 

859 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

860 storage = self._managers.datasets.find(datasetType.name) 

861 assert storage is not None 

862 try: 

863 storage.delete(refsForType) 

864 except sqlalchemy.exc.IntegrityError as err: 

865 raise OrphanedRecordError("One or more datasets is still " 

866 "present in one or more Datastores.") from err 

867 

868 @transactional 

869 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

870 """Add existing datasets to a `~CollectionType.TAGGED` collection. 

871 

872 If a DatasetRef with the same exact integer ID is already in a 

873 collection nothing is changed. If a `DatasetRef` with the same 

874 `DatasetType` and data ID but with different integer ID 

875 exists in the collection, `ConflictingDefinitionError` is raised. 

876 

877 Parameters 

878 ---------- 

879 collection : `str` 

880 Indicates the collection the datasets should be associated with. 

881 refs : `Iterable` [ `DatasetRef` ] 

882 An iterable of resolved `DatasetRef` instances that already exist 

883 in this `Registry`. 

884 

885 Raises 

886 ------ 

887 ConflictingDefinitionError 

888 If a Dataset with the given `DatasetRef` already exists in the 

889 given collection. 

890 AmbiguousDatasetError 

891 Raised if ``any(ref.id is None for ref in refs)``. 

892 MissingCollectionError 

893 Raised if ``collection`` does not exist in the registry. 

894 TypeError 

895 Raise adding new datasets to the given ``collection`` is not 

896 allowed. 

897 """ 

898 collectionRecord = self._managers.collections.find(collection) 

899 if collectionRecord.type is not CollectionType.TAGGED: 

900 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

901 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

902 storage = self._managers.datasets.find(datasetType.name) 

903 assert storage is not None 

904 try: 

905 storage.associate(collectionRecord, refsForType) 

906 except sqlalchemy.exc.IntegrityError as err: 

907 raise ConflictingDefinitionError( 

908 f"Constraint violation while associating dataset of type {datasetType.name} with " 

909 f"collection {collection}. This probably means that one or more datasets with the same " 

910 f"dataset type and data ID already exist in the collection, but it may also indicate " 

911 f"that the datasets do not exist." 

912 ) from err 

913 

914 @transactional 

915 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

916 """Remove existing datasets from a `~CollectionType.TAGGED` collection. 

917 

918 ``collection`` and ``ref`` combinations that are not currently 

919 associated are silently ignored. 

920 

921 Parameters 

922 ---------- 

923 collection : `str` 

924 The collection the datasets should no longer be associated with. 

925 refs : `Iterable` [ `DatasetRef` ] 

926 An iterable of resolved `DatasetRef` instances that already exist 

927 in this `Registry`. 

928 

929 Raises 

930 ------ 

931 AmbiguousDatasetError 

932 Raised if any of the given dataset references is unresolved. 

933 MissingCollectionError 

934 Raised if ``collection`` does not exist in the registry. 

935 TypeError 

936 Raise adding new datasets to the given ``collection`` is not 

937 allowed. 

938 """ 

939 collectionRecord = self._managers.collections.find(collection) 

940 if collectionRecord.type is not CollectionType.TAGGED: 

941 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; " 

942 "expected TAGGED.") 

943 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

944 storage = self._managers.datasets.find(datasetType.name) 

945 assert storage is not None 

946 storage.disassociate(collectionRecord, refsForType) 

947 

948 @transactional 

949 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None: 

950 """Associate one or more datasets with a calibration collection and a 

951 validity range within it. 

952 

953 Parameters 

954 ---------- 

955 collection : `str` 

956 The name of an already-registered `~CollectionType.CALIBRATION` 

957 collection. 

958 refs : `Iterable` [ `DatasetRef` ] 

959 Datasets to be associated. 

960 timespan : `Timespan` 

961 The validity range for these datasets within the collection. 

962 

963 Raises 

964 ------ 

965 AmbiguousDatasetError 

966 Raised if any of the given `DatasetRef` instances is unresolved. 

967 ConflictingDefinitionError 

968 Raised if the collection already contains a different dataset with 

969 the same `DatasetType` and data ID and an overlapping validity 

970 range. 

971 TypeError 

972 Raised if ``collection`` is not a `~CollectionType.CALIBRATION` 

973 collection or if one or more datasets are of a dataset type for 

974 which `DatasetType.isCalibration` returns `False`. 

975 """ 

976 collectionRecord = self._managers.collections.find(collection) 

977 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

978 storage = self._managers.datasets[datasetType.name] 

979 storage.certify(collectionRecord, refsForType, timespan) 

980 

981 @transactional 

982 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *, 

983 dataIds: Optional[Iterable[DataId]] = None) -> None: 

984 """Remove or adjust datasets to clear a validity range within a 

985 calibration collection. 

986 

987 Parameters 

988 ---------- 

989 collection : `str` 

990 The name of an already-registered `~CollectionType.CALIBRATION` 

991 collection. 

992 datasetType : `str` or `DatasetType` 

993 Name or `DatasetType` instance for the datasets to be decertified. 

994 timespan : `Timespan`, optional 

995 The validity range to remove datasets from within the collection. 

996 Datasets that overlap this range but are not contained by it will 

997 have their validity ranges adjusted to not overlap it, which may 

998 split a single dataset validity range into two. 

999 dataIds : `Iterable` [ `DataId` ], optional 

1000 Data IDs that should be decertified within the given validity range 

1001 If `None`, all data IDs for ``self.datasetType`` will be 

1002 decertified. 

1003 

1004 Raises 

1005 ------ 

1006 TypeError 

1007 Raised if ``collection`` is not a `~CollectionType.CALIBRATION` 

1008 collection or if ``datasetType.isCalibration() is False``. 

1009 """ 

1010 collectionRecord = self._managers.collections.find(collection) 

1011 if isinstance(datasetType, str): 

1012 storage = self._managers.datasets[datasetType] 

1013 else: 

1014 storage = self._managers.datasets[datasetType.name] 

1015 standardizedDataIds = None 

1016 if dataIds is not None: 

1017 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) 

1018 for d in dataIds] 

1019 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds) 

1020 

1021 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: 

1022 """Return an object that allows a new `Datastore` instance to 

1023 communicate with this `Registry`. 

1024 

1025 Returns 

1026 ------- 

1027 manager : `DatastoreRegistryBridgeManager` 

1028 Object that mediates communication between this `Registry` and its 

1029 associated datastores. 

1030 """ 

1031 return self._managers.datastores 

1032 

1033 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]: 

1034 """Retrieve datastore locations for a given dataset. 

1035 

1036 Parameters 

1037 ---------- 

1038 ref : `DatasetRef` 

1039 A reference to the dataset for which to retrieve storage 

1040 information. 

1041 

1042 Returns 

1043 ------- 

1044 datastores : `Iterable` [ `str` ] 

1045 All the matching datastores holding this dataset. 

1046 

1047 Raises 

1048 ------ 

1049 AmbiguousDatasetError 

1050 Raised if ``ref.id`` is `None`. 

1051 """ 

1052 return self._managers.datastores.findDatastores(ref) 

1053 

1054 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None, 

1055 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None, 

1056 withDefaults: bool = True, 

1057 **kwargs: Any) -> DataCoordinate: 

1058 """Expand a dimension-based data ID to include additional information. 

1059 

1060 Parameters 

1061 ---------- 

1062 dataId : `DataCoordinate` or `dict`, optional 

1063 Data ID to be expanded; augmented and overridden by ``kwds``. 

1064 graph : `DimensionGraph`, optional 

1065 Set of dimensions for the expanded ID. If `None`, the dimensions 

1066 will be inferred from the keys of ``dataId`` and ``kwds``. 

1067 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph`` 

1068 are silently ignored, providing a way to extract and expand a 

1069 subset of a data ID. 

1070 records : `Mapping` [`str`, `DimensionRecord`], optional 

1071 Dimension record data to use before querying the database for that 

1072 data, keyed by element name. 

1073 withDefaults : `bool`, optional 

1074 Utilize ``self.defaults.dataId`` to fill in missing governor 

1075 dimension key-value pairs. Defaults to `True` (i.e. defaults are 

1076 used). 

1077 **kwargs 

1078 Additional keywords are treated like additional key-value pairs for 

1079 ``dataId``, extending and overriding 

1080 

1081 Returns 

1082 ------- 

1083 expanded : `DataCoordinate` 

1084 A data ID that includes full metadata for all of the dimensions it 

1085 identifieds, i.e. guarantees that ``expanded.hasRecords()`` and 

1086 ``expanded.hasFull()`` both return `True`. 

1087 """ 

1088 if not withDefaults: 

1089 defaults = None 

1090 else: 

1091 defaults = self.defaults.dataId 

1092 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, 

1093 defaults=defaults, **kwargs) 

1094 if standardized.hasRecords(): 

1095 return standardized 

1096 if records is None: 

1097 records = {} 

1098 elif isinstance(records, NamedKeyMapping): 

1099 records = records.byName() 

1100 else: 

1101 records = dict(records) 

1102 if isinstance(dataId, DataCoordinate) and dataId.hasRecords(): 

1103 records.update(dataId.records.byName()) 

1104 keys = standardized.byName() 

1105 for element in standardized.graph.primaryKeyTraversalOrder: 

1106 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

1107 if record is ...: 

1108 if isinstance(element, Dimension) and keys.get(element.name) is None: 

1109 if element in standardized.graph.required: 

1110 raise LookupError( 

1111 f"No value or null value for required dimension {element.name}." 

1112 ) 

1113 keys[element.name] = None 

1114 record = None 

1115 else: 

1116 storage = self._managers.dimensions[element] 

1117 dataIdSet = DataCoordinateIterable.fromScalar( 

1118 DataCoordinate.standardize(keys, graph=element.graph) 

1119 ) 

1120 fetched = tuple(storage.fetch(dataIdSet)) 

1121 try: 

1122 (record,) = fetched 

1123 except ValueError: 

1124 record = None 

1125 records[element.name] = record 

1126 if record is not None: 

1127 for d in element.implied: 

1128 value = getattr(record, d.name) 

1129 if keys.setdefault(d.name, value) != value: 

1130 raise InconsistentDataIdError( 

1131 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, " 

1132 f"but {element.name} implies {d.name}={value!r}." 

1133 ) 

1134 else: 

1135 if element in standardized.graph.required: 

1136 raise LookupError( 

1137 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

1138 ) 

1139 if element.alwaysJoin: 

1140 raise InconsistentDataIdError( 

1141 f"Could not fetch record for element {element.name} via keys {keys}, ", 

1142 "but it is marked alwaysJoin=True; this means one or more dimensions are not " 

1143 "related." 

1144 ) 

1145 for d in element.implied: 

1146 keys.setdefault(d.name, None) 

1147 records.setdefault(d.name, None) 

1148 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records) 

1149 

1150 def insertDimensionData(self, element: Union[DimensionElement, str], 

1151 *data: Union[Mapping[str, Any], DimensionRecord], 

1152 conform: bool = True) -> None: 

1153 """Insert one or more dimension records into the database. 

1154 

1155 Parameters 

1156 ---------- 

1157 element : `DimensionElement` or `str` 

1158 The `DimensionElement` or name thereof that identifies the table 

1159 records will be inserted into. 

1160 data : `dict` or `DimensionRecord` (variadic) 

1161 One or more records to insert. 

1162 conform : `bool`, optional 

1163 If `False` (`True` is default) perform no checking or conversions, 

1164 and assume that ``element`` is a `DimensionElement` instance and 

1165 ``data`` is a one or more `DimensionRecord` instances of the 

1166 appropriate subclass. 

1167 """ 

1168 if conform: 

1169 if isinstance(element, str): 

1170 element = self.dimensions[element] 

1171 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

1172 for row in data] 

1173 else: 

1174 # Ignore typing since caller said to trust them with conform=False. 

1175 records = data # type: ignore 

1176 storage = self._managers.dimensions[element] # type: ignore 

1177 storage.insert(*records) 

1178 

1179 def syncDimensionData(self, element: Union[DimensionElement, str], 

1180 row: Union[Mapping[str, Any], DimensionRecord], 

1181 conform: bool = True) -> bool: 

1182 """Synchronize the given dimension record with the database, inserting 

1183 if it does not already exist and comparing values if it does. 

1184 

1185 Parameters 

1186 ---------- 

1187 element : `DimensionElement` or `str` 

1188 The `DimensionElement` or name thereof that identifies the table 

1189 records will be inserted into. 

1190 row : `dict` or `DimensionRecord` 

1191 The record to insert. 

1192 conform : `bool`, optional 

1193 If `False` (`True` is default) perform no checking or conversions, 

1194 and assume that ``element`` is a `DimensionElement` instance and 

1195 ``data`` is a one or more `DimensionRecord` instances of the 

1196 appropriate subclass. 

1197 

1198 Returns 

1199 ------- 

1200 inserted : `bool` 

1201 `True` if a new row was inserted, `False` otherwise. 

1202 

1203 Raises 

1204 ------ 

1205 ConflictingDefinitionError 

1206 Raised if the record exists in the database (according to primary 

1207 key lookup) but is inconsistent with the given one. 

1208 """ 

1209 if conform: 

1210 if isinstance(element, str): 

1211 element = self.dimensions[element] 

1212 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

1213 else: 

1214 # Ignore typing since caller said to trust them with conform=False. 

1215 record = row # type: ignore 

1216 storage = self._managers.dimensions[element] # type: ignore 

1217 return storage.sync(record) 

1218 

1219 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None 

1220 ) -> Iterator[DatasetType]: 

1221 """Iterate over the dataset types whose names match an expression. 

1222 

1223 Parameters 

1224 ---------- 

1225 expression : `Any`, optional 

1226 An expression that fully or partially identifies the dataset types 

1227 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1228 `...` can be used to return all dataset types, and is the default. 

1229 See :ref:`daf_butler_dataset_type_expressions` for more 

1230 information. 

1231 components : `bool`, optional 

1232 If `True`, apply all expression patterns to component dataset type 

1233 names as well. If `False`, never apply patterns to components. 

1234 If `None` (default), apply patterns to components only if their 

1235 parent datasets were not matched by the expression. 

1236 Fully-specified component datasets (`str` or `DatasetType` 

1237 instances) are always included. 

1238 

1239 Yields 

1240 ------ 

1241 datasetType : `DatasetType` 

1242 A `DatasetType` instance whose name matches ``expression``. 

1243 """ 

1244 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name) 

1245 if wildcard is Ellipsis: 

1246 for datasetType in self._managers.datasets: 

1247 # The dataset type can no longer be a component 

1248 yield datasetType 

1249 if components: 

1250 # Automatically create the component dataset types 

1251 try: 

1252 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes() 

1253 except KeyError as err: 

1254 _LOG.warning(f"Could not load storage class {err} for {datasetType.name}; " 

1255 "if it has components they will not be included in query results.") 

1256 else: 

1257 yield from componentsForDatasetType 

1258 return 

1259 done: Set[str] = set() 

1260 for name in wildcard.strings: 

1261 storage = self._managers.datasets.find(name) 

1262 if storage is not None: 

1263 done.add(storage.datasetType.name) 

1264 yield storage.datasetType 

1265 if wildcard.patterns: 

1266 # If components (the argument) is None, we'll save component 

1267 # dataset that we might want to match, but only if their parents 

1268 # didn't get included. 

1269 componentsForLater = [] 

1270 for registeredDatasetType in self._managers.datasets: 

1271 # Components are not stored in registry so expand them here 

1272 allDatasetTypes = [registeredDatasetType] 

1273 try: 

1274 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes()) 

1275 except KeyError as err: 

1276 _LOG.warning(f"Could not load storage class {err} for {registeredDatasetType.name}; " 

1277 "if it has components they will not be included in query results.") 

1278 for datasetType in allDatasetTypes: 

1279 if datasetType.name in done: 

1280 continue 

1281 parentName, componentName = datasetType.nameAndComponent() 

1282 if componentName is not None and not components: 

1283 if components is None and parentName not in done: 

1284 componentsForLater.append(datasetType) 

1285 continue 

1286 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

1287 done.add(datasetType.name) 

1288 yield datasetType 

1289 # Go back and try to match saved components. 

1290 for datasetType in componentsForLater: 

1291 parentName, _ = datasetType.nameAndComponent() 

1292 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

1293 yield datasetType 

1294 

1295 def queryCollections(self, expression: Any = ..., 

1296 datasetType: Optional[DatasetType] = None, 

1297 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

1298 flattenChains: bool = False, 

1299 includeChains: Optional[bool] = None) -> Iterator[str]: 

1300 """Iterate over the collections whose names match an expression. 

1301 

1302 Parameters 

1303 ---------- 

1304 expression : `Any`, optional 

1305 An expression that fully or partially identifies the collections 

1306 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1307 `...` can be used to return all collections, and is the default. 

1308 See :ref:`daf_butler_collection_expressions` for more 

1309 information. 

1310 datasetType : `DatasetType`, optional 

1311 If provided, only yield collections that may contain datasets of 

1312 this type. This is a conservative approximation in general; it may 

1313 yield collections that do not have any such datasets. 

1314 collectionTypes : `AbstractSet` [ `CollectionType` ], optional 

1315 If provided, only yield collections of these types. 

1316 flattenChains : `bool`, optional 

1317 If `True` (`False` is default), recursively yield the child 

1318 collections of matching `~CollectionType.CHAINED` collections. 

1319 includeChains : `bool`, optional 

1320 If `True`, yield records for matching `~CollectionType.CHAINED` 

1321 collections. Default is the opposite of ``flattenChains``: include 

1322 either CHAINED collections or their children, but not both. 

1323 

1324 Yields 

1325 ------ 

1326 collection : `str` 

1327 The name of a collection that matches ``expression``. 

1328 """ 

1329 # Right now the datasetTypes argument is completely ignored, but that 

1330 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up 

1331 # ticket will take care of that. 

1332 query = CollectionQuery.fromExpression(expression) 

1333 for record in query.iter(self._managers.collections, collectionTypes=frozenset(collectionTypes), 

1334 flattenChains=flattenChains, includeChains=includeChains): 

1335 yield record.name 

1336 

1337 def makeQueryBuilder(self, summary: queries.QuerySummary) -> queries.QueryBuilder: 

1338 """Return a `QueryBuilder` instance capable of constructing and 

1339 managing more complex queries than those obtainable via `Registry` 

1340 interfaces. 

1341 

1342 This is an advanced interface; downstream code should prefer 

1343 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those 

1344 are sufficient. 

1345 

1346 Parameters 

1347 ---------- 

1348 summary : `queries.QuerySummary` 

1349 Object describing and categorizing the full set of dimensions that 

1350 will be included in the query. 

1351 

1352 Returns 

1353 ------- 

1354 builder : `queries.QueryBuilder` 

1355 Object that can be used to construct and perform advanced queries. 

1356 """ 

1357 return queries.QueryBuilder( 

1358 summary, 

1359 queries.RegistryManagers( 

1360 collections=self._managers.collections, 

1361 dimensions=self._managers.dimensions, 

1362 datasets=self._managers.datasets, 

1363 TimespanReprClass=self._db.getTimespanRepresentation(), 

1364 ), 

1365 ) 

1366 

1367 def queryDatasets(self, datasetType: Any, *, 

1368 collections: Any = None, 

1369 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

1370 dataId: Optional[DataId] = None, 

1371 where: Optional[str] = None, 

1372 findFirst: bool = False, 

1373 components: Optional[bool] = None, 

1374 bind: Optional[Mapping[str, Any]] = None, 

1375 check: bool = True, 

1376 **kwargs: Any) -> queries.DatasetQueryResults: 

1377 """Query for and iterate over dataset references matching user-provided 

1378 criteria. 

1379 

1380 Parameters 

1381 ---------- 

1382 datasetType 

1383 An expression that fully or partially identifies the dataset types 

1384 to be queried. Allowed types include `DatasetType`, `str`, 

1385 `re.Pattern`, and iterables thereof. The special value `...` can 

1386 be used to query all dataset types. See 

1387 :ref:`daf_butler_dataset_type_expressions` for more information. 

1388 collections: optional 

1389 An expression that fully or partially identifies the collections 

1390 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1391 thereof. `...` can be used to find datasets from all 

1392 `~CollectionType.RUN` collections (no other collections are 

1393 necessary, because all datasets are in a ``RUN`` collection). See 

1394 :ref:`daf_butler_collection_expressions` for more information. 

1395 If not provided, ``self.default.collections`` is used. 

1396 dimensions : `~collections.abc.Iterable` of `Dimension` or `str` 

1397 Dimensions to include in the query (in addition to those used 

1398 to identify the queried dataset type(s)), either to constrain 

1399 the resulting datasets to those for which a matching dimension 

1400 exists, or to relate the dataset type's dimensions to dimensions 

1401 referenced by the ``dataId`` or ``where`` arguments. 

1402 dataId : `dict` or `DataCoordinate`, optional 

1403 A data ID whose key-value pairs are used as equality constraints 

1404 in the query. 

1405 where : `str`, optional 

1406 A string expression similar to a SQL WHERE clause. May involve 

1407 any column of a dimension table or (as a shortcut for the primary 

1408 key column of a dimension table) dimension name. See 

1409 :ref:`daf_butler_dimension_expressions` for more information. 

1410 findFirst : `bool`, optional 

1411 If `True` (`False` is default), for each result data ID, only 

1412 yield one `DatasetRef` of each `DatasetType`, from the first 

1413 collection in which a dataset of that dataset type appears 

1414 (according to the order of ``collections`` passed in). If `True`, 

1415 ``collections`` must not contain regular expressions and may not 

1416 be `...`. 

1417 components : `bool`, optional 

1418 If `True`, apply all dataset expression patterns to component 

1419 dataset type names as well. If `False`, never apply patterns to 

1420 components. If `None` (default), apply patterns to components only 

1421 if their parent datasets were not matched by the expression. 

1422 Fully-specified component datasets (`str` or `DatasetType` 

1423 instances) are always included. 

1424 bind : `Mapping`, optional 

1425 Mapping containing literal values that should be injected into the 

1426 ``where`` expression, keyed by the identifiers they replace. 

1427 check : `bool`, optional 

1428 If `True` (default) check the query for consistency before 

1429 executing it. This may reject some valid queries that resemble 

1430 common mistakes (e.g. queries for visits without specifying an 

1431 instrument). 

1432 **kwargs 

1433 Additional keyword arguments are forwarded to 

1434 `DataCoordinate.standardize` when processing the ``dataId`` 

1435 argument (and may be used to provide a constraining data ID even 

1436 when the ``dataId`` argument is `None`). 

1437 

1438 Returns 

1439 ------- 

1440 refs : `queries.DatasetQueryResults` 

1441 Dataset references matching the given query criteria. 

1442 

1443 Raises 

1444 ------ 

1445 TypeError 

1446 Raised when the arguments are incompatible, such as when a 

1447 collection wildcard is passed when ``findFirst`` is `True`, or 

1448 when ``collections`` is `None` and``self.defaults.collections`` is 

1449 also `None`. 

1450 

1451 Notes 

1452 ----- 

1453 When multiple dataset types are queried in a single call, the 

1454 results of this operation are equivalent to querying for each dataset 

1455 type separately in turn, and no information about the relationships 

1456 between datasets of different types is included. In contexts where 

1457 that kind of information is important, the recommended pattern is to 

1458 use `queryDataIds` to first obtain data IDs (possibly with the 

1459 desired dataset types and collections passed as constraints to the 

1460 query), and then use multiple (generally much simpler) calls to 

1461 `queryDatasets` with the returned data IDs passed as constraints. 

1462 """ 

1463 # Standardize the collections expression. 

1464 if collections is None: 

1465 if not self.defaults.collections: 

1466 raise TypeError("No collections provided to findDataset, " 

1467 "and no defaults from registry construction.") 

1468 collections = self.defaults.collections 

1469 elif findFirst: 

1470 collections = CollectionSearch.fromExpression(collections) 

1471 else: 

1472 collections = CollectionQuery.fromExpression(collections) 

1473 # Standardize and expand the data ID provided as a constraint. 

1474 standardizedDataId = self.expandDataId(dataId, **kwargs) 

1475 

1476 # We can only query directly if given a non-component DatasetType 

1477 # instance. If we were given an expression or str or a component 

1478 # DatasetType instance, we'll populate this dict, recurse, and return. 

1479 # If we already have a non-component DatasetType, it will remain None 

1480 # and we'll run the query directly. 

1481 composition: Optional[ 

1482 Dict[ 

1483 DatasetType, # parent dataset type 

1484 List[Optional[str]] # component name, or None for parent 

1485 ] 

1486 ] = None 

1487 if not isinstance(datasetType, DatasetType): 

1488 # We were given a dataset type expression (which may be as simple 

1489 # as a str). Loop over all matching datasets, delegating handling 

1490 # of the `components` argument to queryDatasetTypes, as we populate 

1491 # the composition dict. 

1492 composition = defaultdict(list) 

1493 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components): 

1494 parentName, componentName = trueDatasetType.nameAndComponent() 

1495 if componentName is not None: 

1496 parentDatasetType = self.getDatasetType(parentName) 

1497 composition.setdefault(parentDatasetType, []).append(componentName) 

1498 else: 

1499 composition.setdefault(trueDatasetType, []).append(None) 

1500 elif datasetType.isComponent(): 

1501 # We were given a true DatasetType instance, but it's a component. 

1502 # the composition dict will have exactly one item. 

1503 parentName, componentName = datasetType.nameAndComponent() 

1504 parentDatasetType = self.getDatasetType(parentName) 

1505 composition = {parentDatasetType: [componentName]} 

1506 if composition is not None: 

1507 # We need to recurse. Do that once for each parent dataset type. 

1508 chain = [] 

1509 for parentDatasetType, componentNames in composition.items(): 

1510 parentResults = self.queryDatasets( 

1511 parentDatasetType, 

1512 collections=collections, 

1513 dimensions=dimensions, 

1514 dataId=standardizedDataId, 

1515 where=where, 

1516 findFirst=findFirst, 

1517 check=check, 

1518 ) 

1519 if isinstance(parentResults, queries.ParentDatasetQueryResults): 

1520 chain.append( 

1521 parentResults.withComponents(componentNames) 

1522 ) 

1523 else: 

1524 # Should only happen if we know there would be no results. 

1525 assert isinstance(parentResults, queries.ChainedDatasetQueryResults) \ 

1526 and not parentResults._chain 

1527 return queries.ChainedDatasetQueryResults(chain) 

1528 # If we get here, there's no need to recurse (or we are already 

1529 # recursing; there can only ever be one level of recursion). 

1530 

1531 # The full set of dimensions in the query is the combination of those 

1532 # needed for the DatasetType and those explicitly requested, if any. 

1533 requestedDimensionNames = set(datasetType.dimensions.names) 

1534 if dimensions is not None: 

1535 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

1536 # Construct the summary structure needed to construct a QueryBuilder. 

1537 summary = queries.QuerySummary( 

1538 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1539 dataId=standardizedDataId, 

1540 expression=where, 

1541 bind=bind, 

1542 defaults=self.defaults.dataId, 

1543 check=check, 

1544 ) 

1545 builder = self.makeQueryBuilder(summary) 

1546 # Add the dataset subquery to the query, telling the QueryBuilder to 

1547 # include the rank of the selected collection in the results only if we 

1548 # need to findFirst. Note that if any of the collections are 

1549 # actually wildcard expressions, and we've asked for deduplication, 

1550 # this will raise TypeError for us. 

1551 if not builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst): 

1552 return queries.ChainedDatasetQueryResults(()) 

1553 query = builder.finish() 

1554 return queries.ParentDatasetQueryResults(self._db, query, components=[None]) 

1555 

1556 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *, 

1557 dataId: Optional[DataId] = None, 

1558 datasets: Any = None, 

1559 collections: Any = None, 

1560 where: Optional[str] = None, 

1561 components: Optional[bool] = None, 

1562 bind: Optional[Mapping[str, Any]] = None, 

1563 check: bool = True, 

1564 **kwargs: Any) -> queries.DataCoordinateQueryResults: 

1565 """Query for data IDs matching user-provided criteria. 

1566 

1567 Parameters 

1568 ---------- 

1569 dimensions : `Dimension` or `str`, or iterable thereof 

1570 The dimensions of the data IDs to yield, as either `Dimension` 

1571 instances or `str`. Will be automatically expanded to a complete 

1572 `DimensionGraph`. 

1573 dataId : `dict` or `DataCoordinate`, optional 

1574 A data ID whose key-value pairs are used as equality constraints 

1575 in the query. 

1576 datasets : `Any`, optional 

1577 An expression that fully or partially identifies dataset types 

1578 that should constrain the yielded data IDs. For example, including 

1579 "raw" here would constrain the yielded ``instrument``, 

1580 ``exposure``, ``detector``, and ``physical_filter`` values to only 

1581 those for which at least one "raw" dataset exists in 

1582 ``collections``. Allowed types include `DatasetType`, `str`, 

1583 `re.Pattern`, and iterables thereof. Unlike other dataset type 

1584 expressions, ``...`` is not permitted - it doesn't make sense to 

1585 constrain data IDs on the existence of *all* datasets. 

1586 See :ref:`daf_butler_dataset_type_expressions` for more 

1587 information. 

1588 collections: `Any`, optional 

1589 An expression that fully or partially identifies the collections 

1590 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1591 thereof. `...` can be used to return all collections. Must be 

1592 provided if ``datasets`` is, and is ignored if it is not. See 

1593 :ref:`daf_butler_collection_expressions` for more information. 

1594 If not provided, ``self.default.collections`` is used. 

1595 where : `str`, optional 

1596 A string expression similar to a SQL WHERE clause. May involve 

1597 any column of a dimension table or (as a shortcut for the primary 

1598 key column of a dimension table) dimension name. See 

1599 :ref:`daf_butler_dimension_expressions` for more information. 

1600 components : `bool`, optional 

1601 If `True`, apply all dataset expression patterns to component 

1602 dataset type names as well. If `False`, never apply patterns to 

1603 components. If `None` (default), apply patterns to components only 

1604 if their parent datasets were not matched by the expression. 

1605 Fully-specified component datasets (`str` or `DatasetType` 

1606 instances) are always included. 

1607 bind : `Mapping`, optional 

1608 Mapping containing literal values that should be injected into the 

1609 ``where`` expression, keyed by the identifiers they replace. 

1610 check : `bool`, optional 

1611 If `True` (default) check the query for consistency before 

1612 executing it. This may reject some valid queries that resemble 

1613 common mistakes (e.g. queries for visits without specifying an 

1614 instrument). 

1615 **kwargs 

1616 Additional keyword arguments are forwarded to 

1617 `DataCoordinate.standardize` when processing the ``dataId`` 

1618 argument (and may be used to provide a constraining data ID even 

1619 when the ``dataId`` argument is `None`). 

1620 

1621 Returns 

1622 ------- 

1623 dataIds : `DataCoordinateQueryResults` 

1624 Data IDs matching the given query parameters. These are guaranteed 

1625 to identify all dimensions (`DataCoordinate.hasFull` returns 

1626 `True`), but will not contain `DimensionRecord` objects 

1627 (`DataCoordinate.hasRecords` returns `False`). Call 

1628 `DataCoordinateQueryResults.expanded` on the returned object to 

1629 fetch those (and consider using 

1630 `DataCoordinateQueryResults.materialize` on the returned object 

1631 first if the expected number of rows is very large). See 

1632 documentation for those methods for additional information. 

1633 

1634 Raises 

1635 ------ 

1636 TypeError 

1637 Raised if ``collections`` is `None`, ``self.defaults.collections`` 

1638 is `None`, and ``datasets`` is not `None`. 

1639 """ 

1640 dimensions = iterable(dimensions) 

1641 standardizedDataId = self.expandDataId(dataId, **kwargs) 

1642 standardizedDatasetTypes = set() 

1643 requestedDimensions = self.dimensions.extract(dimensions) 

1644 queryDimensionNames = set(requestedDimensions.names) 

1645 if datasets is not None: 

1646 if collections is None: 

1647 if not self.defaults.collections: 

1648 raise TypeError("Cannot pass 'datasets' without 'collections'.") 

1649 collections = self.defaults.collections 

1650 else: 

1651 # Preprocess collections expression in case the original 

1652 # included single-pass iterators (we'll want to use it multiple 

1653 # times below). 

1654 collections = CollectionQuery.fromExpression(collections) 

1655 for datasetType in self.queryDatasetTypes(datasets, components=components): 

1656 queryDimensionNames.update(datasetType.dimensions.names) 

1657 # If any matched dataset type is a component, just operate on 

1658 # its parent instead, because Registry doesn't know anything 

1659 # about what components exist, and here (unlike queryDatasets) 

1660 # we don't care about returning them. 

1661 parentDatasetTypeName, componentName = datasetType.nameAndComponent() 

1662 if componentName is not None: 

1663 datasetType = self.getDatasetType(parentDatasetTypeName) 

1664 standardizedDatasetTypes.add(datasetType) 

1665 

1666 summary = queries.QuerySummary( 

1667 requested=DimensionGraph(self.dimensions, names=queryDimensionNames), 

1668 dataId=standardizedDataId, 

1669 expression=where, 

1670 bind=bind, 

1671 defaults=self.defaults.dataId, 

1672 check=check, 

1673 ) 

1674 builder = self.makeQueryBuilder(summary) 

1675 for datasetType in standardizedDatasetTypes: 

1676 builder.joinDataset(datasetType, collections, isResult=False) 

1677 query = builder.finish() 

1678 return queries.DataCoordinateQueryResults(self._db, query) 

1679 

1680 def queryDimensionRecords(self, element: Union[DimensionElement, str], *, 

1681 dataId: Optional[DataId] = None, 

1682 datasets: Any = None, 

1683 collections: Any = None, 

1684 where: Optional[str] = None, 

1685 components: Optional[bool] = None, 

1686 bind: Optional[Mapping[str, Any]] = None, 

1687 check: bool = True, 

1688 **kwargs: Any) -> Iterator[DimensionRecord]: 

1689 """Query for dimension information matching user-provided criteria. 

1690 

1691 Parameters 

1692 ---------- 

1693 element : `DimensionElement` or `str` 

1694 The dimension element to obtain records for. 

1695 dataId : `dict` or `DataCoordinate`, optional 

1696 A data ID whose key-value pairs are used as equality constraints 

1697 in the query. 

1698 datasets : `Any`, optional 

1699 An expression that fully or partially identifies dataset types 

1700 that should constrain the yielded records. See `queryDataIds` and 

1701 :ref:`daf_butler_dataset_type_expressions` for more information. 

1702 collections: `Any`, optional 

1703 An expression that fully or partially identifies the collections 

1704 to search for datasets. See `queryDataIds` and 

1705 :ref:`daf_butler_collection_expressions` for more information. 

1706 where : `str`, optional 

1707 A string expression similar to a SQL WHERE clause. See 

1708 `queryDataIds` and :ref:`daf_butler_dimension_expressions` for more 

1709 information. 

1710 components : `bool`, optional 

1711 Whether to apply dataset expressions to components as well. 

1712 See `queryDataIds` for more information. 

1713 bind : `Mapping`, optional 

1714 Mapping containing literal values that should be injected into the 

1715 ``where`` expression, keyed by the identifiers they replace. 

1716 check : `bool`, optional 

1717 If `True` (default) check the query for consistency before 

1718 executing it. This may reject some valid queries that resemble 

1719 common mistakes (e.g. queries for visits without specifying an 

1720 instrument). 

1721 **kwargs 

1722 Additional keyword arguments are forwarded to 

1723 `DataCoordinate.standardize` when processing the ``dataId`` 

1724 argument (and may be used to provide a constraining data ID even 

1725 when the ``dataId`` argument is `None`). 

1726 

1727 Returns 

1728 ------- 

1729 dataIds : `DataCoordinateQueryResults` 

1730 Data IDs matching the given query parameters. 

1731 """ 

1732 if not isinstance(element, DimensionElement): 

1733 try: 

1734 element = self.dimensions[element] 

1735 except KeyError as e: 

1736 raise KeyError(f"No such dimension '{element}', available dimensions: " 

1737 + str(self.dimensions.getStaticElements())) from e 

1738 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections, 

1739 where=where, components=components, bind=bind, check=check, **kwargs) 

1740 return iter(self._managers.dimensions[element].fetch(dataIds)) 

1741 

1742 def queryDatasetAssociations( 

1743 self, 

1744 datasetType: Union[str, DatasetType], 

1745 collections: Any = ..., 

1746 *, 

1747 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

1748 flattenChains: bool = False, 

1749 ) -> Iterator[DatasetAssociation]: 

1750 """Iterate over dataset-collection combinations where the dataset is in 

1751 the collection. 

1752 

1753 This method is a temporary placeholder for better support for 

1754 assocation results in `queryDatasets`. It will probably be 

1755 removed in the future, and should be avoided in production code 

1756 whenever possible. 

1757 

1758 Parameters 

1759 ---------- 

1760 datasetType : `DatasetType` or `str` 

1761 A dataset type object or the name of one. 

1762 collections: `Any`, optional 

1763 An expression that fully or partially identifies the collections 

1764 to search for datasets. See `queryCollections` and 

1765 :ref:`daf_butler_collection_expressions` for more information. 

1766 If not provided, ``self.default.collections`` is used. 

1767 collectionTypes : `AbstractSet` [ `CollectionType` ], optional 

1768 If provided, only yield associations from collections of these 

1769 types. 

1770 flattenChains : `bool`, optional 

1771 If `True` (default) search in the children of 

1772 `~CollectionType.CHAINED` collections. If `False`, ``CHAINED`` 

1773 collections are ignored. 

1774 

1775 Yields 

1776 ------ 

1777 association : `DatasetAssociation` 

1778 Object representing the relationship beween a single dataset and 

1779 a single collection. 

1780 

1781 Raises 

1782 ------ 

1783 TypeError 

1784 Raised if ``collections`` is `None` and 

1785 ``self.defaults.collections`` is `None`. 

1786 """ 

1787 if collections is None: 

1788 if not self.defaults.collections: 

1789 raise TypeError("No collections provided to findDataset, " 

1790 "and no defaults from registry construction.") 

1791 collections = self.defaults.collections 

1792 else: 

1793 collections = CollectionQuery.fromExpression(collections) 

1794 TimespanReprClass = self._db.getTimespanRepresentation() 

1795 if isinstance(datasetType, str): 

1796 storage = self._managers.datasets[datasetType] 

1797 else: 

1798 storage = self._managers.datasets[datasetType.name] 

1799 for collectionRecord in collections.iter(self._managers.collections, 

1800 collectionTypes=frozenset(collectionTypes), 

1801 flattenChains=flattenChains): 

1802 query = storage.select(collectionRecord) 

1803 if query is None: 

1804 continue 

1805 for row in self._db.query(query.combine()): 

1806 dataId = DataCoordinate.fromRequiredValues( 

1807 storage.datasetType.dimensions, 

1808 tuple(row[name] for name in storage.datasetType.dimensions.required.names) 

1809 ) 

1810 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]] 

1811 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, 

1812 conform=False) 

1813 if collectionRecord.type is CollectionType.CALIBRATION: 

1814 timespan = TimespanReprClass.extract(row) 

1815 else: 

1816 timespan = None 

1817 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan) 

1818 

1819 storageClasses: StorageClassFactory 

1820 """All storage classes known to the registry (`StorageClassFactory`). 

1821 """