Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "Registry", 

26) 

27 

28from collections import defaultdict 

29import contextlib 

30import logging 

31from typing import ( 

32 Any, 

33 Dict, 

34 Iterable, 

35 Iterator, 

36 List, 

37 Mapping, 

38 Optional, 

39 Set, 

40 TYPE_CHECKING, 

41 Union, 

42) 

43 

44import sqlalchemy 

45 

46from ..core import ( 

47 ButlerURI, 

48 Config, 

49 DataCoordinate, 

50 DataCoordinateIterable, 

51 DataId, 

52 DatasetAssociation, 

53 DatasetRef, 

54 DatasetType, 

55 ddl, 

56 Dimension, 

57 DimensionConfig, 

58 DimensionElement, 

59 DimensionGraph, 

60 DimensionRecord, 

61 DimensionUniverse, 

62 NamedKeyMapping, 

63 NameLookupMapping, 

64 StorageClassFactory, 

65 Timespan, 

66) 

67from . import queries 

68from ..core.utils import iterable, transactional 

69from ._config import RegistryConfig 

70from ._collectionType import CollectionType 

71from ._defaults import RegistryDefaults 

72from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError 

73from .managers import RegistryManagerTypes, RegistryManagerInstances 

74from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch, Ellipsis 

75from .summaries import CollectionSummary 

76from .interfaces import ChainedCollectionRecord, RunRecord 

77 

78if TYPE_CHECKING: 78 ↛ 79line 78 didn't jump to line 79, because the condition on line 78 was never true

79 from .._butlerConfig import ButlerConfig 

80 from .interfaces import ( 

81 Database, 

82 DatastoreRegistryBridgeManager, 

83 ) 

84 

85 

86_LOG = logging.getLogger(__name__) 

87 

88# key for dimensions configuration in attributes table 

89_DIMENSIONS_ATTR = "config:dimensions.json" 

90 

91 

92class Registry: 

93 """Registry interface. 

94 

95 Parameters 

96 ---------- 

97 database : `Database` 

98 Database instance to store Registry. 

99 defaults : `RegistryDefaults`, optional 

100 Default collection search path and/or output `~CollectionType.RUN` 

101 collection. 

102 attributes : `type` 

103 Manager class implementing `ButlerAttributeManager`. 

104 opaque : `type` 

105 Manager class implementing `OpaqueTableStorageManager`. 

106 dimensions : `type` 

107 Manager class implementing `DimensionRecordStorageManager`. 

108 collections : `type` 

109 Manager class implementing `CollectionManager`. 

110 datasets : `type` 

111 Manager class implementing `DatasetRecordStorageManager`. 

112 datastoreBridges : `type` 

113 Manager class implementing `DatastoreRegistryBridgeManager`. 

114 dimensionConfig : `DimensionConfig`, optional 

115 Dimension universe configuration, only used when ``create`` is True. 

116 writeable : `bool`, optional 

117 If True then Registry will support write operations. 

118 create : `bool`, optional 

119 If True then database schema will be initialized, it must be empty 

120 before instantiating Registry. 

121 """ 

122 

123 defaultConfigFile: Optional[str] = None 

124 """Path to configuration defaults. Accessed within the ``configs`` resource 

125 or relative to a search path. Can be None if no defaults specified. 

126 """ 

127 

128 @classmethod 

129 def createFromConfig(cls, config: Optional[Union[RegistryConfig, str]] = None, 

130 dimensionConfig: Optional[Union[DimensionConfig, str]] = None, 

131 butlerRoot: Optional[str] = None) -> Registry: 

132 """Create registry database and return `Registry` instance. 

133 

134 This method initializes database contents, database must be empty 

135 prior to calling this method. 

136 

137 Parameters 

138 ---------- 

139 config : `RegistryConfig` or `str`, optional 

140 Registry configuration, if missing then default configuration will 

141 be loaded from registry.yaml. 

142 dimensionConfig : `DimensionConfig` or `str`, optional 

143 Dimensions configuration, if missing then default configuration 

144 will be loaded from dimensions.yaml. 

145 butlerRoot : `str`, optional 

146 Path to the repository root this `Registry` will manage. 

147 

148 Returns 

149 ------- 

150 registry : `Registry` 

151 A new `Registry` instance. 

152 """ 

153 if isinstance(config, str): 

154 config = RegistryConfig(config) 

155 elif config is None: 

156 config = RegistryConfig() 

157 elif not isinstance(config, RegistryConfig): 

158 raise TypeError(f"Incompatible Registry configuration type: {type(config)}") 

159 config.replaceRoot(butlerRoot) 

160 

161 if isinstance(dimensionConfig, str): 

162 dimensionConfig = DimensionConfig(config) 

163 elif dimensionConfig is None: 

164 dimensionConfig = DimensionConfig() 

165 elif not isinstance(dimensionConfig, DimensionConfig): 

166 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}") 

167 

168 DatabaseClass = config.getDatabaseClass() 

169 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

170 namespace=config.get("namespace")) 

171 managerTypes = RegistryManagerTypes.fromConfig(config) 

172 managers = managerTypes.makeRepo(database, dimensionConfig) 

173 return cls(database, RegistryDefaults(), managers) 

174 

175 @classmethod 

176 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], 

177 butlerRoot: Optional[Union[str, ButlerURI]] = None, writeable: bool = True, 

178 defaults: Optional[RegistryDefaults] = None) -> Registry: 

179 """Create `Registry` subclass instance from `config`. 

180 

181 Registry database must be inbitialized prior to calling this method. 

182 

183 Parameters 

184 ---------- 

185 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

186 Registry configuration 

187 butlerRoot : `str` or `ButlerURI`, optional 

188 Path to the repository root this `Registry` will manage. 

189 writeable : `bool`, optional 

190 If `True` (default) create a read-write connection to the database. 

191 defaults : `RegistryDefaults`, optional 

192 Default collection search path and/or output `~CollectionType.RUN` 

193 collection. 

194 

195 Returns 

196 ------- 

197 registry : `Registry` (subclass) 

198 A new `Registry` subclass instance. 

199 """ 

200 if not isinstance(config, RegistryConfig): 

201 if isinstance(config, str) or isinstance(config, Config): 

202 config = RegistryConfig(config) 

203 else: 

204 raise ValueError("Incompatible Registry configuration: {}".format(config)) 

205 config.replaceRoot(butlerRoot) 

206 DatabaseClass = config.getDatabaseClass() 

207 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

208 namespace=config.get("namespace"), writeable=writeable) 

209 managerTypes = RegistryManagerTypes.fromConfig(config) 

210 managers = managerTypes.loadRepo(database) 

211 if defaults is None: 

212 defaults = RegistryDefaults() 

213 return cls(database, defaults, managers) 

214 

215 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances): 

216 self._db = database 

217 self._managers = managers 

218 self.storageClasses = StorageClassFactory() 

219 # Intentionally invoke property setter to initialize defaults. This 

220 # can only be done after most of the rest of Registry has already been 

221 # initialized, and must be done before the property getter is used. 

222 self.defaults = defaults 

223 

224 def __str__(self) -> str: 

225 return str(self._db) 

226 

227 def __repr__(self) -> str: 

228 return f"Registry({self._db!r}, {self.dimensions!r})" 

229 

230 def isWriteable(self) -> bool: 

231 """Return `True` if this registry allows write operations, and `False` 

232 otherwise. 

233 """ 

234 return self._db.isWriteable() 

235 

236 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry: 

237 """Create a new `Registry` backed by the same data repository and 

238 connection as this one, but independent defaults. 

239 

240 Parameters 

241 ---------- 

242 defaults : `RegistryDefaults`, optional 

243 Default collections and data ID values for the new registry. If 

244 not provided, ``self.defaults`` will be used (but future changes 

245 to either registry's defaults will not affect the other). 

246 

247 Returns 

248 ------- 

249 copy : `Registry` 

250 A new `Registry` instance with its own defaults. 

251 

252 Notes 

253 ----- 

254 Because the new registry shares a connection with the original, they 

255 also share transaction state (despite the fact that their `transaction` 

256 context manager methods do not reflect this), and must be used with 

257 care. 

258 """ 

259 if defaults is None: 

260 # No need to copy, because `RegistryDefaults` is immutable; we 

261 # effectively copy on write. 

262 defaults = self.defaults 

263 return Registry(self._db, defaults, self._managers) 

264 

265 @property 

266 def dimensions(self) -> DimensionUniverse: 

267 """All dimensions recognized by this `Registry` (`DimensionUniverse`). 

268 """ 

269 return self._managers.dimensions.universe 

270 

271 @property 

272 def defaults(self) -> RegistryDefaults: 

273 """Default collection search path and/or output `~CollectionType.RUN` 

274 collection (`RegistryDefaults`). 

275 

276 This is an immutable struct whose components may not be set 

277 individually, but the entire struct can be set by assigning to this 

278 property. 

279 """ 

280 return self._defaults 

281 

282 @defaults.setter 

283 def defaults(self, value: RegistryDefaults) -> None: 

284 if value.run is not None: 

285 self.registerRun(value.run) 

286 value.finish(self) 

287 self._defaults = value 

288 

289 def refresh(self) -> None: 

290 """Refresh all in-memory state by querying the database. 

291 

292 This may be necessary to enable querying for entities added by other 

293 `Registry` instances after this one was constructed. 

294 """ 

295 self._managers.refresh() 

296 

297 @contextlib.contextmanager 

298 def transaction(self, *, savepoint: bool = False) -> Iterator[None]: 

299 """Return a context manager that represents a transaction. 

300 """ 

301 try: 

302 with self._db.transaction(savepoint=savepoint): 

303 yield 

304 except BaseException: 

305 # TODO: this clears the caches sometimes when we wouldn't actually 

306 # need to. Can we avoid that? 

307 self._managers.dimensions.clearCaches() 

308 raise 

309 

310 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None: 

311 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

312 other data repository client. 

313 

314 Opaque table records can be added via `insertOpaqueData`, retrieved via 

315 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

316 

317 Parameters 

318 ---------- 

319 tableName : `str` 

320 Logical name of the opaque table. This may differ from the 

321 actual name used in the database by a prefix and/or suffix. 

322 spec : `ddl.TableSpec` 

323 Specification for the table to be added. 

324 """ 

325 self._managers.opaque.register(tableName, spec) 

326 

327 @transactional 

328 def insertOpaqueData(self, tableName: str, *data: dict) -> None: 

329 """Insert records into an opaque table. 

330 

331 Parameters 

332 ---------- 

333 tableName : `str` 

334 Logical name of the opaque table. Must match the name used in a 

335 previous call to `registerOpaqueTable`. 

336 data 

337 Each additional positional argument is a dictionary that represents 

338 a single row to be added. 

339 """ 

340 self._managers.opaque[tableName].insert(*data) 

341 

342 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

343 """Retrieve records from an opaque table. 

344 

345 Parameters 

346 ---------- 

347 tableName : `str` 

348 Logical name of the opaque table. Must match the name used in a 

349 previous call to `registerOpaqueTable`. 

350 where 

351 Additional keyword arguments are interpreted as equality 

352 constraints that restrict the returned rows (combined with AND); 

353 keyword arguments are column names and values are the values they 

354 must have. 

355 

356 Yields 

357 ------ 

358 row : `dict` 

359 A dictionary representing a single result row. 

360 """ 

361 yield from self._managers.opaque[tableName].fetch(**where) 

362 

363 @transactional 

364 def deleteOpaqueData(self, tableName: str, **where: Any) -> None: 

365 """Remove records from an opaque table. 

366 

367 Parameters 

368 ---------- 

369 tableName : `str` 

370 Logical name of the opaque table. Must match the name used in a 

371 previous call to `registerOpaqueTable`. 

372 where 

373 Additional keyword arguments are interpreted as equality 

374 constraints that restrict the deleted rows (combined with AND); 

375 keyword arguments are column names and values are the values they 

376 must have. 

377 """ 

378 self._managers.opaque[tableName].delete(**where) 

379 

380 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED, 

381 doc: Optional[str] = None) -> None: 

382 """Add a new collection if one with the given name does not exist. 

383 

384 Parameters 

385 ---------- 

386 name : `str` 

387 The name of the collection to create. 

388 type : `CollectionType` 

389 Enum value indicating the type of collection to create. 

390 doc : `str`, optional 

391 Documentation string for the collection. 

392 

393 Notes 

394 ----- 

395 This method cannot be called within transactions, as it needs to be 

396 able to perform its own transaction to be concurrent. 

397 """ 

398 self._managers.collections.register(name, type, doc=doc) 

399 

400 def getCollectionType(self, name: str) -> CollectionType: 

401 """Return an enumeration value indicating the type of the given 

402 collection. 

403 

404 Parameters 

405 ---------- 

406 name : `str` 

407 The name of the collection. 

408 

409 Returns 

410 ------- 

411 type : `CollectionType` 

412 Enum value indicating the type of this collection. 

413 

414 Raises 

415 ------ 

416 MissingCollectionError 

417 Raised if no collection with the given name exists. 

418 """ 

419 return self._managers.collections.find(name).type 

420 

421 def registerRun(self, name: str, doc: Optional[str] = None) -> None: 

422 """Add a new run if one with the given name does not exist. 

423 

424 Parameters 

425 ---------- 

426 name : `str` 

427 The name of the run to create. 

428 doc : `str`, optional 

429 Documentation string for the collection. 

430 

431 Notes 

432 ----- 

433 This method cannot be called within transactions, as it needs to be 

434 able to perform its own transaction to be concurrent. 

435 """ 

436 self._managers.collections.register(name, CollectionType.RUN, doc=doc) 

437 

438 @transactional 

439 def removeCollection(self, name: str) -> None: 

440 """Completely remove the given collection. 

441 

442 Parameters 

443 ---------- 

444 name : `str` 

445 The name of the collection to remove. 

446 

447 Raises 

448 ------ 

449 MissingCollectionError 

450 Raised if no collection with the given name exists. 

451 

452 Notes 

453 ----- 

454 If this is a `~CollectionType.RUN` collection, all datasets and quanta 

455 in it are also fully removed. This requires that those datasets be 

456 removed (or at least trashed) from any datastores that hold them first. 

457 

458 A collection may not be deleted as long as it is referenced by a 

459 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must 

460 be deleted or redefined first. 

461 """ 

462 self._managers.collections.remove(name) 

463 

464 def getCollectionChain(self, parent: str) -> CollectionSearch: 

465 """Return the child collections in a `~CollectionType.CHAINED` 

466 collection. 

467 

468 Parameters 

469 ---------- 

470 parent : `str` 

471 Name of the chained collection. Must have already been added via 

472 a call to `Registry.registerCollection`. 

473 

474 Returns 

475 ------- 

476 children : `CollectionSearch` 

477 An object that defines the search path of the collection. 

478 See :ref:`daf_butler_collection_expressions` for more information. 

479 

480 Raises 

481 ------ 

482 MissingCollectionError 

483 Raised if ``parent`` does not exist in the `Registry`. 

484 TypeError 

485 Raised if ``parent`` does not correspond to a 

486 `~CollectionType.CHAINED` collection. 

487 """ 

488 record = self._managers.collections.find(parent) 

489 if record.type is not CollectionType.CHAINED: 

490 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

491 assert isinstance(record, ChainedCollectionRecord) 

492 return record.children 

493 

494 @transactional 

495 def setCollectionChain(self, parent: str, children: Any) -> None: 

496 """Define or redefine a `~CollectionType.CHAINED` collection. 

497 

498 Parameters 

499 ---------- 

500 parent : `str` 

501 Name of the chained collection. Must have already been added via 

502 a call to `Registry.registerCollection`. 

503 children : `Any` 

504 An expression defining an ordered search of child collections, 

505 generally an iterable of `str`; see 

506 :ref:`daf_butler_collection_expressions` for more information. 

507 

508 Raises 

509 ------ 

510 MissingCollectionError 

511 Raised when any of the given collections do not exist in the 

512 `Registry`. 

513 TypeError 

514 Raised if ``parent`` does not correspond to a 

515 `~CollectionType.CHAINED` collection. 

516 ValueError 

517 Raised if the given collections contains a cycle. 

518 """ 

519 record = self._managers.collections.find(parent) 

520 if record.type is not CollectionType.CHAINED: 

521 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

522 assert isinstance(record, ChainedCollectionRecord) 

523 children = CollectionSearch.fromExpression(children) 

524 if children != record.children: 

525 record.update(self._managers.collections, children) 

526 

527 def getCollectionDocumentation(self, collection: str) -> Optional[str]: 

528 """Retrieve the documentation string for a collection. 

529 

530 Parameters 

531 ---------- 

532 name : `str` 

533 Name of the collection. 

534 

535 Returns 

536 ------- 

537 docs : `str` or `None` 

538 Docstring for the collection with the given name. 

539 """ 

540 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key) 

541 

542 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None: 

543 """Set the documentation string for a collection. 

544 

545 Parameters 

546 ---------- 

547 name : `str` 

548 Name of the collection. 

549 docs : `str` or `None` 

550 Docstring for the collection with the given name; will replace any 

551 existing docstring. Passing `None` will remove any existing 

552 docstring. 

553 """ 

554 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc) 

555 

556 def getCollectionSummary(self, collection: str) -> CollectionSummary: 

557 """Return a summary for the given collection. 

558 

559 Parameters 

560 ---------- 

561 collection : `str` 

562 Name of the collection for which a summary is to be retrieved. 

563 

564 Returns 

565 ------- 

566 summary : `CollectionSummary` 

567 Summary of the dataset types and governor dimension values in 

568 this collection. 

569 """ 

570 record = self._managers.collections.find(collection) 

571 return self._managers.datasets.getCollectionSummary(record) 

572 

573 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

574 """ 

575 Add a new `DatasetType` to the Registry. 

576 

577 It is not an error to register the same `DatasetType` twice. 

578 

579 Parameters 

580 ---------- 

581 datasetType : `DatasetType` 

582 The `DatasetType` to be added. 

583 

584 Returns 

585 ------- 

586 inserted : `bool` 

587 `True` if ``datasetType`` was inserted, `False` if an identical 

588 existing `DatsetType` was found. Note that in either case the 

589 DatasetType is guaranteed to be defined in the Registry 

590 consistently with the given definition. 

591 

592 Raises 

593 ------ 

594 ValueError 

595 Raised if the dimensions or storage class are invalid. 

596 ConflictingDefinitionError 

597 Raised if this DatasetType is already registered with a different 

598 definition. 

599 

600 Notes 

601 ----- 

602 This method cannot be called within transactions, as it needs to be 

603 able to perform its own transaction to be concurrent. 

604 """ 

605 _, inserted = self._managers.datasets.register(datasetType) 

606 return inserted 

607 

608 def removeDatasetType(self, name: str) -> None: 

609 """Remove the named `DatasetType` from the registry. 

610 

611 .. warning:: 

612 

613 Registry caches the dataset type definitions. This means that 

614 deleting the dataset type definition may result in unexpected 

615 behavior from other butler processes that are active that have 

616 not seen the deletion. 

617 

618 Parameters 

619 ---------- 

620 name : `str` 

621 Name of the type to be removed. 

622 

623 Raises 

624 ------ 

625 lsst.daf.butler.registry.OrphanedRecordError 

626 Raised if an attempt is made to remove the dataset type definition 

627 when there are already datasets associated with it. 

628 

629 Notes 

630 ----- 

631 If the dataset type is not registered the method will return without 

632 action. 

633 """ 

634 self._managers.datasets.remove(name) 

635 

636 def getDatasetType(self, name: str) -> DatasetType: 

637 """Get the `DatasetType`. 

638 

639 Parameters 

640 ---------- 

641 name : `str` 

642 Name of the type. 

643 

644 Returns 

645 ------- 

646 type : `DatasetType` 

647 The `DatasetType` associated with the given name. 

648 

649 Raises 

650 ------ 

651 KeyError 

652 Requested named DatasetType could not be found in registry. 

653 """ 

654 return self._managers.datasets[name].datasetType 

655 

656 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *, 

657 collections: Any = None, timespan: Optional[Timespan] = None, 

658 **kwargs: Any) -> Optional[DatasetRef]: 

659 """Find a dataset given its `DatasetType` and data ID. 

660 

661 This can be used to obtain a `DatasetRef` that permits the dataset to 

662 be read from a `Datastore`. If the dataset is a component and can not 

663 be found using the provided dataset type, a dataset ref for the parent 

664 will be returned instead but with the correct dataset type. 

665 

666 Parameters 

667 ---------- 

668 datasetType : `DatasetType` or `str` 

669 A `DatasetType` or the name of one. 

670 dataId : `dict` or `DataCoordinate`, optional 

671 A `dict`-like object containing the `Dimension` links that identify 

672 the dataset within a collection. 

673 collections, optional. 

674 An expression that fully or partially identifies the collections to 

675 search for the dataset; see 

676 :ref:`daf_butler_collection_expressions` for more information. 

677 Defaults to ``self.defaults.collections``. 

678 timespan : `Timespan`, optional 

679 A timespan that the validity range of the dataset must overlap. 

680 If not provided, any `~CollectionType.CALIBRATION` collections 

681 matched by the ``collections`` argument will not be searched. 

682 **kwargs 

683 Additional keyword arguments passed to 

684 `DataCoordinate.standardize` to convert ``dataId`` to a true 

685 `DataCoordinate` or augment an existing one. 

686 

687 Returns 

688 ------- 

689 ref : `DatasetRef` 

690 A reference to the dataset, or `None` if no matching Dataset 

691 was found. 

692 

693 Raises 

694 ------ 

695 TypeError 

696 Raised if ``collections`` is `None` and 

697 ``self.defaults.collections`` is `None`. 

698 LookupError 

699 Raised if one or more data ID keys are missing. 

700 KeyError 

701 Raised if the dataset type does not exist. 

702 MissingCollectionError 

703 Raised if any of ``collections`` does not exist in the registry. 

704 

705 Notes 

706 ----- 

707 This method simply returns `None` and does not raise an exception even 

708 when the set of collections searched is intrinsically incompatible with 

709 the dataset type, e.g. if ``datasetType.isCalibration() is False``, but 

710 only `~CollectionType.CALIBRATION` collections are being searched. 

711 This may make it harder to debug some lookup failures, but the behavior 

712 is intentional; we consider it more important that failed searches are 

713 reported consistently, regardless of the reason, and that adding 

714 additional collections that do not contain a match to the search path 

715 never changes the behavior. 

716 """ 

717 if isinstance(datasetType, DatasetType): 

718 storage = self._managers.datasets[datasetType.name] 

719 else: 

720 storage = self._managers.datasets[datasetType] 

721 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions, 

722 universe=self.dimensions, defaults=self.defaults.dataId, 

723 **kwargs) 

724 if collections is None: 

725 if not self.defaults.collections: 

726 raise TypeError("No collections provided to findDataset, " 

727 "and no defaults from registry construction.") 

728 collections = self.defaults.collections 

729 else: 

730 collections = CollectionSearch.fromExpression(collections) 

731 for collectionRecord in collections.iter(self._managers.collections): 

732 if (collectionRecord.type is CollectionType.CALIBRATION 

733 and (not storage.datasetType.isCalibration() or timespan is None)): 

734 continue 

735 result = storage.find(collectionRecord, dataId, timespan=timespan) 

736 if result is not None: 

737 return result 

738 

739 return None 

740 

741 @transactional 

742 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId], 

743 run: Optional[str] = None) -> List[DatasetRef]: 

744 """Insert one or more datasets into the `Registry` 

745 

746 This always adds new datasets; to associate existing datasets with 

747 a new collection, use ``associate``. 

748 

749 Parameters 

750 ---------- 

751 datasetType : `DatasetType` or `str` 

752 A `DatasetType` or the name of one. 

753 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate` 

754 Dimension-based identifiers for the new datasets. 

755 run : `str`, optional 

756 The name of the run that produced the datasets. Defaults to 

757 ``self.defaults.run``. 

758 

759 Returns 

760 ------- 

761 refs : `list` of `DatasetRef` 

762 Resolved `DatasetRef` instances for all given data IDs (in the same 

763 order). 

764 

765 Raises 

766 ------ 

767 TypeError 

768 Raised if ``run`` is `None` and ``self.defaults.run`` is `None`. 

769 ConflictingDefinitionError 

770 If a dataset with the same dataset type and data ID as one of those 

771 given already exists in ``run``. 

772 MissingCollectionError 

773 Raised if ``run`` does not exist in the registry. 

774 """ 

775 if isinstance(datasetType, DatasetType): 

776 storage = self._managers.datasets.find(datasetType.name) 

777 if storage is None: 

778 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

779 else: 

780 storage = self._managers.datasets.find(datasetType) 

781 if storage is None: 

782 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

783 if run is None: 

784 if self.defaults.run is None: 

785 raise TypeError("No run provided to insertDatasets, " 

786 "and no default from registry construction.") 

787 run = self.defaults.run 

788 runRecord = self._managers.collections.find(run) 

789 if runRecord.type is not CollectionType.RUN: 

790 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.") 

791 assert isinstance(runRecord, RunRecord) 

792 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) 

793 for dataId in dataIds] 

794 try: 

795 refs = list(storage.insert(runRecord, expandedDataIds)) 

796 except sqlalchemy.exc.IntegrityError as err: 

797 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

798 f"one or more datasets of type {storage.datasetType} into " 

799 f"collection '{run}'. " 

800 f"This probably means a dataset with the same data ID " 

801 f"and dataset type already exists, but it may also mean a " 

802 f"dimension row is missing.") from err 

803 return refs 

804 

805 def getDataset(self, id: int) -> Optional[DatasetRef]: 

806 """Retrieve a Dataset entry. 

807 

808 Parameters 

809 ---------- 

810 id : `int` 

811 The unique identifier for the dataset. 

812 

813 Returns 

814 ------- 

815 ref : `DatasetRef` or `None` 

816 A ref to the Dataset, or `None` if no matching Dataset 

817 was found. 

818 """ 

819 ref = self._managers.datasets.getDatasetRef(id) 

820 if ref is None: 

821 return None 

822 return ref 

823 

824 @transactional 

825 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None: 

826 """Remove datasets from the Registry. 

827 

828 The datasets will be removed unconditionally from all collections, and 

829 any `Quantum` that consumed this dataset will instead be marked with 

830 having a NULL input. `Datastore` records will *not* be deleted; the 

831 caller is responsible for ensuring that the dataset has already been 

832 removed from all Datastores. 

833 

834 Parameters 

835 ---------- 

836 refs : `Iterable` of `DatasetRef` 

837 References to the datasets to be removed. Must include a valid 

838 ``id`` attribute, and should be considered invalidated upon return. 

839 

840 Raises 

841 ------ 

842 AmbiguousDatasetError 

843 Raised if any ``ref.id`` is `None`. 

844 OrphanedRecordError 

845 Raised if any dataset is still present in any `Datastore`. 

846 """ 

847 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

848 storage = self._managers.datasets.find(datasetType.name) 

849 assert storage is not None 

850 try: 

851 storage.delete(refsForType) 

852 except sqlalchemy.exc.IntegrityError as err: 

853 raise OrphanedRecordError("One or more datasets is still " 

854 "present in one or more Datastores.") from err 

855 

856 @transactional 

857 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

858 """Add existing datasets to a `~CollectionType.TAGGED` collection. 

859 

860 If a DatasetRef with the same exact integer ID is already in a 

861 collection nothing is changed. If a `DatasetRef` with the same 

862 `DatasetType` and data ID but with different integer ID 

863 exists in the collection, `ConflictingDefinitionError` is raised. 

864 

865 Parameters 

866 ---------- 

867 collection : `str` 

868 Indicates the collection the datasets should be associated with. 

869 refs : `Iterable` [ `DatasetRef` ] 

870 An iterable of resolved `DatasetRef` instances that already exist 

871 in this `Registry`. 

872 

873 Raises 

874 ------ 

875 ConflictingDefinitionError 

876 If a Dataset with the given `DatasetRef` already exists in the 

877 given collection. 

878 AmbiguousDatasetError 

879 Raised if ``any(ref.id is None for ref in refs)``. 

880 MissingCollectionError 

881 Raised if ``collection`` does not exist in the registry. 

882 TypeError 

883 Raise adding new datasets to the given ``collection`` is not 

884 allowed. 

885 """ 

886 collectionRecord = self._managers.collections.find(collection) 

887 if collectionRecord.type is not CollectionType.TAGGED: 

888 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

889 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

890 storage = self._managers.datasets.find(datasetType.name) 

891 assert storage is not None 

892 try: 

893 storage.associate(collectionRecord, refsForType) 

894 except sqlalchemy.exc.IntegrityError as err: 

895 raise ConflictingDefinitionError( 

896 f"Constraint violation while associating dataset of type {datasetType.name} with " 

897 f"collection {collection}. This probably means that one or more datasets with the same " 

898 f"dataset type and data ID already exist in the collection, but it may also indicate " 

899 f"that the datasets do not exist." 

900 ) from err 

901 

902 @transactional 

903 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

904 """Remove existing datasets from a `~CollectionType.TAGGED` collection. 

905 

906 ``collection`` and ``ref`` combinations that are not currently 

907 associated are silently ignored. 

908 

909 Parameters 

910 ---------- 

911 collection : `str` 

912 The collection the datasets should no longer be associated with. 

913 refs : `Iterable` [ `DatasetRef` ] 

914 An iterable of resolved `DatasetRef` instances that already exist 

915 in this `Registry`. 

916 

917 Raises 

918 ------ 

919 AmbiguousDatasetError 

920 Raised if any of the given dataset references is unresolved. 

921 MissingCollectionError 

922 Raised if ``collection`` does not exist in the registry. 

923 TypeError 

924 Raise adding new datasets to the given ``collection`` is not 

925 allowed. 

926 """ 

927 collectionRecord = self._managers.collections.find(collection) 

928 if collectionRecord.type is not CollectionType.TAGGED: 

929 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; " 

930 "expected TAGGED.") 

931 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

932 storage = self._managers.datasets.find(datasetType.name) 

933 assert storage is not None 

934 storage.disassociate(collectionRecord, refsForType) 

935 

936 @transactional 

937 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None: 

938 """Associate one or more datasets with a calibration collection and a 

939 validity range within it. 

940 

941 Parameters 

942 ---------- 

943 collection : `str` 

944 The name of an already-registered `~CollectionType.CALIBRATION` 

945 collection. 

946 refs : `Iterable` [ `DatasetRef` ] 

947 Datasets to be associated. 

948 timespan : `Timespan` 

949 The validity range for these datasets within the collection. 

950 

951 Raises 

952 ------ 

953 AmbiguousDatasetError 

954 Raised if any of the given `DatasetRef` instances is unresolved. 

955 ConflictingDefinitionError 

956 Raised if the collection already contains a different dataset with 

957 the same `DatasetType` and data ID and an overlapping validity 

958 range. 

959 TypeError 

960 Raised if ``collection`` is not a `~CollectionType.CALIBRATION` 

961 collection or if one or more datasets are of a dataset type for 

962 which `DatasetType.isCalibration` returns `False`. 

963 """ 

964 collectionRecord = self._managers.collections.find(collection) 

965 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

966 storage = self._managers.datasets[datasetType.name] 

967 storage.certify(collectionRecord, refsForType, timespan) 

968 

969 @transactional 

970 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *, 

971 dataIds: Optional[Iterable[DataId]] = None) -> None: 

972 """Remove or adjust datasets to clear a validity range within a 

973 calibration collection. 

974 

975 Parameters 

976 ---------- 

977 collection : `str` 

978 The name of an already-registered `~CollectionType.CALIBRATION` 

979 collection. 

980 datasetType : `str` or `DatasetType` 

981 Name or `DatasetType` instance for the datasets to be decertified. 

982 timespan : `Timespan`, optional 

983 The validity range to remove datasets from within the collection. 

984 Datasets that overlap this range but are not contained by it will 

985 have their validity ranges adjusted to not overlap it, which may 

986 split a single dataset validity range into two. 

987 dataIds : `Iterable` [ `DataId` ], optional 

988 Data IDs that should be decertified within the given validity range 

989 If `None`, all data IDs for ``self.datasetType`` will be 

990 decertified. 

991 

992 Raises 

993 ------ 

994 TypeError 

995 Raised if ``collection`` is not a `~CollectionType.CALIBRATION` 

996 collection or if ``datasetType.isCalibration() is False``. 

997 """ 

998 collectionRecord = self._managers.collections.find(collection) 

999 if isinstance(datasetType, str): 

1000 storage = self._managers.datasets[datasetType] 

1001 else: 

1002 storage = self._managers.datasets[datasetType.name] 

1003 standardizedDataIds = None 

1004 if dataIds is not None: 

1005 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) 

1006 for d in dataIds] 

1007 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds) 

1008 

1009 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: 

1010 """Return an object that allows a new `Datastore` instance to 

1011 communicate with this `Registry`. 

1012 

1013 Returns 

1014 ------- 

1015 manager : `DatastoreRegistryBridgeManager` 

1016 Object that mediates communication between this `Registry` and its 

1017 associated datastores. 

1018 """ 

1019 return self._managers.datastores 

1020 

1021 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]: 

1022 """Retrieve datastore locations for a given dataset. 

1023 

1024 Parameters 

1025 ---------- 

1026 ref : `DatasetRef` 

1027 A reference to the dataset for which to retrieve storage 

1028 information. 

1029 

1030 Returns 

1031 ------- 

1032 datastores : `Iterable` [ `str` ] 

1033 All the matching datastores holding this dataset. 

1034 

1035 Raises 

1036 ------ 

1037 AmbiguousDatasetError 

1038 Raised if ``ref.id`` is `None`. 

1039 """ 

1040 return self._managers.datastores.findDatastores(ref) 

1041 

1042 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None, 

1043 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None, 

1044 withDefaults: bool = True, 

1045 **kwargs: Any) -> DataCoordinate: 

1046 """Expand a dimension-based data ID to include additional information. 

1047 

1048 Parameters 

1049 ---------- 

1050 dataId : `DataCoordinate` or `dict`, optional 

1051 Data ID to be expanded; augmented and overridden by ``kwds``. 

1052 graph : `DimensionGraph`, optional 

1053 Set of dimensions for the expanded ID. If `None`, the dimensions 

1054 will be inferred from the keys of ``dataId`` and ``kwds``. 

1055 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph`` 

1056 are silently ignored, providing a way to extract and expand a 

1057 subset of a data ID. 

1058 records : `Mapping` [`str`, `DimensionRecord`], optional 

1059 Dimension record data to use before querying the database for that 

1060 data, keyed by element name. 

1061 withDefaults : `bool`, optional 

1062 Utilize ``self.defaults.dataId`` to fill in missing governor 

1063 dimension key-value pairs. Defaults to `True` (i.e. defaults are 

1064 used). 

1065 **kwargs 

1066 Additional keywords are treated like additional key-value pairs for 

1067 ``dataId``, extending and overriding 

1068 

1069 Returns 

1070 ------- 

1071 expanded : `DataCoordinate` 

1072 A data ID that includes full metadata for all of the dimensions it 

1073 identifieds, i.e. guarantees that ``expanded.hasRecords()`` and 

1074 ``expanded.hasFull()`` both return `True`. 

1075 """ 

1076 if not withDefaults: 

1077 defaults = None 

1078 else: 

1079 defaults = self.defaults.dataId 

1080 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, 

1081 defaults=defaults, **kwargs) 

1082 if standardized.hasRecords(): 

1083 return standardized 

1084 if records is None: 

1085 records = {} 

1086 elif isinstance(records, NamedKeyMapping): 

1087 records = records.byName() 

1088 else: 

1089 records = dict(records) 

1090 if isinstance(dataId, DataCoordinate) and dataId.hasRecords(): 

1091 records.update(dataId.records.byName()) 

1092 keys = standardized.byName() 

1093 for element in standardized.graph.primaryKeyTraversalOrder: 

1094 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

1095 if record is ...: 

1096 if isinstance(element, Dimension) and keys.get(element.name) is None: 

1097 if element in standardized.graph.required: 

1098 raise LookupError( 

1099 f"No value or null value for required dimension {element.name}." 

1100 ) 

1101 keys[element.name] = None 

1102 record = None 

1103 else: 

1104 storage = self._managers.dimensions[element] 

1105 dataIdSet = DataCoordinateIterable.fromScalar( 

1106 DataCoordinate.standardize(keys, graph=element.graph) 

1107 ) 

1108 fetched = tuple(storage.fetch(dataIdSet)) 

1109 try: 

1110 (record,) = fetched 

1111 except ValueError: 

1112 record = None 

1113 records[element.name] = record 

1114 if record is not None: 

1115 for d in element.implied: 

1116 value = getattr(record, d.name) 

1117 if keys.setdefault(d.name, value) != value: 

1118 raise InconsistentDataIdError( 

1119 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, " 

1120 f"but {element.name} implies {d.name}={value!r}." 

1121 ) 

1122 else: 

1123 if element in standardized.graph.required: 

1124 raise LookupError( 

1125 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

1126 ) 

1127 if element.alwaysJoin: 

1128 raise InconsistentDataIdError( 

1129 f"Could not fetch record for element {element.name} via keys {keys}, ", 

1130 "but it is marked alwaysJoin=True; this means one or more dimensions are not " 

1131 "related." 

1132 ) 

1133 for d in element.implied: 

1134 keys.setdefault(d.name, None) 

1135 records.setdefault(d.name, None) 

1136 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records) 

1137 

1138 def insertDimensionData(self, element: Union[DimensionElement, str], 

1139 *data: Union[Mapping[str, Any], DimensionRecord], 

1140 conform: bool = True) -> None: 

1141 """Insert one or more dimension records into the database. 

1142 

1143 Parameters 

1144 ---------- 

1145 element : `DimensionElement` or `str` 

1146 The `DimensionElement` or name thereof that identifies the table 

1147 records will be inserted into. 

1148 data : `dict` or `DimensionRecord` (variadic) 

1149 One or more records to insert. 

1150 conform : `bool`, optional 

1151 If `False` (`True` is default) perform no checking or conversions, 

1152 and assume that ``element`` is a `DimensionElement` instance and 

1153 ``data`` is a one or more `DimensionRecord` instances of the 

1154 appropriate subclass. 

1155 """ 

1156 if conform: 

1157 if isinstance(element, str): 

1158 element = self.dimensions[element] 

1159 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

1160 for row in data] 

1161 else: 

1162 # Ignore typing since caller said to trust them with conform=False. 

1163 records = data # type: ignore 

1164 storage = self._managers.dimensions[element] # type: ignore 

1165 storage.insert(*records) 

1166 

1167 def syncDimensionData(self, element: Union[DimensionElement, str], 

1168 row: Union[Mapping[str, Any], DimensionRecord], 

1169 conform: bool = True) -> bool: 

1170 """Synchronize the given dimension record with the database, inserting 

1171 if it does not already exist and comparing values if it does. 

1172 

1173 Parameters 

1174 ---------- 

1175 element : `DimensionElement` or `str` 

1176 The `DimensionElement` or name thereof that identifies the table 

1177 records will be inserted into. 

1178 row : `dict` or `DimensionRecord` 

1179 The record to insert. 

1180 conform : `bool`, optional 

1181 If `False` (`True` is default) perform no checking or conversions, 

1182 and assume that ``element`` is a `DimensionElement` instance and 

1183 ``data`` is a one or more `DimensionRecord` instances of the 

1184 appropriate subclass. 

1185 

1186 Returns 

1187 ------- 

1188 inserted : `bool` 

1189 `True` if a new row was inserted, `False` otherwise. 

1190 

1191 Raises 

1192 ------ 

1193 ConflictingDefinitionError 

1194 Raised if the record exists in the database (according to primary 

1195 key lookup) but is inconsistent with the given one. 

1196 """ 

1197 if conform: 

1198 if isinstance(element, str): 

1199 element = self.dimensions[element] 

1200 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

1201 else: 

1202 # Ignore typing since caller said to trust them with conform=False. 

1203 record = row # type: ignore 

1204 storage = self._managers.dimensions[element] # type: ignore 

1205 return storage.sync(record) 

1206 

1207 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None 

1208 ) -> Iterator[DatasetType]: 

1209 """Iterate over the dataset types whose names match an expression. 

1210 

1211 Parameters 

1212 ---------- 

1213 expression : `Any`, optional 

1214 An expression that fully or partially identifies the dataset types 

1215 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1216 `...` can be used to return all dataset types, and is the default. 

1217 See :ref:`daf_butler_dataset_type_expressions` for more 

1218 information. 

1219 components : `bool`, optional 

1220 If `True`, apply all expression patterns to component dataset type 

1221 names as well. If `False`, never apply patterns to components. 

1222 If `None` (default), apply patterns to components only if their 

1223 parent datasets were not matched by the expression. 

1224 Fully-specified component datasets (`str` or `DatasetType` 

1225 instances) are always included. 

1226 

1227 Yields 

1228 ------ 

1229 datasetType : `DatasetType` 

1230 A `DatasetType` instance whose name matches ``expression``. 

1231 """ 

1232 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name) 

1233 if wildcard is Ellipsis: 

1234 for datasetType in self._managers.datasets: 

1235 # The dataset type can no longer be a component 

1236 yield datasetType 

1237 if components: 

1238 # Automatically create the component dataset types 

1239 try: 

1240 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes() 

1241 except KeyError as err: 

1242 _LOG.warning(f"Could not load storage class {err} for {datasetType.name}; " 

1243 "if it has components they will not be included in query results.") 

1244 else: 

1245 yield from componentsForDatasetType 

1246 return 

1247 done: Set[str] = set() 

1248 for name in wildcard.strings: 

1249 storage = self._managers.datasets.find(name) 

1250 if storage is not None: 

1251 done.add(storage.datasetType.name) 

1252 yield storage.datasetType 

1253 if wildcard.patterns: 

1254 # If components (the argument) is None, we'll save component 

1255 # dataset that we might want to match, but only if their parents 

1256 # didn't get included. 

1257 componentsForLater = [] 

1258 for registeredDatasetType in self._managers.datasets: 

1259 # Components are not stored in registry so expand them here 

1260 allDatasetTypes = [registeredDatasetType] 

1261 try: 

1262 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes()) 

1263 except KeyError as err: 

1264 _LOG.warning(f"Could not load storage class {err} for {registeredDatasetType.name}; " 

1265 "if it has components they will not be included in query results.") 

1266 for datasetType in allDatasetTypes: 

1267 if datasetType.name in done: 

1268 continue 

1269 parentName, componentName = datasetType.nameAndComponent() 

1270 if componentName is not None and not components: 

1271 if components is None and parentName not in done: 

1272 componentsForLater.append(datasetType) 

1273 continue 

1274 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

1275 done.add(datasetType.name) 

1276 yield datasetType 

1277 # Go back and try to match saved components. 

1278 for datasetType in componentsForLater: 

1279 parentName, _ = datasetType.nameAndComponent() 

1280 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

1281 yield datasetType 

1282 

1283 def queryCollections(self, expression: Any = ..., 

1284 datasetType: Optional[DatasetType] = None, 

1285 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

1286 flattenChains: bool = False, 

1287 includeChains: Optional[bool] = None) -> Iterator[str]: 

1288 """Iterate over the collections whose names match an expression. 

1289 

1290 Parameters 

1291 ---------- 

1292 expression : `Any`, optional 

1293 An expression that fully or partially identifies the collections 

1294 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1295 `...` can be used to return all collections, and is the default. 

1296 See :ref:`daf_butler_collection_expressions` for more 

1297 information. 

1298 datasetType : `DatasetType`, optional 

1299 If provided, only yield collections that may contain datasets of 

1300 this type. This is a conservative approximation in general; it may 

1301 yield collections that do not have any such datasets. 

1302 collectionTypes : `AbstractSet` [ `CollectionType` ], optional 

1303 If provided, only yield collections of these types. 

1304 flattenChains : `bool`, optional 

1305 If `True` (`False` is default), recursively yield the child 

1306 collections of matching `~CollectionType.CHAINED` collections. 

1307 includeChains : `bool`, optional 

1308 If `True`, yield records for matching `~CollectionType.CHAINED` 

1309 collections. Default is the opposite of ``flattenChains``: include 

1310 either CHAINED collections or their children, but not both. 

1311 

1312 Yields 

1313 ------ 

1314 collection : `str` 

1315 The name of a collection that matches ``expression``. 

1316 """ 

1317 # Right now the datasetTypes argument is completely ignored, but that 

1318 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up 

1319 # ticket will take care of that. 

1320 query = CollectionQuery.fromExpression(expression) 

1321 for record in query.iter(self._managers.collections, collectionTypes=frozenset(collectionTypes), 

1322 flattenChains=flattenChains, includeChains=includeChains): 

1323 yield record.name 

1324 

1325 def makeQueryBuilder(self, summary: queries.QuerySummary) -> queries.QueryBuilder: 

1326 """Return a `QueryBuilder` instance capable of constructing and 

1327 managing more complex queries than those obtainable via `Registry` 

1328 interfaces. 

1329 

1330 This is an advanced interface; downstream code should prefer 

1331 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those 

1332 are sufficient. 

1333 

1334 Parameters 

1335 ---------- 

1336 summary : `queries.QuerySummary` 

1337 Object describing and categorizing the full set of dimensions that 

1338 will be included in the query. 

1339 

1340 Returns 

1341 ------- 

1342 builder : `queries.QueryBuilder` 

1343 Object that can be used to construct and perform advanced queries. 

1344 """ 

1345 return queries.QueryBuilder( 

1346 summary, 

1347 queries.RegistryManagers( 

1348 collections=self._managers.collections, 

1349 dimensions=self._managers.dimensions, 

1350 datasets=self._managers.datasets, 

1351 TimespanReprClass=self._db.getTimespanRepresentation(), 

1352 ), 

1353 ) 

1354 

1355 def queryDatasets(self, datasetType: Any, *, 

1356 collections: Any = None, 

1357 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

1358 dataId: Optional[DataId] = None, 

1359 where: Optional[str] = None, 

1360 findFirst: bool = False, 

1361 components: Optional[bool] = None, 

1362 bind: Optional[Mapping[str, Any]] = None, 

1363 check: bool = True, 

1364 **kwargs: Any) -> queries.DatasetQueryResults: 

1365 """Query for and iterate over dataset references matching user-provided 

1366 criteria. 

1367 

1368 Parameters 

1369 ---------- 

1370 datasetType 

1371 An expression that fully or partially identifies the dataset types 

1372 to be queried. Allowed types include `DatasetType`, `str`, 

1373 `re.Pattern`, and iterables thereof. The special value `...` can 

1374 be used to query all dataset types. See 

1375 :ref:`daf_butler_dataset_type_expressions` for more information. 

1376 collections: optional 

1377 An expression that fully or partially identifies the collections 

1378 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1379 thereof. `...` can be used to find datasets from all 

1380 `~CollectionType.RUN` collections (no other collections are 

1381 necessary, because all datasets are in a ``RUN`` collection). See 

1382 :ref:`daf_butler_collection_expressions` for more information. 

1383 If not provided, ``self.default.collections`` is used. 

1384 dimensions : `~collections.abc.Iterable` of `Dimension` or `str` 

1385 Dimensions to include in the query (in addition to those used 

1386 to identify the queried dataset type(s)), either to constrain 

1387 the resulting datasets to those for which a matching dimension 

1388 exists, or to relate the dataset type's dimensions to dimensions 

1389 referenced by the ``dataId`` or ``where`` arguments. 

1390 dataId : `dict` or `DataCoordinate`, optional 

1391 A data ID whose key-value pairs are used as equality constraints 

1392 in the query. 

1393 where : `str`, optional 

1394 A string expression similar to a SQL WHERE clause. May involve 

1395 any column of a dimension table or (as a shortcut for the primary 

1396 key column of a dimension table) dimension name. See 

1397 :ref:`daf_butler_dimension_expressions` for more information. 

1398 findFirst : `bool`, optional 

1399 If `True` (`False` is default), for each result data ID, only 

1400 yield one `DatasetRef` of each `DatasetType`, from the first 

1401 collection in which a dataset of that dataset type appears 

1402 (according to the order of ``collections`` passed in). If `True`, 

1403 ``collections`` must not contain regular expressions and may not 

1404 be `...`. 

1405 components : `bool`, optional 

1406 If `True`, apply all dataset expression patterns to component 

1407 dataset type names as well. If `False`, never apply patterns to 

1408 components. If `None` (default), apply patterns to components only 

1409 if their parent datasets were not matched by the expression. 

1410 Fully-specified component datasets (`str` or `DatasetType` 

1411 instances) are always included. 

1412 bind : `Mapping`, optional 

1413 Mapping containing literal values that should be injected into the 

1414 ``where`` expression, keyed by the identifiers they replace. 

1415 check : `bool`, optional 

1416 If `True` (default) check the query for consistency before 

1417 executing it. This may reject some valid queries that resemble 

1418 common mistakes (e.g. queries for visits without specifying an 

1419 instrument). 

1420 **kwargs 

1421 Additional keyword arguments are forwarded to 

1422 `DataCoordinate.standardize` when processing the ``dataId`` 

1423 argument (and may be used to provide a constraining data ID even 

1424 when the ``dataId`` argument is `None`). 

1425 

1426 Returns 

1427 ------- 

1428 refs : `queries.DatasetQueryResults` 

1429 Dataset references matching the given query criteria. 

1430 

1431 Raises 

1432 ------ 

1433 TypeError 

1434 Raised when the arguments are incompatible, such as when a 

1435 collection wildcard is passed when ``findFirst`` is `True`, or 

1436 when ``collections`` is `None` and``self.defaults.collections`` is 

1437 also `None`. 

1438 

1439 Notes 

1440 ----- 

1441 When multiple dataset types are queried in a single call, the 

1442 results of this operation are equivalent to querying for each dataset 

1443 type separately in turn, and no information about the relationships 

1444 between datasets of different types is included. In contexts where 

1445 that kind of information is important, the recommended pattern is to 

1446 use `queryDataIds` to first obtain data IDs (possibly with the 

1447 desired dataset types and collections passed as constraints to the 

1448 query), and then use multiple (generally much simpler) calls to 

1449 `queryDatasets` with the returned data IDs passed as constraints. 

1450 """ 

1451 # Standardize the collections expression. 

1452 if collections is None: 

1453 if not self.defaults.collections: 

1454 raise TypeError("No collections provided to findDataset, " 

1455 "and no defaults from registry construction.") 

1456 collections = self.defaults.collections 

1457 elif findFirst: 

1458 collections = CollectionSearch.fromExpression(collections) 

1459 else: 

1460 collections = CollectionQuery.fromExpression(collections) 

1461 # Standardize and expand the data ID provided as a constraint. 

1462 standardizedDataId = self.expandDataId(dataId, **kwargs) 

1463 

1464 # We can only query directly if given a non-component DatasetType 

1465 # instance. If we were given an expression or str or a component 

1466 # DatasetType instance, we'll populate this dict, recurse, and return. 

1467 # If we already have a non-component DatasetType, it will remain None 

1468 # and we'll run the query directly. 

1469 composition: Optional[ 

1470 Dict[ 

1471 DatasetType, # parent dataset type 

1472 List[Optional[str]] # component name, or None for parent 

1473 ] 

1474 ] = None 

1475 if not isinstance(datasetType, DatasetType): 

1476 # We were given a dataset type expression (which may be as simple 

1477 # as a str). Loop over all matching datasets, delegating handling 

1478 # of the `components` argument to queryDatasetTypes, as we populate 

1479 # the composition dict. 

1480 composition = defaultdict(list) 

1481 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components): 

1482 parentName, componentName = trueDatasetType.nameAndComponent() 

1483 if componentName is not None: 

1484 parentDatasetType = self.getDatasetType(parentName) 

1485 composition.setdefault(parentDatasetType, []).append(componentName) 

1486 else: 

1487 composition.setdefault(trueDatasetType, []).append(None) 

1488 elif datasetType.isComponent(): 

1489 # We were given a true DatasetType instance, but it's a component. 

1490 # the composition dict will have exactly one item. 

1491 parentName, componentName = datasetType.nameAndComponent() 

1492 parentDatasetType = self.getDatasetType(parentName) 

1493 composition = {parentDatasetType: [componentName]} 

1494 if composition is not None: 

1495 # We need to recurse. Do that once for each parent dataset type. 

1496 chain = [] 

1497 for parentDatasetType, componentNames in composition.items(): 

1498 parentResults = self.queryDatasets( 

1499 parentDatasetType, 

1500 collections=collections, 

1501 dimensions=dimensions, 

1502 dataId=standardizedDataId, 

1503 where=where, 

1504 findFirst=findFirst, 

1505 check=check, 

1506 ) 

1507 if isinstance(parentResults, queries.ParentDatasetQueryResults): 

1508 chain.append( 

1509 parentResults.withComponents(componentNames) 

1510 ) 

1511 else: 

1512 # Should only happen if we know there would be no results. 

1513 assert isinstance(parentResults, queries.ChainedDatasetQueryResults) \ 

1514 and not parentResults._chain 

1515 return queries.ChainedDatasetQueryResults(chain) 

1516 # If we get here, there's no need to recurse (or we are already 

1517 # recursing; there can only ever be one level of recursion). 

1518 

1519 # The full set of dimensions in the query is the combination of those 

1520 # needed for the DatasetType and those explicitly requested, if any. 

1521 requestedDimensionNames = set(datasetType.dimensions.names) 

1522 if dimensions is not None: 

1523 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

1524 # Construct the summary structure needed to construct a QueryBuilder. 

1525 summary = queries.QuerySummary( 

1526 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1527 dataId=standardizedDataId, 

1528 expression=where, 

1529 bind=bind, 

1530 defaults=self.defaults.dataId, 

1531 check=check, 

1532 ) 

1533 builder = self.makeQueryBuilder(summary) 

1534 # Add the dataset subquery to the query, telling the QueryBuilder to 

1535 # include the rank of the selected collection in the results only if we 

1536 # need to findFirst. Note that if any of the collections are 

1537 # actually wildcard expressions, and we've asked for deduplication, 

1538 # this will raise TypeError for us. 

1539 if not builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst): 

1540 return queries.ChainedDatasetQueryResults(()) 

1541 query = builder.finish() 

1542 return queries.ParentDatasetQueryResults(self._db, query, components=[None]) 

1543 

1544 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *, 

1545 dataId: Optional[DataId] = None, 

1546 datasets: Any = None, 

1547 collections: Any = None, 

1548 where: Optional[str] = None, 

1549 components: Optional[bool] = None, 

1550 bind: Optional[Mapping[str, Any]] = None, 

1551 check: bool = True, 

1552 **kwargs: Any) -> queries.DataCoordinateQueryResults: 

1553 """Query for data IDs matching user-provided criteria. 

1554 

1555 Parameters 

1556 ---------- 

1557 dimensions : `Dimension` or `str`, or iterable thereof 

1558 The dimensions of the data IDs to yield, as either `Dimension` 

1559 instances or `str`. Will be automatically expanded to a complete 

1560 `DimensionGraph`. 

1561 dataId : `dict` or `DataCoordinate`, optional 

1562 A data ID whose key-value pairs are used as equality constraints 

1563 in the query. 

1564 datasets : `Any`, optional 

1565 An expression that fully or partially identifies dataset types 

1566 that should constrain the yielded data IDs. For example, including 

1567 "raw" here would constrain the yielded ``instrument``, 

1568 ``exposure``, ``detector``, and ``physical_filter`` values to only 

1569 those for which at least one "raw" dataset exists in 

1570 ``collections``. Allowed types include `DatasetType`, `str`, 

1571 `re.Pattern`, and iterables thereof. Unlike other dataset type 

1572 expressions, ``...`` is not permitted - it doesn't make sense to 

1573 constrain data IDs on the existence of *all* datasets. 

1574 See :ref:`daf_butler_dataset_type_expressions` for more 

1575 information. 

1576 collections: `Any`, optional 

1577 An expression that fully or partially identifies the collections 

1578 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1579 thereof. `...` can be used to return all collections. Must be 

1580 provided if ``datasets`` is, and is ignored if it is not. See 

1581 :ref:`daf_butler_collection_expressions` for more information. 

1582 If not provided, ``self.default.collections`` is used. 

1583 where : `str`, optional 

1584 A string expression similar to a SQL WHERE clause. May involve 

1585 any column of a dimension table or (as a shortcut for the primary 

1586 key column of a dimension table) dimension name. See 

1587 :ref:`daf_butler_dimension_expressions` for more information. 

1588 components : `bool`, optional 

1589 If `True`, apply all dataset expression patterns to component 

1590 dataset type names as well. If `False`, never apply patterns to 

1591 components. If `None` (default), apply patterns to components only 

1592 if their parent datasets were not matched by the expression. 

1593 Fully-specified component datasets (`str` or `DatasetType` 

1594 instances) are always included. 

1595 bind : `Mapping`, optional 

1596 Mapping containing literal values that should be injected into the 

1597 ``where`` expression, keyed by the identifiers they replace. 

1598 check : `bool`, optional 

1599 If `True` (default) check the query for consistency before 

1600 executing it. This may reject some valid queries that resemble 

1601 common mistakes (e.g. queries for visits without specifying an 

1602 instrument). 

1603 **kwargs 

1604 Additional keyword arguments are forwarded to 

1605 `DataCoordinate.standardize` when processing the ``dataId`` 

1606 argument (and may be used to provide a constraining data ID even 

1607 when the ``dataId`` argument is `None`). 

1608 

1609 Returns 

1610 ------- 

1611 dataIds : `DataCoordinateQueryResults` 

1612 Data IDs matching the given query parameters. These are guaranteed 

1613 to identify all dimensions (`DataCoordinate.hasFull` returns 

1614 `True`), but will not contain `DimensionRecord` objects 

1615 (`DataCoordinate.hasRecords` returns `False`). Call 

1616 `DataCoordinateQueryResults.expanded` on the returned object to 

1617 fetch those (and consider using 

1618 `DataCoordinateQueryResults.materialize` on the returned object 

1619 first if the expected number of rows is very large). See 

1620 documentation for those methods for additional information. 

1621 

1622 Raises 

1623 ------ 

1624 TypeError 

1625 Raised if ``collections`` is `None`, ``self.defaults.collections`` 

1626 is `None`, and ``datasets`` is not `None`. 

1627 """ 

1628 dimensions = iterable(dimensions) 

1629 standardizedDataId = self.expandDataId(dataId, **kwargs) 

1630 standardizedDatasetTypes = set() 

1631 requestedDimensions = self.dimensions.extract(dimensions) 

1632 queryDimensionNames = set(requestedDimensions.names) 

1633 if datasets is not None: 

1634 if collections is None: 

1635 if not self.defaults.collections: 

1636 raise TypeError("Cannot pass 'datasets' without 'collections'.") 

1637 collections = self.defaults.collections 

1638 else: 

1639 # Preprocess collections expression in case the original 

1640 # included single-pass iterators (we'll want to use it multiple 

1641 # times below). 

1642 collections = CollectionQuery.fromExpression(collections) 

1643 for datasetType in self.queryDatasetTypes(datasets, components=components): 

1644 queryDimensionNames.update(datasetType.dimensions.names) 

1645 # If any matched dataset type is a component, just operate on 

1646 # its parent instead, because Registry doesn't know anything 

1647 # about what components exist, and here (unlike queryDatasets) 

1648 # we don't care about returning them. 

1649 parentDatasetTypeName, componentName = datasetType.nameAndComponent() 

1650 if componentName is not None: 

1651 datasetType = self.getDatasetType(parentDatasetTypeName) 

1652 standardizedDatasetTypes.add(datasetType) 

1653 

1654 summary = queries.QuerySummary( 

1655 requested=DimensionGraph(self.dimensions, names=queryDimensionNames), 

1656 dataId=standardizedDataId, 

1657 expression=where, 

1658 bind=bind, 

1659 defaults=self.defaults.dataId, 

1660 check=check, 

1661 ) 

1662 builder = self.makeQueryBuilder(summary) 

1663 for datasetType in standardizedDatasetTypes: 

1664 builder.joinDataset(datasetType, collections, isResult=False) 

1665 query = builder.finish() 

1666 return queries.DataCoordinateQueryResults(self._db, query) 

1667 

1668 def queryDimensionRecords(self, element: Union[DimensionElement, str], *, 

1669 dataId: Optional[DataId] = None, 

1670 datasets: Any = None, 

1671 collections: Any = None, 

1672 where: Optional[str] = None, 

1673 components: Optional[bool] = None, 

1674 bind: Optional[Mapping[str, Any]] = None, 

1675 check: bool = True, 

1676 **kwargs: Any) -> Iterator[DimensionRecord]: 

1677 """Query for dimension information matching user-provided criteria. 

1678 

1679 Parameters 

1680 ---------- 

1681 element : `DimensionElement` or `str` 

1682 The dimension element to obtain records for. 

1683 dataId : `dict` or `DataCoordinate`, optional 

1684 A data ID whose key-value pairs are used as equality constraints 

1685 in the query. 

1686 datasets : `Any`, optional 

1687 An expression that fully or partially identifies dataset types 

1688 that should constrain the yielded records. See `queryDataIds` and 

1689 :ref:`daf_butler_dataset_type_expressions` for more information. 

1690 collections: `Any`, optional 

1691 An expression that fully or partially identifies the collections 

1692 to search for datasets. See `queryDataIds` and 

1693 :ref:`daf_butler_collection_expressions` for more information. 

1694 where : `str`, optional 

1695 A string expression similar to a SQL WHERE clause. See 

1696 `queryDataIds` and :ref:`daf_butler_dimension_expressions` for more 

1697 information. 

1698 components : `bool`, optional 

1699 Whether to apply dataset expressions to components as well. 

1700 See `queryDataIds` for more information. 

1701 bind : `Mapping`, optional 

1702 Mapping containing literal values that should be injected into the 

1703 ``where`` expression, keyed by the identifiers they replace. 

1704 check : `bool`, optional 

1705 If `True` (default) check the query for consistency before 

1706 executing it. This may reject some valid queries that resemble 

1707 common mistakes (e.g. queries for visits without specifying an 

1708 instrument). 

1709 **kwargs 

1710 Additional keyword arguments are forwarded to 

1711 `DataCoordinate.standardize` when processing the ``dataId`` 

1712 argument (and may be used to provide a constraining data ID even 

1713 when the ``dataId`` argument is `None`). 

1714 

1715 Returns 

1716 ------- 

1717 dataIds : `DataCoordinateQueryResults` 

1718 Data IDs matching the given query parameters. 

1719 """ 

1720 if not isinstance(element, DimensionElement): 

1721 element = self.dimensions[element] 

1722 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections, 

1723 where=where, components=components, bind=bind, check=check, **kwargs) 

1724 return iter(self._managers.dimensions[element].fetch(dataIds)) 

1725 

1726 def queryDatasetAssociations( 

1727 self, 

1728 datasetType: Union[str, DatasetType], 

1729 collections: Any = ..., 

1730 *, 

1731 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

1732 flattenChains: bool = False, 

1733 ) -> Iterator[DatasetAssociation]: 

1734 """Iterate over dataset-collection combinations where the dataset is in 

1735 the collection. 

1736 

1737 This method is a temporary placeholder for better support for 

1738 assocation results in `queryDatasets`. It will probably be 

1739 removed in the future, and should be avoided in production code 

1740 whenever possible. 

1741 

1742 Parameters 

1743 ---------- 

1744 datasetType : `DatasetType` or `str` 

1745 A dataset type object or the name of one. 

1746 collections: `Any`, optional 

1747 An expression that fully or partially identifies the collections 

1748 to search for datasets. See `queryCollections` and 

1749 :ref:`daf_butler_collection_expressions` for more information. 

1750 If not provided, ``self.default.collections`` is used. 

1751 collectionTypes : `AbstractSet` [ `CollectionType` ], optional 

1752 If provided, only yield associations from collections of these 

1753 types. 

1754 flattenChains : `bool`, optional 

1755 If `True` (default) search in the children of 

1756 `~CollectionType.CHAINED` collections. If `False`, ``CHAINED`` 

1757 collections are ignored. 

1758 

1759 Yields 

1760 ------ 

1761 association : `DatasetAssociation` 

1762 Object representing the relationship beween a single dataset and 

1763 a single collection. 

1764 

1765 Raises 

1766 ------ 

1767 TypeError 

1768 Raised if ``collections`` is `None` and 

1769 ``self.defaults.collections`` is `None`. 

1770 """ 

1771 if collections is None: 

1772 if not self.defaults.collections: 

1773 raise TypeError("No collections provided to findDataset, " 

1774 "and no defaults from registry construction.") 

1775 collections = self.defaults.collections 

1776 else: 

1777 collections = CollectionQuery.fromExpression(collections) 

1778 TimespanReprClass = self._db.getTimespanRepresentation() 

1779 if isinstance(datasetType, str): 

1780 storage = self._managers.datasets[datasetType] 

1781 else: 

1782 storage = self._managers.datasets[datasetType.name] 

1783 for collectionRecord in collections.iter(self._managers.collections, 

1784 collectionTypes=frozenset(collectionTypes), 

1785 flattenChains=flattenChains): 

1786 query = storage.select(collectionRecord) 

1787 if query is None: 

1788 continue 

1789 for row in self._db.query(query.combine()): 

1790 dataId = DataCoordinate.fromRequiredValues( 

1791 storage.datasetType.dimensions, 

1792 tuple(row[name] for name in storage.datasetType.dimensions.required.names) 

1793 ) 

1794 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]] 

1795 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, 

1796 conform=False) 

1797 if collectionRecord.type is CollectionType.CALIBRATION: 

1798 timespan = TimespanReprClass.extract(row) 

1799 else: 

1800 timespan = None 

1801 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan) 

1802 

1803 storageClasses: StorageClassFactory 

1804 """All storage classes known to the registry (`StorageClassFactory`). 

1805 """