Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "Registry", 

26) 

27 

28from collections import defaultdict 

29import contextlib 

30import logging 

31from typing import ( 

32 Any, 

33 Dict, 

34 Iterable, 

35 Iterator, 

36 List, 

37 Mapping, 

38 Optional, 

39 Set, 

40 Type, 

41 TYPE_CHECKING, 

42 Union, 

43) 

44 

45import sqlalchemy 

46 

47from lsst.utils import doImport 

48from ..core import ( 

49 ButlerURI, 

50 Config, 

51 DataCoordinate, 

52 DataCoordinateIterable, 

53 DataId, 

54 DatasetAssociation, 

55 DatasetRef, 

56 DatasetType, 

57 ddl, 

58 Dimension, 

59 DimensionConfig, 

60 DimensionElement, 

61 DimensionGraph, 

62 DimensionRecord, 

63 DimensionUniverse, 

64 NamedKeyMapping, 

65 NameLookupMapping, 

66 StorageClassFactory, 

67 Timespan, 

68) 

69from . import queries 

70from ..core.utils import iterable, transactional 

71from ._config import RegistryConfig 

72from ._collectionType import CollectionType 

73from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError 

74from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch, Ellipsis 

75from .interfaces import ChainedCollectionRecord, RunRecord 

76from .versions import ButlerVersionsManager, DigestMismatchError 

77 

78if TYPE_CHECKING: 78 ↛ 79line 78 didn't jump to line 79, because the condition on line 78 was never true

79 from .._butlerConfig import ButlerConfig 

80 from .interfaces import ( 

81 ButlerAttributeManager, 

82 CollectionManager, 

83 Database, 

84 OpaqueTableStorageManager, 

85 DimensionRecordStorageManager, 

86 DatasetRecordStorageManager, 

87 DatastoreRegistryBridgeManager, 

88 ) 

89 

90 

91_LOG = logging.getLogger(__name__) 

92 

93# key for dimensions configuration in attributes table 

94_DIMENSIONS_ATTR = "config:dimensions.json" 

95 

96 

97class Registry: 

98 """Registry interface. 

99 

100 Parameters 

101 ---------- 

102 database : `Database` 

103 Database instance to store Registry. 

104 attributes : `type` 

105 Manager class implementing `ButlerAttributeManager`. 

106 opaque : `type` 

107 Manager class implementing `OpaqueTableStorageManager`. 

108 dimensions : `type` 

109 Manager class implementing `DimensionRecordStorageManager`. 

110 collections : `type` 

111 Manager class implementing `CollectionManager`. 

112 datasets : `type` 

113 Manager class implementing `DatasetRecordStorageManager`. 

114 datastoreBridges : `type` 

115 Manager class implementing `DatastoreRegistryBridgeManager`. 

116 dimensionConfig : `DimensionConfig`, optional 

117 Dimension universe configuration, only used when ``create`` is True. 

118 writeable : `bool`, optional 

119 If True then Registry will support write operations. 

120 create : `bool`, optional 

121 If True then database schema will be initialized, it must be empty 

122 before instantiating Registry. 

123 """ 

124 

125 defaultConfigFile: Optional[str] = None 

126 """Path to configuration defaults. Accessed within the ``configs`` resource 

127 or relative to a search path. Can be None if no defaults specified. 

128 """ 

129 

130 @classmethod 

131 def createFromConfig(cls, config: Optional[Union[RegistryConfig, str]] = None, 

132 dimensionConfig: Optional[Union[DimensionConfig, str]] = None, 

133 butlerRoot: Optional[str] = None) -> Registry: 

134 """Create registry database and return `Registry` instance. 

135 

136 This method initializes database contents, database must be empty 

137 prior to calling this method. 

138 

139 Parameters 

140 ---------- 

141 config : `RegistryConfig` or `str`, optional 

142 Registry configuration, if missing then default configuration will 

143 be loaded from registry.yaml. 

144 dimensionConfig : `DimensionConfig` or `str`, optional 

145 Dimensions configuration, if missing then default configuration 

146 will be loaded from dimensions.yaml. 

147 butlerRoot : `str`, optional 

148 Path to the repository root this `Registry` will manage. 

149 

150 Returns 

151 ------- 

152 registry : `Registry` 

153 A new `Registry` instance. 

154 """ 

155 if isinstance(config, str): 

156 config = RegistryConfig(config) 

157 elif config is None: 

158 config = RegistryConfig() 

159 elif not isinstance(config, RegistryConfig): 

160 raise TypeError(f"Incompatible Registry configuration type: {type(config)}") 

161 config.replaceRoot(butlerRoot) 

162 

163 if isinstance(dimensionConfig, str): 

164 dimensionConfig = DimensionConfig(config) 

165 elif dimensionConfig is None: 

166 dimensionConfig = DimensionConfig() 

167 elif not isinstance(dimensionConfig, DimensionConfig): 

168 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}") 

169 

170 DatabaseClass = config.getDatabaseClass() 

171 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

172 namespace=config.get("namespace")) 

173 attributes = doImport(config["managers", "attributes"]) 

174 opaque = doImport(config["managers", "opaque"]) 

175 dimensions = doImport(config["managers", "dimensions"]) 

176 collections = doImport(config["managers", "collections"]) 

177 datasets = doImport(config["managers", "datasets"]) 

178 datastoreBridges = doImport(config["managers", "datastores"]) 

179 

180 return cls(database, dimensions=dimensions, attributes=attributes, opaque=opaque, 

181 collections=collections, datasets=datasets, datastoreBridges=datastoreBridges, 

182 dimensionConfig=dimensionConfig, create=True) 

183 

184 @classmethod 

185 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], 

186 butlerRoot: Optional[Union[str, ButlerURI]] = None, writeable: bool = True) -> Registry: 

187 """Create `Registry` subclass instance from `config`. 

188 

189 Registry database must be inbitialized prior to calling this method. 

190 

191 Parameters 

192 ---------- 

193 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

194 Registry configuration 

195 butlerRoot : `str` or `ButlerURI`, optional 

196 Path to the repository root this `Registry` will manage. 

197 writeable : `bool`, optional 

198 If `True` (default) create a read-write connection to the database. 

199 

200 Returns 

201 ------- 

202 registry : `Registry` (subclass) 

203 A new `Registry` subclass instance. 

204 """ 

205 if not isinstance(config, RegistryConfig): 

206 if isinstance(config, str) or isinstance(config, Config): 

207 config = RegistryConfig(config) 

208 else: 

209 raise ValueError("Incompatible Registry configuration: {}".format(config)) 

210 config.replaceRoot(butlerRoot) 

211 DatabaseClass = config.getDatabaseClass() 

212 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

213 namespace=config.get("namespace"), writeable=writeable) 

214 attributes = doImport(config["managers", "attributes"]) 

215 opaque = doImport(config["managers", "opaque"]) 

216 dimensions = doImport(config["managers", "dimensions"]) 

217 collections = doImport(config["managers", "collections"]) 

218 datasets = doImport(config["managers", "datasets"]) 

219 datastoreBridges = doImport(config["managers", "datastores"]) 

220 

221 return cls(database, dimensions=dimensions, attributes=attributes, opaque=opaque, 

222 collections=collections, datasets=datasets, datastoreBridges=datastoreBridges, 

223 dimensionConfig=None, writeable=writeable) 

224 

225 def __init__(self, database: Database, *, 

226 attributes: Type[ButlerAttributeManager], 

227 opaque: Type[OpaqueTableStorageManager], 

228 dimensions: Type[DimensionRecordStorageManager], 

229 collections: Type[CollectionManager], 

230 datasets: Type[DatasetRecordStorageManager], 

231 datastoreBridges: Type[DatastoreRegistryBridgeManager], 

232 dimensionConfig: Optional[DimensionConfig] = None, 

233 writeable: bool = True, 

234 create: bool = False): 

235 self._db = database 

236 self.storageClasses = StorageClassFactory() 

237 

238 # With existing registry we have to read dimensions config from 

239 # database before we initialize all other managers. 

240 if dimensionConfig is None: 

241 assert not create, "missing DimensionConfig when create=True" 

242 with self._db.declareStaticTables(create=False) as context: 

243 self._attributes = attributes.initialize(self._db, context) 

244 

245 versions = ButlerVersionsManager( 

246 self._attributes, 

247 dict(attributes=self._attributes) 

248 ) 

249 # verify that configured versions are compatible with schema 

250 versions.checkManagersConfig() 

251 versions.checkManagersVersions(writeable) 

252 

253 # get serialized as a string from database 

254 dimensionsString = self._attributes.get(_DIMENSIONS_ATTR) 

255 if dimensionsString is not None: 

256 dimensionConfig = DimensionConfig(Config.fromString(dimensionsString, format="json")) 

257 else: 

258 raise LookupError(f"Registry attribute {_DIMENSIONS_ATTR} is missing from database") 

259 

260 # make universe 

261 universe = DimensionUniverse(dimensionConfig) 

262 

263 with self._db.declareStaticTables(create=create) as context: 

264 self._attributes = attributes.initialize(self._db, context) 

265 self._dimensions = dimensions.initialize(self._db, context, universe=universe) 

266 self._collections = collections.initialize(self._db, context, dimensions=self._dimensions) 

267 self._datasets = datasets.initialize(self._db, context, 

268 collections=self._collections, 

269 dimensions=self._dimensions) 

270 self._opaque = opaque.initialize(self._db, context) 

271 self._datastoreBridges = datastoreBridges.initialize(self._db, context, 

272 opaque=self._opaque, 

273 datasets=datasets, 

274 universe=self._dimensions.universe) 

275 versions = ButlerVersionsManager( 

276 self._attributes, 

277 dict( 

278 attributes=self._attributes, 

279 opaque=self._opaque, 

280 dimensions=self._dimensions, 

281 collections=self._collections, 

282 datasets=self._datasets, 

283 datastores=self._datastoreBridges, 

284 ) 

285 ) 

286 # store managers and their versions in attributes table 

287 context.addInitializer(lambda db: versions.storeManagersConfig()) 

288 context.addInitializer(lambda db: versions.storeManagersVersions()) 

289 # dump universe config as json into attributes (faster than YAML) 

290 json = dimensionConfig.dump(format="json") 

291 if json is not None: 

292 # Convert Optional[str] to str for mypy 

293 json_str = json 

294 context.addInitializer( 

295 lambda db: self._attributes.set(_DIMENSIONS_ATTR, json_str) 

296 ) 

297 else: 

298 raise RuntimeError("Unexpectedly failed to serialize DimensionConfig to JSON") 

299 

300 if not create: 

301 # verify that configured versions are compatible with schema 

302 versions.checkManagersConfig() 

303 versions.checkManagersVersions(writeable) 

304 try: 

305 versions.checkManagersDigests() 

306 except DigestMismatchError as exc: 

307 # potentially digest mismatch is a serious error but during 

308 # development it could be benign, treat this as warning for 

309 # now. 

310 _LOG.warning(f"Registry schema digest mismatch: {exc}") 

311 

312 self._dimensions.refresh() 

313 self._collections.refresh() 

314 self._datasets.refresh() 

315 

316 def __str__(self) -> str: 

317 return str(self._db) 

318 

319 def __repr__(self) -> str: 

320 return f"Registry({self._db!r}, {self.dimensions!r})" 

321 

322 def isWriteable(self) -> bool: 

323 """Return `True` if this registry allows write operations, and `False` 

324 otherwise. 

325 """ 

326 return self._db.isWriteable() 

327 

328 @property 

329 def dimensions(self) -> DimensionUniverse: 

330 """All dimensions recognized by this `Registry` (`DimensionUniverse`). 

331 """ 

332 return self._dimensions.universe 

333 

334 @contextlib.contextmanager 

335 def transaction(self, *, savepoint: bool = False) -> Iterator[None]: 

336 """Return a context manager that represents a transaction. 

337 """ 

338 try: 

339 with self._db.transaction(savepoint=savepoint): 

340 yield 

341 except BaseException: 

342 # TODO: this clears the caches sometimes when we wouldn't actually 

343 # need to. Can we avoid that? 

344 self._dimensions.clearCaches() 

345 raise 

346 

347 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None: 

348 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

349 other data repository client. 

350 

351 Opaque table records can be added via `insertOpaqueData`, retrieved via 

352 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

353 

354 Parameters 

355 ---------- 

356 tableName : `str` 

357 Logical name of the opaque table. This may differ from the 

358 actual name used in the database by a prefix and/or suffix. 

359 spec : `ddl.TableSpec` 

360 Specification for the table to be added. 

361 """ 

362 self._opaque.register(tableName, spec) 

363 

364 @transactional 

365 def insertOpaqueData(self, tableName: str, *data: dict) -> None: 

366 """Insert records into an opaque table. 

367 

368 Parameters 

369 ---------- 

370 tableName : `str` 

371 Logical name of the opaque table. Must match the name used in a 

372 previous call to `registerOpaqueTable`. 

373 data 

374 Each additional positional argument is a dictionary that represents 

375 a single row to be added. 

376 """ 

377 self._opaque[tableName].insert(*data) 

378 

379 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

380 """Retrieve records from an opaque table. 

381 

382 Parameters 

383 ---------- 

384 tableName : `str` 

385 Logical name of the opaque table. Must match the name used in a 

386 previous call to `registerOpaqueTable`. 

387 where 

388 Additional keyword arguments are interpreted as equality 

389 constraints that restrict the returned rows (combined with AND); 

390 keyword arguments are column names and values are the values they 

391 must have. 

392 

393 Yields 

394 ------ 

395 row : `dict` 

396 A dictionary representing a single result row. 

397 """ 

398 yield from self._opaque[tableName].fetch(**where) 

399 

400 @transactional 

401 def deleteOpaqueData(self, tableName: str, **where: Any) -> None: 

402 """Remove records from an opaque table. 

403 

404 Parameters 

405 ---------- 

406 tableName : `str` 

407 Logical name of the opaque table. Must match the name used in a 

408 previous call to `registerOpaqueTable`. 

409 where 

410 Additional keyword arguments are interpreted as equality 

411 constraints that restrict the deleted rows (combined with AND); 

412 keyword arguments are column names and values are the values they 

413 must have. 

414 """ 

415 self._opaque[tableName].delete(**where) 

416 

417 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED, 

418 doc: Optional[str] = None) -> None: 

419 """Add a new collection if one with the given name does not exist. 

420 

421 Parameters 

422 ---------- 

423 name : `str` 

424 The name of the collection to create. 

425 type : `CollectionType` 

426 Enum value indicating the type of collection to create. 

427 doc : `str`, optional 

428 Documentation string for the collection. 

429 

430 Notes 

431 ----- 

432 This method cannot be called within transactions, as it needs to be 

433 able to perform its own transaction to be concurrent. 

434 """ 

435 self._collections.register(name, type, doc=doc) 

436 

437 def getCollectionType(self, name: str) -> CollectionType: 

438 """Return an enumeration value indicating the type of the given 

439 collection. 

440 

441 Parameters 

442 ---------- 

443 name : `str` 

444 The name of the collection. 

445 

446 Returns 

447 ------- 

448 type : `CollectionType` 

449 Enum value indicating the type of this collection. 

450 

451 Raises 

452 ------ 

453 MissingCollectionError 

454 Raised if no collection with the given name exists. 

455 """ 

456 return self._collections.find(name).type 

457 

458 def registerRun(self, name: str, doc: Optional[str] = None) -> None: 

459 """Add a new run if one with the given name does not exist. 

460 

461 Parameters 

462 ---------- 

463 name : `str` 

464 The name of the run to create. 

465 doc : `str`, optional 

466 Documentation string for the collection. 

467 

468 Notes 

469 ----- 

470 This method cannot be called within transactions, as it needs to be 

471 able to perform its own transaction to be concurrent. 

472 """ 

473 self._collections.register(name, CollectionType.RUN, doc=doc) 

474 

475 @transactional 

476 def removeCollection(self, name: str) -> None: 

477 """Completely remove the given collection. 

478 

479 Parameters 

480 ---------- 

481 name : `str` 

482 The name of the collection to remove. 

483 

484 Raises 

485 ------ 

486 MissingCollectionError 

487 Raised if no collection with the given name exists. 

488 

489 Notes 

490 ----- 

491 If this is a `~CollectionType.RUN` collection, all datasets and quanta 

492 in it are also fully removed. This requires that those datasets be 

493 removed (or at least trashed) from any datastores that hold them first. 

494 

495 A collection may not be deleted as long as it is referenced by a 

496 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must 

497 be deleted or redefined first. 

498 """ 

499 self._collections.remove(name) 

500 

501 def getCollectionChain(self, parent: str) -> CollectionSearch: 

502 """Return the child collections in a `~CollectionType.CHAINED` 

503 collection. 

504 

505 Parameters 

506 ---------- 

507 parent : `str` 

508 Name of the chained collection. Must have already been added via 

509 a call to `Registry.registerCollection`. 

510 

511 Returns 

512 ------- 

513 children : `CollectionSearch` 

514 An object that defines the search path of the collection. 

515 See :ref:`daf_butler_collection_expressions` for more information. 

516 

517 Raises 

518 ------ 

519 MissingCollectionError 

520 Raised if ``parent`` does not exist in the `Registry`. 

521 TypeError 

522 Raised if ``parent`` does not correspond to a 

523 `~CollectionType.CHAINED` collection. 

524 """ 

525 record = self._collections.find(parent) 

526 if record.type is not CollectionType.CHAINED: 

527 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

528 assert isinstance(record, ChainedCollectionRecord) 

529 return record.children 

530 

531 @transactional 

532 def setCollectionChain(self, parent: str, children: Any) -> None: 

533 """Define or redefine a `~CollectionType.CHAINED` collection. 

534 

535 Parameters 

536 ---------- 

537 parent : `str` 

538 Name of the chained collection. Must have already been added via 

539 a call to `Registry.registerCollection`. 

540 children : `Any` 

541 An expression defining an ordered search of child collections, 

542 generally an iterable of `str`; see 

543 :ref:`daf_butler_collection_expressions` for more information. 

544 

545 Raises 

546 ------ 

547 MissingCollectionError 

548 Raised when any of the given collections do not exist in the 

549 `Registry`. 

550 TypeError 

551 Raised if ``parent`` does not correspond to a 

552 `~CollectionType.CHAINED` collection. 

553 ValueError 

554 Raised if the given collections contains a cycle. 

555 """ 

556 record = self._collections.find(parent) 

557 if record.type is not CollectionType.CHAINED: 

558 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

559 assert isinstance(record, ChainedCollectionRecord) 

560 children = CollectionSearch.fromExpression(children) 

561 if children != record.children: 

562 record.update(self._collections, children) 

563 

564 def getCollectionDocumentation(self, collection: str) -> Optional[str]: 

565 """Retrieve the documentation string for a collection. 

566 

567 Parameters 

568 ---------- 

569 name : `str` 

570 Name of the collection. 

571 

572 Returns 

573 ------- 

574 docs : `str` or `None` 

575 Docstring for the collection with the given name. 

576 """ 

577 return self._collections.getDocumentation(self._collections.find(collection).key) 

578 

579 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None: 

580 """Set the documentation string for a collection. 

581 

582 Parameters 

583 ---------- 

584 name : `str` 

585 Name of the collection. 

586 docs : `str` or `None` 

587 Docstring for the collection with the given name; will replace any 

588 existing docstring. Passing `None` will remove any existing 

589 docstring. 

590 """ 

591 self._collections.setDocumentation(self._collections.find(collection).key, doc) 

592 

593 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

594 """ 

595 Add a new `DatasetType` to the Registry. 

596 

597 It is not an error to register the same `DatasetType` twice. 

598 

599 Parameters 

600 ---------- 

601 datasetType : `DatasetType` 

602 The `DatasetType` to be added. 

603 

604 Returns 

605 ------- 

606 inserted : `bool` 

607 `True` if ``datasetType`` was inserted, `False` if an identical 

608 existing `DatsetType` was found. Note that in either case the 

609 DatasetType is guaranteed to be defined in the Registry 

610 consistently with the given definition. 

611 

612 Raises 

613 ------ 

614 ValueError 

615 Raised if the dimensions or storage class are invalid. 

616 ConflictingDefinitionError 

617 Raised if this DatasetType is already registered with a different 

618 definition. 

619 

620 Notes 

621 ----- 

622 This method cannot be called within transactions, as it needs to be 

623 able to perform its own transaction to be concurrent. 

624 """ 

625 _, inserted = self._datasets.register(datasetType) 

626 return inserted 

627 

628 def removeDatasetType(self, name: str) -> None: 

629 """Remove the named `DatasetType` from the registry. 

630 

631 .. warning:: 

632 

633 Registry caches the dataset type definitions. This means that 

634 deleting the dataset type definition may result in unexpected 

635 behavior from other butler processes that are active that have 

636 not seen the deletion. 

637 

638 Parameters 

639 ---------- 

640 name : `str` 

641 Name of the type to be removed. 

642 

643 Raises 

644 ------ 

645 lsst.daf.butler.registry.OrphanedRecordError 

646 Raised if an attempt is made to remove the dataset type definition 

647 when there are already datasets associated with it. 

648 

649 Notes 

650 ----- 

651 If the dataset type is not registered the method will return without 

652 action. 

653 """ 

654 self._datasets.remove(name) 

655 

656 def getDatasetType(self, name: str) -> DatasetType: 

657 """Get the `DatasetType`. 

658 

659 Parameters 

660 ---------- 

661 name : `str` 

662 Name of the type. 

663 

664 Returns 

665 ------- 

666 type : `DatasetType` 

667 The `DatasetType` associated with the given name. 

668 

669 Raises 

670 ------ 

671 KeyError 

672 Requested named DatasetType could not be found in registry. 

673 """ 

674 return self._datasets[name].datasetType 

675 

676 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *, 

677 collections: Any, timespan: Optional[Timespan] = None, 

678 **kwargs: Any) -> Optional[DatasetRef]: 

679 """Find a dataset given its `DatasetType` and data ID. 

680 

681 This can be used to obtain a `DatasetRef` that permits the dataset to 

682 be read from a `Datastore`. If the dataset is a component and can not 

683 be found using the provided dataset type, a dataset ref for the parent 

684 will be returned instead but with the correct dataset type. 

685 

686 Parameters 

687 ---------- 

688 datasetType : `DatasetType` or `str` 

689 A `DatasetType` or the name of one. 

690 dataId : `dict` or `DataCoordinate`, optional 

691 A `dict`-like object containing the `Dimension` links that identify 

692 the dataset within a collection. 

693 collections 

694 An expression that fully or partially identifies the collections to 

695 search for the dataset; see 

696 :ref:`daf_butler_collection_expressions` for more information. 

697 timespan : `Timespan`, optional 

698 A timespan that the validity range of the dataset must overlap. 

699 If not provided, any `~CollectionType.CALIBRATION` collections 

700 matched by the ``collections`` argument will not be searched. 

701 **kwargs 

702 Additional keyword arguments passed to 

703 `DataCoordinate.standardize` to convert ``dataId`` to a true 

704 `DataCoordinate` or augment an existing one. 

705 

706 Returns 

707 ------- 

708 ref : `DatasetRef` 

709 A reference to the dataset, or `None` if no matching Dataset 

710 was found. 

711 

712 Raises 

713 ------ 

714 LookupError 

715 Raised if one or more data ID keys are missing. 

716 KeyError 

717 Raised if the dataset type does not exist. 

718 MissingCollectionError 

719 Raised if any of ``collections`` does not exist in the registry. 

720 

721 Notes 

722 ----- 

723 This method simply returns `None` and does not raise an exception even 

724 when the set of collections searched is intrinsically incompatible with 

725 the dataset type, e.g. if ``datasetType.isCalibration() is False``, but 

726 only `~CollectionType.CALIBRATION` collections are being searched. 

727 This may make it harder to debug some lookup failures, but the behavior 

728 is intentional; we consider it more important that failed searches are 

729 reported consistently, regardless of the reason, and that adding 

730 additional collections that do not contain a match to the search path 

731 never changes the behavior. 

732 """ 

733 if isinstance(datasetType, DatasetType): 

734 storage = self._datasets[datasetType.name] 

735 else: 

736 storage = self._datasets[datasetType] 

737 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions, 

738 universe=self.dimensions, **kwargs) 

739 collections = CollectionSearch.fromExpression(collections) 

740 for collectionRecord in collections.iter(self._collections): 

741 if (collectionRecord.type is CollectionType.CALIBRATION 

742 and (not storage.datasetType.isCalibration() or timespan is None)): 

743 continue 

744 result = storage.find(collectionRecord, dataId, timespan=timespan) 

745 if result is not None: 

746 return result 

747 

748 return None 

749 

750 @transactional 

751 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId], 

752 run: str) -> List[DatasetRef]: 

753 """Insert one or more datasets into the `Registry` 

754 

755 This always adds new datasets; to associate existing datasets with 

756 a new collection, use ``associate``. 

757 

758 Parameters 

759 ---------- 

760 datasetType : `DatasetType` or `str` 

761 A `DatasetType` or the name of one. 

762 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate` 

763 Dimension-based identifiers for the new datasets. 

764 run : `str` 

765 The name of the run that produced the datasets. 

766 

767 Returns 

768 ------- 

769 refs : `list` of `DatasetRef` 

770 Resolved `DatasetRef` instances for all given data IDs (in the same 

771 order). 

772 

773 Raises 

774 ------ 

775 ConflictingDefinitionError 

776 If a dataset with the same dataset type and data ID as one of those 

777 given already exists in ``run``. 

778 MissingCollectionError 

779 Raised if ``run`` does not exist in the registry. 

780 """ 

781 if isinstance(datasetType, DatasetType): 

782 storage = self._datasets.find(datasetType.name) 

783 if storage is None: 

784 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

785 else: 

786 storage = self._datasets.find(datasetType) 

787 if storage is None: 

788 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

789 runRecord = self._collections.find(run) 

790 if runRecord.type is not CollectionType.RUN: 

791 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.") 

792 assert isinstance(runRecord, RunRecord) 

793 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) 

794 for dataId in dataIds] 

795 try: 

796 refs = list(storage.insert(runRecord, expandedDataIds)) 

797 except sqlalchemy.exc.IntegrityError as err: 

798 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

799 f"one or more datasets of type {storage.datasetType} into " 

800 f"collection '{run}'. " 

801 f"This probably means a dataset with the same data ID " 

802 f"and dataset type already exists, but it may also mean a " 

803 f"dimension row is missing.") from err 

804 return refs 

805 

806 def getDataset(self, id: int) -> Optional[DatasetRef]: 

807 """Retrieve a Dataset entry. 

808 

809 Parameters 

810 ---------- 

811 id : `int` 

812 The unique identifier for the dataset. 

813 

814 Returns 

815 ------- 

816 ref : `DatasetRef` or `None` 

817 A ref to the Dataset, or `None` if no matching Dataset 

818 was found. 

819 """ 

820 ref = self._datasets.getDatasetRef(id) 

821 if ref is None: 

822 return None 

823 return ref 

824 

825 @transactional 

826 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None: 

827 """Remove datasets from the Registry. 

828 

829 The datasets will be removed unconditionally from all collections, and 

830 any `Quantum` that consumed this dataset will instead be marked with 

831 having a NULL input. `Datastore` records will *not* be deleted; the 

832 caller is responsible for ensuring that the dataset has already been 

833 removed from all Datastores. 

834 

835 Parameters 

836 ---------- 

837 refs : `Iterable` of `DatasetRef` 

838 References to the datasets to be removed. Must include a valid 

839 ``id`` attribute, and should be considered invalidated upon return. 

840 

841 Raises 

842 ------ 

843 AmbiguousDatasetError 

844 Raised if any ``ref.id`` is `None`. 

845 OrphanedRecordError 

846 Raised if any dataset is still present in any `Datastore`. 

847 """ 

848 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

849 storage = self._datasets.find(datasetType.name) 

850 assert storage is not None 

851 try: 

852 storage.delete(refsForType) 

853 except sqlalchemy.exc.IntegrityError as err: 

854 raise OrphanedRecordError("One or more datasets is still " 

855 "present in one or more Datastores.") from err 

856 

857 @transactional 

858 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

859 """Add existing datasets to a `~CollectionType.TAGGED` collection. 

860 

861 If a DatasetRef with the same exact integer ID is already in a 

862 collection nothing is changed. If a `DatasetRef` with the same 

863 `DatasetType` and data ID but with different integer ID 

864 exists in the collection, `ConflictingDefinitionError` is raised. 

865 

866 Parameters 

867 ---------- 

868 collection : `str` 

869 Indicates the collection the datasets should be associated with. 

870 refs : `Iterable` [ `DatasetRef` ] 

871 An iterable of resolved `DatasetRef` instances that already exist 

872 in this `Registry`. 

873 

874 Raises 

875 ------ 

876 ConflictingDefinitionError 

877 If a Dataset with the given `DatasetRef` already exists in the 

878 given collection. 

879 AmbiguousDatasetError 

880 Raised if ``any(ref.id is None for ref in refs)``. 

881 MissingCollectionError 

882 Raised if ``collection`` does not exist in the registry. 

883 TypeError 

884 Raise adding new datasets to the given ``collection`` is not 

885 allowed. 

886 """ 

887 collectionRecord = self._collections.find(collection) 

888 if collectionRecord.type is not CollectionType.TAGGED: 

889 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

890 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

891 storage = self._datasets.find(datasetType.name) 

892 assert storage is not None 

893 try: 

894 storage.associate(collectionRecord, refsForType) 

895 except sqlalchemy.exc.IntegrityError as err: 

896 raise ConflictingDefinitionError( 

897 f"Constraint violation while associating dataset of type {datasetType.name} with " 

898 f"collection {collection}. This probably means that one or more datasets with the same " 

899 f"dataset type and data ID already exist in the collection, but it may also indicate " 

900 f"that the datasets do not exist." 

901 ) from err 

902 

903 @transactional 

904 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

905 """Remove existing datasets from a `~CollectionType.TAGGED` collection. 

906 

907 ``collection`` and ``ref`` combinations that are not currently 

908 associated are silently ignored. 

909 

910 Parameters 

911 ---------- 

912 collection : `str` 

913 The collection the datasets should no longer be associated with. 

914 refs : `Iterable` [ `DatasetRef` ] 

915 An iterable of resolved `DatasetRef` instances that already exist 

916 in this `Registry`. 

917 

918 Raises 

919 ------ 

920 AmbiguousDatasetError 

921 Raised if any of the given dataset references is unresolved. 

922 MissingCollectionError 

923 Raised if ``collection`` does not exist in the registry. 

924 TypeError 

925 Raise adding new datasets to the given ``collection`` is not 

926 allowed. 

927 """ 

928 collectionRecord = self._collections.find(collection) 

929 if collectionRecord.type is not CollectionType.TAGGED: 

930 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; " 

931 "expected TAGGED.") 

932 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

933 storage = self._datasets.find(datasetType.name) 

934 assert storage is not None 

935 storage.disassociate(collectionRecord, refsForType) 

936 

937 @transactional 

938 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None: 

939 """Associate one or more datasets with a calibration collection and a 

940 validity range within it. 

941 

942 Parameters 

943 ---------- 

944 collection : `str` 

945 The name of an already-registered `~CollectionType.CALIBRATION` 

946 collection. 

947 refs : `Iterable` [ `DatasetRef` ] 

948 Datasets to be associated. 

949 timespan : `Timespan` 

950 The validity range for these datasets within the collection. 

951 

952 Raises 

953 ------ 

954 AmbiguousDatasetError 

955 Raised if any of the given `DatasetRef` instances is unresolved. 

956 ConflictingDefinitionError 

957 Raised if the collection already contains a different dataset with 

958 the same `DatasetType` and data ID and an overlapping validity 

959 range. 

960 TypeError 

961 Raised if ``collection`` is not a `~CollectionType.CALIBRATION` 

962 collection or if one or more datasets are of a dataset type for 

963 which `DatasetType.isCalibration` returns `False`. 

964 """ 

965 collectionRecord = self._collections.find(collection) 

966 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

967 storage = self._datasets[datasetType.name] 

968 storage.certify(collectionRecord, refsForType, timespan) 

969 

970 @transactional 

971 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *, 

972 dataIds: Optional[Iterable[DataId]] = None) -> None: 

973 """Remove or adjust datasets to clear a validity range within a 

974 calibration collection. 

975 

976 Parameters 

977 ---------- 

978 collection : `str` 

979 The name of an already-registered `~CollectionType.CALIBRATION` 

980 collection. 

981 datasetType : `str` or `DatasetType` 

982 Name or `DatasetType` instance for the datasets to be decertified. 

983 timespan : `Timespan`, optional 

984 The validity range to remove datasets from within the collection. 

985 Datasets that overlap this range but are not contained by it will 

986 have their validity ranges adjusted to not overlap it, which may 

987 split a single dataset validity range into two. 

988 dataIds : `Iterable` [ `DataId` ], optional 

989 Data IDs that should be decertified within the given validity range 

990 If `None`, all data IDs for ``self.datasetType`` will be 

991 decertified. 

992 

993 Raises 

994 ------ 

995 TypeError 

996 Raised if ``collection`` is not a `~CollectionType.CALIBRATION` 

997 collection or if ``datasetType.isCalibration() is False``. 

998 """ 

999 collectionRecord = self._collections.find(collection) 

1000 if isinstance(datasetType, str): 

1001 storage = self._datasets[datasetType] 

1002 else: 

1003 storage = self._datasets[datasetType.name] 

1004 standardizedDataIds = None 

1005 if dataIds is not None: 

1006 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) 

1007 for d in dataIds] 

1008 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds) 

1009 

1010 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: 

1011 """Return an object that allows a new `Datastore` instance to 

1012 communicate with this `Registry`. 

1013 

1014 Returns 

1015 ------- 

1016 manager : `DatastoreRegistryBridgeManager` 

1017 Object that mediates communication between this `Registry` and its 

1018 associated datastores. 

1019 """ 

1020 return self._datastoreBridges 

1021 

1022 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]: 

1023 """Retrieve datastore locations for a given dataset. 

1024 

1025 Parameters 

1026 ---------- 

1027 ref : `DatasetRef` 

1028 A reference to the dataset for which to retrieve storage 

1029 information. 

1030 

1031 Returns 

1032 ------- 

1033 datastores : `Iterable` [ `str` ] 

1034 All the matching datastores holding this dataset. 

1035 

1036 Raises 

1037 ------ 

1038 AmbiguousDatasetError 

1039 Raised if ``ref.id`` is `None`. 

1040 """ 

1041 return self._datastoreBridges.findDatastores(ref) 

1042 

1043 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None, 

1044 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None, 

1045 **kwargs: Any) -> DataCoordinate: 

1046 """Expand a dimension-based data ID to include additional information. 

1047 

1048 Parameters 

1049 ---------- 

1050 dataId : `DataCoordinate` or `dict`, optional 

1051 Data ID to be expanded; augmented and overridden by ``kwds``. 

1052 graph : `DimensionGraph`, optional 

1053 Set of dimensions for the expanded ID. If `None`, the dimensions 

1054 will be inferred from the keys of ``dataId`` and ``kwds``. 

1055 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph`` 

1056 are silently ignored, providing a way to extract and expand a 

1057 subset of a data ID. 

1058 records : `Mapping` [`str`, `DimensionRecord`], optional 

1059 Dimension record data to use before querying the database for that 

1060 data, keyed by element name. 

1061 **kwargs 

1062 Additional keywords are treated like additional key-value pairs for 

1063 ``dataId``, extending and overriding 

1064 

1065 Returns 

1066 ------- 

1067 expanded : `DataCoordinate` 

1068 A data ID that includes full metadata for all of the dimensions it 

1069 identifieds, i.e. guarantees that ``expanded.hasRecords()`` and 

1070 ``expanded.hasFull()`` both return `True`. 

1071 """ 

1072 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwargs) 

1073 if standardized.hasRecords(): 

1074 return standardized 

1075 if records is None: 

1076 records = {} 

1077 elif isinstance(records, NamedKeyMapping): 

1078 records = records.byName() 

1079 else: 

1080 records = dict(records) 

1081 if isinstance(dataId, DataCoordinate) and dataId.hasRecords(): 

1082 records.update(dataId.records.byName()) 

1083 keys = standardized.byName() 

1084 for element in standardized.graph.primaryKeyTraversalOrder: 

1085 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

1086 if record is ...: 

1087 if isinstance(element, Dimension) and keys.get(element.name) is None: 

1088 if element in standardized.graph.required: 

1089 raise LookupError( 

1090 f"No value or null value for required dimension {element.name}." 

1091 ) 

1092 keys[element.name] = None 

1093 record = None 

1094 else: 

1095 storage = self._dimensions[element] 

1096 dataIdSet = DataCoordinateIterable.fromScalar( 

1097 DataCoordinate.standardize(keys, graph=element.graph) 

1098 ) 

1099 fetched = tuple(storage.fetch(dataIdSet)) 

1100 try: 

1101 (record,) = fetched 

1102 except ValueError: 

1103 record = None 

1104 records[element.name] = record 

1105 if record is not None: 

1106 for d in element.implied: 

1107 value = getattr(record, d.name) 

1108 if keys.setdefault(d.name, value) != value: 

1109 raise InconsistentDataIdError( 

1110 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, " 

1111 f"but {element.name} implies {d.name}={value!r}." 

1112 ) 

1113 else: 

1114 if element in standardized.graph.required: 

1115 raise LookupError( 

1116 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

1117 ) 

1118 if element.alwaysJoin: 

1119 raise InconsistentDataIdError( 

1120 f"Could not fetch record for element {element.name} via keys {keys}, ", 

1121 "but it is marked alwaysJoin=True; this means one or more dimensions are not " 

1122 "related." 

1123 ) 

1124 for d in element.implied: 

1125 keys.setdefault(d.name, None) 

1126 records.setdefault(d.name, None) 

1127 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records) 

1128 

1129 def insertDimensionData(self, element: Union[DimensionElement, str], 

1130 *data: Union[Mapping[str, Any], DimensionRecord], 

1131 conform: bool = True) -> None: 

1132 """Insert one or more dimension records into the database. 

1133 

1134 Parameters 

1135 ---------- 

1136 element : `DimensionElement` or `str` 

1137 The `DimensionElement` or name thereof that identifies the table 

1138 records will be inserted into. 

1139 data : `dict` or `DimensionRecord` (variadic) 

1140 One or more records to insert. 

1141 conform : `bool`, optional 

1142 If `False` (`True` is default) perform no checking or conversions, 

1143 and assume that ``element`` is a `DimensionElement` instance and 

1144 ``data`` is a one or more `DimensionRecord` instances of the 

1145 appropriate subclass. 

1146 """ 

1147 if conform: 

1148 if isinstance(element, str): 

1149 element = self.dimensions[element] 

1150 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

1151 for row in data] 

1152 else: 

1153 # Ignore typing since caller said to trust them with conform=False. 

1154 records = data # type: ignore 

1155 storage = self._dimensions[element] # type: ignore 

1156 storage.insert(*records) 

1157 

1158 def syncDimensionData(self, element: Union[DimensionElement, str], 

1159 row: Union[Mapping[str, Any], DimensionRecord], 

1160 conform: bool = True) -> bool: 

1161 """Synchronize the given dimension record with the database, inserting 

1162 if it does not already exist and comparing values if it does. 

1163 

1164 Parameters 

1165 ---------- 

1166 element : `DimensionElement` or `str` 

1167 The `DimensionElement` or name thereof that identifies the table 

1168 records will be inserted into. 

1169 row : `dict` or `DimensionRecord` 

1170 The record to insert. 

1171 conform : `bool`, optional 

1172 If `False` (`True` is default) perform no checking or conversions, 

1173 and assume that ``element`` is a `DimensionElement` instance and 

1174 ``data`` is a one or more `DimensionRecord` instances of the 

1175 appropriate subclass. 

1176 

1177 Returns 

1178 ------- 

1179 inserted : `bool` 

1180 `True` if a new row was inserted, `False` otherwise. 

1181 

1182 Raises 

1183 ------ 

1184 ConflictingDefinitionError 

1185 Raised if the record exists in the database (according to primary 

1186 key lookup) but is inconsistent with the given one. 

1187 """ 

1188 if conform: 

1189 if isinstance(element, str): 

1190 element = self.dimensions[element] 

1191 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

1192 else: 

1193 # Ignore typing since caller said to trust them with conform=False. 

1194 record = row # type: ignore 

1195 storage = self._dimensions[element] # type: ignore 

1196 return storage.sync(record) 

1197 

1198 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None 

1199 ) -> Iterator[DatasetType]: 

1200 """Iterate over the dataset types whose names match an expression. 

1201 

1202 Parameters 

1203 ---------- 

1204 expression : `Any`, optional 

1205 An expression that fully or partially identifies the dataset types 

1206 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1207 `...` can be used to return all dataset types, and is the default. 

1208 See :ref:`daf_butler_dataset_type_expressions` for more 

1209 information. 

1210 components : `bool`, optional 

1211 If `True`, apply all expression patterns to component dataset type 

1212 names as well. If `False`, never apply patterns to components. 

1213 If `None` (default), apply patterns to components only if their 

1214 parent datasets were not matched by the expression. 

1215 Fully-specified component datasets (`str` or `DatasetType` 

1216 instances) are always included. 

1217 

1218 Yields 

1219 ------ 

1220 datasetType : `DatasetType` 

1221 A `DatasetType` instance whose name matches ``expression``. 

1222 """ 

1223 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name) 

1224 if wildcard is Ellipsis: 

1225 for datasetType in self._datasets: 

1226 # The dataset type can no longer be a component 

1227 yield datasetType 

1228 if components and datasetType.isComposite(): 

1229 # Automatically create the component dataset types 

1230 for component in datasetType.makeAllComponentDatasetTypes(): 

1231 yield component 

1232 return 

1233 done: Set[str] = set() 

1234 for name in wildcard.strings: 

1235 storage = self._datasets.find(name) 

1236 if storage is not None: 

1237 done.add(storage.datasetType.name) 

1238 yield storage.datasetType 

1239 if wildcard.patterns: 

1240 # If components (the argument) is None, we'll save component 

1241 # dataset that we might want to match, but only if their parents 

1242 # didn't get included. 

1243 componentsForLater = [] 

1244 for registeredDatasetType in self._datasets: 

1245 # Components are not stored in registry so expand them here 

1246 allDatasetTypes = [registeredDatasetType] \ 

1247 + registeredDatasetType.makeAllComponentDatasetTypes() 

1248 for datasetType in allDatasetTypes: 

1249 if datasetType.name in done: 

1250 continue 

1251 parentName, componentName = datasetType.nameAndComponent() 

1252 if componentName is not None and not components: 

1253 if components is None and parentName not in done: 

1254 componentsForLater.append(datasetType) 

1255 continue 

1256 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

1257 done.add(datasetType.name) 

1258 yield datasetType 

1259 # Go back and try to match saved components. 

1260 for datasetType in componentsForLater: 

1261 parentName, _ = datasetType.nameAndComponent() 

1262 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

1263 yield datasetType 

1264 

1265 def queryCollections(self, expression: Any = ..., 

1266 datasetType: Optional[DatasetType] = None, 

1267 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

1268 flattenChains: bool = False, 

1269 includeChains: Optional[bool] = None) -> Iterator[str]: 

1270 """Iterate over the collections whose names match an expression. 

1271 

1272 Parameters 

1273 ---------- 

1274 expression : `Any`, optional 

1275 An expression that fully or partially identifies the collections 

1276 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1277 `...` can be used to return all collections, and is the default. 

1278 See :ref:`daf_butler_collection_expressions` for more 

1279 information. 

1280 datasetType : `DatasetType`, optional 

1281 If provided, only yield collections that may contain datasets of 

1282 this type. This is a conservative approximation in general; it may 

1283 yield collections that do not have any such datasets. 

1284 collectionTypes : `AbstractSet` [ `CollectionType` ], optional 

1285 If provided, only yield collections of these types. 

1286 flattenChains : `bool`, optional 

1287 If `True` (`False` is default), recursively yield the child 

1288 collections of matching `~CollectionType.CHAINED` collections. 

1289 includeChains : `bool`, optional 

1290 If `True`, yield records for matching `~CollectionType.CHAINED` 

1291 collections. Default is the opposite of ``flattenChains``: include 

1292 either CHAINED collections or their children, but not both. 

1293 

1294 Yields 

1295 ------ 

1296 collection : `str` 

1297 The name of a collection that matches ``expression``. 

1298 """ 

1299 # Right now the datasetTypes argument is completely ignored, but that 

1300 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up 

1301 # ticket will take care of that. 

1302 query = CollectionQuery.fromExpression(expression) 

1303 for record in query.iter(self._collections, collectionTypes=frozenset(collectionTypes), 

1304 flattenChains=flattenChains, includeChains=includeChains): 

1305 yield record.name 

1306 

1307 def makeQueryBuilder(self, summary: queries.QuerySummary) -> queries.QueryBuilder: 

1308 """Return a `QueryBuilder` instance capable of constructing and 

1309 managing more complex queries than those obtainable via `Registry` 

1310 interfaces. 

1311 

1312 This is an advanced interface; downstream code should prefer 

1313 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those 

1314 are sufficient. 

1315 

1316 Parameters 

1317 ---------- 

1318 summary : `queries.QuerySummary` 

1319 Object describing and categorizing the full set of dimensions that 

1320 will be included in the query. 

1321 

1322 Returns 

1323 ------- 

1324 builder : `queries.QueryBuilder` 

1325 Object that can be used to construct and perform advanced queries. 

1326 """ 

1327 return queries.QueryBuilder( 

1328 summary, 

1329 queries.RegistryManagers( 

1330 collections=self._collections, 

1331 dimensions=self._dimensions, 

1332 datasets=self._datasets 

1333 ) 

1334 ) 

1335 

1336 def queryDatasets(self, datasetType: Any, *, 

1337 collections: Any, 

1338 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

1339 dataId: Optional[DataId] = None, 

1340 where: Optional[str] = None, 

1341 findFirst: bool = False, 

1342 components: Optional[bool] = None, 

1343 check: bool = True, 

1344 **kwargs: Any) -> queries.DatasetQueryResults: 

1345 """Query for and iterate over dataset references matching user-provided 

1346 criteria. 

1347 

1348 Parameters 

1349 ---------- 

1350 datasetType 

1351 An expression that fully or partially identifies the dataset types 

1352 to be queried. Allowed types include `DatasetType`, `str`, 

1353 `re.Pattern`, and iterables thereof. The special value `...` can 

1354 be used to query all dataset types. See 

1355 :ref:`daf_butler_dataset_type_expressions` for more information. 

1356 collections 

1357 An expression that fully or partially identifies the collections 

1358 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1359 thereof. `...` can be used to find datasets from all 

1360 `~CollectionType.RUN` collections (no other collections are 

1361 necessary, because all datasets are in a ``RUN`` collection). See 

1362 :ref:`daf_butler_collection_expressions` for more information. 

1363 dimensions : `~collections.abc.Iterable` of `Dimension` or `str` 

1364 Dimensions to include in the query (in addition to those used 

1365 to identify the queried dataset type(s)), either to constrain 

1366 the resulting datasets to those for which a matching dimension 

1367 exists, or to relate the dataset type's dimensions to dimensions 

1368 referenced by the ``dataId`` or ``where`` arguments. 

1369 dataId : `dict` or `DataCoordinate`, optional 

1370 A data ID whose key-value pairs are used as equality constraints 

1371 in the query. 

1372 where : `str`, optional 

1373 A string expression similar to a SQL WHERE clause. May involve 

1374 any column of a dimension table or (as a shortcut for the primary 

1375 key column of a dimension table) dimension name. See 

1376 :ref:`daf_butler_dimension_expressions` for more information. 

1377 findFirst : `bool`, optional 

1378 If `True` (`False` is default), for each result data ID, only 

1379 yield one `DatasetRef` of each `DatasetType`, from the first 

1380 collection in which a dataset of that dataset type appears 

1381 (according to the order of ``collections`` passed in). If `True`, 

1382 ``collections`` must not contain regular expressions and may not 

1383 be `...`. 

1384 components : `bool`, optional 

1385 If `True`, apply all dataset expression patterns to component 

1386 dataset type names as well. If `False`, never apply patterns to 

1387 components. If `None` (default), apply patterns to components only 

1388 if their parent datasets were not matched by the expression. 

1389 Fully-specified component datasets (`str` or `DatasetType` 

1390 instances) are always included. 

1391 check : `bool`, optional 

1392 If `True` (default) check the query for consistency before 

1393 executing it. This may reject some valid queries that resemble 

1394 common mistakes (e.g. queries for visits without specifying an 

1395 instrument). 

1396 **kwargs 

1397 Additional keyword arguments are forwarded to 

1398 `DataCoordinate.standardize` when processing the ``dataId`` 

1399 argument (and may be used to provide a constraining data ID even 

1400 when the ``dataId`` argument is `None`). 

1401 

1402 Returns 

1403 ------- 

1404 refs : `queries.DatasetQueryResults` 

1405 Dataset references matching the given query criteria. 

1406 

1407 Raises 

1408 ------ 

1409 TypeError 

1410 Raised when the arguments are incompatible, such as when a 

1411 collection wildcard is passed when ``findFirst`` is `True`. 

1412 

1413 Notes 

1414 ----- 

1415 When multiple dataset types are queried in a single call, the 

1416 results of this operation are equivalent to querying for each dataset 

1417 type separately in turn, and no information about the relationships 

1418 between datasets of different types is included. In contexts where 

1419 that kind of information is important, the recommended pattern is to 

1420 use `queryDataIds` to first obtain data IDs (possibly with the 

1421 desired dataset types and collections passed as constraints to the 

1422 query), and then use multiple (generally much simpler) calls to 

1423 `queryDatasets` with the returned data IDs passed as constraints. 

1424 """ 

1425 # Standardize the collections expression. 

1426 if findFirst: 

1427 collections = CollectionSearch.fromExpression(collections) 

1428 else: 

1429 collections = CollectionQuery.fromExpression(collections) 

1430 # Standardize and expand the data ID provided as a constraint. 

1431 standardizedDataId = self.expandDataId(dataId, **kwargs) 

1432 

1433 # We can only query directly if given a non-component DatasetType 

1434 # instance. If we were given an expression or str or a component 

1435 # DatasetType instance, we'll populate this dict, recurse, and return. 

1436 # If we already have a non-component DatasetType, it will remain None 

1437 # and we'll run the query directly. 

1438 composition: Optional[ 

1439 Dict[ 

1440 DatasetType, # parent dataset type 

1441 List[Optional[str]] # component name, or None for parent 

1442 ] 

1443 ] = None 

1444 if not isinstance(datasetType, DatasetType): 

1445 # We were given a dataset type expression (which may be as simple 

1446 # as a str). Loop over all matching datasets, delegating handling 

1447 # of the `components` argument to queryDatasetTypes, as we populate 

1448 # the composition dict. 

1449 composition = defaultdict(list) 

1450 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components): 

1451 parentName, componentName = trueDatasetType.nameAndComponent() 

1452 if componentName is not None: 

1453 parentDatasetType = self.getDatasetType(parentName) 

1454 composition.setdefault(parentDatasetType, []).append(componentName) 

1455 else: 

1456 composition.setdefault(trueDatasetType, []).append(None) 

1457 elif datasetType.isComponent(): 

1458 # We were given a true DatasetType instance, but it's a component. 

1459 # the composition dict will have exactly one item. 

1460 parentName, componentName = datasetType.nameAndComponent() 

1461 parentDatasetType = self.getDatasetType(parentName) 

1462 composition = {parentDatasetType: [componentName]} 

1463 if composition is not None: 

1464 # We need to recurse. Do that once for each parent dataset type. 

1465 chain = [] 

1466 for parentDatasetType, componentNames in composition.items(): 

1467 parentResults = self.queryDatasets( 

1468 parentDatasetType, 

1469 collections=collections, 

1470 dimensions=dimensions, 

1471 dataId=standardizedDataId, 

1472 where=where, 

1473 findFirst=findFirst, 

1474 check=check, 

1475 ) 

1476 if isinstance(parentResults, queries.ParentDatasetQueryResults): 

1477 chain.append( 

1478 parentResults.withComponents(componentNames) 

1479 ) 

1480 else: 

1481 # Should only happen if we know there would be no results. 

1482 assert isinstance(parentResults, queries.ChainedDatasetQueryResults) \ 

1483 and not parentResults._chain 

1484 return queries.ChainedDatasetQueryResults(chain) 

1485 # If we get here, there's no need to recurse (or we are already 

1486 # recursing; there can only ever be one level of recursion). 

1487 

1488 # The full set of dimensions in the query is the combination of those 

1489 # needed for the DatasetType and those explicitly requested, if any. 

1490 requestedDimensionNames = set(datasetType.dimensions.names) 

1491 if dimensions is not None: 

1492 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

1493 # Construct the summary structure needed to construct a QueryBuilder. 

1494 summary = queries.QuerySummary( 

1495 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1496 dataId=standardizedDataId, 

1497 expression=where, 

1498 check=check, 

1499 ) 

1500 builder = self.makeQueryBuilder(summary) 

1501 # Add the dataset subquery to the query, telling the QueryBuilder to 

1502 # include the rank of the selected collection in the results only if we 

1503 # need to findFirst. Note that if any of the collections are 

1504 # actually wildcard expressions, and we've asked for deduplication, 

1505 # this will raise TypeError for us. 

1506 if not builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst): 

1507 return queries.ChainedDatasetQueryResults(()) 

1508 query = builder.finish() 

1509 return queries.ParentDatasetQueryResults(self._db, query, components=[None]) 

1510 

1511 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *, 

1512 dataId: Optional[DataId] = None, 

1513 datasets: Any = None, 

1514 collections: Any = None, 

1515 where: Optional[str] = None, 

1516 components: Optional[bool] = None, 

1517 check: bool = True, 

1518 **kwargs: Any) -> queries.DataCoordinateQueryResults: 

1519 """Query for data IDs matching user-provided criteria. 

1520 

1521 Parameters 

1522 ---------- 

1523 dimensions : `Dimension` or `str`, or iterable thereof 

1524 The dimensions of the data IDs to yield, as either `Dimension` 

1525 instances or `str`. Will be automatically expanded to a complete 

1526 `DimensionGraph`. 

1527 dataId : `dict` or `DataCoordinate`, optional 

1528 A data ID whose key-value pairs are used as equality constraints 

1529 in the query. 

1530 datasets : `Any`, optional 

1531 An expression that fully or partially identifies dataset types 

1532 that should constrain the yielded data IDs. For example, including 

1533 "raw" here would constrain the yielded ``instrument``, 

1534 ``exposure``, ``detector``, and ``physical_filter`` values to only 

1535 those for which at least one "raw" dataset exists in 

1536 ``collections``. Allowed types include `DatasetType`, `str`, 

1537 `re.Pattern`, and iterables thereof. Unlike other dataset type 

1538 expressions, ``...`` is not permitted - it doesn't make sense to 

1539 constrain data IDs on the existence of *all* datasets. 

1540 See :ref:`daf_butler_dataset_type_expressions` for more 

1541 information. 

1542 collections: `Any`, optional 

1543 An expression that fully or partially identifies the collections 

1544 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1545 thereof. `...` can be used to return all collections. Must be 

1546 provided if ``datasets`` is, and is ignored if it is not. See 

1547 :ref:`daf_butler_collection_expressions` for more information. 

1548 where : `str`, optional 

1549 A string expression similar to a SQL WHERE clause. May involve 

1550 any column of a dimension table or (as a shortcut for the primary 

1551 key column of a dimension table) dimension name. See 

1552 :ref:`daf_butler_dimension_expressions` for more information. 

1553 components : `bool`, optional 

1554 If `True`, apply all dataset expression patterns to component 

1555 dataset type names as well. If `False`, never apply patterns to 

1556 components. If `None` (default), apply patterns to components only 

1557 if their parent datasets were not matched by the expression. 

1558 Fully-specified component datasets (`str` or `DatasetType` 

1559 instances) are always included. 

1560 check : `bool`, optional 

1561 If `True` (default) check the query for consistency before 

1562 executing it. This may reject some valid queries that resemble 

1563 common mistakes (e.g. queries for visits without specifying an 

1564 instrument). 

1565 **kwargs 

1566 Additional keyword arguments are forwarded to 

1567 `DataCoordinate.standardize` when processing the ``dataId`` 

1568 argument (and may be used to provide a constraining data ID even 

1569 when the ``dataId`` argument is `None`). 

1570 

1571 Returns 

1572 ------- 

1573 dataIds : `DataCoordinateQueryResults` 

1574 Data IDs matching the given query parameters. These are guaranteed 

1575 to identify all dimensions (`DataCoordinate.hasFull` returns 

1576 `True`), but will not contain `DimensionRecord` objects 

1577 (`DataCoordinate.hasRecords` returns `False`). Call 

1578 `DataCoordinateQueryResults.expanded` on the returned object to 

1579 fetch those (and consider using 

1580 `DataCoordinateQueryResults.materialize` on the returned object 

1581 first if the expected number of rows is very large). See 

1582 documentation for those methods for additional information. 

1583 """ 

1584 dimensions = iterable(dimensions) 

1585 standardizedDataId = self.expandDataId(dataId, **kwargs) 

1586 standardizedDatasetTypes = set() 

1587 requestedDimensions = self.dimensions.extract(dimensions) 

1588 queryDimensionNames = set(requestedDimensions.names) 

1589 if datasets is not None: 

1590 if collections is None: 

1591 raise TypeError("Cannot pass 'datasets' without 'collections'.") 

1592 for datasetType in self.queryDatasetTypes(datasets, components=components): 

1593 queryDimensionNames.update(datasetType.dimensions.names) 

1594 # If any matched dataset type is a component, just operate on 

1595 # its parent instead, because Registry doesn't know anything 

1596 # about what components exist, and here (unlike queryDatasets) 

1597 # we don't care about returning them. 

1598 parentDatasetTypeName, componentName = datasetType.nameAndComponent() 

1599 if componentName is not None: 

1600 datasetType = self.getDatasetType(parentDatasetTypeName) 

1601 standardizedDatasetTypes.add(datasetType) 

1602 # Preprocess collections expression in case the original included 

1603 # single-pass iterators (we'll want to use it multiple times 

1604 # below). 

1605 collections = CollectionQuery.fromExpression(collections) 

1606 

1607 summary = queries.QuerySummary( 

1608 requested=DimensionGraph(self.dimensions, names=queryDimensionNames), 

1609 dataId=standardizedDataId, 

1610 expression=where, 

1611 check=check, 

1612 ) 

1613 builder = self.makeQueryBuilder(summary) 

1614 for datasetType in standardizedDatasetTypes: 

1615 builder.joinDataset(datasetType, collections, isResult=False) 

1616 query = builder.finish() 

1617 return queries.DataCoordinateQueryResults(self._db, query) 

1618 

1619 def queryDimensionRecords(self, element: Union[DimensionElement, str], *, 

1620 dataId: Optional[DataId] = None, 

1621 datasets: Any = None, 

1622 collections: Any = None, 

1623 where: Optional[str] = None, 

1624 components: Optional[bool] = None, 

1625 check: bool = True, 

1626 **kwargs: Any) -> Iterator[DimensionRecord]: 

1627 """Query for dimension information matching user-provided criteria. 

1628 

1629 Parameters 

1630 ---------- 

1631 element : `DimensionElement` or `str` 

1632 The dimension element to obtain r 

1633 dataId : `dict` or `DataCoordinate`, optional 

1634 A data ID whose key-value pairs are used as equality constraints 

1635 in the query. 

1636 datasets : `Any`, optional 

1637 An expression that fully or partially identifies dataset types 

1638 that should constrain the yielded records. See `queryDataIds` and 

1639 :ref:`daf_butler_dataset_type_expressions` for more information. 

1640 collections: `Any`, optional 

1641 An expression that fully or partially identifies the collections 

1642 to search for datasets. See `queryDataIds` and 

1643 :ref:`daf_butler_collection_expressions` for more information. 

1644 where : `str`, optional 

1645 A string expression similar to a SQL WHERE clause. See 

1646 `queryDataIds` and :ref:`daf_butler_dimension_expressions` for more 

1647 information. 

1648 components : `bool`, optional 

1649 Whether to apply dataset expressions to components as well. 

1650 See `queryDataIds` for more information. 

1651 check : `bool`, optional 

1652 If `True` (default) check the query for consistency before 

1653 executing it. This may reject some valid queries that resemble 

1654 common mistakes (e.g. queries for visits without specifying an 

1655 instrument). 

1656 **kwargs 

1657 Additional keyword arguments are forwarded to 

1658 `DataCoordinate.standardize` when processing the ``dataId`` 

1659 argument (and may be used to provide a constraining data ID even 

1660 when the ``dataId`` argument is `None`). 

1661 

1662 Returns 

1663 ------- 

1664 dataIds : `DataCoordinateQueryResults` 

1665 Data IDs matching the given query parameters. 

1666 """ 

1667 if not isinstance(element, DimensionElement): 

1668 element = self.dimensions[element] 

1669 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections, 

1670 where=where, components=components, check=check, **kwargs) 

1671 return iter(self._dimensions[element].fetch(dataIds)) 

1672 

1673 def queryDatasetAssociations( 

1674 self, 

1675 datasetType: Union[str, DatasetType], 

1676 collections: Any = ..., 

1677 *, 

1678 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

1679 flattenChains: bool = False, 

1680 ) -> Iterator[DatasetAssociation]: 

1681 """Iterate over dataset-collection combinations where the dataset is in 

1682 the collection. 

1683 

1684 This method is a temporary placeholder for better support for 

1685 assocation results in `queryDatasets`. It will probably be 

1686 removed in the future, and should be avoided in production code 

1687 whenever possible. 

1688 

1689 Parameters 

1690 ---------- 

1691 datasetType : `DatasetType` or `str` 

1692 A dataset type object or the name of one. 

1693 collections: `Any`, optional 

1694 An expression that fully or partially identifies the collections 

1695 to search for datasets. See `queryCollections` and 

1696 :ref:`daf_butler_collection_expressions` for more information. 

1697 collectionTypes : `AbstractSet` [ `CollectionType` ], optional 

1698 If provided, only yield associations from collections of these 

1699 types. 

1700 flattenChains : `bool`, optional 

1701 If `True` (default) search in the children of 

1702 `~CollectionType.CHAINED` collections. If `False`, ``CHAINED`` 

1703 collections are ignored. 

1704 

1705 Yields 

1706 ------ 

1707 association : `DatasetAssociation` 

1708 Object representing the relationship beween a single dataset and 

1709 a single collection. 

1710 """ 

1711 collections = CollectionQuery.fromExpression(collections) 

1712 tsRepr = self._db.getTimespanRepresentation() 

1713 if isinstance(datasetType, str): 

1714 storage = self._datasets[datasetType] 

1715 else: 

1716 storage = self._datasets[datasetType.name] 

1717 for collectionRecord in collections.iter(self._collections, 

1718 collectionTypes=frozenset(collectionTypes), 

1719 flattenChains=flattenChains): 

1720 query = storage.select(collectionRecord) 

1721 if query is None: 

1722 continue 

1723 for row in self._db.query(query.combine()): 

1724 dataId = DataCoordinate.fromRequiredValues( 

1725 storage.datasetType.dimensions, 

1726 tuple(row[name] for name in storage.datasetType.dimensions.required.names) 

1727 ) 

1728 runRecord = self._collections[row[self._collections.getRunForeignKeyName()]] 

1729 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, 

1730 conform=False) 

1731 if collectionRecord.type is CollectionType.CALIBRATION: 

1732 timespan = tsRepr.extract(row) 

1733 else: 

1734 timespan = None 

1735 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan) 

1736 

1737 storageClasses: StorageClassFactory 

1738 """All storage classes known to the registry (`StorageClassFactory`). 

1739 """