Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "Registry", 

26) 

27 

28from collections import defaultdict 

29import contextlib 

30import logging 

31from typing import ( 

32 Any, 

33 Dict, 

34 Iterable, 

35 Iterator, 

36 List, 

37 Mapping, 

38 Optional, 

39 Set, 

40 Type, 

41 TYPE_CHECKING, 

42 Union, 

43) 

44 

45import sqlalchemy 

46 

47from ..core import ( 

48 Config, 

49 DataCoordinate, 

50 DataCoordinateIterable, 

51 DataId, 

52 DatasetAssociation, 

53 DatasetRef, 

54 DatasetType, 

55 ddl, 

56 Dimension, 

57 DimensionConfig, 

58 DimensionElement, 

59 DimensionGraph, 

60 DimensionRecord, 

61 DimensionUniverse, 

62 NamedKeyMapping, 

63 NameLookupMapping, 

64 StorageClassFactory, 

65 Timespan, 

66) 

67from . import queries 

68from ..core.utils import doImport, iterable, transactional 

69from ._config import RegistryConfig 

70from ._collectionType import CollectionType 

71from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError 

72from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch, Ellipsis 

73from .interfaces import ChainedCollectionRecord, RunRecord 

74from .versions import ButlerVersionsManager, DigestMismatchError 

75 

76if TYPE_CHECKING: 76 ↛ 77line 76 didn't jump to line 77, because the condition on line 76 was never true

77 from ..butlerConfig import ButlerConfig 

78 from .interfaces import ( 

79 ButlerAttributeManager, 

80 CollectionManager, 

81 Database, 

82 OpaqueTableStorageManager, 

83 DimensionRecordStorageManager, 

84 DatasetRecordStorageManager, 

85 DatastoreRegistryBridgeManager, 

86 ) 

87 

88 

89_LOG = logging.getLogger(__name__) 

90 

91# key for dimensions configuration in attributes table 

92_DIMENSIONS_ATTR = "config:dimensions.json" 

93 

94 

95class Registry: 

96 """Registry interface. 

97 

98 Parameters 

99 ---------- 

100 database : `Database` 

101 Database instance to store Registry. 

102 attributes : `type` 

103 Manager class implementing `ButlerAttributeManager`. 

104 opaque : `type` 

105 Manager class implementing `OpaqueTableStorageManager`. 

106 dimensions : `type` 

107 Manager class implementing `DimensionRecordStorageManager`. 

108 collections : `type` 

109 Manager class implementing `CollectionManager`. 

110 datasets : `type` 

111 Manager class implementing `DatasetRecordStorageManager`. 

112 datastoreBridges : `type` 

113 Manager class implementing `DatastoreRegistryBridgeManager`. 

114 dimensionConfig : `DimensionConfig`, optional 

115 Dimension universe configuration, only used when ``create`` is True. 

116 writeable : `bool`, optional 

117 If True then Registry will support write operations. 

118 create : `bool`, optional 

119 If True then database schema will be initialized, it must be empty 

120 before instantiating Registry. 

121 """ 

122 

123 defaultConfigFile: Optional[str] = None 

124 """Path to configuration defaults. Accessed within the ``configs`` resource 

125 or relative to a search path. Can be None if no defaults specified. 

126 """ 

127 

128 @classmethod 

129 def createFromConfig(cls, config: Optional[Union[RegistryConfig, str]] = None, 

130 dimensionConfig: Optional[Union[DimensionConfig, str]] = None, 

131 butlerRoot: Optional[str] = None) -> Registry: 

132 """Create registry database and return `Registry` instance. 

133 

134 This method initializes database contents, database must be empty 

135 prior to calling this method. 

136 

137 Parameters 

138 ---------- 

139 config : `RegistryConfig` or `str`, optional 

140 Registry configuration, if missing then default configuration will 

141 be loaded from registry.yaml. 

142 dimensionConfig : `DimensionConfig` or `str`, optional 

143 Dimensions configuration, if missing then default configuration 

144 will be loaded from dimensions.yaml. 

145 butlerRoot : `str`, optional 

146 Path to the repository root this `Registry` will manage. 

147 

148 Returns 

149 ------- 

150 registry : `Registry` 

151 A new `Registry` instance. 

152 """ 

153 if isinstance(config, str): 

154 config = RegistryConfig(config) 

155 elif config is None: 

156 config = RegistryConfig() 

157 elif not isinstance(config, RegistryConfig): 

158 raise TypeError(f"Incompatible Registry configuration type: {type(config)}") 

159 config.replaceRoot(butlerRoot) 

160 

161 if isinstance(dimensionConfig, str): 

162 dimensionConfig = DimensionConfig(config) 

163 elif dimensionConfig is None: 

164 dimensionConfig = DimensionConfig() 

165 elif not isinstance(dimensionConfig, DimensionConfig): 

166 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}") 

167 

168 DatabaseClass = config.getDatabaseClass() 

169 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

170 namespace=config.get("namespace")) 

171 attributes = doImport(config["managers", "attributes"]) 

172 opaque = doImport(config["managers", "opaque"]) 

173 dimensions = doImport(config["managers", "dimensions"]) 

174 collections = doImport(config["managers", "collections"]) 

175 datasets = doImport(config["managers", "datasets"]) 

176 datastoreBridges = doImport(config["managers", "datastores"]) 

177 

178 return cls(database, dimensions=dimensions, attributes=attributes, opaque=opaque, 

179 collections=collections, datasets=datasets, datastoreBridges=datastoreBridges, 

180 dimensionConfig=dimensionConfig, create=True) 

181 

182 @classmethod 

183 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], 

184 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry: 

185 """Create `Registry` subclass instance from `config`. 

186 

187 Registry database must be inbitialized prior to calling this method. 

188 

189 Parameters 

190 ---------- 

191 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

192 Registry configuration 

193 butlerRoot : `str`, optional 

194 Path to the repository root this `Registry` will manage. 

195 writeable : `bool`, optional 

196 If `True` (default) create a read-write connection to the database. 

197 

198 Returns 

199 ------- 

200 registry : `Registry` (subclass) 

201 A new `Registry` subclass instance. 

202 """ 

203 if not isinstance(config, RegistryConfig): 

204 if isinstance(config, str) or isinstance(config, Config): 

205 config = RegistryConfig(config) 

206 else: 

207 raise ValueError("Incompatible Registry configuration: {}".format(config)) 

208 config.replaceRoot(butlerRoot) 

209 DatabaseClass = config.getDatabaseClass() 

210 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

211 namespace=config.get("namespace"), writeable=writeable) 

212 attributes = doImport(config["managers", "attributes"]) 

213 opaque = doImport(config["managers", "opaque"]) 

214 dimensions = doImport(config["managers", "dimensions"]) 

215 collections = doImport(config["managers", "collections"]) 

216 datasets = doImport(config["managers", "datasets"]) 

217 datastoreBridges = doImport(config["managers", "datastores"]) 

218 

219 return cls(database, dimensions=dimensions, attributes=attributes, opaque=opaque, 

220 collections=collections, datasets=datasets, datastoreBridges=datastoreBridges, 

221 dimensionConfig=None, writeable=writeable) 

222 

223 def __init__(self, database: Database, *, 

224 attributes: Type[ButlerAttributeManager], 

225 opaque: Type[OpaqueTableStorageManager], 

226 dimensions: Type[DimensionRecordStorageManager], 

227 collections: Type[CollectionManager], 

228 datasets: Type[DatasetRecordStorageManager], 

229 datastoreBridges: Type[DatastoreRegistryBridgeManager], 

230 dimensionConfig: Optional[DimensionConfig] = None, 

231 writeable: bool = True, 

232 create: bool = False): 

233 self._db = database 

234 self.storageClasses = StorageClassFactory() 

235 

236 # With existing registry we have to read dimensions config from 

237 # database before we initialize all other managers. 

238 if dimensionConfig is None: 

239 assert not create, "missing DimensionConfig when create=True" 

240 with self._db.declareStaticTables(create=False) as context: 

241 self._attributes = attributes.initialize(self._db, context) 

242 

243 versions = ButlerVersionsManager( 

244 self._attributes, 

245 dict(attributes=self._attributes) 

246 ) 

247 # verify that configured versions are compatible with schema 

248 versions.checkManagersConfig() 

249 versions.checkManagersVersions(writeable) 

250 

251 # get serialized as a string from database 

252 dimensionsString = self._attributes.get(_DIMENSIONS_ATTR) 

253 if dimensionsString is not None: 

254 dimensionConfig = DimensionConfig(Config.fromString(dimensionsString, format="json")) 

255 else: 

256 raise LookupError(f"Registry attribute {_DIMENSIONS_ATTR} is missing from database") 

257 

258 # make universe 

259 universe = DimensionUniverse(dimensionConfig) 

260 

261 with self._db.declareStaticTables(create=create) as context: 

262 self._attributes = attributes.initialize(self._db, context) 

263 self._dimensions = dimensions.initialize(self._db, context, universe=universe) 

264 self._collections = collections.initialize(self._db, context, dimensions=self._dimensions) 

265 self._datasets = datasets.initialize(self._db, context, 

266 collections=self._collections, 

267 dimensions=self._dimensions) 

268 self._opaque = opaque.initialize(self._db, context) 

269 self._datastoreBridges = datastoreBridges.initialize(self._db, context, 

270 opaque=self._opaque, 

271 datasets=datasets, 

272 universe=self._dimensions.universe) 

273 versions = ButlerVersionsManager( 

274 self._attributes, 

275 dict( 

276 attributes=self._attributes, 

277 opaque=self._opaque, 

278 dimensions=self._dimensions, 

279 collections=self._collections, 

280 datasets=self._datasets, 

281 datastores=self._datastoreBridges, 

282 ) 

283 ) 

284 # store managers and their versions in attributes table 

285 context.addInitializer(lambda db: versions.storeManagersConfig()) 

286 context.addInitializer(lambda db: versions.storeManagersVersions()) 

287 # dump universe config as json into attributes (faster than YAML) 

288 json = dimensionConfig.dump(format="json") 

289 if json is not None: 

290 # Convert Optional[str] to str for mypy 

291 json_str = json 

292 context.addInitializer( 

293 lambda db: self._attributes.set(_DIMENSIONS_ATTR, json_str) 

294 ) 

295 else: 

296 raise RuntimeError("Unexpectedly failed to serialize DimensionConfig to JSON") 

297 

298 if not create: 

299 # verify that configured versions are compatible with schema 

300 versions.checkManagersConfig() 

301 versions.checkManagersVersions(writeable) 

302 try: 

303 versions.checkManagersDigests() 

304 except DigestMismatchError as exc: 

305 # potentially digest mismatch is a serious error but during 

306 # development it could be benign, treat this as warning for 

307 # now. 

308 _LOG.warning(f"Registry schema digest mismatch: {exc}") 

309 

310 self._dimensions.refresh() 

311 self._collections.refresh() 

312 self._datasets.refresh() 

313 

314 def __str__(self) -> str: 

315 return str(self._db) 

316 

317 def __repr__(self) -> str: 

318 return f"Registry({self._db!r}, {self.dimensions!r})" 

319 

320 def isWriteable(self) -> bool: 

321 """Return `True` if this registry allows write operations, and `False` 

322 otherwise. 

323 """ 

324 return self._db.isWriteable() 

325 

326 @property 

327 def dimensions(self) -> DimensionUniverse: 

328 """All dimensions recognized by this `Registry` (`DimensionUniverse`). 

329 """ 

330 return self._dimensions.universe 

331 

332 @contextlib.contextmanager 

333 def transaction(self, *, savepoint: bool = False) -> Iterator[None]: 

334 """Return a context manager that represents a transaction. 

335 """ 

336 try: 

337 with self._db.transaction(savepoint=savepoint): 

338 yield 

339 except BaseException: 

340 # TODO: this clears the caches sometimes when we wouldn't actually 

341 # need to. Can we avoid that? 

342 self._dimensions.clearCaches() 

343 raise 

344 

345 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None: 

346 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

347 other data repository client. 

348 

349 Opaque table records can be added via `insertOpaqueData`, retrieved via 

350 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

351 

352 Parameters 

353 ---------- 

354 tableName : `str` 

355 Logical name of the opaque table. This may differ from the 

356 actual name used in the database by a prefix and/or suffix. 

357 spec : `ddl.TableSpec` 

358 Specification for the table to be added. 

359 """ 

360 self._opaque.register(tableName, spec) 

361 

362 @transactional 

363 def insertOpaqueData(self, tableName: str, *data: dict) -> None: 

364 """Insert records into an opaque table. 

365 

366 Parameters 

367 ---------- 

368 tableName : `str` 

369 Logical name of the opaque table. Must match the name used in a 

370 previous call to `registerOpaqueTable`. 

371 data 

372 Each additional positional argument is a dictionary that represents 

373 a single row to be added. 

374 """ 

375 self._opaque[tableName].insert(*data) 

376 

377 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

378 """Retrieve records from an opaque table. 

379 

380 Parameters 

381 ---------- 

382 tableName : `str` 

383 Logical name of the opaque table. Must match the name used in a 

384 previous call to `registerOpaqueTable`. 

385 where 

386 Additional keyword arguments are interpreted as equality 

387 constraints that restrict the returned rows (combined with AND); 

388 keyword arguments are column names and values are the values they 

389 must have. 

390 

391 Yields 

392 ------ 

393 row : `dict` 

394 A dictionary representing a single result row. 

395 """ 

396 yield from self._opaque[tableName].fetch(**where) 

397 

398 @transactional 

399 def deleteOpaqueData(self, tableName: str, **where: Any) -> None: 

400 """Remove records from an opaque table. 

401 

402 Parameters 

403 ---------- 

404 tableName : `str` 

405 Logical name of the opaque table. Must match the name used in a 

406 previous call to `registerOpaqueTable`. 

407 where 

408 Additional keyword arguments are interpreted as equality 

409 constraints that restrict the deleted rows (combined with AND); 

410 keyword arguments are column names and values are the values they 

411 must have. 

412 """ 

413 self._opaque[tableName].delete(**where) 

414 

415 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED, 

416 doc: Optional[str] = None) -> None: 

417 """Add a new collection if one with the given name does not exist. 

418 

419 Parameters 

420 ---------- 

421 name : `str` 

422 The name of the collection to create. 

423 type : `CollectionType` 

424 Enum value indicating the type of collection to create. 

425 doc : `str`, optional 

426 Documentation string for the collection. 

427 

428 Notes 

429 ----- 

430 This method cannot be called within transactions, as it needs to be 

431 able to perform its own transaction to be concurrent. 

432 """ 

433 self._collections.register(name, type, doc=doc) 

434 

435 def getCollectionType(self, name: str) -> CollectionType: 

436 """Return an enumeration value indicating the type of the given 

437 collection. 

438 

439 Parameters 

440 ---------- 

441 name : `str` 

442 The name of the collection. 

443 

444 Returns 

445 ------- 

446 type : `CollectionType` 

447 Enum value indicating the type of this collection. 

448 

449 Raises 

450 ------ 

451 MissingCollectionError 

452 Raised if no collection with the given name exists. 

453 """ 

454 return self._collections.find(name).type 

455 

456 def registerRun(self, name: str, doc: Optional[str] = None) -> None: 

457 """Add a new run if one with the given name does not exist. 

458 

459 Parameters 

460 ---------- 

461 name : `str` 

462 The name of the run to create. 

463 doc : `str`, optional 

464 Documentation string for the collection. 

465 

466 Notes 

467 ----- 

468 This method cannot be called within transactions, as it needs to be 

469 able to perform its own transaction to be concurrent. 

470 """ 

471 self._collections.register(name, CollectionType.RUN, doc=doc) 

472 

473 @transactional 

474 def removeCollection(self, name: str) -> None: 

475 """Completely remove the given collection. 

476 

477 Parameters 

478 ---------- 

479 name : `str` 

480 The name of the collection to remove. 

481 

482 Raises 

483 ------ 

484 MissingCollectionError 

485 Raised if no collection with the given name exists. 

486 

487 Notes 

488 ----- 

489 If this is a `~CollectionType.RUN` collection, all datasets and quanta 

490 in it are also fully removed. This requires that those datasets be 

491 removed (or at least trashed) from any datastores that hold them first. 

492 

493 A collection may not be deleted as long as it is referenced by a 

494 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must 

495 be deleted or redefined first. 

496 """ 

497 self._collections.remove(name) 

498 

499 def getCollectionChain(self, parent: str) -> CollectionSearch: 

500 """Return the child collections in a `~CollectionType.CHAINED` 

501 collection. 

502 

503 Parameters 

504 ---------- 

505 parent : `str` 

506 Name of the chained collection. Must have already been added via 

507 a call to `Registry.registerCollection`. 

508 

509 Returns 

510 ------- 

511 children : `CollectionSearch` 

512 An object that defines the search path of the collection. 

513 See :ref:`daf_butler_collection_expressions` for more information. 

514 

515 Raises 

516 ------ 

517 MissingCollectionError 

518 Raised if ``parent`` does not exist in the `Registry`. 

519 TypeError 

520 Raised if ``parent`` does not correspond to a 

521 `~CollectionType.CHAINED` collection. 

522 """ 

523 record = self._collections.find(parent) 

524 if record.type is not CollectionType.CHAINED: 

525 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

526 assert isinstance(record, ChainedCollectionRecord) 

527 return record.children 

528 

529 @transactional 

530 def setCollectionChain(self, parent: str, children: Any) -> None: 

531 """Define or redefine a `~CollectionType.CHAINED` collection. 

532 

533 Parameters 

534 ---------- 

535 parent : `str` 

536 Name of the chained collection. Must have already been added via 

537 a call to `Registry.registerCollection`. 

538 children : `Any` 

539 An expression defining an ordered search of child collections, 

540 generally an iterable of `str`; see 

541 :ref:`daf_butler_collection_expressions` for more information. 

542 

543 Raises 

544 ------ 

545 MissingCollectionError 

546 Raised when any of the given collections do not exist in the 

547 `Registry`. 

548 TypeError 

549 Raised if ``parent`` does not correspond to a 

550 `~CollectionType.CHAINED` collection. 

551 ValueError 

552 Raised if the given collections contains a cycle. 

553 """ 

554 record = self._collections.find(parent) 

555 if record.type is not CollectionType.CHAINED: 

556 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

557 assert isinstance(record, ChainedCollectionRecord) 

558 children = CollectionSearch.fromExpression(children) 

559 if children != record.children: 

560 record.update(self._collections, children) 

561 

562 def getCollectionDocumentation(self, collection: str) -> Optional[str]: 

563 """Retrieve the documentation string for a collection. 

564 

565 Parameters 

566 ---------- 

567 name : `str` 

568 Name of the collection. 

569 

570 Returns 

571 ------- 

572 docs : `str` or `None` 

573 Docstring for the collection with the given name. 

574 """ 

575 return self._collections.getDocumentation(self._collections.find(collection).key) 

576 

577 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None: 

578 """Set the documentation string for a collection. 

579 

580 Parameters 

581 ---------- 

582 name : `str` 

583 Name of the collection. 

584 docs : `str` or `None` 

585 Docstring for the collection with the given name; will replace any 

586 existing docstring. Passing `None` will remove any existing 

587 docstring. 

588 """ 

589 self._collections.setDocumentation(self._collections.find(collection).key, doc) 

590 

591 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

592 """ 

593 Add a new `DatasetType` to the Registry. 

594 

595 It is not an error to register the same `DatasetType` twice. 

596 

597 Parameters 

598 ---------- 

599 datasetType : `DatasetType` 

600 The `DatasetType` to be added. 

601 

602 Returns 

603 ------- 

604 inserted : `bool` 

605 `True` if ``datasetType`` was inserted, `False` if an identical 

606 existing `DatsetType` was found. Note that in either case the 

607 DatasetType is guaranteed to be defined in the Registry 

608 consistently with the given definition. 

609 

610 Raises 

611 ------ 

612 ValueError 

613 Raised if the dimensions or storage class are invalid. 

614 ConflictingDefinitionError 

615 Raised if this DatasetType is already registered with a different 

616 definition. 

617 

618 Notes 

619 ----- 

620 This method cannot be called within transactions, as it needs to be 

621 able to perform its own transaction to be concurrent. 

622 """ 

623 _, inserted = self._datasets.register(datasetType) 

624 return inserted 

625 

626 def removeDatasetType(self, name: str) -> None: 

627 """Remove the named `DatasetType` from the registry. 

628 

629 .. warning:: 

630 

631 Registry caches the dataset type definitions. This means that 

632 deleting the dataset type definition may result in unexpected 

633 behavior from other butler processes that are active that have 

634 not seen the deletion. 

635 

636 Parameters 

637 ---------- 

638 name : `str` 

639 Name of the type to be removed. 

640 

641 Raises 

642 ------ 

643 lsst.daf.butler.registry.OrphanedRecordError 

644 Raised if an attempt is made to remove the dataset type definition 

645 when there are already datasets associated with it. 

646 

647 Notes 

648 ----- 

649 If the dataset type is not registered the method will return without 

650 action. 

651 """ 

652 self._datasets.remove(name) 

653 

654 def getDatasetType(self, name: str) -> DatasetType: 

655 """Get the `DatasetType`. 

656 

657 Parameters 

658 ---------- 

659 name : `str` 

660 Name of the type. 

661 

662 Returns 

663 ------- 

664 type : `DatasetType` 

665 The `DatasetType` associated with the given name. 

666 

667 Raises 

668 ------ 

669 KeyError 

670 Requested named DatasetType could not be found in registry. 

671 """ 

672 return self._datasets[name].datasetType 

673 

674 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *, 

675 collections: Any, timespan: Optional[Timespan] = None, 

676 **kwargs: Any) -> Optional[DatasetRef]: 

677 """Find a dataset given its `DatasetType` and data ID. 

678 

679 This can be used to obtain a `DatasetRef` that permits the dataset to 

680 be read from a `Datastore`. If the dataset is a component and can not 

681 be found using the provided dataset type, a dataset ref for the parent 

682 will be returned instead but with the correct dataset type. 

683 

684 Parameters 

685 ---------- 

686 datasetType : `DatasetType` or `str` 

687 A `DatasetType` or the name of one. 

688 dataId : `dict` or `DataCoordinate`, optional 

689 A `dict`-like object containing the `Dimension` links that identify 

690 the dataset within a collection. 

691 collections 

692 An expression that fully or partially identifies the collections to 

693 search for the dataset; see 

694 :ref:`daf_butler_collection_expressions` for more information. 

695 timespan : `Timespan`, optional 

696 A timespan that the validity range of the dataset must overlap. 

697 If not provided, any `~CollectionType.CALIBRATION` collections 

698 matched by the ``collections`` argument will not be searched. 

699 **kwargs 

700 Additional keyword arguments passed to 

701 `DataCoordinate.standardize` to convert ``dataId`` to a true 

702 `DataCoordinate` or augment an existing one. 

703 

704 Returns 

705 ------- 

706 ref : `DatasetRef` 

707 A reference to the dataset, or `None` if no matching Dataset 

708 was found. 

709 

710 Raises 

711 ------ 

712 LookupError 

713 Raised if one or more data ID keys are missing. 

714 KeyError 

715 Raised if the dataset type does not exist. 

716 MissingCollectionError 

717 Raised if any of ``collections`` does not exist in the registry. 

718 

719 Notes 

720 ----- 

721 This method simply returns `None` and does not raise an exception even 

722 when the set of collections searched is intrinsically incompatible with 

723 the dataset type, e.g. if ``datasetType.isCalibration() is False``, but 

724 only `~CollectionType.CALIBRATION` collections are being searched. 

725 This may make it harder to debug some lookup failures, but the behavior 

726 is intentional; we consider it more important that failed searches are 

727 reported consistently, regardless of the reason, and that adding 

728 additional collections that do not contain a match to the search path 

729 never changes the behavior. 

730 """ 

731 if isinstance(datasetType, DatasetType): 

732 storage = self._datasets[datasetType.name] 

733 else: 

734 storage = self._datasets[datasetType] 

735 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions, 

736 universe=self.dimensions, **kwargs) 

737 collections = CollectionSearch.fromExpression(collections) 

738 for collectionRecord in collections.iter(self._collections): 

739 if (collectionRecord.type is CollectionType.CALIBRATION 

740 and (not storage.datasetType.isCalibration() or timespan is None)): 

741 continue 

742 result = storage.find(collectionRecord, dataId, timespan=timespan) 

743 if result is not None: 

744 return result 

745 

746 return None 

747 

748 @transactional 

749 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId], 

750 run: str) -> List[DatasetRef]: 

751 """Insert one or more datasets into the `Registry` 

752 

753 This always adds new datasets; to associate existing datasets with 

754 a new collection, use ``associate``. 

755 

756 Parameters 

757 ---------- 

758 datasetType : `DatasetType` or `str` 

759 A `DatasetType` or the name of one. 

760 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate` 

761 Dimension-based identifiers for the new datasets. 

762 run : `str` 

763 The name of the run that produced the datasets. 

764 

765 Returns 

766 ------- 

767 refs : `list` of `DatasetRef` 

768 Resolved `DatasetRef` instances for all given data IDs (in the same 

769 order). 

770 

771 Raises 

772 ------ 

773 ConflictingDefinitionError 

774 If a dataset with the same dataset type and data ID as one of those 

775 given already exists in ``run``. 

776 MissingCollectionError 

777 Raised if ``run`` does not exist in the registry. 

778 """ 

779 if isinstance(datasetType, DatasetType): 

780 storage = self._datasets.find(datasetType.name) 

781 if storage is None: 

782 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

783 else: 

784 storage = self._datasets.find(datasetType) 

785 if storage is None: 

786 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

787 runRecord = self._collections.find(run) 

788 if runRecord.type is not CollectionType.RUN: 

789 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.") 

790 assert isinstance(runRecord, RunRecord) 

791 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) 

792 for dataId in dataIds] 

793 try: 

794 refs = list(storage.insert(runRecord, expandedDataIds)) 

795 except sqlalchemy.exc.IntegrityError as err: 

796 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

797 f"one or more datasets of type {storage.datasetType} into " 

798 f"collection '{run}'. " 

799 f"This probably means a dataset with the same data ID " 

800 f"and dataset type already exists, but it may also mean a " 

801 f"dimension row is missing.") from err 

802 return refs 

803 

804 def getDataset(self, id: int) -> Optional[DatasetRef]: 

805 """Retrieve a Dataset entry. 

806 

807 Parameters 

808 ---------- 

809 id : `int` 

810 The unique identifier for the dataset. 

811 

812 Returns 

813 ------- 

814 ref : `DatasetRef` or `None` 

815 A ref to the Dataset, or `None` if no matching Dataset 

816 was found. 

817 """ 

818 ref = self._datasets.getDatasetRef(id) 

819 if ref is None: 

820 return None 

821 return ref 

822 

823 @transactional 

824 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None: 

825 """Remove datasets from the Registry. 

826 

827 The datasets will be removed unconditionally from all collections, and 

828 any `Quantum` that consumed this dataset will instead be marked with 

829 having a NULL input. `Datastore` records will *not* be deleted; the 

830 caller is responsible for ensuring that the dataset has already been 

831 removed from all Datastores. 

832 

833 Parameters 

834 ---------- 

835 refs : `Iterable` of `DatasetRef` 

836 References to the datasets to be removed. Must include a valid 

837 ``id`` attribute, and should be considered invalidated upon return. 

838 

839 Raises 

840 ------ 

841 AmbiguousDatasetError 

842 Raised if any ``ref.id`` is `None`. 

843 OrphanedRecordError 

844 Raised if any dataset is still present in any `Datastore`. 

845 """ 

846 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

847 storage = self._datasets.find(datasetType.name) 

848 assert storage is not None 

849 try: 

850 storage.delete(refsForType) 

851 except sqlalchemy.exc.IntegrityError as err: 

852 raise OrphanedRecordError("One or more datasets is still " 

853 "present in one or more Datastores.") from err 

854 

855 @transactional 

856 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

857 """Add existing datasets to a `~CollectionType.TAGGED` collection. 

858 

859 If a DatasetRef with the same exact integer ID is already in a 

860 collection nothing is changed. If a `DatasetRef` with the same 

861 `DatasetType` and data ID but with different integer ID 

862 exists in the collection, `ConflictingDefinitionError` is raised. 

863 

864 Parameters 

865 ---------- 

866 collection : `str` 

867 Indicates the collection the datasets should be associated with. 

868 refs : `Iterable` [ `DatasetRef` ] 

869 An iterable of resolved `DatasetRef` instances that already exist 

870 in this `Registry`. 

871 

872 Raises 

873 ------ 

874 ConflictingDefinitionError 

875 If a Dataset with the given `DatasetRef` already exists in the 

876 given collection. 

877 AmbiguousDatasetError 

878 Raised if ``any(ref.id is None for ref in refs)``. 

879 MissingCollectionError 

880 Raised if ``collection`` does not exist in the registry. 

881 TypeError 

882 Raise adding new datasets to the given ``collection`` is not 

883 allowed. 

884 """ 

885 collectionRecord = self._collections.find(collection) 

886 if collectionRecord.type is not CollectionType.TAGGED: 

887 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

888 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

889 storage = self._datasets.find(datasetType.name) 

890 assert storage is not None 

891 try: 

892 storage.associate(collectionRecord, refsForType) 

893 except sqlalchemy.exc.IntegrityError as err: 

894 raise ConflictingDefinitionError( 

895 f"Constraint violation while associating dataset of type {datasetType.name} with " 

896 f"collection {collection}. This probably means that one or more datasets with the same " 

897 f"dataset type and data ID already exist in the collection, but it may also indicate " 

898 f"that the datasets do not exist." 

899 ) from err 

900 

901 @transactional 

902 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

903 """Remove existing datasets from a `~CollectionType.TAGGED` collection. 

904 

905 ``collection`` and ``ref`` combinations that are not currently 

906 associated are silently ignored. 

907 

908 Parameters 

909 ---------- 

910 collection : `str` 

911 The collection the datasets should no longer be associated with. 

912 refs : `Iterable` [ `DatasetRef` ] 

913 An iterable of resolved `DatasetRef` instances that already exist 

914 in this `Registry`. 

915 

916 Raises 

917 ------ 

918 AmbiguousDatasetError 

919 Raised if any of the given dataset references is unresolved. 

920 MissingCollectionError 

921 Raised if ``collection`` does not exist in the registry. 

922 TypeError 

923 Raise adding new datasets to the given ``collection`` is not 

924 allowed. 

925 """ 

926 collectionRecord = self._collections.find(collection) 

927 if collectionRecord.type is not CollectionType.TAGGED: 

928 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; " 

929 "expected TAGGED.") 

930 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

931 storage = self._datasets.find(datasetType.name) 

932 assert storage is not None 

933 storage.disassociate(collectionRecord, refsForType) 

934 

935 @transactional 

936 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None: 

937 """Associate one or more datasets with a calibration collection and a 

938 validity range within it. 

939 

940 Parameters 

941 ---------- 

942 collection : `str` 

943 The name of an already-registered `~CollectionType.CALIBRATION` 

944 collection. 

945 refs : `Iterable` [ `DatasetRef` ] 

946 Datasets to be associated. 

947 timespan : `Timespan` 

948 The validity range for these datasets within the collection. 

949 

950 Raises 

951 ------ 

952 AmbiguousDatasetError 

953 Raised if any of the given `DatasetRef` instances is unresolved. 

954 ConflictingDefinitionError 

955 Raised if the collection already contains a different dataset with 

956 the same `DatasetType` and data ID and an overlapping validity 

957 range. 

958 TypeError 

959 Raised if ``collection`` is not a `~CollectionType.CALIBRATION` 

960 collection or if one or more datasets are of a dataset type for 

961 which `DatasetType.isCalibration` returns `False`. 

962 """ 

963 collectionRecord = self._collections.find(collection) 

964 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

965 storage = self._datasets[datasetType.name] 

966 storage.certify(collectionRecord, refsForType, timespan) 

967 

968 @transactional 

969 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *, 

970 dataIds: Optional[Iterable[DataId]] = None) -> None: 

971 """Remove or adjust datasets to clear a validity range within a 

972 calibration collection. 

973 

974 Parameters 

975 ---------- 

976 collection : `str` 

977 The name of an already-registered `~CollectionType.CALIBRATION` 

978 collection. 

979 datasetType : `str` or `DatasetType` 

980 Name or `DatasetType` instance for the datasets to be decertified. 

981 timespan : `Timespan`, optional 

982 The validity range to remove datasets from within the collection. 

983 Datasets that overlap this range but are not contained by it will 

984 have their validity ranges adjusted to not overlap it, which may 

985 split a single dataset validity range into two. 

986 dataIds : `Iterable` [ `DataId` ], optional 

987 Data IDs that should be decertified within the given validity range 

988 If `None`, all data IDs for ``self.datasetType`` will be 

989 decertified. 

990 

991 Raises 

992 ------ 

993 TypeError 

994 Raised if ``collection`` is not a `~CollectionType.CALIBRATION` 

995 collection or if ``datasetType.isCalibration() is False``. 

996 """ 

997 collectionRecord = self._collections.find(collection) 

998 if isinstance(datasetType, str): 

999 storage = self._datasets[datasetType] 

1000 else: 

1001 storage = self._datasets[datasetType.name] 

1002 standardizedDataIds = None 

1003 if dataIds is not None: 

1004 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) 

1005 for d in dataIds] 

1006 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds) 

1007 

1008 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: 

1009 """Return an object that allows a new `Datastore` instance to 

1010 communicate with this `Registry`. 

1011 

1012 Returns 

1013 ------- 

1014 manager : `DatastoreRegistryBridgeManager` 

1015 Object that mediates communication between this `Registry` and its 

1016 associated datastores. 

1017 """ 

1018 return self._datastoreBridges 

1019 

1020 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]: 

1021 """Retrieve datastore locations for a given dataset. 

1022 

1023 Parameters 

1024 ---------- 

1025 ref : `DatasetRef` 

1026 A reference to the dataset for which to retrieve storage 

1027 information. 

1028 

1029 Returns 

1030 ------- 

1031 datastores : `Iterable` [ `str` ] 

1032 All the matching datastores holding this dataset. 

1033 

1034 Raises 

1035 ------ 

1036 AmbiguousDatasetError 

1037 Raised if ``ref.id`` is `None`. 

1038 """ 

1039 return self._datastoreBridges.findDatastores(ref) 

1040 

1041 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None, 

1042 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None, 

1043 **kwargs: Any) -> DataCoordinate: 

1044 """Expand a dimension-based data ID to include additional information. 

1045 

1046 Parameters 

1047 ---------- 

1048 dataId : `DataCoordinate` or `dict`, optional 

1049 Data ID to be expanded; augmented and overridden by ``kwds``. 

1050 graph : `DimensionGraph`, optional 

1051 Set of dimensions for the expanded ID. If `None`, the dimensions 

1052 will be inferred from the keys of ``dataId`` and ``kwds``. 

1053 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph`` 

1054 are silently ignored, providing a way to extract and expand a 

1055 subset of a data ID. 

1056 records : `Mapping` [`str`, `DimensionRecord`], optional 

1057 Dimension record data to use before querying the database for that 

1058 data, keyed by element name. 

1059 **kwargs 

1060 Additional keywords are treated like additional key-value pairs for 

1061 ``dataId``, extending and overriding 

1062 

1063 Returns 

1064 ------- 

1065 expanded : `DataCoordinate` 

1066 A data ID that includes full metadata for all of the dimensions it 

1067 identifieds, i.e. guarantees that ``expanded.hasRecords()`` and 

1068 ``expanded.hasFull()`` both return `True`. 

1069 """ 

1070 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwargs) 

1071 if standardized.hasRecords(): 

1072 return standardized 

1073 if records is None: 

1074 records = {} 

1075 elif isinstance(records, NamedKeyMapping): 

1076 records = records.byName() 

1077 else: 

1078 records = dict(records) 

1079 if isinstance(dataId, DataCoordinate) and dataId.hasRecords(): 

1080 records.update(dataId.records.byName()) 

1081 keys = standardized.byName() 

1082 for element in standardized.graph.primaryKeyTraversalOrder: 

1083 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

1084 if record is ...: 

1085 if isinstance(element, Dimension) and keys.get(element.name) is None: 

1086 if element in standardized.graph.required: 

1087 raise LookupError( 

1088 f"No value or null value for required dimension {element.name}." 

1089 ) 

1090 keys[element.name] = None 

1091 record = None 

1092 else: 

1093 storage = self._dimensions[element] 

1094 dataIdSet = DataCoordinateIterable.fromScalar( 

1095 DataCoordinate.standardize(keys, graph=element.graph) 

1096 ) 

1097 fetched = tuple(storage.fetch(dataIdSet)) 

1098 try: 

1099 (record,) = fetched 

1100 except ValueError: 

1101 record = None 

1102 records[element.name] = record 

1103 if record is not None: 

1104 for d in element.implied: 

1105 value = getattr(record, d.name) 

1106 if keys.setdefault(d.name, value) != value: 

1107 raise InconsistentDataIdError( 

1108 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, " 

1109 f"but {element.name} implies {d.name}={value!r}." 

1110 ) 

1111 else: 

1112 if element in standardized.graph.required: 

1113 raise LookupError( 

1114 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

1115 ) 

1116 if element.alwaysJoin: 

1117 raise InconsistentDataIdError( 

1118 f"Could not fetch record for element {element.name} via keys {keys}, ", 

1119 "but it is marked alwaysJoin=True; this means one or more dimensions are not " 

1120 "related." 

1121 ) 

1122 for d in element.implied: 

1123 keys.setdefault(d.name, None) 

1124 records.setdefault(d.name, None) 

1125 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records) 

1126 

1127 def insertDimensionData(self, element: Union[DimensionElement, str], 

1128 *data: Union[Mapping[str, Any], DimensionRecord], 

1129 conform: bool = True) -> None: 

1130 """Insert one or more dimension records into the database. 

1131 

1132 Parameters 

1133 ---------- 

1134 element : `DimensionElement` or `str` 

1135 The `DimensionElement` or name thereof that identifies the table 

1136 records will be inserted into. 

1137 data : `dict` or `DimensionRecord` (variadic) 

1138 One or more records to insert. 

1139 conform : `bool`, optional 

1140 If `False` (`True` is default) perform no checking or conversions, 

1141 and assume that ``element`` is a `DimensionElement` instance and 

1142 ``data`` is a one or more `DimensionRecord` instances of the 

1143 appropriate subclass. 

1144 """ 

1145 if conform: 

1146 if isinstance(element, str): 

1147 element = self.dimensions[element] 

1148 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

1149 for row in data] 

1150 else: 

1151 # Ignore typing since caller said to trust them with conform=False. 

1152 records = data # type: ignore 

1153 storage = self._dimensions[element] # type: ignore 

1154 storage.insert(*records) 

1155 

1156 def syncDimensionData(self, element: Union[DimensionElement, str], 

1157 row: Union[Mapping[str, Any], DimensionRecord], 

1158 conform: bool = True) -> bool: 

1159 """Synchronize the given dimension record with the database, inserting 

1160 if it does not already exist and comparing values if it does. 

1161 

1162 Parameters 

1163 ---------- 

1164 element : `DimensionElement` or `str` 

1165 The `DimensionElement` or name thereof that identifies the table 

1166 records will be inserted into. 

1167 row : `dict` or `DimensionRecord` 

1168 The record to insert. 

1169 conform : `bool`, optional 

1170 If `False` (`True` is default) perform no checking or conversions, 

1171 and assume that ``element`` is a `DimensionElement` instance and 

1172 ``data`` is a one or more `DimensionRecord` instances of the 

1173 appropriate subclass. 

1174 

1175 Returns 

1176 ------- 

1177 inserted : `bool` 

1178 `True` if a new row was inserted, `False` otherwise. 

1179 

1180 Raises 

1181 ------ 

1182 ConflictingDefinitionError 

1183 Raised if the record exists in the database (according to primary 

1184 key lookup) but is inconsistent with the given one. 

1185 """ 

1186 if conform: 

1187 if isinstance(element, str): 

1188 element = self.dimensions[element] 

1189 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

1190 else: 

1191 # Ignore typing since caller said to trust them with conform=False. 

1192 record = row # type: ignore 

1193 storage = self._dimensions[element] # type: ignore 

1194 return storage.sync(record) 

1195 

1196 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None 

1197 ) -> Iterator[DatasetType]: 

1198 """Iterate over the dataset types whose names match an expression. 

1199 

1200 Parameters 

1201 ---------- 

1202 expression : `Any`, optional 

1203 An expression that fully or partially identifies the dataset types 

1204 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1205 `...` can be used to return all dataset types, and is the default. 

1206 See :ref:`daf_butler_dataset_type_expressions` for more 

1207 information. 

1208 components : `bool`, optional 

1209 If `True`, apply all expression patterns to component dataset type 

1210 names as well. If `False`, never apply patterns to components. 

1211 If `None` (default), apply patterns to components only if their 

1212 parent datasets were not matched by the expression. 

1213 Fully-specified component datasets (`str` or `DatasetType` 

1214 instances) are always included. 

1215 

1216 Yields 

1217 ------ 

1218 datasetType : `DatasetType` 

1219 A `DatasetType` instance whose name matches ``expression``. 

1220 """ 

1221 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name) 

1222 if wildcard is Ellipsis: 

1223 for datasetType in self._datasets: 

1224 # The dataset type can no longer be a component 

1225 yield datasetType 

1226 if components and datasetType.isComposite(): 

1227 # Automatically create the component dataset types 

1228 for component in datasetType.makeAllComponentDatasetTypes(): 

1229 yield component 

1230 return 

1231 done: Set[str] = set() 

1232 for name in wildcard.strings: 

1233 storage = self._datasets.find(name) 

1234 if storage is not None: 

1235 done.add(storage.datasetType.name) 

1236 yield storage.datasetType 

1237 if wildcard.patterns: 

1238 # If components (the argument) is None, we'll save component 

1239 # dataset that we might want to match, but only if their parents 

1240 # didn't get included. 

1241 componentsForLater = [] 

1242 for registeredDatasetType in self._datasets: 

1243 # Components are not stored in registry so expand them here 

1244 allDatasetTypes = [registeredDatasetType] \ 

1245 + registeredDatasetType.makeAllComponentDatasetTypes() 

1246 for datasetType in allDatasetTypes: 

1247 if datasetType.name in done: 

1248 continue 

1249 parentName, componentName = datasetType.nameAndComponent() 

1250 if componentName is not None and not components: 

1251 if components is None and parentName not in done: 

1252 componentsForLater.append(datasetType) 

1253 continue 

1254 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

1255 done.add(datasetType.name) 

1256 yield datasetType 

1257 # Go back and try to match saved components. 

1258 for datasetType in componentsForLater: 

1259 parentName, _ = datasetType.nameAndComponent() 

1260 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

1261 yield datasetType 

1262 

1263 def queryCollections(self, expression: Any = ..., 

1264 datasetType: Optional[DatasetType] = None, 

1265 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

1266 flattenChains: bool = False, 

1267 includeChains: Optional[bool] = None) -> Iterator[str]: 

1268 """Iterate over the collections whose names match an expression. 

1269 

1270 Parameters 

1271 ---------- 

1272 expression : `Any`, optional 

1273 An expression that fully or partially identifies the collections 

1274 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1275 `...` can be used to return all collections, and is the default. 

1276 See :ref:`daf_butler_collection_expressions` for more 

1277 information. 

1278 datasetType : `DatasetType`, optional 

1279 If provided, only yield collections that may contain datasets of 

1280 this type. This is a conservative approximation in general; it may 

1281 yield collections that do not have any such datasets. 

1282 collectionTypes : `AbstractSet` [ `CollectionType` ], optional 

1283 If provided, only yield collections of these types. 

1284 flattenChains : `bool`, optional 

1285 If `True` (`False` is default), recursively yield the child 

1286 collections of matching `~CollectionType.CHAINED` collections. 

1287 includeChains : `bool`, optional 

1288 If `True`, yield records for matching `~CollectionType.CHAINED` 

1289 collections. Default is the opposite of ``flattenChains``: include 

1290 either CHAINED collections or their children, but not both. 

1291 

1292 Yields 

1293 ------ 

1294 collection : `str` 

1295 The name of a collection that matches ``expression``. 

1296 """ 

1297 # Right now the datasetTypes argument is completely ignored, but that 

1298 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up 

1299 # ticket will take care of that. 

1300 query = CollectionQuery.fromExpression(expression) 

1301 for record in query.iter(self._collections, collectionTypes=frozenset(collectionTypes), 

1302 flattenChains=flattenChains, includeChains=includeChains): 

1303 yield record.name 

1304 

1305 def makeQueryBuilder(self, summary: queries.QuerySummary) -> queries.QueryBuilder: 

1306 """Return a `QueryBuilder` instance capable of constructing and 

1307 managing more complex queries than those obtainable via `Registry` 

1308 interfaces. 

1309 

1310 This is an advanced interface; downstream code should prefer 

1311 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those 

1312 are sufficient. 

1313 

1314 Parameters 

1315 ---------- 

1316 summary : `queries.QuerySummary` 

1317 Object describing and categorizing the full set of dimensions that 

1318 will be included in the query. 

1319 

1320 Returns 

1321 ------- 

1322 builder : `queries.QueryBuilder` 

1323 Object that can be used to construct and perform advanced queries. 

1324 """ 

1325 return queries.QueryBuilder( 

1326 summary, 

1327 queries.RegistryManagers( 

1328 collections=self._collections, 

1329 dimensions=self._dimensions, 

1330 datasets=self._datasets 

1331 ) 

1332 ) 

1333 

1334 def queryDatasets(self, datasetType: Any, *, 

1335 collections: Any, 

1336 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

1337 dataId: Optional[DataId] = None, 

1338 where: Optional[str] = None, 

1339 findFirst: bool = False, 

1340 components: Optional[bool] = None, 

1341 **kwargs: Any) -> queries.DatasetQueryResults: 

1342 """Query for and iterate over dataset references matching user-provided 

1343 criteria. 

1344 

1345 Parameters 

1346 ---------- 

1347 datasetType 

1348 An expression that fully or partially identifies the dataset types 

1349 to be queried. Allowed types include `DatasetType`, `str`, 

1350 `re.Pattern`, and iterables thereof. The special value `...` can 

1351 be used to query all dataset types. See 

1352 :ref:`daf_butler_dataset_type_expressions` for more information. 

1353 collections 

1354 An expression that fully or partially identifies the collections 

1355 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1356 thereof. `...` can be used to find datasets from all 

1357 `~CollectionType.RUN` collections (no other collections are 

1358 necessary, because all datasets are in a ``RUN`` collection). See 

1359 :ref:`daf_butler_collection_expressions` for more information. 

1360 dimensions : `~collections.abc.Iterable` of `Dimension` or `str` 

1361 Dimensions to include in the query (in addition to those used 

1362 to identify the queried dataset type(s)), either to constrain 

1363 the resulting datasets to those for which a matching dimension 

1364 exists, or to relate the dataset type's dimensions to dimensions 

1365 referenced by the ``dataId`` or ``where`` arguments. 

1366 dataId : `dict` or `DataCoordinate`, optional 

1367 A data ID whose key-value pairs are used as equality constraints 

1368 in the query. 

1369 where : `str`, optional 

1370 A string expression similar to a SQL WHERE clause. May involve 

1371 any column of a dimension table or (as a shortcut for the primary 

1372 key column of a dimension table) dimension name. See 

1373 :ref:`daf_butler_dimension_expressions` for more information. 

1374 findFirst : `bool`, optional 

1375 If `True` (`False` is default), for each result data ID, only 

1376 yield one `DatasetRef` of each `DatasetType`, from the first 

1377 collection in which a dataset of that dataset type appears 

1378 (according to the order of ``collections`` passed in). If `True`, 

1379 ``collections`` must not contain regular expressions and may not 

1380 be `...`. 

1381 components : `bool`, optional 

1382 If `True`, apply all dataset expression patterns to component 

1383 dataset type names as well. If `False`, never apply patterns to 

1384 components. If `None` (default), apply patterns to components only 

1385 if their parent datasets were not matched by the expression. 

1386 Fully-specified component datasets (`str` or `DatasetType` 

1387 instances) are always included. 

1388 **kwargs 

1389 Additional keyword arguments are forwarded to 

1390 `DataCoordinate.standardize` when processing the ``dataId`` 

1391 argument (and may be used to provide a constraining data ID even 

1392 when the ``dataId`` argument is `None`). 

1393 

1394 Returns 

1395 ------- 

1396 refs : `queries.DatasetQueryResults` 

1397 Dataset references matching the given query criteria. 

1398 

1399 Raises 

1400 ------ 

1401 TypeError 

1402 Raised when the arguments are incompatible, such as when a 

1403 collection wildcard is passed when ``findFirst`` is `True`. 

1404 

1405 Notes 

1406 ----- 

1407 When multiple dataset types are queried in a single call, the 

1408 results of this operation are equivalent to querying for each dataset 

1409 type separately in turn, and no information about the relationships 

1410 between datasets of different types is included. In contexts where 

1411 that kind of information is important, the recommended pattern is to 

1412 use `queryDataIds` to first obtain data IDs (possibly with the 

1413 desired dataset types and collections passed as constraints to the 

1414 query), and then use multiple (generally much simpler) calls to 

1415 `queryDatasets` with the returned data IDs passed as constraints. 

1416 """ 

1417 # Standardize the collections expression. 

1418 if findFirst: 

1419 collections = CollectionSearch.fromExpression(collections) 

1420 else: 

1421 collections = CollectionQuery.fromExpression(collections) 

1422 # Standardize and expand the data ID provided as a constraint. 

1423 standardizedDataId = self.expandDataId(dataId, **kwargs) 

1424 

1425 # We can only query directly if given a non-component DatasetType 

1426 # instance. If we were given an expression or str or a component 

1427 # DatasetType instance, we'll populate this dict, recurse, and return. 

1428 # If we already have a non-component DatasetType, it will remain None 

1429 # and we'll run the query directly. 

1430 composition: Optional[ 

1431 Dict[ 

1432 DatasetType, # parent dataset type 

1433 List[Optional[str]] # component name, or None for parent 

1434 ] 

1435 ] = None 

1436 if not isinstance(datasetType, DatasetType): 

1437 # We were given a dataset type expression (which may be as simple 

1438 # as a str). Loop over all matching datasets, delegating handling 

1439 # of the `components` argument to queryDatasetTypes, as we populate 

1440 # the composition dict. 

1441 composition = defaultdict(list) 

1442 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components): 

1443 parentName, componentName = trueDatasetType.nameAndComponent() 

1444 if componentName is not None: 

1445 parentDatasetType = self.getDatasetType(parentName) 

1446 composition.setdefault(parentDatasetType, []).append(componentName) 

1447 else: 

1448 composition.setdefault(trueDatasetType, []).append(None) 

1449 elif datasetType.isComponent(): 

1450 # We were given a true DatasetType instance, but it's a component. 

1451 # the composition dict will have exactly one item. 

1452 parentName, componentName = datasetType.nameAndComponent() 

1453 parentDatasetType = self.getDatasetType(parentName) 

1454 composition = {parentDatasetType: [componentName]} 

1455 if composition is not None: 

1456 # We need to recurse. Do that once for each parent dataset type. 

1457 chain = [] 

1458 for parentDatasetType, componentNames in composition.items(): 

1459 parentResults = self.queryDatasets( 

1460 parentDatasetType, 

1461 collections=collections, 

1462 dimensions=dimensions, 

1463 dataId=standardizedDataId, 

1464 where=where, 

1465 findFirst=findFirst 

1466 ) 

1467 if isinstance(parentResults, queries.ParentDatasetQueryResults): 

1468 chain.append( 

1469 parentResults.withComponents(componentNames) 

1470 ) 

1471 else: 

1472 # Should only happen if we know there would be no results. 

1473 assert isinstance(parentResults, queries.ChainedDatasetQueryResults) \ 

1474 and not parentResults._chain 

1475 return queries.ChainedDatasetQueryResults(chain) 

1476 # If we get here, there's no need to recurse (or we are already 

1477 # recursing; there can only ever be one level of recursion). 

1478 

1479 # The full set of dimensions in the query is the combination of those 

1480 # needed for the DatasetType and those explicitly requested, if any. 

1481 requestedDimensionNames = set(datasetType.dimensions.names) 

1482 if dimensions is not None: 

1483 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

1484 # Construct the summary structure needed to construct a QueryBuilder. 

1485 summary = queries.QuerySummary( 

1486 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1487 dataId=standardizedDataId, 

1488 expression=where, 

1489 ) 

1490 builder = self.makeQueryBuilder(summary) 

1491 # Add the dataset subquery to the query, telling the QueryBuilder to 

1492 # include the rank of the selected collection in the results only if we 

1493 # need to findFirst. Note that if any of the collections are 

1494 # actually wildcard expressions, and we've asked for deduplication, 

1495 # this will raise TypeError for us. 

1496 if not builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst): 

1497 return queries.ChainedDatasetQueryResults(()) 

1498 query = builder.finish() 

1499 return queries.ParentDatasetQueryResults(self._db, query, components=[None]) 

1500 

1501 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *, 

1502 dataId: Optional[DataId] = None, 

1503 datasets: Any = None, 

1504 collections: Any = None, 

1505 where: Optional[str] = None, 

1506 components: Optional[bool] = None, 

1507 **kwargs: Any) -> queries.DataCoordinateQueryResults: 

1508 """Query for data IDs matching user-provided criteria. 

1509 

1510 Parameters 

1511 ---------- 

1512 dimensions : `Dimension` or `str`, or iterable thereof 

1513 The dimensions of the data IDs to yield, as either `Dimension` 

1514 instances or `str`. Will be automatically expanded to a complete 

1515 `DimensionGraph`. 

1516 dataId : `dict` or `DataCoordinate`, optional 

1517 A data ID whose key-value pairs are used as equality constraints 

1518 in the query. 

1519 datasets : `Any`, optional 

1520 An expression that fully or partially identifies dataset types 

1521 that should constrain the yielded data IDs. For example, including 

1522 "raw" here would constrain the yielded ``instrument``, 

1523 ``exposure``, ``detector``, and ``physical_filter`` values to only 

1524 those for which at least one "raw" dataset exists in 

1525 ``collections``. Allowed types include `DatasetType`, `str`, 

1526 `re.Pattern`, and iterables thereof. Unlike other dataset type 

1527 expressions, ``...`` is not permitted - it doesn't make sense to 

1528 constrain data IDs on the existence of *all* datasets. 

1529 See :ref:`daf_butler_dataset_type_expressions` for more 

1530 information. 

1531 collections: `Any`, optional 

1532 An expression that fully or partially identifies the collections 

1533 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1534 thereof. `...` can be used to return all collections. Must be 

1535 provided if ``datasets`` is, and is ignored if it is not. See 

1536 :ref:`daf_butler_collection_expressions` for more information. 

1537 where : `str`, optional 

1538 A string expression similar to a SQL WHERE clause. May involve 

1539 any column of a dimension table or (as a shortcut for the primary 

1540 key column of a dimension table) dimension name. See 

1541 :ref:`daf_butler_dimension_expressions` for more information. 

1542 components : `bool`, optional 

1543 If `True`, apply all dataset expression patterns to component 

1544 dataset type names as well. If `False`, never apply patterns to 

1545 components. If `None` (default), apply patterns to components only 

1546 if their parent datasets were not matched by the expression. 

1547 Fully-specified component datasets (`str` or `DatasetType` 

1548 instances) are always included. 

1549 **kwargs 

1550 Additional keyword arguments are forwarded to 

1551 `DataCoordinate.standardize` when processing the ``dataId`` 

1552 argument (and may be used to provide a constraining data ID even 

1553 when the ``dataId`` argument is `None`). 

1554 

1555 Returns 

1556 ------- 

1557 dataIds : `DataCoordinateQueryResults` 

1558 Data IDs matching the given query parameters. These are guaranteed 

1559 to identify all dimensions (`DataCoordinate.hasFull` returns 

1560 `True`), but will not contain `DimensionRecord` objects 

1561 (`DataCoordinate.hasRecords` returns `False`). Call 

1562 `DataCoordinateQueryResults.expanded` on the returned object to 

1563 fetch those (and consider using 

1564 `DataCoordinateQueryResults.materialize` on the returned object 

1565 first if the expected number of rows is very large). See 

1566 documentation for those methods for additional information. 

1567 """ 

1568 dimensions = iterable(dimensions) 

1569 standardizedDataId = self.expandDataId(dataId, **kwargs) 

1570 standardizedDatasetTypes = set() 

1571 requestedDimensions = self.dimensions.extract(dimensions) 

1572 queryDimensionNames = set(requestedDimensions.names) 

1573 if datasets is not None: 

1574 if collections is None: 

1575 raise TypeError("Cannot pass 'datasets' without 'collections'.") 

1576 for datasetType in self.queryDatasetTypes(datasets, components=components): 

1577 queryDimensionNames.update(datasetType.dimensions.names) 

1578 # If any matched dataset type is a component, just operate on 

1579 # its parent instead, because Registry doesn't know anything 

1580 # about what components exist, and here (unlike queryDatasets) 

1581 # we don't care about returning them. 

1582 parentDatasetTypeName, componentName = datasetType.nameAndComponent() 

1583 if componentName is not None: 

1584 datasetType = self.getDatasetType(parentDatasetTypeName) 

1585 standardizedDatasetTypes.add(datasetType) 

1586 # Preprocess collections expression in case the original included 

1587 # single-pass iterators (we'll want to use it multiple times 

1588 # below). 

1589 collections = CollectionQuery.fromExpression(collections) 

1590 

1591 summary = queries.QuerySummary( 

1592 requested=DimensionGraph(self.dimensions, names=queryDimensionNames), 

1593 dataId=standardizedDataId, 

1594 expression=where, 

1595 ) 

1596 builder = self.makeQueryBuilder(summary) 

1597 for datasetType in standardizedDatasetTypes: 

1598 builder.joinDataset(datasetType, collections, isResult=False) 

1599 query = builder.finish() 

1600 return queries.DataCoordinateQueryResults(self._db, query) 

1601 

1602 def queryDimensionRecords(self, element: Union[DimensionElement, str], *, 

1603 dataId: Optional[DataId] = None, 

1604 datasets: Any = None, 

1605 collections: Any = None, 

1606 where: Optional[str] = None, 

1607 components: Optional[bool] = None, 

1608 **kwargs: Any) -> Iterator[DimensionRecord]: 

1609 """Query for dimension information matching user-provided criteria. 

1610 

1611 Parameters 

1612 ---------- 

1613 element : `DimensionElement` or `str` 

1614 The dimension element to obtain r 

1615 dataId : `dict` or `DataCoordinate`, optional 

1616 A data ID whose key-value pairs are used as equality constraints 

1617 in the query. 

1618 datasets : `Any`, optional 

1619 An expression that fully or partially identifies dataset types 

1620 that should constrain the yielded records. See `queryDataIds` and 

1621 :ref:`daf_butler_dataset_type_expressions` for more information. 

1622 collections: `Any`, optional 

1623 An expression that fully or partially identifies the collections 

1624 to search for datasets. See `queryDataIds` and 

1625 :ref:`daf_butler_collection_expressions` for more information. 

1626 where : `str`, optional 

1627 A string expression similar to a SQL WHERE clause. See 

1628 `queryDataIds` and :ref:`daf_butler_dimension_expressions` for more 

1629 information. 

1630 components : `bool`, optional 

1631 Whether to apply dataset expressions to components as well. 

1632 See `queryDataIds` for more information. 

1633 **kwargs 

1634 Additional keyword arguments are forwarded to 

1635 `DataCoordinate.standardize` when processing the ``dataId`` 

1636 argument (and may be used to provide a constraining data ID even 

1637 when the ``dataId`` argument is `None`). 

1638 

1639 Returns 

1640 ------- 

1641 dataIds : `DataCoordinateQueryResults` 

1642 Data IDs matching the given query parameters. 

1643 """ 

1644 if not isinstance(element, DimensionElement): 

1645 element = self.dimensions[element] 

1646 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections, 

1647 where=where, components=components, **kwargs) 

1648 return iter(self._dimensions[element].fetch(dataIds)) 

1649 

1650 def queryDatasetAssociations( 

1651 self, 

1652 datasetType: Union[str, DatasetType], 

1653 collections: Any = ..., 

1654 *, 

1655 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

1656 flattenChains: bool = False, 

1657 ) -> Iterator[DatasetAssociation]: 

1658 """Iterate over dataset-collection combinations where the dataset is in 

1659 the collection. 

1660 

1661 This method is a temporary placeholder for better support for 

1662 assocation results in `queryDatasets`. It will probably be 

1663 removed in the future, and should be avoided in production code 

1664 whenever possible. 

1665 

1666 Parameters 

1667 ---------- 

1668 datasetType : `DatasetType` or `str` 

1669 A dataset type object or the name of one. 

1670 collections: `Any`, optional 

1671 An expression that fully or partially identifies the collections 

1672 to search for datasets. See `queryCollections` and 

1673 :ref:`daf_butler_collection_expressions` for more information. 

1674 collectionTypes : `AbstractSet` [ `CollectionType` ], optional 

1675 If provided, only yield associations from collections of these 

1676 types. 

1677 flattenChains : `bool`, optional 

1678 If `True` (default) search in the children of 

1679 `~CollectionType.CHAINED` collections. If `False`, ``CHAINED`` 

1680 collections are ignored. 

1681 

1682 Yields 

1683 ------ 

1684 association : `DatasetAssociation` 

1685 Object representing the relationship beween a single dataset and 

1686 a single collection. 

1687 """ 

1688 collections = CollectionQuery.fromExpression(collections) 

1689 tsRepr = self._db.getTimespanRepresentation() 

1690 if isinstance(datasetType, str): 

1691 storage = self._datasets[datasetType] 

1692 else: 

1693 storage = self._datasets[datasetType.name] 

1694 for collectionRecord in collections.iter(self._collections, 

1695 collectionTypes=frozenset(collectionTypes), 

1696 flattenChains=flattenChains): 

1697 query = storage.select(collectionRecord) 

1698 if query is None: 

1699 continue 

1700 for row in self._db.query(query.combine()): 

1701 dataId = DataCoordinate.fromRequiredValues( 

1702 storage.datasetType.dimensions, 

1703 tuple(row[name] for name in storage.datasetType.dimensions.required.names) 

1704 ) 

1705 runRecord = self._collections[row[self._collections.getRunForeignKeyName()]] 

1706 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, 

1707 conform=False) 

1708 if collectionRecord.type is CollectionType.CALIBRATION: 

1709 timespan = tsRepr.extract(row) 

1710 else: 

1711 timespan = None 

1712 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan) 

1713 

1714 storageClasses: StorageClassFactory 

1715 """All storage classes known to the registry (`StorageClassFactory`). 

1716 """