Coverage for python/lsst/daf/butler/_butler.py : 8%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = ("Butler", "ButlerValidationError")
29import os
30from collections import defaultdict
31import contextlib
32import logging
33from typing import (
34 Any,
35 ClassVar,
36 ContextManager,
37 Dict,
38 Iterable,
39 List,
40 Mapping,
41 MutableMapping,
42 Optional,
43 Set,
44 TextIO,
45 Tuple,
46 Union,
47)
49try:
50 import boto3
51except ImportError:
52 boto3 = None
54from lsst.utils import doImport
55from .core import (
56 AmbiguousDatasetError,
57 ButlerURI,
58 Config,
59 ConfigSubset,
60 DataCoordinate,
61 DataId,
62 DatasetRef,
63 DatasetType,
64 Datastore,
65 DimensionConfig,
66 FileDataset,
67 StorageClassFactory,
68 Timespan,
69 ValidationError,
70)
71from .core.repoRelocation import BUTLER_ROOT_TAG
72from .core.utils import transactional, getClassOf
73from ._deferredDatasetHandle import DeferredDatasetHandle
74from ._butlerConfig import ButlerConfig
75from .registry import Registry, RegistryConfig, CollectionType
76from .registry.wildcards import CollectionSearch
77from .transfers import RepoExportContext
79log = logging.getLogger(__name__)
82class ButlerValidationError(ValidationError):
83 """There is a problem with the Butler configuration."""
84 pass
87class Butler:
88 """Main entry point for the data access system.
90 Parameters
91 ----------
92 config : `ButlerConfig`, `Config` or `str`, optional.
93 Configuration. Anything acceptable to the
94 `ButlerConfig` constructor. If a directory path
95 is given the configuration will be read from a ``butler.yaml`` file in
96 that location. If `None` is given default values will be used.
97 butler : `Butler`, optional.
98 If provided, construct a new Butler that uses the same registry and
99 datastore as the given one, but with the given collection and run.
100 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
101 arguments.
102 collections : `Any`, optional
103 An expression specifying the collections to be searched (in order) when
104 reading datasets, and optionally dataset type restrictions on them.
105 This may be:
106 - a `str` collection name;
107 - a tuple of (collection name, *dataset type restriction*);
108 - an iterable of either of the above;
109 - a mapping from `str` to *dataset type restriction*.
111 See :ref:`daf_butler_collection_expressions` for more information,
112 including the definition of a *dataset type restriction*. All
113 collections must either already exist or be specified to be created
114 by other arguments.
115 run : `str`, optional
116 Name of the run datasets should be output to. If the run
117 does not exist, it will be created. If ``collections`` is `None`, it
118 will be set to ``[run]``. If this is not set (and ``writeable`` is
119 not set either), a read-only butler will be created.
120 tags : `Iterable` [ `str` ], optional
121 A list of `~CollectionType.TAGGED` collections that datasets should be
122 associated with in `put` or `ingest` and disassociated from in
123 `pruneDatasets`. If any of these collections does not exist, it will
124 be created.
125 chains : `Mapping` [ `str`, `Iterable` [ `str` ] ], optional
126 A mapping from the names of new `~CollectionType.CHAINED` collections
127 to an expression identifying their child collections (which takes the
128 same form as the ``collections`` argument. Chains may be nested only
129 if children precede their parents in this mapping.
130 searchPaths : `list` of `str`, optional
131 Directory paths to search when calculating the full Butler
132 configuration. Not used if the supplied config is already a
133 `ButlerConfig`.
134 writeable : `bool`, optional
135 Explicitly sets whether the butler supports write operations. If not
136 provided, a read-write butler is created if any of ``run``, ``tags``,
137 or ``chains`` is non-empty.
139 Examples
140 --------
141 While there are many ways to control exactly how a `Butler` interacts with
142 the collections in its `Registry`, the most common cases are still simple.
144 For a read-only `Butler` that searches one collection, do::
146 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
148 For a read-write `Butler` that writes to and reads from a
149 `~CollectionType.RUN` collection::
151 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
153 The `Butler` passed to a ``PipelineTask`` is often much more complex,
154 because we want to write to one `~CollectionType.RUN` collection but read
155 from several others (as well), while defining a new
156 `~CollectionType.CHAINED` collection that combines them all::
158 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
159 collections=["u/alice/DM-50000"],
160 chains={
161 "u/alice/DM-50000": ["u/alice/DM-50000/a",
162 "u/bob/DM-49998",
163 "raw/hsc"]
164 })
166 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``, but
167 they'll also be available from the chained collection ``u/alice/DM-50000``.
168 Datasets will be read first from that run (since it appears first in the
169 chain), and then from ``u/bob/DM-49998`` and finally ``raw/hsc``.
170 If ``u/alice/DM-50000`` had already been defined, the ``chain`` argument
171 would be unnecessary. We could also construct a butler that performs
172 exactly the same `put` and `get` operations without actually creating a
173 chained collection, just by passing multiple items is ``collections``::
175 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
176 collections=["u/alice/DM-50000/a",
177 "u/bob/DM-49998",
178 "raw/hsc"])
180 Finally, one can always create a `Butler` with no collections::
182 butler = Butler("/path/to/repo", writeable=True)
184 This can be extremely useful when you just want to use ``butler.registry``,
185 e.g. for inserting dimension data or managing collections, or when the
186 collections you want to use with the butler are not consistent.
187 Passing ``writeable`` explicitly here is only necessary if you want to be
188 able to make changes to the repo - usually the value for ``writeable`` is
189 can be guessed from the collection arguments provided, but it defaults to
190 `False` when there are not collection arguments.
191 """
192 def __init__(self, config: Union[Config, str, None] = None, *,
193 butler: Optional[Butler] = None,
194 collections: Any = None,
195 run: Optional[str] = None,
196 tags: Iterable[str] = (),
197 chains: Optional[Mapping[str, Any]] = None,
198 searchPaths: Optional[List[str]] = None,
199 writeable: Optional[bool] = None,
200 ):
201 # Transform any single-pass iterator into an actual sequence so we
202 # can see if its empty
203 self.tags = tuple(tags)
204 # Load registry, datastore, etc. from config or existing butler.
205 if butler is not None:
206 if config is not None or searchPaths is not None or writeable is not None:
207 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' "
208 "arguments with 'butler' argument.")
209 self.registry = butler.registry
210 self.datastore = butler.datastore
211 self.storageClasses = butler.storageClasses
212 self._config = butler._config
213 else:
214 self._config = ButlerConfig(config, searchPaths=searchPaths)
215 if "root" in self._config:
216 butlerRoot = self._config["root"]
217 else:
218 butlerRoot = self._config.configDir
219 if writeable is None:
220 writeable = run is not None or chains is not None or self.tags
221 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable)
222 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(),
223 butlerRoot=butlerRoot)
224 self.storageClasses = StorageClassFactory()
225 self.storageClasses.addFromConfig(self._config)
226 # Check the many collection arguments for consistency and create any
227 # needed collections that don't exist.
228 if collections is None:
229 if run is not None:
230 collections = (run,)
231 else:
232 collections = ()
233 self.collections = CollectionSearch.fromExpression(collections)
234 if chains is None:
235 chains = {}
236 self.run = run
237 if "run" in self._config or "collection" in self._config:
238 raise ValueError("Passing a run or collection via configuration is no longer supported.")
239 if self.run is not None:
240 self.registry.registerCollection(self.run, type=CollectionType.RUN)
241 for tag in self.tags:
242 self.registry.registerCollection(tag, type=CollectionType.TAGGED)
243 for parent, children in chains.items():
244 self.registry.registerCollection(parent, type=CollectionType.CHAINED)
245 self.registry.setCollectionChain(parent, children)
247 GENERATION: ClassVar[int] = 3
248 """This is a Generation 3 Butler.
250 This attribute may be removed in the future, once the Generation 2 Butler
251 interface has been fully retired; it should only be used in transitional
252 code.
253 """
255 @staticmethod
256 def makeRepo(root: str, config: Union[Config, str, None] = None,
257 dimensionConfig: Union[Config, str, None] = None, standalone: bool = False,
258 searchPaths: Optional[List[str]] = None, forceConfigRoot: bool = True,
259 outfile: Optional[str] = None, overwrite: bool = False) -> Config:
260 """Create an empty data repository by adding a butler.yaml config
261 to a repository root directory.
263 Parameters
264 ----------
265 root : `str` or `ButlerURI`
266 Path or URI to the root location of the new repository. Will be
267 created if it does not exist.
268 config : `Config` or `str`, optional
269 Configuration to write to the repository, after setting any
270 root-dependent Registry or Datastore config options. Can not
271 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
272 configuration will be used. Root-dependent config options
273 specified in this config are overwritten if ``forceConfigRoot``
274 is `True`.
275 dimensionConfig : `Config` or `str`, optional
276 Configuration for dimensions, will be used to initialize registry
277 database.
278 standalone : `bool`
279 If True, write all expanded defaults, not just customized or
280 repository-specific settings.
281 This (mostly) decouples the repository from the default
282 configuration, insulating it from changes to the defaults (which
283 may be good or bad, depending on the nature of the changes).
284 Future *additions* to the defaults will still be picked up when
285 initializing `Butlers` to repos created with ``standalone=True``.
286 searchPaths : `list` of `str`, optional
287 Directory paths to search when calculating the full butler
288 configuration.
289 forceConfigRoot : `bool`, optional
290 If `False`, any values present in the supplied ``config`` that
291 would normally be reset are not overridden and will appear
292 directly in the output config. This allows non-standard overrides
293 of the root directory for a datastore or registry to be given.
294 If this parameter is `True` the values for ``root`` will be
295 forced into the resulting config if appropriate.
296 outfile : `str`, optional
297 If not-`None`, the output configuration will be written to this
298 location rather than into the repository itself. Can be a URI
299 string. Can refer to a directory that will be used to write
300 ``butler.yaml``.
301 overwrite : `bool`, optional
302 Create a new configuration file even if one already exists
303 in the specified output location. Default is to raise
304 an exception.
306 Returns
307 -------
308 config : `Config`
309 The updated `Config` instance written to the repo.
311 Raises
312 ------
313 ValueError
314 Raised if a ButlerConfig or ConfigSubset is passed instead of a
315 regular Config (as these subclasses would make it impossible to
316 support ``standalone=False``).
317 FileExistsError
318 Raised if the output config file already exists.
319 os.error
320 Raised if the directory does not exist, exists but is not a
321 directory, or cannot be created.
323 Notes
324 -----
325 Note that when ``standalone=False`` (the default), the configuration
326 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
327 construct the repository should also be used to construct any Butlers
328 to avoid configuration inconsistencies.
329 """
330 if isinstance(config, (ButlerConfig, ConfigSubset)):
331 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
333 # Ensure that the root of the repository exists or can be made
334 uri = ButlerURI(root, forceDirectory=True)
335 uri.mkdir()
337 config = Config(config)
339 # If we are creating a new repo from scratch with relative roots,
340 # do not propagate an explicit root from the config file
341 if "root" in config:
342 del config["root"]
344 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
345 datastoreClass = doImport(full["datastore", "cls"])
346 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
348 # if key exists in given config, parse it, otherwise parse the defaults
349 # in the expanded config
350 if config.get(("registry", "db")):
351 registryConfig = RegistryConfig(config)
352 else:
353 registryConfig = RegistryConfig(full)
354 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
355 if defaultDatabaseUri is not None:
356 Config.updateParameters(RegistryConfig, config, full,
357 toUpdate={"db": defaultDatabaseUri},
358 overwrite=forceConfigRoot)
359 else:
360 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",),
361 overwrite=forceConfigRoot)
363 if standalone:
364 config.merge(full)
365 else:
366 # Always expand the registry.managers section into the per-repo
367 # config, because after the database schema is created, it's not
368 # allowed to change anymore. Note that in the standalone=True
369 # branch, _everything_ in the config is expanded, so there's no
370 # need to special case this.
371 Config.updateParameters(RegistryConfig, config, full, toCopy=("managers",), overwrite=False)
372 if outfile is not None:
373 # When writing to a separate location we must include
374 # the root of the butler repo in the config else it won't know
375 # where to look.
376 config["root"] = uri.geturl()
377 configURI = outfile
378 else:
379 configURI = uri
380 config.dumpToUri(configURI, overwrite=overwrite)
382 # Create Registry and populate tables
383 registryConfig = RegistryConfig(config.get("registry"))
384 dimensionConfig = DimensionConfig(dimensionConfig)
385 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root)
387 return config
389 @classmethod
390 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str],
391 tags: Tuple[str, ...], writeable: bool) -> Butler:
392 """Callable used to unpickle a Butler.
394 We prefer not to use ``Butler.__init__`` directly so we can force some
395 of its many arguments to be keyword-only (note that ``__reduce__``
396 can only invoke callables with positional arguments).
398 Parameters
399 ----------
400 config : `ButlerConfig`
401 Butler configuration, already coerced into a true `ButlerConfig`
402 instance (and hence after any search paths for overrides have been
403 utilized).
404 collections : `CollectionSearch`
405 Names of collections to read from.
406 run : `str`, optional
407 Name of `~CollectionType.RUN` collection to write to.
408 tags : `tuple` [`str`]
409 Names of `~CollectionType.TAGGED` collections to associate with.
410 writeable : `bool`
411 Whether the Butler should support write operations.
413 Returns
414 -------
415 butler : `Butler`
416 A new `Butler` instance.
417 """
418 return cls(config=config, collections=collections, run=run, tags=tags, writeable=writeable)
420 def __reduce__(self):
421 """Support pickling.
422 """
423 return (Butler._unpickle, (self._config, self.collections, self.run, self.tags,
424 self.registry.isWriteable()))
426 def __str__(self):
427 return "Butler(collections={}, run={}, tags={}, datastore='{}', registry='{}')".format(
428 self.collections, self.run, self.tags, self.datastore, self.registry)
430 def isWriteable(self) -> bool:
431 """Return `True` if this `Butler` supports write operations.
432 """
433 return self.registry.isWriteable()
435 @contextlib.contextmanager
436 def transaction(self):
437 """Context manager supporting `Butler` transactions.
439 Transactions can be nested.
440 """
441 with self.registry.transaction():
442 with self.datastore.transaction():
443 yield
445 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
446 dataId: Optional[DataId] = None, **kwds: Any) -> Tuple[DatasetType, DataId]:
447 """Standardize the arguments passed to several Butler APIs.
449 Parameters
450 ----------
451 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
452 When `DatasetRef` the `dataId` should be `None`.
453 Otherwise the `DatasetType` or name thereof.
454 dataId : `dict` or `DataCoordinate`
455 A `dict` of `Dimension` link name, value pairs that label the
456 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
457 should be provided as the second argument.
458 kwds
459 Additional keyword arguments used to augment or construct a
460 `DataCoordinate`. See `DataCoordinate.standardize`
461 parameters.
463 Returns
464 -------
465 datasetType : `DatasetType`
466 A `DatasetType` instance extracted from ``datasetRefOrType``.
467 dataId : `dict` or `DataId`, optional
468 Argument that can be used (along with ``kwds``) to construct a
469 `DataId`.
471 Notes
472 -----
473 Butler APIs that conceptually need a DatasetRef also allow passing a
474 `DatasetType` (or the name of one) and a `DataId` (or a dict and
475 keyword arguments that can be used to construct one) separately. This
476 method accepts those arguments and always returns a true `DatasetType`
477 and a `DataId` or `dict`.
479 Standardization of `dict` vs `DataId` is best handled by passing the
480 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are
481 generally similarly flexible.
482 """
483 externalDatasetType = None
484 internalDatasetType = None
485 if isinstance(datasetRefOrType, DatasetRef):
486 if dataId is not None or kwds:
487 raise ValueError("DatasetRef given, cannot use dataId as well")
488 externalDatasetType = datasetRefOrType.datasetType
489 dataId = datasetRefOrType.dataId
490 else:
491 # Don't check whether DataId is provided, because Registry APIs
492 # can usually construct a better error message when it wasn't.
493 if isinstance(datasetRefOrType, DatasetType):
494 externalDatasetType = datasetRefOrType
495 else:
496 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
498 # Check that they are self-consistent
499 if externalDatasetType is not None:
500 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
501 if externalDatasetType != internalDatasetType:
502 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with "
503 f"registry definition ({internalDatasetType})")
505 return internalDatasetType, dataId
507 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
508 dataId: Optional[DataId] = None, *,
509 collections: Any = None,
510 allowUnresolved: bool = False,
511 **kwds: Any) -> DatasetRef:
512 """Shared logic for methods that start with a search for a dataset in
513 the registry.
515 Parameters
516 ----------
517 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
518 When `DatasetRef` the `dataId` should be `None`.
519 Otherwise the `DatasetType` or name thereof.
520 dataId : `dict` or `DataCoordinate`, optional
521 A `dict` of `Dimension` link name, value pairs that label the
522 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
523 should be provided as the first argument.
524 collections : Any, optional
525 Collections to be searched, overriding ``self.collections``.
526 Can be any of the types supported by the ``collections`` argument
527 to butler construction.
528 allowUnresolved : `bool`, optional
529 If `True`, return an unresolved `DatasetRef` if finding a resolved
530 one in the `Registry` fails. Defaults to `False`.
531 kwds
532 Additional keyword arguments used to augment or construct a
533 `DataId`. See `DataId` parameters.
535 Returns
536 -------
537 ref : `DatasetRef`
538 A reference to the dataset identified by the given arguments.
540 Raises
541 ------
542 LookupError
543 Raised if no matching dataset exists in the `Registry` (and
544 ``allowUnresolved is False``).
545 ValueError
546 Raised if a resolved `DatasetRef` was passed as an input, but it
547 differs from the one found in the registry.
548 TypeError
549 Raised if no collections were provided.
550 """
551 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
552 if isinstance(datasetRefOrType, DatasetRef):
553 idNumber = datasetRefOrType.id
554 else:
555 idNumber = None
556 timespan: Optional[Timespan] = None
558 # Process dimension records that are using record information
559 # rather than ids
560 newDataId: dict[Any, Any] = {}
561 byRecord: dict[Any, dict[str, Any]] = defaultdict(dict)
563 # if all the dataId comes from keyword parameters we do not need
564 # to do anything here because they can't be of the form
565 # exposure.obs_id because a "." is not allowed in a keyword parameter.
566 if dataId:
567 for k, v in dataId.items():
568 # If we have a Dimension we do not need to do anything
569 # because it cannot be a compound key.
570 if isinstance(k, str) and "." in k:
571 # Someone is using a more human-readable dataId
572 dimension, record = k.split(".", 1)
573 byRecord[dimension][record] = v
574 else:
575 newDataId[k] = v
577 if byRecord:
578 # Some record specifiers were found so we need to convert
579 # them to the Id form
580 for dimensionName, values in byRecord.items():
581 if dimensionName in newDataId:
582 log.warning("DataId specified explicit %s dimension value of %s in addition to"
583 " general record specifiers for it of %s. Ignoring record information.",
584 dimensionName, newDataId[dimensionName], str(values))
585 continue
587 # Build up a WHERE expression -- use single quotes
588 def quote(s):
589 if isinstance(s, str):
590 return f"'{s}'"
591 else:
592 return s
594 where = " AND ".join(f"{dimensionName}.{k} = {quote(v)}"
595 for k, v in values.items())
597 # Hopefully we get a single record that matches
598 records = set(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId,
599 where=where, **kwds))
601 if len(records) != 1:
602 if len(records) > 1:
603 log.debug("Received %d records from constraints of %s", len(records), str(values))
604 for r in records:
605 log.debug("- %s", str(r))
606 raise RuntimeError(f"DataId specification for dimension {dimensionName} is not"
607 f" uniquely constrained to a single dataset by {values}."
608 f" Got {len(records)} results.")
609 raise RuntimeError(f"DataId specification for dimension {dimensionName} matched no"
610 f" records when constrained by {values}")
612 # Get the primary key from the real dimension object
613 dimension = self.registry.dimensions[dimensionName]
614 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
616 # We have modified the dataId so need to switch to it
617 dataId = newDataId
619 if datasetType.isCalibration():
620 # Because this is a calibration dataset, first try to make a
621 # standardize the data ID without restricting the dimensions to
622 # those of the dataset type requested, because there may be extra
623 # dimensions that provide temporal information for a validity-range
624 # lookup.
625 dataId = DataCoordinate.standardize(dataId, universe=self.registry.dimensions, **kwds)
626 if dataId.graph.temporal:
627 dataId = self.registry.expandDataId(dataId)
628 timespan = dataId.timespan
629 else:
630 # Standardize the data ID to just the dimensions of the dataset
631 # type instead of letting registry.findDataset do it, so we get the
632 # result even if no dataset is found.
633 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions, **kwds)
634 if collections is None:
635 collections = self.collections
636 if not collections:
637 raise TypeError("No input collections provided.")
638 else:
639 collections = CollectionSearch.fromExpression(collections)
640 # Always lookup the DatasetRef, even if one is given, to ensure it is
641 # present in the current collection.
642 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
643 if ref is None:
644 if allowUnresolved:
645 return DatasetRef(datasetType, dataId)
646 else:
647 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} "
648 f"could not be found in collections {collections}.")
649 if idNumber is not None and idNumber != ref.id:
650 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match "
651 f"id ({ref.id}) in registry in collections {collections}.")
652 return ref
654 @transactional
655 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
656 dataId: Optional[DataId] = None, *,
657 run: Optional[str] = None,
658 tags: Optional[Iterable[str]] = None,
659 **kwds: Any) -> DatasetRef:
660 """Store and register a dataset.
662 Parameters
663 ----------
664 obj : `object`
665 The dataset.
666 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
667 When `DatasetRef` is provided, ``dataId`` should be `None`.
668 Otherwise the `DatasetType` or name thereof.
669 dataId : `dict` or `DataCoordinate`
670 A `dict` of `Dimension` link name, value pairs that label the
671 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
672 should be provided as the second argument.
673 run : `str`, optional
674 The name of the run the dataset should be added to, overriding
675 ``self.run``.
676 tags : `Iterable` [ `str` ], optional
677 The names of a `~CollectionType.TAGGED` collections to associate
678 the dataset with, overriding ``self.tags``. These collections
679 must have already been added to the `Registry`.
680 kwds
681 Additional keyword arguments used to augment or construct a
682 `DataCoordinate`. See `DataCoordinate.standardize`
683 parameters.
685 Returns
686 -------
687 ref : `DatasetRef`
688 A reference to the stored dataset, updated with the correct id if
689 given.
691 Raises
692 ------
693 TypeError
694 Raised if the butler is read-only or if no run has been provided.
695 """
696 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
697 if not self.isWriteable():
698 raise TypeError("Butler is read-only.")
699 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
700 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
701 raise ValueError("DatasetRef must not be in registry, must have None id")
703 if run is None:
704 if self.run is None:
705 raise TypeError("No run provided.")
706 run = self.run
707 # No need to check type for run; first thing we do is
708 # insertDatasets, and that will check for us.
710 if tags is None:
711 tags = self.tags
712 else:
713 tags = tuple(tags)
714 for tag in tags:
715 # Check that these are tagged collections up front, because we want
716 # to avoid relying on Datastore transactionality to avoid modifying
717 # the repo if there's an error later.
718 collectionType = self.registry.getCollectionType(tag)
719 if collectionType is not CollectionType.TAGGED:
720 raise TypeError(f"Cannot associate into collection '{tag}' of non-TAGGED type "
721 f"{collectionType.name}.")
723 # Add Registry Dataset entry.
724 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
725 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
727 # Add Datastore entry.
728 self.datastore.put(obj, ref)
730 for tag in tags:
731 self.registry.associate(tag, [ref])
733 return ref
735 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None):
736 """Retrieve a stored dataset.
738 Unlike `Butler.get`, this method allows datasets outside the Butler's
739 collection to be read as long as the `DatasetRef` that identifies them
740 can be obtained separately.
742 Parameters
743 ----------
744 ref : `DatasetRef`
745 Resolved reference to an already stored dataset.
746 parameters : `dict`
747 Additional StorageClass-defined options to control reading,
748 typically used to efficiently read only a subset of the dataset.
750 Returns
751 -------
752 obj : `object`
753 The dataset.
754 """
755 return self.datastore.get(ref, parameters=parameters)
757 def getDirectDeferred(self, ref: DatasetRef, *,
758 parameters: Union[dict, None] = None) -> DeferredDatasetHandle:
759 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
760 from a resolved `DatasetRef`.
762 Parameters
763 ----------
764 ref : `DatasetRef`
765 Resolved reference to an already stored dataset.
766 parameters : `dict`
767 Additional StorageClass-defined options to control reading,
768 typically used to efficiently read only a subset of the dataset.
770 Returns
771 -------
772 obj : `DeferredDatasetHandle`
773 A handle which can be used to retrieve a dataset at a later time.
775 Raises
776 ------
777 AmbiguousDatasetError
778 Raised if ``ref.id is None``, i.e. the reference is unresolved.
779 """
780 if ref.id is None:
781 raise AmbiguousDatasetError(
782 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
783 )
784 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
786 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
787 dataId: Optional[DataId] = None, *,
788 parameters: Union[dict, None] = None,
789 collections: Any = None,
790 **kwds: Any) -> DeferredDatasetHandle:
791 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
792 after an immediate registry lookup.
794 Parameters
795 ----------
796 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
797 When `DatasetRef` the `dataId` should be `None`.
798 Otherwise the `DatasetType` or name thereof.
799 dataId : `dict` or `DataCoordinate`, optional
800 A `dict` of `Dimension` link name, value pairs that label the
801 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
802 should be provided as the first argument.
803 parameters : `dict`
804 Additional StorageClass-defined options to control reading,
805 typically used to efficiently read only a subset of the dataset.
806 collections : Any, optional
807 Collections to be searched, overriding ``self.collections``.
808 Can be any of the types supported by the ``collections`` argument
809 to butler construction.
810 kwds
811 Additional keyword arguments used to augment or construct a
812 `DataId`. See `DataId` parameters.
814 Returns
815 -------
816 obj : `DeferredDatasetHandle`
817 A handle which can be used to retrieve a dataset at a later time.
819 Raises
820 ------
821 LookupError
822 Raised if no matching dataset exists in the `Registry` (and
823 ``allowUnresolved is False``).
824 ValueError
825 Raised if a resolved `DatasetRef` was passed as an input, but it
826 differs from the one found in the registry.
827 TypeError
828 Raised if no collections were provided.
829 """
830 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
831 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
833 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
834 dataId: Optional[DataId] = None, *,
835 parameters: Optional[Dict[str, Any]] = None,
836 collections: Any = None,
837 **kwds: Any) -> Any:
838 """Retrieve a stored dataset.
840 Parameters
841 ----------
842 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
843 When `DatasetRef` the `dataId` should be `None`.
844 Otherwise the `DatasetType` or name thereof.
845 dataId : `dict` or `DataCoordinate`
846 A `dict` of `Dimension` link name, value pairs that label the
847 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
848 should be provided as the first argument.
849 parameters : `dict`
850 Additional StorageClass-defined options to control reading,
851 typically used to efficiently read only a subset of the dataset.
852 collections : Any, optional
853 Collections to be searched, overriding ``self.collections``.
854 Can be any of the types supported by the ``collections`` argument
855 to butler construction.
856 kwds
857 Additional keyword arguments used to augment or construct a
858 `DataCoordinate`. See `DataCoordinate.standardize`
859 parameters.
861 Returns
862 -------
863 obj : `object`
864 The dataset.
866 Raises
867 ------
868 ValueError
869 Raised if a resolved `DatasetRef` was passed as an input, but it
870 differs from the one found in the registry.
871 LookupError
872 Raised if no matching dataset exists in the `Registry`.
873 TypeError
874 Raised if no collections were provided.
876 Notes
877 -----
878 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
879 this method requires that the given data ID include temporal dimensions
880 beyond the dimensions of the dataset type itself, in order to find the
881 dataset with the appropriate validity range. For example, a "bias"
882 dataset with native dimensions ``{instrument, detector}`` could be
883 fetched with a ``{instrument, detector, exposure}`` data ID, because
884 ``exposure`` is a temporal dimension.
885 """
886 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
887 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
888 return self.getDirect(ref, parameters=parameters)
890 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
891 dataId: Optional[DataId] = None, *,
892 predict: bool = False,
893 collections: Any = None,
894 run: Optional[str] = None,
895 **kwds: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
896 """Returns the URIs associated with the dataset.
898 Parameters
899 ----------
900 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
901 When `DatasetRef` the `dataId` should be `None`.
902 Otherwise the `DatasetType` or name thereof.
903 dataId : `dict` or `DataCoordinate`
904 A `dict` of `Dimension` link name, value pairs that label the
905 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
906 should be provided as the first argument.
907 predict : `bool`
908 If `True`, allow URIs to be returned of datasets that have not
909 been written.
910 collections : Any, optional
911 Collections to be searched, overriding ``self.collections``.
912 Can be any of the types supported by the ``collections`` argument
913 to butler construction.
914 run : `str`, optional
915 Run to use for predictions, overriding ``self.run``.
916 kwds
917 Additional keyword arguments used to augment or construct a
918 `DataCoordinate`. See `DataCoordinate.standardize`
919 parameters.
921 Returns
922 -------
923 primary : `ButlerURI`
924 The URI to the primary artifact associated with this dataset.
925 If the dataset was disassembled within the datastore this
926 may be `None`.
927 components : `dict`
928 URIs to any components associated with the dataset artifact.
929 Can be empty if there are no components.
930 """
931 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict,
932 collections=collections, **kwds)
933 if ref.id is None: # only possible if predict is True
934 if run is None:
935 run = self.run
936 if run is None:
937 raise TypeError("Cannot predict location with run=None.")
938 # Lie about ID, because we can't guess it, and only
939 # Datastore.getURIs() will ever see it (and it doesn't use it).
940 ref = ref.resolved(id=0, run=run)
941 return self.datastore.getURIs(ref, predict)
943 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
944 dataId: Optional[DataId] = None, *,
945 predict: bool = False,
946 collections: Any = None,
947 run: Optional[str] = None,
948 **kwds: Any) -> ButlerURI:
949 """Return the URI to the Dataset.
951 Parameters
952 ----------
953 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
954 When `DatasetRef` the `dataId` should be `None`.
955 Otherwise the `DatasetType` or name thereof.
956 dataId : `dict` or `DataCoordinate`
957 A `dict` of `Dimension` link name, value pairs that label the
958 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
959 should be provided as the first argument.
960 predict : `bool`
961 If `True`, allow URIs to be returned of datasets that have not
962 been written.
963 collections : Any, optional
964 Collections to be searched, overriding ``self.collections``.
965 Can be any of the types supported by the ``collections`` argument
966 to butler construction.
967 run : `str`, optional
968 Run to use for predictions, overriding ``self.run``.
969 kwds
970 Additional keyword arguments used to augment or construct a
971 `DataCoordinate`. See `DataCoordinate.standardize`
972 parameters.
974 Returns
975 -------
976 uri : `ButlerURI`
977 URI pointing to the Dataset within the datastore. If the
978 Dataset does not exist in the datastore, and if ``predict`` is
979 `True`, the URI will be a prediction and will include a URI
980 fragment "#predicted".
981 If the datastore does not have entities that relate well
982 to the concept of a URI the returned URI string will be
983 descriptive. The returned URI is not guaranteed to be obtainable.
985 Raises
986 ------
987 LookupError
988 A URI has been requested for a dataset that does not exist and
989 guessing is not allowed.
990 ValueError
991 Raised if a resolved `DatasetRef` was passed as an input, but it
992 differs from the one found in the registry.
993 TypeError
994 Raised if no collections were provided.
995 RuntimeError
996 Raised if a URI is requested for a dataset that consists of
997 multiple artifacts.
998 """
999 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict,
1000 collections=collections, run=run, **kwds)
1002 if primary is None or components:
1003 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1004 "Use Butler.getURIs() instead.")
1005 return primary
1007 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1008 dataId: Optional[DataId] = None, *,
1009 collections: Any = None,
1010 **kwds: Any) -> bool:
1011 """Return True if the Dataset is actually present in the Datastore.
1013 Parameters
1014 ----------
1015 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1016 When `DatasetRef` the `dataId` should be `None`.
1017 Otherwise the `DatasetType` or name thereof.
1018 dataId : `dict` or `DataCoordinate`
1019 A `dict` of `Dimension` link name, value pairs that label the
1020 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1021 should be provided as the first argument.
1022 collections : Any, optional
1023 Collections to be searched, overriding ``self.collections``.
1024 Can be any of the types supported by the ``collections`` argument
1025 to butler construction.
1026 kwds
1027 Additional keyword arguments used to augment or construct a
1028 `DataCoordinate`. See `DataCoordinate.standardize`
1029 parameters.
1031 Raises
1032 ------
1033 LookupError
1034 Raised if the dataset is not even present in the Registry.
1035 ValueError
1036 Raised if a resolved `DatasetRef` was passed as an input, but it
1037 differs from the one found in the registry.
1038 TypeError
1039 Raised if no collections were provided.
1040 """
1041 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
1042 return self.datastore.exists(ref)
1044 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False):
1045 """Remove a collection and possibly prune datasets within it.
1047 Parameters
1048 ----------
1049 name : `str`
1050 Name of the collection to remove. If this is a
1051 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
1052 datasets within the collection are not modified unless ``unstore``
1053 is `True`. If this is a `~CollectionType.RUN` collection,
1054 ``purge`` and ``unstore`` must be `True`, and all datasets in it
1055 are fully removed from the data repository.
1056 purge : `bool`, optional
1057 If `True`, permit `~CollectionType.RUN` collections to be removed,
1058 fully removing datasets within them. Requires ``unstore=True`` as
1059 well as an added precaution against accidental deletion. Must be
1060 `False` (default) if the collection is not a ``RUN``.
1061 unstore: `bool`, optional
1062 If `True`, remove all datasets in the collection from all
1063 datastores in which they appear.
1065 Raises
1066 ------
1067 TypeError
1068 Raised if the butler is read-only or arguments are mutually
1069 inconsistent.
1070 """
1071 # See pruneDatasets comments for more information about the logic here;
1072 # the cases are almost the same, but here we can rely on Registry to
1073 # take care everything but Datastore deletion when we remove the
1074 # collection.
1075 if not self.isWriteable():
1076 raise TypeError("Butler is read-only.")
1077 if purge and not unstore:
1078 raise TypeError("Cannot pass purge=True without unstore=True.")
1079 collectionType = self.registry.getCollectionType(name)
1080 if collectionType is CollectionType.RUN and not purge:
1081 raise TypeError(f"Cannot prune RUN collection {name} without purge=True.")
1082 if collectionType is not CollectionType.RUN and purge:
1083 raise TypeError(f"Cannot prune {collectionType.name} collection {name} with purge=True.")
1084 with self.registry.transaction():
1085 if unstore:
1086 for ref in self.registry.queryDatasets(..., collections=name, findFirst=True):
1087 if self.datastore.exists(ref):
1088 self.datastore.trash(ref)
1089 self.registry.removeCollection(name)
1090 if unstore:
1091 # Point of no return for removing artifacts
1092 self.datastore.emptyTrash()
1094 def pruneDatasets(self, refs: Iterable[DatasetRef], *,
1095 disassociate: bool = True,
1096 unstore: bool = False,
1097 tags: Optional[Iterable[str]] = None,
1098 purge: bool = False,
1099 run: Optional[str] = None):
1100 """Remove one or more datasets from a collection and/or storage.
1102 Parameters
1103 ----------
1104 refs : `~collections.abc.Iterable` of `DatasetRef`
1105 Datasets to prune. These must be "resolved" references (not just
1106 a `DatasetType` and data ID).
1107 disassociate : `bool`, optional
1108 Disassociate pruned datasets from ``self.tags`` (or the collections
1109 given via the ``tags`` argument).
1110 unstore : `bool`, optional
1111 If `True` (`False` is default) remove these datasets from all
1112 datastores known to this butler. Note that this will make it
1113 impossible to retrieve these datasets even via other collections.
1114 Datasets that are already not stored are ignored by this option.
1115 tags : `Iterable` [ `str` ], optional
1116 `~CollectionType.TAGGED` collections to disassociate the datasets
1117 from, overriding ``self.tags``. Ignored if ``disassociate`` is
1118 `False` or ``purge`` is `True`.
1119 purge : `bool`, optional
1120 If `True` (`False` is default), completely remove the dataset from
1121 the `Registry`. To prevent accidental deletions, ``purge`` may
1122 only be `True` if all of the following conditions are met:
1124 - All given datasets are in the given run.
1125 - ``disassociate`` is `True`;
1126 - ``unstore`` is `True`.
1128 This mode may remove provenance information from datasets other
1129 than those provided, and should be used with extreme care.
1130 run : `str`, optional
1131 `~CollectionType.RUN` collection to purge from, overriding
1132 ``self.run``. Ignored unless ``purge`` is `True`.
1134 Raises
1135 ------
1136 TypeError
1137 Raised if the butler is read-only, if no collection was provided,
1138 or the conditions for ``purge=True`` were not met.
1139 """
1140 if not self.isWriteable():
1141 raise TypeError("Butler is read-only.")
1142 if purge:
1143 if not disassociate:
1144 raise TypeError("Cannot pass purge=True without disassociate=True.")
1145 if not unstore:
1146 raise TypeError("Cannot pass purge=True without unstore=True.")
1147 if run is None:
1148 run = self.run
1149 if run is None:
1150 raise TypeError("No run provided but purge=True.")
1151 collectionType = self.registry.getCollectionType(run)
1152 if collectionType is not CollectionType.RUN:
1153 raise TypeError(f"Cannot purge from collection '{run}' "
1154 f"of non-RUN type {collectionType.name}.")
1155 elif disassociate:
1156 if tags is None:
1157 tags = self.tags
1158 else:
1159 tags = tuple(tags)
1160 if not tags:
1161 raise TypeError("No tags provided but disassociate=True.")
1162 for tag in tags:
1163 collectionType = self.registry.getCollectionType(tag)
1164 if collectionType is not CollectionType.TAGGED:
1165 raise TypeError(f"Cannot disassociate from collection '{tag}' "
1166 f"of non-TAGGED type {collectionType.name}.")
1167 # Transform possibly-single-pass iterable into something we can iterate
1168 # over multiple times.
1169 refs = list(refs)
1170 # Pruning a component of a DatasetRef makes no sense since registry
1171 # doesn't know about components and datastore might not store
1172 # components in a separate file
1173 for ref in refs:
1174 if ref.datasetType.component():
1175 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1176 # We don't need an unreliable Datastore transaction for this, because
1177 # we've been extra careful to ensure that Datastore.trash only involves
1178 # mutating the Registry (it can _look_ at Datastore-specific things,
1179 # but shouldn't change them), and hence all operations here are
1180 # Registry operations.
1181 with self.registry.transaction():
1182 if unstore:
1183 for ref in refs:
1184 # There is a difference between a concrete composite
1185 # and virtual composite. In a virtual composite the
1186 # datastore is never given the top level DatasetRef. In
1187 # the concrete composite the datastore knows all the
1188 # refs and will clean up itself if asked to remove the
1189 # parent ref. We can not check configuration for this
1190 # since we can not trust that the configuration is the
1191 # same. We therefore have to ask if the ref exists or
1192 # not. This is consistent with the fact that we want
1193 # to ignore already-removed-from-datastore datasets
1194 # anyway.
1195 if self.datastore.exists(ref):
1196 self.datastore.trash(ref)
1197 if purge:
1198 self.registry.removeDatasets(refs)
1199 elif disassociate:
1200 for tag in tags:
1201 self.registry.disassociate(tag, refs)
1202 # We've exited the Registry transaction, and apparently committed.
1203 # (if there was an exception, everything rolled back, and it's as if
1204 # nothing happened - and we never get here).
1205 # Datastore artifacts are not yet gone, but they're clearly marked
1206 # as trash, so if we fail to delete now because of (e.g.) filesystem
1207 # problems we can try again later, and if manual administrative
1208 # intervention is required, it's pretty clear what that should entail:
1209 # deleting everything on disk and in private Datastore tables that is
1210 # in the dataset_location_trash table.
1211 if unstore:
1212 # Point of no return for removing artifacts
1213 self.datastore.emptyTrash()
1215 @transactional
1216 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None,
1217 tags: Optional[Iterable[str]] = None,):
1218 """Store and register one or more datasets that already exist on disk.
1220 Parameters
1221 ----------
1222 datasets : `FileDataset`
1223 Each positional argument is a struct containing information about
1224 a file to be ingested, including its path (either absolute or
1225 relative to the datastore root, if applicable), a `DatasetRef`,
1226 and optionally a formatter class or its fully-qualified string
1227 name. If a formatter is not provided, the formatter that would be
1228 used for `put` is assumed. On successful return, all
1229 `FileDataset.ref` attributes will have their `DatasetRef.id`
1230 attribute populated and all `FileDataset.formatter` attributes will
1231 be set to the formatter class used. `FileDataset.path` attributes
1232 may be modified to put paths in whatever the datastore considers a
1233 standardized form.
1234 transfer : `str`, optional
1235 If not `None`, must be one of 'auto', 'move', 'copy', 'hardlink',
1236 'relsymlink' or 'symlink', indicating how to transfer the file.
1237 run : `str`, optional
1238 The name of the run ingested datasets should be added to,
1239 overriding ``self.run``.
1240 tags : `Iterable` [ `str` ], optional
1241 The names of a `~CollectionType.TAGGED` collections to associate
1242 the dataset with, overriding ``self.tags``. These collections
1243 must have already been added to the `Registry`.
1245 Raises
1246 ------
1247 TypeError
1248 Raised if the butler is read-only or if no run was provided.
1249 NotImplementedError
1250 Raised if the `Datastore` does not support the given transfer mode.
1251 DatasetTypeNotSupportedError
1252 Raised if one or more files to be ingested have a dataset type that
1253 is not supported by the `Datastore`..
1254 FileNotFoundError
1255 Raised if one of the given files does not exist.
1256 FileExistsError
1257 Raised if transfer is not `None` but the (internal) location the
1258 file would be moved to is already occupied.
1260 Notes
1261 -----
1262 This operation is not fully exception safe: if a database operation
1263 fails, the given `FileDataset` instances may be only partially updated.
1265 It is atomic in terms of database operations (they will either all
1266 succeed or all fail) providing the database engine implements
1267 transactions correctly. It will attempt to be atomic in terms of
1268 filesystem operations as well, but this cannot be implemented
1269 rigorously for most datastores.
1270 """
1271 if not self.isWriteable():
1272 raise TypeError("Butler is read-only.")
1273 if run is None:
1274 if self.run is None:
1275 raise TypeError("No run provided.")
1276 run = self.run
1277 # No need to check run type, since insertDatasets will do that
1278 # (safely) for us.
1279 if tags is None:
1280 tags = self.tags
1281 else:
1282 tags = tuple(tags)
1283 for tag in tags:
1284 # Check that these are tagged collections up front, because we want
1285 # to avoid relying on Datastore transactionality to avoid modifying
1286 # the repo if there's an error later.
1287 collectionType = self.registry.getCollectionType(tag)
1288 if collectionType is not CollectionType.TAGGED:
1289 raise TypeError(f"Cannot associate into collection '{tag}' of non-TAGGED type "
1290 f"{collectionType.name}.")
1291 # Reorganize the inputs so they're grouped by DatasetType and then
1292 # data ID. We also include a list of DatasetRefs for each FileDataset
1293 # to hold the resolved DatasetRefs returned by the Registry, before
1294 # it's safe to swap them into FileDataset.refs.
1295 # Some type annotation aliases to make that clearer:
1296 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1297 GroupedData = MutableMapping[DatasetType, GroupForType]
1298 # The actual data structure:
1299 groupedData: GroupedData = defaultdict(dict)
1300 # And the nested loop that populates it:
1301 for dataset in datasets:
1302 # This list intentionally shared across the inner loop, since it's
1303 # associated with `dataset`.
1304 resolvedRefs = []
1305 for ref in dataset.refs:
1306 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1308 # Now we can bulk-insert into Registry for each DatasetType.
1309 allResolvedRefs = []
1310 for datasetType, groupForType in groupedData.items():
1311 refs = self.registry.insertDatasets(datasetType,
1312 dataIds=groupForType.keys(),
1313 run=run)
1314 # Append those resolved DatasetRefs to the new lists we set up for
1315 # them.
1316 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1317 resolvedRefs.append(ref)
1319 # Go back to the original FileDatasets to replace their refs with the
1320 # new resolved ones, and also build a big list of all refs.
1321 allResolvedRefs = []
1322 for groupForType in groupedData.values():
1323 for dataset, resolvedRefs in groupForType.values():
1324 dataset.refs = resolvedRefs
1325 allResolvedRefs.extend(resolvedRefs)
1327 # Bulk-associate everything with any tagged collections.
1328 for tag in tags:
1329 self.registry.associate(tag, allResolvedRefs)
1331 # Bulk-insert everything into Datastore.
1332 self.datastore.ingest(*datasets, transfer=transfer)
1334 @contextlib.contextmanager
1335 def export(self, *, directory: Optional[str] = None,
1336 filename: Optional[str] = None,
1337 format: Optional[str] = None,
1338 transfer: Optional[str] = None) -> ContextManager[RepoExportContext]:
1339 """Export datasets from the repository represented by this `Butler`.
1341 This method is a context manager that returns a helper object
1342 (`RepoExportContext`) that is used to indicate what information from
1343 the repository should be exported.
1345 Parameters
1346 ----------
1347 directory : `str`, optional
1348 Directory dataset files should be written to if ``transfer`` is not
1349 `None`.
1350 filename : `str`, optional
1351 Name for the file that will include database information associated
1352 with the exported datasets. If this is not an absolute path and
1353 ``directory`` is not `None`, it will be written to ``directory``
1354 instead of the current working directory. Defaults to
1355 "export.{format}".
1356 format : `str`, optional
1357 File format for the database information file. If `None`, the
1358 extension of ``filename`` will be used.
1359 transfer : `str`, optional
1360 Transfer mode passed to `Datastore.export`.
1362 Raises
1363 ------
1364 TypeError
1365 Raised if the set of arguments passed is inconsistent.
1367 Examples
1368 --------
1369 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1370 methods are used to provide the iterables over data IDs and/or datasets
1371 to be exported::
1373 with butler.export("exports.yaml") as export:
1374 # Export all flats, but none of the dimension element rows
1375 # (i.e. data ID information) associated with them.
1376 export.saveDatasets(butler.registry.queryDatasets("flat"),
1377 elements=())
1378 # Export all datasets that start with "deepCoadd_" and all of
1379 # their associated data ID information.
1380 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1381 """
1382 if directory is None and transfer is not None:
1383 raise TypeError("Cannot transfer without providing a directory.")
1384 if transfer == "move":
1385 raise TypeError("Transfer may not be 'move': export is read-only")
1386 if format is None:
1387 if filename is None:
1388 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1389 else:
1390 _, format = os.path.splitext(filename)
1391 elif filename is None:
1392 filename = f"export.{format}"
1393 if directory is not None:
1394 filename = os.path.join(directory, filename)
1395 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"])
1396 with open(filename, 'w') as stream:
1397 backend = BackendClass(stream)
1398 try:
1399 helper = RepoExportContext(self.registry, self.datastore, backend=backend,
1400 directory=directory, transfer=transfer)
1401 yield helper
1402 except BaseException:
1403 raise
1404 else:
1405 helper._finish()
1407 def import_(self, *, directory: Optional[str] = None,
1408 filename: Union[str, TextIO, None] = None,
1409 format: Optional[str] = None,
1410 transfer: Optional[str] = None,
1411 skip_dimensions: Optional[Set] = None):
1412 """Import datasets exported from a different butler repository.
1414 Parameters
1415 ----------
1416 directory : `str`, optional
1417 Directory containing dataset files. If `None`, all file paths
1418 must be absolute.
1419 filename : `str` or `TextIO`, optional
1420 A stream or name of file that contains database information
1421 associated with the exported datasets. If this a string (name) and
1422 is not an absolute path, does not exist in the current working
1423 directory, and ``directory`` is not `None`, it is assumed to be in
1424 ``directory``. Defaults to "export.{format}".
1425 format : `str`, optional
1426 File format for the database information file. If `None`, the
1427 extension of ``filename`` will be used.
1428 transfer : `str`, optional
1429 Transfer mode passed to `Datastore.ingest`.
1430 skip_dimensions : `set`, optional
1431 Names of dimensions that should be skipped and not imported.
1433 Raises
1434 ------
1435 TypeError
1436 Raised if the set of arguments passed is inconsistent, or if the
1437 butler is read-only.
1438 """
1439 if not self.isWriteable():
1440 raise TypeError("Butler is read-only.")
1441 if format is None:
1442 if filename is None:
1443 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1444 else:
1445 _, format = os.path.splitext(filename)
1446 elif filename is None:
1447 filename = f"export.{format}"
1448 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
1449 filename = os.path.join(directory, filename)
1450 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"])
1452 def doImport(importStream):
1453 backend = BackendClass(importStream, self.registry)
1454 backend.register()
1455 with self.transaction():
1456 backend.load(self.datastore, directory=directory, transfer=transfer,
1457 skip_dimensions=skip_dimensions)
1459 if isinstance(filename, str):
1460 with open(filename, "r") as stream:
1461 doImport(stream)
1462 else:
1463 doImport(filename)
1465 def validateConfiguration(self, logFailures: bool = False,
1466 datasetTypeNames: Optional[Iterable[str]] = None,
1467 ignore: Iterable[str] = None):
1468 """Validate butler configuration.
1470 Checks that each `DatasetType` can be stored in the `Datastore`.
1472 Parameters
1473 ----------
1474 logFailures : `bool`, optional
1475 If `True`, output a log message for every validation error
1476 detected.
1477 datasetTypeNames : iterable of `str`, optional
1478 The `DatasetType` names that should be checked. This allows
1479 only a subset to be selected.
1480 ignore : iterable of `str`, optional
1481 Names of DatasetTypes to skip over. This can be used to skip
1482 known problems. If a named `DatasetType` corresponds to a
1483 composite, all components of that `DatasetType` will also be
1484 ignored.
1486 Raises
1487 ------
1488 ButlerValidationError
1489 Raised if there is some inconsistency with how this Butler
1490 is configured.
1491 """
1492 if datasetTypeNames:
1493 entities = [self.registry.getDatasetType(name) for name in datasetTypeNames]
1494 else:
1495 entities = list(self.registry.queryDatasetTypes())
1497 # filter out anything from the ignore list
1498 if ignore:
1499 ignore = set(ignore)
1500 entities = [e for e in entities if e.name not in ignore and e.nameAndComponent()[0] not in ignore]
1501 else:
1502 ignore = set()
1504 # Find all the registered instruments
1505 instruments = set(
1506 record.name for record in self.registry.queryDimensionRecords("instrument")
1507 )
1509 # For each datasetType that has an instrument dimension, create
1510 # a DatasetRef for each defined instrument
1511 datasetRefs = []
1513 for datasetType in entities:
1514 if "instrument" in datasetType.dimensions:
1515 for instrument in instruments:
1516 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, conform=False)
1517 datasetRefs.append(datasetRef)
1519 entities.extend(datasetRefs)
1521 datastoreErrorStr = None
1522 try:
1523 self.datastore.validateConfiguration(entities, logFailures=logFailures)
1524 except ValidationError as e:
1525 datastoreErrorStr = str(e)
1527 # Also check that the LookupKeys used by the datastores match
1528 # registry and storage class definitions
1529 keys = self.datastore.getLookupKeys()
1531 failedNames = set()
1532 failedDataId = set()
1533 for key in keys:
1534 datasetType = None
1535 if key.name is not None:
1536 if key.name in ignore:
1537 continue
1539 # skip if specific datasetType names were requested and this
1540 # name does not match
1541 if datasetTypeNames and key.name not in datasetTypeNames:
1542 continue
1544 # See if it is a StorageClass or a DatasetType
1545 if key.name in self.storageClasses:
1546 pass
1547 else:
1548 try:
1549 self.registry.getDatasetType(key.name)
1550 except KeyError:
1551 if logFailures:
1552 log.fatal("Key '%s' does not correspond to a DatasetType or StorageClass", key)
1553 failedNames.add(key)
1554 else:
1555 # Dimensions are checked for consistency when the Butler
1556 # is created and rendezvoused with a universe.
1557 pass
1559 # Check that the instrument is a valid instrument
1560 # Currently only support instrument so check for that
1561 if key.dataId:
1562 dataIdKeys = set(key.dataId)
1563 if set(["instrument"]) != dataIdKeys:
1564 if logFailures:
1565 log.fatal("Key '%s' has unsupported DataId override", key)
1566 failedDataId.add(key)
1567 elif key.dataId["instrument"] not in instruments:
1568 if logFailures:
1569 log.fatal("Key '%s' has unknown instrument", key)
1570 failedDataId.add(key)
1572 messages = []
1574 if datastoreErrorStr:
1575 messages.append(datastoreErrorStr)
1577 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
1578 (failedDataId, "Keys with bad DataId entries: ")):
1579 if failed:
1580 msg += ", ".join(str(k) for k in failed)
1581 messages.append(msg)
1583 if messages:
1584 raise ValidationError(";\n".join(messages))
1586 registry: Registry
1587 """The object that manages dataset metadata and relationships (`Registry`).
1589 Most operations that don't involve reading or writing butler datasets are
1590 accessible only via `Registry` methods.
1591 """
1593 datastore: Datastore
1594 """The object that manages actual dataset storage (`Datastore`).
1596 Direct user access to the datastore should rarely be necessary; the primary
1597 exception is the case where a `Datastore` implementation provides extra
1598 functionality beyond what the base class defines.
1599 """
1601 storageClasses: StorageClassFactory
1602 """An object that maps known storage class names to objects that fully
1603 describe them (`StorageClassFactory`).
1604 """
1606 collections: Optional[CollectionSearch]
1607 """The collections to search and any restrictions on the dataset types to
1608 search for within them, in order (`CollectionSearch`).
1609 """
1611 run: Optional[str]
1612 """Name of the run this butler writes outputs to (`str` or `None`).
1613 """
1615 tags: Tuple[str, ...]
1616 """Names of `~CollectionType.TAGGED` collections this butler associates
1617 with in `put` and `ingest`, and disassociates from in `pruneDatasets`
1618 (`tuple` [ `str` ]).
1619 """