27 """This module defines the Butler class."""
28 from builtins
import str
29 from past.builtins
import basestring
30 from builtins
import object
37 from lsst.log
import Log
38 import lsst.pex.policy
as pexPolicy
39 from .
import ReadProxy, ButlerSubset, ButlerDataRef, Persistence, \
40 Storage, Policy, NoResults, Repository, DataId, RepositoryCfg, \
41 RepositoryArgs, listify, setify, sequencify, doImport, ButlerComposite, genericAssembler, \
42 genericDisassembler, PosixStorage, ParentsMismatch
44 preinitedMapperWarning = (
"Passing an instantiated mapper into " +
45 "Butler.__init__ will prevent Butler from passing " +
46 "parentRegistry or repositoryCfg information to " +
47 "the mapper, which is done only at init time. " +
48 "It is better to pass a importable string or " +
53 """Represents a Butler configuration.
57 cfg is 'wet paint' and very likely to change. Use of it in production
58 code other than via the 'old butler' API is strongly discouraged.
60 yaml_tag =
u"!ButlerCfg"
63 super(ButlerCfg, self).
__init__({
'repoCfg': repoCfg,
'cls': cls})
67 """Container object for repository data used by Butler
72 The arguments that are used to find or create the RepositoryCfg.
74 "input", "output", or "parent", indicating why Butler loaded this repository.
75 * input: the Repository was passed as a Butler input.
76 * output: the Repository was passed as a Butler output.
77 * parent: the Repository was specified in the RepositoryCfg parents list of a readable repository.
82 The configuration for the Repository.
85 "new", "existing", or "nested". Indicates the origin of the repository and its RepositoryCfg:
86 * new: it was created by this instance of Butler, and this instance of Butler will generate the
88 * existing: it was found (via the root or cfgRoot argument)
89 * nested: the full RepositoryCfg was nested in another RepositoryCfg's parents list (this can happen
90 if parameters of an input specified by RepositoryArgs or dict does not entirely match an existing
94 Path or URI to the location of the RepositoryCfg file.
96 repo : lsst.daf.persistence.Repository
97 The Repository class instance.
99 parentRepoDatas : list of RepoData
100 The parents of this Repository, as indicated this Repository's RepositoryCfg. If this is a new
101 Repository then these are the inputs to this Butler (and will be saved in the RepositoryCfg). These
102 RepoData objects are not owned by this RepoData, these are references to peer RepoData objects in the
103 Butler's RepoDataContainer.
105 isV1Repository : bool
106 True if this is an Old Butler repository. In this case the repository does not have a RepositoryCfg
107 file. It may have a _mapper file and may have a _parent symlink. It will never be treated as a "new"
108 repository, i.e. even though there is not a RepositoryCfg file, one will not be generated.
109 If False, this is a New Butler repository and is specified by RepositoryCfg file.
112 These are values that may be used to restrict the search of input repositories. Details are available
113 in the RepositoryArgs and DataId classes.
116 "input", "output", or "parent", indicating why Butler loaded this repository.
117 * input: the Repository was passed as a Butler input.
118 * output: the Repository was passed as a Butler output.
119 * parent: the Repository was specified in the RepositoryCfg parents list of a readable repository.
121 _repoArgs : RepositoryArgs
122 Contains the arguments that were used to specify this Repository.
152 "parentRepoDatas={}," +
155 "parentRegistry={})").format(
156 self.__class__.__name__,
168 def setCfg(self, cfg, origin, root, isV1Repository):
169 """Set information about the cfg into the RepoData
174 The RepositoryCfg for the repo.
176 'new', 'existing', or 'nested'
178 URI or absolute path to the location of the RepositoryCfg.yaml file.
184 if origin
not in (
'new',
'existing',
'nested'):
185 raise RuntimeError(
"Invalid value for origin:{}".format(origin))
205 if val
not in (
'input',
'output',
'parent'):
206 raise RuntimeError(
"Invalid value for role: {}".format(val))
210 """Get the parents & grandparents etc of this repo data, in depth-first search order.
212 Duplicate entries will be removed in cases where the same parent appears more than once in the parent
217 context : set, optional
218 Users should typically omit context and accept the default argument. Context is used to keep a set
219 of known RepoDatas when calling this function recursively, for duplicate elimination.
224 A list of the parents & grandparents etc of a given repo data, in depth-first search order.
229 if id(self)
in context:
231 context.add(id(self))
233 parents.append(parent)
234 parents += parent.getParentRepoDatas(context)
238 self.parentRepoDatas.append(parentRepoData)
241 self.
tags = self.tags.union(tags)
245 """Container object for RepoData instances owned by a Butler instance.
249 repoDataList : list of RepoData
250 repoData - RepoData instance to add
256 self.
_all = repoDataList
260 """Get a list of RepoData that are used to as inputs to the Butler.
261 The list is created lazily as needed, and cached.
265 A list of RepoData with readable repositories, in the order to be used when searching.
268 raise RuntimeError(
"Inputs not yet initialized.")
272 """Get a list of RepoData that are used to as outputs to the Butler.
273 The list is created lazily as needed, and cached.
277 A list of RepoData with writable repositories, in the order to be use when searching.
280 raise RuntimeError(
"Outputs not yet initialized.")
284 """Get a list of all RepoData that are used to as by the Butler.
285 The list is created lazily as needed, and cached.
289 A list of RepoData with writable repositories, in the order to be use when searching.
294 return "%s(_inputs=%r, \n_outputs=%s, \n_all=%s)" % (
295 self.__class__.__name__,
300 def _buildLookupLists(self):
301 """Build the inputs and outputs lists based on the order of self.all()."""
303 def addToList(repoData, lst):
304 """Add a repoData and each of its parents (depth first) to a list"""
305 if id(repoData)
in alreadyAdded:
308 alreadyAdded.add(id(repoData))
309 for parent
in repoData.parentRepoDatas:
310 addToList(parent, lst)
313 raise RuntimeError(
"Lookup lists are already built.")
314 inputs = [repoData
for repoData
in self.
all()
if repoData.role ==
'input']
315 outputs = [repoData
for repoData
in self.
all()
if repoData.role ==
'output']
318 for repoData
in outputs:
319 if 'r' in repoData.repoArgs.mode:
320 addToList(repoData.repoData, self._inputs)
321 for repoData
in inputs:
322 addToList(repoData.repoData, self.
_inputs)
323 self.
_outputs = [repoData.repoData
for repoData
in outputs]
327 """Butler provides a generic mechanism for persisting and retrieving data using mappers.
329 A Butler manages a collection of datasets known as a repository. Each dataset has a type representing its
330 intended usage and a location. Note that the dataset type is not the same as the C++ or Python type of the
331 object containing the data. For example, an ExposureF object might be used to hold the data for a raw
332 image, a post-ISR image, a calibrated science image, or a difference image. These would all be different
335 A Butler can produce a collection of possible values for a key (or tuples of values for multiple keys) if
336 given a partial data identifier. It can check for the existence of a file containing a dataset given its
337 type and data identifier. The Butler can then retrieve the dataset. Similarly, it can persist an object to
338 an appropriate location when given its associated data identifier.
340 Note that the Butler has two more advanced features when retrieving a data set. First, the retrieval is
341 lazy. Input does not occur until the data set is actually accessed. This allows datasets to be retrieved
342 and placed on a clipboard prospectively with little cost, even if the algorithm of a stage ends up not
343 using them. Second, the Butler will call a standardization hook upon retrieval of the dataset. This
344 function, contained in the input mapper object, must perform any necessary manipulations to force the
345 retrieved object to conform to standards, including translating metadata.
349 __init__(self, root, mapper=None, **mapperArgs)
351 defineAlias(self, alias, datasetType)
353 getKeys(self, datasetType=None, level=None)
355 queryMetadata(self, datasetType, format=None, dataId={}, **rest)
357 datasetExists(self, datasetType, dataId={}, **rest)
359 get(self, datasetType, dataId={}, immediate=False, **rest)
361 put(self, obj, datasetType, dataId={}, **rest)
363 subset(self, datasetType, level=None, dataId={}, **rest)
365 dataRef(self, datasetType, level=None, dataId={}, **rest)
369 The preferred method of initialization is to use the `inputs` and `outputs` __init__ parameters. These
370 are described in the parameters section, below.
372 For backward compatibility: this initialization method signature can take a posix root path, and
373 optionally a mapper class instance or class type that will be instantiated using the mapperArgs input
374 argument. However, for this to work in a backward compatible way it creates a single repository that is
375 used as both an input and an output repository. This is NOT preferred, and will likely break any
376 provenance system we have in place.
381 .. note:: Deprecated in 12_0
382 `root` will be removed in TBD, it is replaced by `inputs` and `outputs` for
383 multiple-repository support.
384 A file system path. Will only work with a PosixRepository.
385 mapper : string or instance
386 .. note:: Deprecated in 12_0
387 `mapper` will be removed in TBD, it is replaced by `inputs` and `outputs` for
388 multiple-repository support.
389 Provides a mapper to be used with Butler.
391 .. note:: Deprecated in 12_0
392 `mapperArgs` will be removed in TBD, it is replaced by `inputs` and `outputs` for
393 multiple-repository support.
394 Provides arguments to be passed to the mapper if the mapper input argument is a class type to be
395 instantiated by Butler.
396 inputs : RepositoryArgs, dict, or string
397 Can be a single item or a list. Provides arguments to load an existing repository (or repositories).
398 String is assumed to be a URI and is used as the cfgRoot (URI to the location of the cfg file). (Local
399 file system URI does not have to start with 'file://' and in this way can be a relative path). The
400 `RepositoryArgs` class can be used to provide more parameters with which to initialize a repository
401 (such as `mapper`, `mapperArgs`, `tags`, etc. See the `RepositoryArgs` documentation for more
402 details). A dict may be used as shorthand for a `RepositoryArgs` class instance. The dict keys must
403 match parameters to the `RepositoryArgs.__init__` function.
404 outputs : RepositoryArgs, dict, or string
405 Provides arguments to load one or more existing repositories or create new ones. The different types
406 are handled the same as for `inputs`.
408 The Butler init sequence loads all of the input and output repositories.
409 This creates the object hierarchy to read from and write to them. Each
410 repository can have 0 or more parents, which also get loaded as inputs.
411 This becomes a DAG of repositories. Ultimately, Butler creates a list of
412 these Repositories in the order that they are used.
414 Initialization Sequence
415 =======================
417 During initialization Butler creates a Repository class instance & support structure for each object
418 passed to `inputs` and `outputs` as well as the parent repositories recorded in the `RepositoryCfg` of
419 each existing readable repository.
421 This process is complex. It is explained below to shed some light on the intent of each step.
423 1. Input Argument Standardization
424 ---------------------------------
426 In `Butler._processInputArguments` the input arguments are verified to be legal (and a RuntimeError is
427 raised if not), and they are converted into an expected format that is used for the rest of the Butler
428 init sequence. See the docstring for `_processInputArguments`.
430 2. Create RepoData Objects
431 --------------------------
433 Butler uses an object, called `RepoData`, to keep track of information about each repository; each
434 repository is contained in a single `RepoData`. The attributes are explained in its docstring.
436 After `_processInputArguments`, a RepoData is instantiated and put in a list for each repository in
437 `outputs` and `inputs`. This list of RepoData, the `repoDataList`, now represents all the output and input
438 repositories (but not parent repositories) that this Butler instance will use.
440 3. Get `RepositoryCfg`s
441 -----------------------
443 `Butler._getCfgs` gets the `RepositoryCfg` for each repository the `repoDataList`. The behavior is
444 described in the docstring.
449 `Butler._addParents` then considers the parents list in the `RepositoryCfg` of each `RepoData` in the
450 `repoDataList` and inserts new `RepoData` objects for each parent not represented in the proper location
451 in the `repoDataList`. Ultimately a flat list is built to represent the DAG of readable repositories
452 represented in depth-first order.
454 5. Set and Verify Parents of Outputs
455 ------------------------------------
457 To be able to load parent repositories when output repositories are used as inputs, the input repositories
458 are recorded as parents in the `RepositoryCfg` file of new output repositories. When an output repository
459 already exists, for consistency the Butler's inputs must match the list of parents specified the already-
460 existing output repository's `RepositoryCfg` file.
462 In `Butler._setAndVerifyParentsLists`, the list of parents is recorded in the `RepositoryCfg` of new
463 repositories. For existing repositories the list of parents is compared with the `RepositoryCfg`'s parents
464 list, and if they do not match a `RuntimeError` is raised.
466 6. Set the Default Mapper
467 -------------------------
469 If all the input repositories use the same mapper then we can assume that mapper to be the
470 "default mapper". If there are new output repositories whose `RepositoryArgs` do not specify a mapper and
471 there is a default mapper then the new output repository will be set to use that default mapper.
473 This is handled in `Butler._setDefaultMapper`.
475 7. Cache References to Parent RepoDatas
476 ---------------------------------------
478 In `Butler._connectParentRepoDatas`, in each `RepoData` in `repoDataList`, a list of `RepoData` object
479 references is built that matches the parents specified in that `RepoData`'s `RepositoryCfg`.
481 This list is used later to find things in that repository's parents, without considering peer repository's
482 parents. (e.g. finding the registry of a parent)
487 Tags are described at https://ldm-463.lsst.io/v/draft/#tagging
489 In `Butler._setRepoDataTags`, for each `RepoData`, the tags specified by its `RepositoryArgs` are recorded
490 in a set, and added to the tags set in each of its parents, for ease of lookup when mapping.
492 9. Find Parent Registry and Instantiate RepoData
493 ------------------------------------------------
495 At this point there is enough information to instantiate the `Repository` instances. There is one final
496 step before instantiating the Repository, which is to try to get a parent registry that can be used by the
497 child repository. The criteria for "can be used" is spelled out in `Butler._setParentRegistry`. However,
498 to get the registry from the parent, the parent must be instantiated. The `repoDataList`, in depth-first
499 search order, is built so that the most-dependent repositories are first, and the least dependent
500 repositories are last. So the `repoDataList` is reversed and the Repositories are instantiated in that
501 order; for each RepoData a parent registry is searched for, and then the Repository is instantiated with
502 whatever registry could be found."""
504 def __init__(self, root=None, mapper=None, inputs=None, outputs=None, **mapperArgs):
505 self.
_initArgs = {
'root': root,
'mapper': mapper,
'inputs': inputs,
'outputs': outputs,
506 'mapperArgs': mapperArgs}
508 self.
log = Log.getLogger(
"daf.persistence.butler")
510 persistencePolicy = pexPolicy.Policy()
514 root=root, mapper=mapper, inputs=inputs, outputs=outputs, **mapperArgs)
517 inputs = [
RepoData(args,
'input')
for args
in inputs]
518 outputs = [
RepoData(args,
'output')
for args
in outputs]
519 repoDataList = outputs + inputs
535 for repoData
in reversed(repoDataList):
537 repoData.repo = Repository(repoData)
539 def _setParentRegistry(self, repoData):
540 """Try to get a parent registry that can be used by this repository. To be usable the repository must
541 "match", meaning the mapper in the passed-in repo is the same type as the mapper in the parent.
544 def getParentRegsitry(repoData, context):
545 """Get the first found registry that matches the the passed-in repo.
550 The RepoData for the repository for which we are searching for a
556 A registry from a parent if one can be found, or None.
561 Indicates a butler init order problem, all parents should be initialized before child
562 repositories, so this function should be able to get any parent of any child repo.
564 if id(self)
in context:
567 context.add(id(self))
568 for parentRepoData
in repoData.getParentRepoDatas():
569 if parentRepoData.cfg.mapper == repoData.cfg.mapper:
570 if parentRepoData.repo
is None:
572 "_getParentRegistry: Parent {} of new repo {} not yet created, ignoring.".format(
573 parentRepoData, repoData))
575 parentRegistry = parentRepoData.repo.getRegistry()
577 return parentRegistry
579 parentRegistry = getParentRegsitry(parentRepoData, context)
581 return parentRegistry
584 repoData.repoData.parentRegistry = getParentRegsitry(repoData.repoData, set())
586 def _processInputArguments(self, root=None, mapper=None, inputs=None, outputs=None, **mapperArgs):
587 """Process, verify, and standardize the input arguments.
588 * Inputs can not be for Old Butler (root, mapper, mapperArgs) AND New Butler (inputs, outputs)
589 `root`, `mapper`, and `mapperArgs` are Old Butler init API.
590 `inputs` and `outputs` are New Butler init API.
591 Old Butler and New Butler init API may not be mixed, Butler may be initialized with only the Old
592 arguments or the New arguments.
593 * Verify that if there is a readable output that there is exactly one output. (This restriction is in
594 place because all readable repositories must be parents of writable repositories, and for
595 consistency the DAG of readable repositories must always be the same. Keeping the list of parents
596 becomes very complicated in the presence of multiple readable output repositories. It is better to
597 only write to output repositories, and then create a new Butler instance and use the outputs as
598 inputs, and write to new output repositories.)
599 * Make a copy of inputs & outputs so they may be modified without changing the passed-in arguments.
600 * Convert any input/output values that are URI strings to RepositoryArgs.
601 * Listify inputs & outputs.
602 * Set default RW mode on inputs & outputs as needed.
606 Same as Butler.__init__
610 (list of RepositoryArgs, list of RepositoryArgs)
611 First item is a list to use as inputs.
612 Second item is a list to use as outputs.
617 If Old Butler and New Butler arguments are both used this will raise.
618 If an output is readable there is more than one output this will raise.
621 inputs = copy.deepcopy(inputs)
622 outputs = copy.deepcopy(outputs)
624 isV1Args = inputs
is None and outputs
is None
628 mapperArgs=mapperArgs
or None)
629 elif root
or mapper
or mapperArgs:
631 'Butler version 1 API (root, mapper, **mapperArgs) may ' +
632 'not be used with version 2 API (inputs, outputs)')
640 inputs = [RepositoryArgs(cfgRoot=args)
641 if not isinstance(args, RepositoryArgs)
else args
for args
in inputs]
642 outputs = [RepositoryArgs(cfgRoot=args)
643 if not isinstance(args, RepositoryArgs)
else args
for args
in outputs]
647 if args.mode
is None:
649 elif 'rw' == args.mode:
651 elif 'r' != args.mode:
652 raise RuntimeError(
"The mode of an input should be readable.")
654 if args.mode
is None:
656 elif 'w' not in args.mode:
657 raise RuntimeError(
"The mode of an output should be writable.")
659 for args
in inputs + outputs:
660 if (args.mapper
and not isinstance(args.mapper, basestring)
and
661 not inspect.isclass(args.mapper)):
662 self.log.warn(preinitedMapperWarning)
667 raise RuntimeError(
"Butler does not support multiple output repositories if any of the "
668 "outputs are readable.")
673 def inputIsInOutputs(inputArgs, outputArgsList):
674 for o
in outputArgsList:
675 if (
'r' in o.mode and
676 o.root == inputArgs.root and
677 o.mapper == inputArgs.mapper
and
678 o.mapperArgs == inputArgs.mapperArgs
and
679 o.tags == inputArgs.tags
and
680 o.policy == inputArgs.policy):
681 self.log.debug((
"Input repositoryArgs {} is also listed in outputs as readable; " +
682 "throwing away the input.").format(inputArgs))
686 inputs = [args
for args
in inputs
if not inputIsInOutputs(args, outputs)]
687 return inputs, outputs
690 def _getParentVal(repoData):
691 """Get the value of this repoData as it should appear in the parents
692 list of other repositories"""
693 if repoData.isV1Repository:
695 if repoData.cfgOrigin ==
'nested':
698 return repoData.cfg.root
701 def _getParents(ofRepoData, repoInfo):
702 """Create a parents list of repoData from inputs and (readable) outputs."""
705 for repoData
in repoInfo:
706 if repoData
is ofRepoData:
708 if 'r' not in repoData.repoArgs.mode:
710 parents.append(Butler._getParentVal(repoData))
714 def _getOldButlerRepositoryCfg(repositoryArgs):
715 if not Storage.isPosix(repositoryArgs.cfgRoot):
717 if not PosixStorage.v1RepoExists(repositoryArgs.cfgRoot):
719 if not repositoryArgs.mapper:
720 repositoryArgs.mapper = PosixStorage.getMapperClass(repositoryArgs.cfgRoot)
721 cfg = RepositoryCfg.makeFromArgs(repositoryArgs)
722 parent = PosixStorage.getParentSymlinkPath(repositoryArgs.cfgRoot)
724 parent = Butler._getOldButlerRepositoryCfg(RepositoryArgs(cfgRoot=parent, mode=
'r'))
725 if parent
is not None:
726 cfg.addParents([parent])
729 def _getRepositoryCfg(self, repositoryArgs):
730 """Try to get a repository from the location described by cfgRoot.
734 repositoryArgs : RepositoryArgs or string
735 Provides arguments to load an existing repository (or repositories). String is assumed to be a URI
736 and is used as the cfgRoot (URI to the location of the cfg file).
740 (RepositoryCfg or None, bool)
741 The RepositoryCfg, or None if one cannot be found, and True if the RepositoryCfg was created by
742 reading an Old Butler repository, or False if it is a New Butler Repository.
744 if not isinstance(repositoryArgs, RepositoryArgs):
745 repositoryArgs = RepositoryArgs(cfgRoot=repositoryArgs, mode=
'r')
747 cfg = self.storage.getRepositoryCfg(repositoryArgs.cfgRoot)
748 isOldButlerRepository = False
750 cfg = Butler._getOldButlerRepositoryCfg(repositoryArgs)
752 isOldButlerRepository =
True
753 return cfg, isOldButlerRepository
755 def _getCfgs(self, repoDataList):
756 """Get or make a RepositoryCfg for each RepoData, and add the cfg to the RepoData.
757 If the cfg exists, compare values. If values match then use the cfg as an "existing" cfg. If the
758 values do not match, use the cfg as a "nested" cfg.
759 If the cfg does not exist, the RepositoryArgs must be for a writable repository.
763 repoDataList : list of RepoData
764 The RepoData that are output and inputs of this Butler
769 If the passed-in RepositoryArgs indicate an existing repository but other cfg parameters in those
771 match the existing repository's cfg a RuntimeError will be raised.
773 def cfgMatchesArgs(args, cfg):
774 """Test if there are any values in an RepositoryArgs that conflict with the values in a cfg"""
775 if args.mapper
is not None and cfg.mapper != args.mapper:
777 if args.mapperArgs
is not None and cfg.mapperArgs != args.mapperArgs:
779 if args.policy
is not None and cfg.policy != args.policy:
783 for repoData
in repoDataList:
786 if 'w' not in repoData.repoArgs.mode:
788 "No cfg found for read-only input repository at {}".format(repoData.repoArgs.cfgRoot))
789 repoData.setCfg(cfg=RepositoryCfg.makeFromArgs(repoData.repoArgs),
791 root=repoData.repoArgs.cfgRoot,
792 isV1Repository=isOldButlerRepository)
794 if 'w' in repoData.repoArgs.mode:
796 if not cfgMatchesArgs(repoData.repoArgs, cfg):
797 raise RuntimeError((
"The RepositoryArgs and RepositoryCfg must match for writable " +
798 "repositories, RepositoryCfg:{}, RepositoryArgs:{}").format(
799 cfg, repoData.repoArgs))
800 repoData.setCfg(cfg=cfg, origin=
'existing', root=repoData.repoArgs.cfgRoot,
801 isV1Repository=isOldButlerRepository)
804 if cfgMatchesArgs(repoData.repoArgs, cfg):
805 repoData.setCfg(cfg=cfg, origin=
'existing', root=repoData.repoArgs.cfgRoot,
806 isV1Repository=isOldButlerRepository)
808 repoData.setCfg(cfg=cfg, origin=
'nested', root=
None,
809 isV1Repository=isOldButlerRepository)
811 def _addParents(self, repoDataList):
812 """For each repoData in the input list, see if its parents are the next items in the list, and if not
813 add the parent, so that the repoDataList includes parents and is in order to operate depth-first 0..n.
817 repoDataList : list of RepoData
818 The RepoData for the Butler outputs + inputs.
823 Raised if a RepositoryCfg can not be found at a location where a parent repository should be.
827 if repoDataIdx == len(repoDataList):
829 repoData = repoDataList[repoDataIdx]
830 if 'r' not in repoData.repoArgs.mode:
833 if repoData.isNewRepository:
836 if repoData.cfg.parents
is None:
839 for repoParentIdx, repoParent
in enumerate(repoData.cfg.parents):
840 parentIdxInRepoDataList = repoDataIdx + repoParentIdx + 1
841 if not isinstance(repoParent, RepositoryCfg):
843 if repoParentCfg
is not None:
844 cfgOrigin =
'existing'
846 isOldButlerRepository =
False
847 repoParentCfg = repoParent
849 if (parentIdxInRepoDataList < len(repoDataList)
and
850 repoDataList[parentIdxInRepoDataList].cfg == repoParentCfg):
852 args = RepositoryArgs(cfgRoot=repoParentCfg.root, mode=
'r')
853 role = 'input' if repoData.role ==
'output' else 'parent'
855 newRepoInfo.repoData.setCfg(cfg=repoParentCfg, origin=cfgOrigin, root=args.cfgRoot,
856 isV1Repository=isOldButlerRepository)
857 repoDataList.insert(parentIdxInRepoDataList, newRepoInfo)
860 def _setAndVerifyParentsLists(self, repoDataList):
861 """Make a list of all the input repositories of this Butler, these are the parents of the outputs.
862 For new output repositories, set the parents in the RepositoryCfg. For existing output repositories
863 verify that the RepositoryCfg's parents match the parents list.
867 repoDataList : list of RepoData
868 All the RepoDatas loaded by this butler, in search order.
873 If an existing output repository is loaded and its parents do not match the parents of this Butler
874 an error will be raised.
876 def getIOParents(ofRepoData, repoDataList):
877 """make a parents list for repo in `ofRepoData` that is comprised of inputs and readable
878 outputs (not parents-of-parents) of this butler"""
880 for repoData
in repoDataList:
881 if repoData.role ==
'parent':
883 if repoData
is ofRepoData:
885 if repoData.role ==
'output':
886 if 'r' in repoData.repoArgs.mode:
887 raise RuntimeError(
"If an output is readable it must be the only output.")
894 for repoData
in repoDataList:
895 if repoData.role !=
'output':
897 parents = getIOParents(repoData, repoDataList)
899 if repoData.cfgOrigin ==
'new':
900 repoData.cfg.addParents(parents)
901 elif repoData.cfgOrigin
in (
'existing',
'nested'):
902 if repoData.cfg.parents != parents:
904 repoData.cfg.extendParents(parents)
905 except ParentsMismatch
as e:
906 raise RuntimeError((
"Inputs of this Butler:{} do not match parents of existing " +
907 "writable cfg:{} (ParentMismatch exception: {}").format(
908 parents, repoData.cfg.parents, e))
910 def _setDefaultMapper(self, repoDataList):
911 """Establish a default mapper if there is one and assign it to outputs that do not have a mapper
914 If all inputs have the same mapper it will be used as the default mapper.
918 repoDataList : list of RepoData
919 All the RepoDatas loaded by this butler, in search order.
924 If a default mapper can not be established and there is an output that does not have a mapper.
926 needyOutputs = [rd
for rd
in repoDataList
if rd.role ==
'output' and rd.cfg.mapper
is None]
927 if len(needyOutputs)
is 0:
929 mappers = set([rd.cfg.mapper
for rd
in repoDataList
if rd.role ==
'input'])
930 if len(mappers) != 1:
931 inputs = [rd
for rd
in repoDataList
if rd.role ==
'input']
933 (
"No default mapper could be established from inputs:{} and no mapper specified " +
934 "for outputs:{}").format(inputs, needyOutputs))
935 defaultMapper = mappers.pop()
936 for repoData
in needyOutputs:
937 repoData.cfg.mapper = defaultMapper
939 def _connectParentRepoDatas(self, repoDataList):
940 """For each RepoData in repoDataList, find its parent in the repoDataList and cache a reference to it.
944 repoDataList : list of RepoData
945 All the RepoDatas loaded by this butler, in search order.
950 When a parent is listed in the parents list but not found in the repoDataList. This is not
951 expected to ever happen and would indicate an internal Butler error.
953 for repoData
in repoDataList:
954 for parent
in repoData.cfg.parents:
956 for otherRepoData
in repoDataList:
957 if isinstance(parent, RepositoryCfg):
958 if otherRepoData.repoData.repoData.cfg == parent:
959 parentToAdd = otherRepoData.repoData
961 elif otherRepoData.repoData.cfg.root == parent:
962 parentToAdd = otherRepoData.repoData
964 if parentToAdd
is None:
966 "Could not find a parent matching {} to add to {}".format(parent, repoData))
967 repoData.addParentRepoData(parentToAdd)
970 def _getParentRepoData(parent, repoDataList):
971 """get a parent RepoData from a cfg from a list of RepoData
975 parent : string or RepositoryCfg
976 cfgRoot of a repo or a cfg that describes the repo
977 repoDataList : list of RepoData
983 A RepoData if one can be found, else None
986 for otherRepoData
in repoDataList:
987 if isinstance(parent, RepositoryCfg):
988 if otherRepoData.cfg == parent:
989 repoData = otherRepoData
991 elif otherRepoData.cfg.root == parent:
992 repoData = otherRepoData
996 def _setRepoDataTags(self):
997 """Set the tags from each repoArgs into all its parent repoArgs so that they can be included in tagged
999 def setTags(repoData, tags, context):
1000 if id(repoData)
in context:
1002 repoData.addTags(tags)
1003 context.add(id(repoData))
1004 for parentRepoData
in repoData.parentRepoDatas:
1005 setTags(parentRepoData, tags, context)
1006 for repoData
in self._repos.outputs() + self._repos.inputs():
1007 setTags(repoData.repoData, repoData.repoArgs.tags, set())
1009 def _convertV1Args(self, root, mapper, mapperArgs):
1010 """Convert Old Butler RepositoryArgs (root, mapper, mapperArgs) to New Butler RepositoryArgs
1016 Posix path to repository root
1017 mapper : class, class instance, or string
1018 Instantiated class, a class object to be instantiated, or a string that refers to a class that
1019 can be imported & used as the mapper.
1021 RepositoryArgs & their values used when instantiating the mapper.
1026 (inputs, outputs) - values to be used for inputs and outputs in Butler.__init__
1028 if (mapper
and not isinstance(mapper, basestring)
and
1029 not inspect.isclass(mapper)):
1030 self.log.warn(preinitedMapperWarning)
1033 if hasattr(mapper,
'root'):
1039 outputs = RepositoryArgs(mode=
'rw',
1042 mapperArgs=mapperArgs)
1043 return inputs, outputs
1046 return 'Butler(datasetTypeAliasDict=%s, repos=%s, persistence=%s)' % (
1049 def _getDefaultMapper(self):
1051 """Get the default mapper. Currently this means if all the repositories use exactly the same mapper,
1052 that mapper may be considered the default.
1054 This definition may be changing; mappers may be able to exclude themselves as candidates for default,
1055 and they may nominate a different mapper instead. Also, we may not want to look at *all* the
1056 repositories, but only a depth-first search on each of the input & output repositories, and use the
1057 first-found mapper for each of those. TBD.
1066 Mapper class or None
1067 Returns the class type of the default mapper, or None if a default
1068 mapper can not be determined.
1070 defaultMapper =
None
1072 for inputRepoData
in self._repos.inputs():
1074 if inputRepoData.cfg.mapper
is not None:
1075 mapper = inputRepoData.cfg.mapper
1080 if isinstance(mapper, basestring):
1082 elif not inspect.isclass(mapper):
1083 mapper = mapper.__class__
1089 if defaultMapper
is None:
1090 defaultMapper = mapper
1091 elif mapper == defaultMapper:
1093 elif mapper
is not None:
1095 return defaultMapper
1097 def _assignDefaultMapper(self, defaultMapper):
1098 for repoData
in self._repos.all().values():
1099 if repoData.cfg.mapper
is None and (repoData.isNewRepository
or repoData.isV1Repository):
1100 if defaultMapper
is None:
1102 "No mapper specified for %s and no default mapper could be determined." %
1104 repoData.cfg.mapper = defaultMapper
1108 """posix-only; gets the mapper class at the path specified by root (if a file _mapper can be found at
1109 that location or in a parent location.
1111 As we abstract the storage and support different types of storage locations this method will be
1112 moved entirely into Butler Access, or made more dynamic, and the API will very likely change."""
1113 return Storage.getMapperClass(root)
1116 """Register an alias that will be substituted in datasetTypes.
1121 The alias keyword. It may start with @ or not. It may not contain @ except as the first character.
1122 datasetType - string
1123 The string that will be substituted when @alias is passed into datasetType. It may not contain '@'
1127 atLoc = alias.rfind(
'@')
1129 alias =
"@" + str(alias)
1131 raise RuntimeError(
"Badly formatted alias string: %s" % (alias,))
1134 if datasetType.count(
'@') != 0:
1135 raise RuntimeError(
"Badly formatted type string: %s" % (datasetType))
1140 if key.startswith(alias)
or alias.startswith(key):
1141 raise RuntimeError(
"Alias: %s overlaps with existing alias: %s" % (alias, key))
1145 def getKeys(self, datasetType=None, level=None, tag=None):
1146 """Get the valid data id keys at or above the given level of hierarchy for the dataset type or the
1147 entire collection if None. The dict values are the basic Python types corresponding to the keys (int,
1152 datasetType - string
1153 The type of dataset to get keys for, entire collection if None.
1155 The hierarchy level to descend to. None if it should not be restricted. Use an empty string if the
1156 mapper should lookup the default level.
1157 tags - any, or list of any
1158 Any object that can be tested to be the same as the tag in a dataId passed into butler input
1159 functions. Applies only to input repositories: If tag is specified by the dataId then the repo
1160 will only be read from used if the tag in the dataId matches a tag used for that repository.
1164 Returns a dict. The dict keys are the valid data id keys at or above the given level of hierarchy for
1165 the dataset type or the entire collection if None. The dict values are the basic Python types
1166 corresponding to the keys (int, float, string).
1172 for repoData
in self._repos.inputs():
1173 if not tag
or len(tag.intersection(repoData.tags)) > 0:
1174 keys = repoData.repo.getKeys(datasetType, level)
1177 if keys
is not None:
1182 """Returns the valid values for one or more keys when given a partial
1183 input collection data id.
1187 datasetType - string
1188 The type of dataset to inquire about.
1190 Key or tuple of keys to be returned.
1191 dataId - DataId, dict
1192 The partial data id.
1194 Keyword arguments for the partial data id.
1198 A list of valid values or tuples of valid values as specified by the
1203 dataId = DataId(dataId)
1204 dataId.update(**rest)
1208 for repoData
in self._repos.inputs():
1209 if not dataId.tag
or len(dataId.tag.intersection(repoData.tags)) > 0:
1210 tuples = repoData.repo.queryMetadata(datasetType, format, dataId)
1217 if len(format) == 1:
1229 """Determines if a dataset file exists.
1233 datasetType - string
1234 The type of dataset to inquire about.
1235 dataId - DataId, dict
1236 The data id of the dataset.
1237 **rest keyword arguments for the data id.
1242 True if the dataset exists or is non-file-based.
1245 dataId = DataId(dataId)
1246 dataId.update(**rest)
1247 location = self.
_locate(datasetType, dataId, write=
False)
1248 if location
is None:
1253 if isinstance(location, ButlerComposite):
1254 for name, componentInfo
in location.componentInfo.items():
1255 if componentInfo.subset:
1256 subset = self.
subset(datasetType=componentInfo.datasetType, dataId=location.dataId)
1257 exists = all([obj.datasetExists()
for obj
in subset])
1259 exists = self.
datasetExists(componentInfo.datasetType, location.dataId)
1263 exists = location.repository.exists(location)
1266 def _locate(self, datasetType, dataId, write):
1267 """Get one or more ButlerLocations and/or ButlercComposites.
1271 datasetType : string
1272 The datasetType that is being searched for. The datasetType may be followed by a dot and
1273 a component name (component names are specified in the policy). IE datasetType.componentName
1275 dataId : dict or DataId class instance
1279 True if this is a search to write an object. False if it is a search to read an object. This
1280 affects what type (an object or a container) is returned.
1284 If write is False, will return either a single object or None. If write is True, will return a list
1285 (which may be empty)
1287 repos = self._repos.outputs()
if write
else self._repos.inputs()
1289 for repoData
in repos:
1291 if not write
and dataId.tag
and len(dataId.tag.intersection(repoData.tags)) == 0:
1293 components = datasetType.split(
'.')
1294 datasetType = components[0]
1295 components = components[1:]
1297 location = repoData.repo.map(datasetType, dataId, write=write)
1300 if location
is None:
1302 location.datasetType = datasetType
1303 if len(components) > 0:
1304 if not isinstance(location, ButlerComposite):
1305 raise RuntimeError(
"The location for a dotted datasetType must be a composite.")
1307 components[0] = location.componentInfo[components[0]].datasetType
1309 datasetType =
'.'.join(components)
1310 location = self.
_locate(datasetType, dataId, write)
1312 if location
is None:
1323 if hasattr(location.mapper,
"bypass_" + location.datasetType):
1327 location.bypass = bypass
1333 if (isinstance(location, ButlerComposite)
or hasattr(location,
'bypass')
or
1334 location.repository.exists(location)):
1338 locations.extend(location)
1340 locations.append(location)
1346 def _getBypassFunc(location, dataId):
1347 pythonType = location.getPythonType()
1348 if pythonType
is not None:
1349 if isinstance(pythonType, basestring):
1351 bypassFunc = getattr(location.mapper,
"bypass_" + location.datasetType)
1352 return lambda: bypassFunc(location.datasetType, pythonType, location, dataId)
1354 def get(self, datasetType, dataId=None, immediate=True, **rest):
1355 """Retrieves a dataset given an input collection data id.
1359 datasetType - string
1360 The type of dataset to retrieve.
1364 If False use a proxy for delayed loading.
1366 keyword arguments for the data id.
1370 An object retrieved from the dataset (or a proxy for one).
1373 dataId = DataId(dataId)
1374 dataId.update(**rest)
1376 location = self.
_locate(datasetType, dataId, write=
False)
1377 if location
is None:
1378 raise NoResults(
"No locations for get:", datasetType, dataId)
1379 self.log.debug(
"Get type=%s keys=%s from %s", datasetType, dataId, str(location))
1381 if hasattr(location,
'bypass'):
1384 return location.bypass
1387 return self.
_read(location)
1388 if location.mapper.canStandardize(location.datasetType):
1389 innerCallback = callback
1392 return location.mapper.standardize(location.datasetType, innerCallback(), dataId)
1395 return ReadProxy(callback)
1397 def put(self, obj, datasetType, dataId={}, doBackup=False, **rest):
1398 """Persists a dataset given an output collection data id.
1403 The object to persist.
1404 datasetType - string
1405 The type of dataset to persist.
1409 If True, rename existing instead of overwriting.
1410 WARNING: Setting doBackup=True is not safe for parallel processing, as it may be subject to race
1413 Keyword arguments for the data id.
1416 dataId = DataId(dataId)
1417 dataId.update(**rest)
1419 for location
in self.
_locate(datasetType, dataId, write=
True):
1420 if isinstance(location, ButlerComposite):
1421 disassembler = location.disassembler
if location.disassembler
else genericDisassembler
1422 disassembler(obj=obj, dataId=location.dataId, componentInfo=location.componentInfo)
1423 for name, info
in location.componentInfo.items():
1424 if not info.inputOnly:
1425 self.
put(info.obj, info.datasetType, location.dataId, doBackup=doBackup)
1428 location.getRepository().backup(location.datasetType, dataId)
1429 location.getRepository().write(location, obj)
1431 def subset(self, datasetType, level=None, dataId={}, **rest):
1432 """Return complete dataIds for a dataset type that match a partial (or empty) dataId.
1434 Given a partial (or empty) dataId specified in dataId and **rest, find all datasets that match the
1435 dataId. Optionally restrict the results to a given level specified by a dataId key (e.g. visit or
1436 sensor or amp for a camera). Return an iterable collection of complete dataIds as ButlerDataRefs.
1437 Datasets with the resulting dataIds may not exist; that needs to be tested with datasetExists().
1441 datasetType - string
1442 The type of dataset collection to subset
1444 The level of dataId at which to subset. Use an empty string if the mapper should look up the
1449 Keyword arguments for the data id.
1453 subset - ButlerSubset
1454 Collection of ButlerDataRefs for datasets matching the data id.
1458 To print the full dataIds for all r-band measurements in a source catalog
1459 (note that the subset call is equivalent to: `butler.subset('src', dataId={'filter':'r'})`):
1461 >>> subset = butler.subset('src', filter=
'r')
1462 >>> for data_ref
in subset: print(data_ref.dataId)
1464 datasetType = self._resolveDatasetTypeAlias(datasetType)
1466 # Currently expected behavior of subset is that if specified level is None then the mapper's default
1467 # level should be used. Convention for level within Butler is that an empty string is used to indicate
1472 dataId = DataId(dataId)
1473 dataId.update(**rest)
1474 return ButlerSubset(self, datasetType, level, dataId)
1476 def dataRef(self, datasetType, level=None, dataId={}, **rest):
1477 """Returns a single ButlerDataRef.
1479 Given a complete dataId specified
in dataId
and **rest, find the unique dataset at the given level
1480 specified by a dataId key (e.g. visit
or sensor
or amp
for a camera)
and return a ButlerDataRef.
1484 datasetType - string
1485 The type of dataset collection to reference
1487 The level of dataId at which to reference
1491 Keyword arguments
for the data id.
1495 dataRef - ButlerDataRef
1496 ButlerDataRef
for dataset matching the data id
1499 datasetType = self._resolveDatasetTypeAlias(datasetType)
1500 dataId = DataId(dataId)
1501 subset = self.subset(datasetType, level, dataId, **rest)
1502 if len(subset) != 1:
1503 raise RuntimeError("No unique dataset for: Dataset type:%s Level:%s Data ID:%s Keywords:%s" %
1504 (str(datasetType), str(level), str(dataId), str(rest)))
1505 return ButlerDataRef(subset, subset.cache[0])
1507 def _read(self, location):
1508 """Unpersist an object using data inside a ButlerLocation
or ButlerComposite object.
1512 location : ButlerLocation
or ButlerComposite
1513 A ButlerLocation
or ButlerComposite instance populated with data needed to read the object.
1518 An instance of the object specified by the location.
1520 self.log.debug("Starting read from %s", location)
1522 if isinstance(location, ButlerComposite):
1523 for name, componentInfo in location.componentInfo.items():
1524 if componentInfo.subset:
1525 subset = self.subset(datasetType=componentInfo.datasetType, dataId=location.dataId)
1526 componentInfo.obj = [obj.get() for obj in subset]
1528 obj = self.get(componentInfo.datasetType, location.dataId, immediate=True)
1529 componentInfo.obj = obj
1530 assembler = location.assembler or genericAssembler
1531 results = assembler(dataId=location.dataId, componentInfo=location.componentInfo,
1532 cls=location.python)
1535 results = location.repository.read(location)
1536 if len(results) == 1:
1537 results = results[0]
1538 self.log.debug("Ending read from %s", location)
1541 def __reduce__(self):
1542 ret = (_unreduce, (self._initArgs, self.datasetTypeAliasDict))
1545 def _resolveDatasetTypeAlias(self, datasetType):
1546 """Replaces all the known alias keywords
in the given string with the alias value.
1550 datasetType - string
1551 A datasetType string to search & replace on
1555 datasetType - string
1556 The de-aliased string
1558 for key in self.datasetTypeAliasDict:
1559 # if all aliases have been replaced, bail out
1560 if datasetType.find('@') == -1:
1562 datasetType = datasetType.replace(key, self.datasetTypeAliasDict[key])
1564 # If an alias specifier can not be resolved then throw.
1565 if datasetType.find('@') != -1:
1566 raise RuntimeError("Unresolvable alias specifier in datasetType: %s" % (datasetType))
1571 def _unreduce(initArgs, datasetTypeAliasDict):
1572 mapperArgs = initArgs.pop('mapperArgs')
1573 initArgs.update(mapperArgs)
1574 butler = Butler(**initArgs)
1575 butler.datasetTypeAliasDict = datasetTypeAliasDict
def _connectParentRepoDatas
def _processInputArguments
def _resolveDatasetTypeAlias
def _setAndVerifyParentsLists