27 """This module defines the Butler class.""" 28 from builtins
import str, super
29 from past.builtins
import basestring
30 from builtins
import object
39 from .
import ReadProxy, ButlerSubset, ButlerDataRef, Persistence, \
40 Storage, Policy, NoResults, Repository, DataId, RepositoryCfg, \
41 RepositoryArgs, listify, setify, sequencify, doImport, ButlerComposite, genericAssembler, \
42 genericDisassembler, PosixStorage, ParentsMismatch
44 preinitedMapperWarning = (
"Passing an instantiated mapper into " +
45 "Butler.__init__ will prevent Butler from passing " +
46 "parentRegistry or repositoryCfg information to " +
47 "the mapper, which is done only at init time. " +
48 "It is better to pass a importable string or " +
53 """Represents a Butler configuration. 57 cfg is 'wet paint' and very likely to change. Use of it in production 58 code other than via the 'old butler' API is strongly discouraged. 60 yaml_tag =
u"!ButlerCfg" 63 super().
__init__({
'repoCfg': repoCfg,
'cls': cls})
67 """Container object for repository data used by Butler 72 The arguments that are used to find or create the RepositoryCfg. 74 "input", "output", or "parent", indicating why Butler loaded this repository. 75 * input: the Repository was passed as a Butler input. 76 * output: the Repository was passed as a Butler output. 77 * parent: the Repository was specified in the RepositoryCfg parents list of a readable repository. 82 The configuration for the Repository. 85 "new", "existing", or "nested". Indicates the origin of the repository and its RepositoryCfg: 86 * new: it was created by this instance of Butler, and this instance of Butler will generate the 88 * existing: it was found (via the root or cfgRoot argument) 89 * nested: the full RepositoryCfg was nested in another RepositoryCfg's parents list (this can happen 90 if parameters of an input specified by RepositoryArgs or dict does not entirely match an existing 94 Path or URI to the location of the RepositoryCfg file. 96 repo : lsst.daf.persistence.Repository 97 The Repository class instance. 99 parentRepoDatas : list of RepoData 100 The parents of this Repository, as indicated this Repository's RepositoryCfg. If this is a new 101 Repository then these are the inputs to this Butler (and will be saved in the RepositoryCfg). These 102 RepoData objects are not owned by this RepoData, these are references to peer RepoData objects in the 103 Butler's RepoDataContainer. 105 isV1Repository : bool 106 True if this is an Old Butler repository. In this case the repository does not have a RepositoryCfg 107 file. It may have a _mapper file and may have a _parent symlink. It will never be treated as a "new" 108 repository, i.e. even though there is not a RepositoryCfg file, one will not be generated. 109 If False, this is a New Butler repository and is specified by RepositoryCfg file. 112 These are values that may be used to restrict the search of input repositories. Details are available 113 in the RepositoryArgs and DataId classes. 116 "input", "output", or "parent", indicating why Butler loaded this repository. 117 * input: the Repository was passed as a Butler input. 118 * output: the Repository was passed as a Butler output. 119 * parent: the Repository was specified in the RepositoryCfg parents list of a readable repository. 121 _repoArgs : RepositoryArgs 122 Contains the arguments that were used to specify this Repository. 152 "parentRepoDatas={}," +
155 "parentRegistry={})").format(
156 self.__class__.__name__,
168 def setCfg(self, cfg, origin, root, isV1Repository):
169 """Set information about the cfg into the RepoData 174 The RepositoryCfg for the repo. 176 'new', 'existing', or 'nested' 178 URI or absolute path to the location of the RepositoryCfg.yaml file. 184 if origin
not in (
'new',
'existing',
'nested'):
185 raise RuntimeError(
"Invalid value for origin:{}".format(origin))
205 if val
not in (
'input',
'output',
'parent'):
206 raise RuntimeError(
"Invalid value for role: {}".format(val))
210 """Get the parents & grandparents etc of this repo data, in depth-first search order. 212 Duplicate entries will be removed in cases where the same parent appears more than once in the parent 217 context : set, optional 218 Users should typically omit context and accept the default argument. Context is used to keep a set 219 of known RepoDatas when calling this function recursively, for duplicate elimination. 224 A list of the parents & grandparents etc of a given repo data, in depth-first search order. 229 if id(self)
in context:
231 context.add(id(self))
233 parents.append(parent)
234 parents += parent.getParentRepoDatas(context)
245 """Container object for RepoData instances owned by a Butler instance. 249 repoDataList : list of RepoData 250 repoData - RepoData instance to add 256 self.
_all = repoDataList
260 """Get a list of RepoData that are used to as inputs to the Butler. 261 The list is created lazily as needed, and cached. 265 A list of RepoData with readable repositories, in the order to be used when searching. 268 raise RuntimeError(
"Inputs not yet initialized.")
272 """Get a list of RepoData that are used to as outputs to the Butler. 273 The list is created lazily as needed, and cached. 277 A list of RepoData with writable repositories, in the order to be use when searching. 280 raise RuntimeError(
"Outputs not yet initialized.")
284 """Get a list of all RepoData that are used to as by the Butler. 285 The list is created lazily as needed, and cached. 289 A list of RepoData with writable repositories, in the order to be use when searching. 294 return "%s(_inputs=%r, \n_outputs=%s, \n_all=%s)" % (
295 self.__class__.__name__,
300 def _buildLookupLists(self):
301 """Build the inputs and outputs lists based on the order of self.all().""" 303 def addToList(repoData, lst):
304 """Add a repoData and each of its parents (depth first) to a list""" 305 if id(repoData)
in alreadyAdded:
308 alreadyAdded.add(id(repoData))
309 for parent
in repoData.parentRepoDatas:
310 addToList(parent, lst)
313 raise RuntimeError(
"Lookup lists are already built.")
314 inputs = [repoData
for repoData
in self.
all()
if repoData.role ==
'input']
315 outputs = [repoData
for repoData
in self.
all()
if repoData.role ==
'output']
318 for repoData
in outputs:
319 if 'r' in repoData.repoArgs.mode: 320 addToList(repoData.repoData, self._inputs) 321 for repoData
in inputs:
322 addToList(repoData.repoData, self.
_inputs)
323 self.
_outputs = [repoData.repoData
for repoData
in outputs]
327 """Butler provides a generic mechanism for persisting and retrieving data using mappers. 329 A Butler manages a collection of datasets known as a repository. Each dataset has a type representing its 330 intended usage and a location. Note that the dataset type is not the same as the C++ or Python type of the 331 object containing the data. For example, an ExposureF object might be used to hold the data for a raw 332 image, a post-ISR image, a calibrated science image, or a difference image. These would all be different 335 A Butler can produce a collection of possible values for a key (or tuples of values for multiple keys) if 336 given a partial data identifier. It can check for the existence of a file containing a dataset given its 337 type and data identifier. The Butler can then retrieve the dataset. Similarly, it can persist an object to 338 an appropriate location when given its associated data identifier. 340 Note that the Butler has two more advanced features when retrieving a data set. First, the retrieval is 341 lazy. Input does not occur until the data set is actually accessed. This allows datasets to be retrieved 342 and placed on a clipboard prospectively with little cost, even if the algorithm of a stage ends up not 343 using them. Second, the Butler will call a standardization hook upon retrieval of the dataset. This 344 function, contained in the input mapper object, must perform any necessary manipulations to force the 345 retrieved object to conform to standards, including translating metadata. 349 __init__(self, root, mapper=None, **mapperArgs) 351 defineAlias(self, alias, datasetType) 353 getKeys(self, datasetType=None, level=None) 355 queryMetadata(self, datasetType, format=None, dataId={}, **rest) 357 datasetExists(self, datasetType, dataId={}, **rest) 359 get(self, datasetType, dataId={}, immediate=False, **rest) 361 put(self, obj, datasetType, dataId={}, **rest) 363 subset(self, datasetType, level=None, dataId={}, **rest) 365 dataRef(self, datasetType, level=None, dataId={}, **rest) 369 The preferred method of initialization is to use the `inputs` and `outputs` __init__ parameters. These 370 are described in the parameters section, below. 372 For backward compatibility: this initialization method signature can take a posix root path, and 373 optionally a mapper class instance or class type that will be instantiated using the mapperArgs input 374 argument. However, for this to work in a backward compatible way it creates a single repository that is 375 used as both an input and an output repository. This is NOT preferred, and will likely break any 376 provenance system we have in place. 381 .. note:: Deprecated in 12_0 382 `root` will be removed in TBD, it is replaced by `inputs` and `outputs` for 383 multiple-repository support. 384 A file system path. Will only work with a PosixRepository. 385 mapper : string or instance 386 .. note:: Deprecated in 12_0 387 `mapper` will be removed in TBD, it is replaced by `inputs` and `outputs` for 388 multiple-repository support. 389 Provides a mapper to be used with Butler. 391 .. note:: Deprecated in 12_0 392 `mapperArgs` will be removed in TBD, it is replaced by `inputs` and `outputs` for 393 multiple-repository support. 394 Provides arguments to be passed to the mapper if the mapper input argument is a class type to be 395 instantiated by Butler. 396 inputs : RepositoryArgs, dict, or string 397 Can be a single item or a list. Provides arguments to load an existing repository (or repositories). 398 String is assumed to be a URI and is used as the cfgRoot (URI to the location of the cfg file). (Local 399 file system URI does not have to start with 'file://' and in this way can be a relative path). The 400 `RepositoryArgs` class can be used to provide more parameters with which to initialize a repository 401 (such as `mapper`, `mapperArgs`, `tags`, etc. See the `RepositoryArgs` documentation for more 402 details). A dict may be used as shorthand for a `RepositoryArgs` class instance. The dict keys must 403 match parameters to the `RepositoryArgs.__init__` function. 404 outputs : RepositoryArgs, dict, or string 405 Provides arguments to load one or more existing repositories or create new ones. The different types 406 are handled the same as for `inputs`. 408 The Butler init sequence loads all of the input and output repositories. 409 This creates the object hierarchy to read from and write to them. Each 410 repository can have 0 or more parents, which also get loaded as inputs. 411 This becomes a DAG of repositories. Ultimately, Butler creates a list of 412 these Repositories in the order that they are used. 414 Initialization Sequence 415 ======================= 417 During initialization Butler creates a Repository class instance & support structure for each object 418 passed to `inputs` and `outputs` as well as the parent repositories recorded in the `RepositoryCfg` of 419 each existing readable repository. 421 This process is complex. It is explained below to shed some light on the intent of each step. 423 1. Input Argument Standardization 424 --------------------------------- 426 In `Butler._processInputArguments` the input arguments are verified to be legal (and a RuntimeError is 427 raised if not), and they are converted into an expected format that is used for the rest of the Butler 428 init sequence. See the docstring for `_processInputArguments`. 430 2. Create RepoData Objects 431 -------------------------- 433 Butler uses an object, called `RepoData`, to keep track of information about each repository; each 434 repository is contained in a single `RepoData`. The attributes are explained in its docstring. 436 After `_processInputArguments`, a RepoData is instantiated and put in a list for each repository in 437 `outputs` and `inputs`. This list of RepoData, the `repoDataList`, now represents all the output and input 438 repositories (but not parent repositories) that this Butler instance will use. 440 3. Get `RepositoryCfg`s 441 ----------------------- 443 `Butler._getCfgs` gets the `RepositoryCfg` for each repository the `repoDataList`. The behavior is 444 described in the docstring. 449 `Butler._addParents` then considers the parents list in the `RepositoryCfg` of each `RepoData` in the 450 `repoDataList` and inserts new `RepoData` objects for each parent not represented in the proper location 451 in the `repoDataList`. Ultimately a flat list is built to represent the DAG of readable repositories 452 represented in depth-first order. 454 5. Set and Verify Parents of Outputs 455 ------------------------------------ 457 To be able to load parent repositories when output repositories are used as inputs, the input repositories 458 are recorded as parents in the `RepositoryCfg` file of new output repositories. When an output repository 459 already exists, for consistency the Butler's inputs must match the list of parents specified the already- 460 existing output repository's `RepositoryCfg` file. 462 In `Butler._setAndVerifyParentsLists`, the list of parents is recorded in the `RepositoryCfg` of new 463 repositories. For existing repositories the list of parents is compared with the `RepositoryCfg`'s parents 464 list, and if they do not match a `RuntimeError` is raised. 466 6. Set the Default Mapper 467 ------------------------- 469 If all the input repositories use the same mapper then we can assume that mapper to be the 470 "default mapper". If there are new output repositories whose `RepositoryArgs` do not specify a mapper and 471 there is a default mapper then the new output repository will be set to use that default mapper. 473 This is handled in `Butler._setDefaultMapper`. 475 7. Cache References to Parent RepoDatas 476 --------------------------------------- 478 In `Butler._connectParentRepoDatas`, in each `RepoData` in `repoDataList`, a list of `RepoData` object 479 references is built that matches the parents specified in that `RepoData`'s `RepositoryCfg`. 481 This list is used later to find things in that repository's parents, without considering peer repository's 482 parents. (e.g. finding the registry of a parent) 487 Tags are described at https://ldm-463.lsst.io/v/draft/#tagging 489 In `Butler._setRepoDataTags`, for each `RepoData`, the tags specified by its `RepositoryArgs` are recorded 490 in a set, and added to the tags set in each of its parents, for ease of lookup when mapping. 492 9. Find Parent Registry and Instantiate RepoData 493 ------------------------------------------------ 495 At this point there is enough information to instantiate the `Repository` instances. There is one final 496 step before instantiating the Repository, which is to try to get a parent registry that can be used by the 497 child repository. The criteria for "can be used" is spelled out in `Butler._setParentRegistry`. However, 498 to get the registry from the parent, the parent must be instantiated. The `repoDataList`, in depth-first 499 search order, is built so that the most-dependent repositories are first, and the least dependent 500 repositories are last. So the `repoDataList` is reversed and the Repositories are instantiated in that 501 order; for each RepoData a parent registry is searched for, and then the Repository is instantiated with 502 whatever registry could be found.""" 504 def __init__(self, root=None, mapper=None, inputs=None, outputs=None, **mapperArgs):
505 self.
_initArgs = {
'root': root,
'mapper': mapper,
'inputs': inputs,
'outputs': outputs,
506 'mapperArgs': mapperArgs}
508 self.
log = Log.getLogger(
"daf.persistence.butler")
514 root=root, mapper=mapper, inputs=inputs, outputs=outputs, **mapperArgs)
517 inputs = [
RepoData(args,
'input')
for args
in inputs]
518 outputs = [
RepoData(args,
'output')
for args
in outputs]
519 repoDataList = outputs + inputs
535 for repoData
in reversed(repoDataList):
539 def _setParentRegistry(self, repoData):
540 """Try to get a parent registry that can be used by this repository. To be usable the repository must 541 "match", meaning the mapper in the passed-in repo is the same type as the mapper in the parent. 544 def getParentRegsitry(repoData, context):
545 """Get the first found registry that matches the the passed-in repo. 550 The RepoData for the repository for which we are searching for a 556 A registry from a parent if one can be found, or None. 561 Indicates a butler init order problem, all parents should be initialized before child 562 repositories, so this function should be able to get any parent of any child repo. 564 if id(self)
in context:
567 context.add(id(self))
568 for parentRepoData
in repoData.getParentRepoDatas():
569 if parentRepoData.cfg.mapper == repoData.cfg.mapper:
570 if parentRepoData.repo
is None:
572 "_getParentRegistry: Parent {} of new repo {} not yet created, ignoring.".format(
573 parentRepoData, repoData))
575 parentRegistry = parentRepoData.repo.getRegistry()
577 return parentRegistry
579 parentRegistry = getParentRegsitry(parentRepoData, context)
581 return parentRegistry
584 repoData.repoData.parentRegistry = getParentRegsitry(repoData.repoData, set())
586 def _processInputArguments(self, root=None, mapper=None, inputs=None, outputs=None, **mapperArgs):
587 """Process, verify, and standardize the input arguments. 588 * Inputs can not be for Old Butler (root, mapper, mapperArgs) AND New Butler (inputs, outputs) 589 `root`, `mapper`, and `mapperArgs` are Old Butler init API. 590 `inputs` and `outputs` are New Butler init API. 591 Old Butler and New Butler init API may not be mixed, Butler may be initialized with only the Old 592 arguments or the New arguments. 593 * Verify that if there is a readable output that there is exactly one output. (This restriction is in 594 place because all readable repositories must be parents of writable repositories, and for 595 consistency the DAG of readable repositories must always be the same. Keeping the list of parents 596 becomes very complicated in the presence of multiple readable output repositories. It is better to 597 only write to output repositories, and then create a new Butler instance and use the outputs as 598 inputs, and write to new output repositories.) 599 * Make a copy of inputs & outputs so they may be modified without changing the passed-in arguments. 600 * Convert any input/output values that are URI strings to RepositoryArgs. 601 * Listify inputs & outputs. 602 * Set default RW mode on inputs & outputs as needed. 606 Same as Butler.__init__ 610 (list of RepositoryArgs, list of RepositoryArgs) 611 First item is a list to use as inputs. 612 Second item is a list to use as outputs. 617 If Old Butler and New Butler arguments are both used this will raise. 618 If an output is readable there is more than one output this will raise. 621 inputs = copy.deepcopy(inputs)
622 outputs = copy.deepcopy(outputs)
624 isV1Args = inputs
is None and outputs
is None 628 mapperArgs=mapperArgs
or None)
629 elif root
or mapper
or mapperArgs:
631 'Butler version 1 API (root, mapper, **mapperArgs) may ' +
632 'not be used with version 2 API (inputs, outputs)')
641 if not isinstance(args, RepositoryArgs)
else args
for args
in inputs]
643 if not isinstance(args, RepositoryArgs)
else args
for args
in outputs]
647 if args.mode
is None:
649 elif 'rw' == args.mode:
651 elif 'r' != args.mode: 652 raise RuntimeError(
"The mode of an input should be readable.")
654 if args.mode
is None:
656 elif 'w' not in args.mode:
657 raise RuntimeError(
"The mode of an output should be writable.")
659 for args
in inputs + outputs:
660 if (args.mapper
and not isinstance(args.mapper, basestring)
and 661 not inspect.isclass(args.mapper)):
662 self.
log.warn(preinitedMapperWarning)
667 raise RuntimeError(
"Butler does not support multiple output repositories if any of the " 668 "outputs are readable.")
673 def inputIsInOutputs(inputArgs, outputArgsList):
674 for o
in outputArgsList:
675 if (
'r' in o.mode and 676 o.root == inputArgs.root and 677 o.mapper == inputArgs.mapper
and 678 o.mapperArgs == inputArgs.mapperArgs
and 679 o.tags == inputArgs.tags
and 680 o.policy == inputArgs.policy):
681 self.
log.debug((
"Input repositoryArgs {} is also listed in outputs as readable; " +
682 "throwing away the input.").format(inputArgs))
686 inputs = [args
for args
in inputs
if not inputIsInOutputs(args, outputs)]
687 return inputs, outputs
690 def _getParentVal(repoData):
691 """Get the value of this repoData as it should appear in the parents 692 list of other repositories""" 693 if repoData.isV1Repository:
695 if repoData.cfgOrigin ==
'nested':
698 return repoData.cfg.root
701 def _getParents(ofRepoData, repoInfo):
702 """Create a parents list of repoData from inputs and (readable) outputs.""" 705 for repoData
in repoInfo:
706 if repoData
is ofRepoData:
708 if 'r' not in repoData.repoArgs.mode: 710 parents.append(Butler._getParentVal(repoData))
714 def _getOldButlerRepositoryCfg(repositoryArgs):
715 if not Storage.isPosix(repositoryArgs.cfgRoot):
717 if not PosixStorage.v1RepoExists(repositoryArgs.cfgRoot):
719 if not repositoryArgs.mapper:
720 repositoryArgs.mapper = PosixStorage.getMapperClass(repositoryArgs.cfgRoot)
721 cfg = RepositoryCfg.makeFromArgs(repositoryArgs)
722 parent = PosixStorage.getParentSymlinkPath(repositoryArgs.cfgRoot)
724 parent = Butler._getOldButlerRepositoryCfg(
RepositoryArgs(cfgRoot=parent, mode=
'r')) 725 if parent
is not None:
726 cfg.addParents([parent])
729 def _getRepositoryCfg(self, repositoryArgs):
730 """Try to get a repository from the location described by cfgRoot. 734 repositoryArgs : RepositoryArgs or string 735 Provides arguments to load an existing repository (or repositories). String is assumed to be a URI 736 and is used as the cfgRoot (URI to the location of the cfg file). 740 (RepositoryCfg or None, bool) 741 The RepositoryCfg, or None if one cannot be found, and True if the RepositoryCfg was created by 742 reading an Old Butler repository, or False if it is a New Butler Repository. 744 if not isinstance(repositoryArgs, RepositoryArgs):
747 cfg = self.storage.getRepositoryCfg(repositoryArgs.cfgRoot) 748 isOldButlerRepository = False 750 cfg = Butler._getOldButlerRepositoryCfg(repositoryArgs)
752 isOldButlerRepository =
True 753 return cfg, isOldButlerRepository
755 def _getCfgs(self, repoDataList):
756 """Get or make a RepositoryCfg for each RepoData, and add the cfg to the RepoData. 757 If the cfg exists, compare values. If values match then use the cfg as an "existing" cfg. If the 758 values do not match, use the cfg as a "nested" cfg. 759 If the cfg does not exist, the RepositoryArgs must be for a writable repository. 763 repoDataList : list of RepoData 764 The RepoData that are output and inputs of this Butler 769 If the passed-in RepositoryArgs indicate an existing repository but other cfg parameters in those 771 match the existing repository's cfg a RuntimeError will be raised. 773 def cfgMatchesArgs(args, cfg):
774 """Test if there are any values in an RepositoryArgs that conflict with the values in a cfg""" 775 if args.mapper
is not None and cfg.mapper != args.mapper:
777 if args.mapperArgs
is not None and cfg.mapperArgs != args.mapperArgs:
779 if args.policy
is not None and cfg.policy != args.policy:
783 for repoData
in repoDataList:
786 if 'w' not in repoData.repoArgs.mode:
788 "No cfg found for read-only input repository at {}".format(repoData.repoArgs.cfgRoot))
789 repoData.setCfg(cfg=RepositoryCfg.makeFromArgs(repoData.repoArgs),
791 root=repoData.repoArgs.cfgRoot,
792 isV1Repository=isOldButlerRepository)
802 for i, parent
in enumerate(cfg.parents):
803 if isinstance(parent, RepositoryCfg):
806 if parentIsOldButlerRepository:
807 parentCfg.mapperArgs = cfg.mapperArgs
808 self.
log.info((
"Butler is replacing an Old Butler parent repository path '{}' " 809 "found in the parents list of a New Butler repositoryCfg: {} " 810 "with a repositoryCfg that includes the child repository's " 811 "mapperArgs: {}. This affects the instantiated RepositoryCfg " 812 "but does not change the persisted child repositoryCfg.yaml file." 813 ).format(parent, cfg, parentCfg))
814 cfg._parents[i] = cfg._normalizeParents(cfg.root, [parentCfg])[0]
816 if 'w' in repoData.repoArgs.mode:
818 if not cfgMatchesArgs(repoData.repoArgs, cfg):
819 raise RuntimeError((
"The RepositoryArgs and RepositoryCfg must match for writable " +
820 "repositories, RepositoryCfg:{}, RepositoryArgs:{}").format(
821 cfg, repoData.repoArgs))
822 repoData.setCfg(cfg=cfg, origin=
'existing', root=repoData.repoArgs.cfgRoot,
823 isV1Repository=isOldButlerRepository)
826 if cfgMatchesArgs(repoData.repoArgs, cfg):
827 repoData.setCfg(cfg=cfg, origin=
'existing', root=repoData.repoArgs.cfgRoot,
828 isV1Repository=isOldButlerRepository)
830 repoData.setCfg(cfg=cfg, origin=
'nested', root=
None,
831 isV1Repository=isOldButlerRepository)
833 def _addParents(self, repoDataList):
834 """For each repoData in the input list, see if its parents are the next items in the list, and if not 835 add the parent, so that the repoDataList includes parents and is in order to operate depth-first 0..n. 839 repoDataList : list of RepoData 840 The RepoData for the Butler outputs + inputs. 845 Raised if a RepositoryCfg can not be found at a location where a parent repository should be. 849 if repoDataIdx == len(repoDataList):
851 repoData = repoDataList[repoDataIdx]
852 if 'r' not in repoData.repoArgs.mode: 855 if repoData.isNewRepository:
858 if repoData.cfg.parents
is None:
861 for repoParentIdx, repoParent
in enumerate(repoData.cfg.parents):
862 parentIdxInRepoDataList = repoDataIdx + repoParentIdx + 1
863 if not isinstance(repoParent, RepositoryCfg):
865 if repoParentCfg
is not None:
866 cfgOrigin =
'existing' 868 isOldButlerRepository =
False 869 repoParentCfg = repoParent
871 if (parentIdxInRepoDataList < len(repoDataList)
and 872 repoDataList[parentIdxInRepoDataList].cfg == repoParentCfg):
875 role = 'input' if repoData.role ==
'output' else 'parent' 877 newRepoInfo.repoData.setCfg(cfg=repoParentCfg, origin=cfgOrigin, root=args.cfgRoot,
878 isV1Repository=isOldButlerRepository)
879 repoDataList.insert(parentIdxInRepoDataList, newRepoInfo)
882 def _setAndVerifyParentsLists(self, repoDataList):
883 """Make a list of all the input repositories of this Butler, these are the parents of the outputs. 884 For new output repositories, set the parents in the RepositoryCfg. For existing output repositories 885 verify that the RepositoryCfg's parents match the parents list. 889 repoDataList : list of RepoData 890 All the RepoDatas loaded by this butler, in search order. 895 If an existing output repository is loaded and its parents do not match the parents of this Butler 896 an error will be raised. 898 def getIOParents(ofRepoData, repoDataList):
899 """make a parents list for repo in `ofRepoData` that is comprised of inputs and readable 900 outputs (not parents-of-parents) of this butler""" 902 for repoData
in repoDataList:
903 if repoData.role ==
'parent':
905 if repoData
is ofRepoData:
907 if repoData.role ==
'output':
908 if 'r' in repoData.repoArgs.mode: 909 raise RuntimeError(
"If an output is readable it must be the only output.")
916 for repoData
in repoDataList:
917 if repoData.role !=
'output':
919 parents = getIOParents(repoData, repoDataList)
921 if repoData.cfgOrigin ==
'new':
922 repoData.cfg.addParents(parents)
923 elif repoData.cfgOrigin
in (
'existing',
'nested'):
924 if repoData.cfg.parents != parents:
926 repoData.cfg.extendParents(parents)
927 except ParentsMismatch
as e:
928 raise RuntimeError((
"Inputs of this Butler:{} do not match parents of existing " +
929 "writable cfg:{} (ParentMismatch exception: {}").format(
930 parents, repoData.cfg.parents, e))
932 def _setDefaultMapper(self, repoDataList):
933 """Establish a default mapper if there is one and assign it to outputs that do not have a mapper 936 If all inputs have the same mapper it will be used as the default mapper. 940 repoDataList : list of RepoData 941 All the RepoDatas loaded by this butler, in search order. 946 If a default mapper can not be established and there is an output that does not have a mapper. 948 needyOutputs = [rd
for rd
in repoDataList
if rd.role ==
'output' and rd.cfg.mapper
is None]
949 if len(needyOutputs)
is 0:
951 mappers = set([rd.cfg.mapper
for rd
in repoDataList
if rd.role ==
'input'])
952 if len(mappers) != 1:
953 inputs = [rd
for rd
in repoDataList
if rd.role ==
'input']
955 (
"No default mapper could be established from inputs:{} and no mapper specified " +
956 "for outputs:{}").format(inputs, needyOutputs))
957 defaultMapper = mappers.pop()
958 for repoData
in needyOutputs:
959 repoData.cfg.mapper = defaultMapper
961 def _connectParentRepoDatas(self, repoDataList):
962 """For each RepoData in repoDataList, find its parent in the repoDataList and cache a reference to it. 966 repoDataList : list of RepoData 967 All the RepoDatas loaded by this butler, in search order. 972 When a parent is listed in the parents list but not found in the repoDataList. This is not 973 expected to ever happen and would indicate an internal Butler error. 975 for repoData
in repoDataList:
976 for parent
in repoData.cfg.parents:
978 for otherRepoData
in repoDataList:
979 if isinstance(parent, RepositoryCfg):
980 if otherRepoData.repoData.repoData.cfg == parent:
981 parentToAdd = otherRepoData.repoData
983 elif otherRepoData.repoData.cfg.root == parent:
984 parentToAdd = otherRepoData.repoData
986 if parentToAdd
is None:
988 "Could not find a parent matching {} to add to {}".format(parent, repoData))
989 repoData.addParentRepoData(parentToAdd)
992 def _getParentRepoData(parent, repoDataList):
993 """get a parent RepoData from a cfg from a list of RepoData 997 parent : string or RepositoryCfg 998 cfgRoot of a repo or a cfg that describes the repo 999 repoDataList : list of RepoData 1005 A RepoData if one can be found, else None 1008 for otherRepoData
in repoDataList:
1009 if isinstance(parent, RepositoryCfg):
1010 if otherRepoData.cfg == parent:
1011 repoData = otherRepoData
1013 elif otherRepoData.cfg.root == parent:
1014 repoData = otherRepoData
1018 def _setRepoDataTags(self):
1019 """Set the tags from each repoArgs into all its parent repoArgs so that they can be included in tagged 1021 def setTags(repoData, tags, context):
1022 if id(repoData)
in context:
1024 repoData.addTags(tags)
1025 context.add(id(repoData))
1026 for parentRepoData
in repoData.parentRepoDatas:
1027 setTags(parentRepoData, tags, context)
1028 for repoData
in self.
_repos.outputs() + self.
_repos.inputs():
1029 setTags(repoData.repoData, repoData.repoArgs.tags, set())
1031 def _convertV1Args(self, root, mapper, mapperArgs):
1032 """Convert Old Butler RepositoryArgs (root, mapper, mapperArgs) to New Butler RepositoryArgs 1038 Posix path to repository root 1039 mapper : class, class instance, or string 1040 Instantiated class, a class object to be instantiated, or a string that refers to a class that 1041 can be imported & used as the mapper. 1043 RepositoryArgs & their values used when instantiating the mapper. 1048 (inputs, outputs) - values to be used for inputs and outputs in Butler.__init__ 1050 if (mapper
and not isinstance(mapper, basestring)
and 1051 not inspect.isclass(mapper)):
1052 self.
log.warn(preinitedMapperWarning)
1055 if hasattr(mapper,
'root'):
1064 mapperArgs=mapperArgs)
1065 return inputs, outputs
1068 return 'Butler(datasetTypeAliasDict=%s, repos=%s, persistence=%s)' % (
1071 def _getDefaultMapper(self):
1073 """Get the default mapper. Currently this means if all the repositories use exactly the same mapper, 1074 that mapper may be considered the default. 1076 This definition may be changing; mappers may be able to exclude themselves as candidates for default, 1077 and they may nominate a different mapper instead. Also, we may not want to look at *all* the 1078 repositories, but only a depth-first search on each of the input & output repositories, and use the 1079 first-found mapper for each of those. TBD. 1088 Mapper class or None 1089 Returns the class type of the default mapper, or None if a default 1090 mapper can not be determined. 1092 defaultMapper =
None 1094 for inputRepoData
in self.
_repos.inputs():
1096 if inputRepoData.cfg.mapper
is not None:
1097 mapper = inputRepoData.cfg.mapper
1102 if isinstance(mapper, basestring):
1104 elif not inspect.isclass(mapper):
1105 mapper = mapper.__class__
1111 if defaultMapper
is None:
1112 defaultMapper = mapper
1113 elif mapper == defaultMapper:
1115 elif mapper
is not None:
1117 return defaultMapper
1119 def _assignDefaultMapper(self, defaultMapper):
1120 for repoData
in self.
_repos.all().values():
1121 if repoData.cfg.mapper
is None and (repoData.isNewRepository
or repoData.isV1Repository):
1122 if defaultMapper
is None:
1124 "No mapper specified for %s and no default mapper could be determined." %
1126 repoData.cfg.mapper = defaultMapper
1130 """posix-only; gets the mapper class at the path specified by root (if a file _mapper can be found at 1131 that location or in a parent location. 1133 As we abstract the storage and support different types of storage locations this method will be 1134 moved entirely into Butler Access, or made more dynamic, and the API will very likely change.""" 1135 return Storage.getMapperClass(root)
1138 """Register an alias that will be substituted in datasetTypes. 1143 The alias keyword. It may start with @ or not. It may not contain @ except as the first character. 1144 datasetType - string 1145 The string that will be substituted when @alias is passed into datasetType. It may not contain '@' 1149 atLoc = alias.rfind(
'@')
1151 alias =
"@" + str(alias)
1153 raise RuntimeError(
"Badly formatted alias string: %s" % (alias,))
1156 if datasetType.count(
'@') != 0:
1157 raise RuntimeError(
"Badly formatted type string: %s" % (datasetType))
1162 if key.startswith(alias)
or alias.startswith(key):
1163 raise RuntimeError(
"Alias: %s overlaps with existing alias: %s" % (alias, key))
1167 def getKeys(self, datasetType=None, level=None, tag=None):
1168 """Get the valid data id keys at or above the given level of hierarchy for the dataset type or the 1169 entire collection if None. The dict values are the basic Python types corresponding to the keys (int, 1174 datasetType - string 1175 The type of dataset to get keys for, entire collection if None. 1177 The hierarchy level to descend to. None if it should not be restricted. Use an empty string if the 1178 mapper should lookup the default level. 1179 tags - any, or list of any 1180 Any object that can be tested to be the same as the tag in a dataId passed into butler input 1181 functions. Applies only to input repositories: If tag is specified by the dataId then the repo 1182 will only be read from used if the tag in the dataId matches a tag used for that repository. 1186 Returns a dict. The dict keys are the valid data id keys at or above the given level of hierarchy for 1187 the dataset type or the entire collection if None. The dict values are the basic Python types 1188 corresponding to the keys (int, float, string). 1194 for repoData
in self.
_repos.inputs():
1195 if not tag
or len(tag.intersection(repoData.tags)) > 0:
1196 keys = repoData.repo.getKeys(datasetType, level)
1199 if keys
is not None:
1204 """Returns the valid values for one or more keys when given a partial 1205 input collection data id. 1209 datasetType - string 1210 The type of dataset to inquire about. 1212 Key or tuple of keys to be returned. 1213 dataId - DataId, dict 1214 The partial data id. 1216 Keyword arguments for the partial data id. 1220 A list of valid values or tuples of valid values as specified by the 1226 dataId.update(**rest)
1230 for repoData
in self.
_repos.inputs():
1231 if not dataId.tag
or len(dataId.tag.intersection(repoData.tags)) > 0:
1232 tuples = repoData.repo.queryMetadata(datasetType, format, dataId)
1239 if len(format) == 1:
1251 """Determines if a dataset file exists. 1255 datasetType - string 1256 The type of dataset to inquire about. 1257 dataId - DataId, dict 1258 The data id of the dataset. 1260 If True, look only in locations where the dataset could be written, 1261 and return True only if it is present in all of them. 1262 **rest keyword arguments for the data id. 1267 True if the dataset exists or is non-file-based. 1271 dataId.update(**rest)
1272 locations = self.
_locate(datasetType, dataId, write=write)
1274 if locations
is None:
1276 locations = [locations]
1281 for location
in locations:
1284 if isinstance(location, ButlerComposite):
1285 for name, componentInfo
in location.componentInfo.items():
1286 if componentInfo.subset:
1287 subset = self.
subset(datasetType=componentInfo.datasetType, dataId=location.dataId)
1288 exists = all([obj.datasetExists()
for obj
in subset])
1290 exists = self.
datasetExists(componentInfo.datasetType, location.dataId)
1294 if not location.repository.exists(location):
1298 def _locate(self, datasetType, dataId, write):
1299 """Get one or more ButlerLocations and/or ButlercComposites. 1303 datasetType : string 1304 The datasetType that is being searched for. The datasetType may be followed by a dot and 1305 a component name (component names are specified in the policy). IE datasetType.componentName 1307 dataId : dict or DataId class instance 1311 True if this is a search to write an object. False if it is a search to read an object. This 1312 affects what type (an object or a container) is returned. 1316 If write is False, will return either a single object or None. If write is True, will return a list 1317 (which may be empty) 1319 repos = self.
_repos.outputs()
if write
else self.
_repos.inputs()
1321 for repoData
in repos:
1323 if not write
and dataId.tag
and len(dataId.tag.intersection(repoData.tags)) == 0:
1325 components = datasetType.split(
'.')
1326 datasetType = components[0]
1327 components = components[1:]
1329 location = repoData.repo.map(datasetType, dataId, write=write)
1332 if location
is None:
1334 location.datasetType = datasetType
1335 if len(components) > 0:
1336 if not isinstance(location, ButlerComposite):
1337 raise RuntimeError(
"The location for a dotted datasetType must be a composite.")
1339 components[0] = location.componentInfo[components[0]].datasetType
1341 datasetType =
'.'.join(components)
1342 location = self.
_locate(datasetType, dataId, write)
1344 if location
is None:
1355 if hasattr(location.mapper,
"bypass_" + location.datasetType):
1359 location.bypass = bypass
1360 except (NoResults, IOError):
1361 self.
log.debug(
"Continuing dataset search while evaluating " 1362 "bypass function for Dataset type:{} Data ID:{} at " 1363 "location {}".format(datasetType, dataId, location))
1367 if (isinstance(location, ButlerComposite)
or hasattr(location,
'bypass')
or 1368 location.repository.exists(location)):
1372 locations.extend(location)
1374 locations.append(location)
1380 def _getBypassFunc(location, dataId):
1381 pythonType = location.getPythonType()
1382 if pythonType
is not None:
1383 if isinstance(pythonType, basestring):
1385 bypassFunc = getattr(location.mapper,
"bypass_" + location.datasetType)
1386 return lambda: bypassFunc(location.datasetType, pythonType, location, dataId)
1388 def get(self, datasetType, dataId=None, immediate=True, **rest):
1389 """Retrieves a dataset given an input collection data id. 1393 datasetType - string 1394 The type of dataset to retrieve. 1398 If False use a proxy for delayed loading. 1400 keyword arguments for the data id. 1404 An object retrieved from the dataset (or a proxy for one). 1408 dataId.update(**rest)
1410 location = self.
_locate(datasetType, dataId, write=
False)
1411 if location
is None:
1412 raise NoResults(
"No locations for get:", datasetType, dataId)
1413 self.
log.debug(
"Get type=%s keys=%s from %s", datasetType, dataId, str(location))
1415 if hasattr(location,
'bypass'):
1418 return location.bypass
1421 return self.
_read(location)
1422 if location.mapper.canStandardize(location.datasetType):
1423 innerCallback = callback
1426 return location.mapper.standardize(location.datasetType, innerCallback(), dataId)
1429 return ReadProxy(callback)
1431 def put(self, obj, datasetType, dataId={}, doBackup=False, **rest):
1432 """Persists a dataset given an output collection data id. 1437 The object to persist. 1438 datasetType - string 1439 The type of dataset to persist. 1443 If True, rename existing instead of overwriting. 1444 WARNING: Setting doBackup=True is not safe for parallel processing, as it may be subject to race 1447 Keyword arguments for the data id. 1451 dataId.update(**rest)
1453 for location
in self.
_locate(datasetType, dataId, write=
True):
1454 if isinstance(location, ButlerComposite):
1455 disassembler = location.disassembler
if location.disassembler
else genericDisassembler
1456 disassembler(obj=obj, dataId=location.dataId, componentInfo=location.componentInfo)
1457 for name, info
in location.componentInfo.items():
1458 if not info.inputOnly:
1459 self.
put(info.obj, info.datasetType, location.dataId, doBackup=doBackup)
1462 location.getRepository().backup(location.datasetType, dataId)
1463 location.getRepository().write(location, obj)
1465 def subset(self, datasetType, level=None, dataId={}, **rest):
1466 """Return complete dataIds for a dataset type that match a partial (or empty) dataId. 1468 Given a partial (or empty) dataId specified in dataId and **rest, find all datasets that match the 1469 dataId. Optionally restrict the results to a given level specified by a dataId key (e.g. visit or 1470 sensor or amp for a camera). Return an iterable collection of complete dataIds as ButlerDataRefs. 1471 Datasets with the resulting dataIds may not exist; that needs to be tested with datasetExists(). 1475 datasetType - string 1476 The type of dataset collection to subset 1478 The level of dataId at which to subset. Use an empty string if the mapper should look up the 1483 Keyword arguments for the data id. 1487 subset - ButlerSubset 1488 Collection of ButlerDataRefs for datasets matching the data id. 1492 To print the full dataIds for all r-band measurements in a source catalog 1493 (note that the subset call is equivalent to: `butler.subset('src', dataId={'filter':'r'})`): 1495 >>> subset = butler.subset('src', filter=
'r') 1496 >>> for data_ref
in subset: print(data_ref.dataId)
1498 datasetType = self._resolveDatasetTypeAlias(datasetType) 1500 # Currently expected behavior of subset is that if specified level is None then the mapper's default 1501 # level should be used. Convention for level within Butler is that an empty string is used to indicate 1506 dataId = DataId(dataId) 1507 dataId.update(**rest) 1508 return ButlerSubset(self, datasetType, level, dataId) 1510 def dataRef(self, datasetType, level=None, dataId={}, **rest): 1511 """Returns a single ButlerDataRef.
1513 Given a complete dataId specified
in dataId
and **rest, find the unique dataset at the given level
1514 specified by a dataId key (e.g. visit
or sensor
or amp
for a camera)
and return a ButlerDataRef.
1518 datasetType - string
1519 The type of dataset collection to reference
1521 The level of dataId at which to reference
1525 Keyword arguments
for the data id.
1529 dataRef - ButlerDataRef
1530 ButlerDataRef
for dataset matching the data id
1533 datasetType = self._resolveDatasetTypeAlias(datasetType) 1534 dataId = DataId(dataId) 1535 subset = self.subset(datasetType, level, dataId, **rest) 1536 if len(subset) != 1: 1537 raise RuntimeError("No unique dataset for: Dataset type:%s Level:%s Data ID:%s Keywords:%s" % 1538 (str(datasetType), str(level), str(dataId), str(rest))) 1539 return ButlerDataRef(subset, subset.cache[0]) 1541 def getUri(self, datasetType, dataId=None, write=False, **rest): 1542 """Return the URI
for a dataset
1544 .. warning:: This
is intended only
for debugging. The URI should
1545 never be used
for anything other than printing.
1547 .. note:: In the event there are multiple URIs
for read, we
return only
1550 .. note::
getUri() does
not currently support composite datasets.
1555 The dataset type of interest.
1556 dataId : `dict`, optional
1557 The data identifier.
1558 write : `bool`, optional
1559 Return the URI
for writing?
1560 rest : `dict`, optional
1561 Keyword arguments
for the data id.
1568 datasetType = self._resolveDatasetTypeAlias(datasetType) 1569 dataId = DataId(dataId) 1570 dataId.update(**rest) 1571 locations = self._locate(datasetType, dataId, write=write) 1572 if locations is None: 1573 raise NoResults("No locations for getUri: ", datasetType, dataId) 1576 # Follow the write path 1577 # Return the first valid write location. 1578 for location in locations: 1579 if isinstance(location, ButlerComposite): 1580 for name, info in location.componentInfo.items(): 1581 if not info.inputOnly: 1582 return self.getUri(info.datasetType, location.dataId, write=True) 1584 return location.getLocationsWithRoot()[0] 1585 # fall back to raise 1586 raise NoResults("No locations for getUri(write=True): ", datasetType, dataId) 1588 # Follow the read path, only return the first valid read 1589 return locations.getLocationsWithRoot()[0] 1591 def _read(self, location): 1592 """Unpersist an object using data inside a ButlerLocation
or ButlerComposite object.
1596 location : ButlerLocation
or ButlerComposite
1597 A ButlerLocation
or ButlerComposite instance populated with data needed to read the object.
1602 An instance of the object specified by the location.
1604 self.log.debug("Starting read from %s", location) 1606 if isinstance(location, ButlerComposite): 1607 for name, componentInfo in location.componentInfo.items(): 1608 if componentInfo.subset: 1609 subset = self.subset(datasetType=componentInfo.datasetType, dataId=location.dataId) 1610 componentInfo.obj = [obj.get() for obj in subset] 1612 obj = self.get(componentInfo.datasetType, location.dataId, immediate=True) 1613 componentInfo.obj = obj 1614 assembler = location.assembler or genericAssembler 1615 results = assembler(dataId=location.dataId, componentInfo=location.componentInfo, 1616 cls=location.python) 1619 results = location.repository.read(location) 1620 if len(results) == 1: 1621 results = results[0] 1622 self.log.debug("Ending read from %s", location) 1625 def __reduce__(self): 1626 ret = (_unreduce, (self._initArgs, self.datasetTypeAliasDict)) 1629 def _resolveDatasetTypeAlias(self, datasetType): 1630 """Replaces all the known alias keywords
in the given string with the alias value.
1634 datasetType - string
1635 A datasetType string to search & replace on
1639 datasetType - string
1640 The de-aliased string
1642 for key in self.datasetTypeAliasDict: 1643 # if all aliases have been replaced, bail out 1644 if datasetType.find('@') == -1: 1646 datasetType = datasetType.replace(key, self.datasetTypeAliasDict[key]) 1648 # If an alias specifier can not be resolved then throw. 1649 if datasetType.find('@') != -1: 1650 raise RuntimeError("Unresolvable alias specifier in datasetType: %s" % (datasetType)) 1655 def _unreduce(initArgs, datasetTypeAliasDict): 1656 mapperArgs = initArgs.pop('mapperArgs') 1657 initArgs.update(mapperArgs) 1658 butler = Butler(**initArgs) 1659 butler.datasetTypeAliasDict = datasetTypeAliasDict
def _buildLookupLists(self)
def _resolveDatasetTypeAlias(self, datasetType)
def datasetExists(self, datasetType, dataId={}, write=False, rest)
def _convertV1Args(self, root, mapper, mapperArgs)
def _setRepoDataTags(self)
def __init__(self, root=None, mapper=None, inputs=None, outputs=None, mapperArgs)
def setCfg(self, cfg, origin, root, isV1Repository)
def _getRepositoryCfg(self, repositoryArgs)
def getParentRepoDatas(self, context=None)
def _setParentRegistry(self, repoData)
def _getCfgs(self, repoDataList)
def subset(self, datasetType, level=None, dataId={}, rest)
def __init__(self, cls, repoCfg)
def isNewRepository(self)
def _read(self, location)
def _setDefaultMapper(self, repoDataList)
def getUri(self, datasetType, dataId=None, write=False, rest)
def defineAlias(self, alias, datasetType)
def _connectParentRepoDatas(self, repoDataList)
def __init__(self, repoDataList)
def _addParents(self, repoDataList)
def getKeys(self, datasetType=None, level=None, tag=None)
def _getBypassFunc(location, dataId)
def put(self, obj, datasetType, dataId={}, doBackup=False, rest)
def queryMetadata(self, datasetType, format, dataId={}, rest)
def _processInputArguments(self, root=None, mapper=None, inputs=None, outputs=None, mapperArgs)
def addParentRepoData(self, parentRepoData)
def _locate(self, datasetType, dataId, write)
def _getParentVal(repoData)
def _setAndVerifyParentsLists(self, repoDataList)
def get(self, datasetType, dataId=None, immediate=True, rest)
def __init__(self, args, role)