27 """This module defines the Butler class.""" 28 from builtins
import str, super
29 from past.builtins
import basestring
30 from builtins
import object
39 from .
import ReadProxy, ButlerSubset, ButlerDataRef, Persistence, \
40 Storage, Policy, NoResults, Repository, DataId, RepositoryCfg, \
41 RepositoryArgs, listify, setify, sequencify, doImport, ButlerComposite, genericAssembler, \
42 genericDisassembler, PosixStorage, ParentsMismatch
44 preinitedMapperWarning = (
"Passing an instantiated mapper into " +
45 "Butler.__init__ will prevent Butler from passing " +
46 "parentRegistry or repositoryCfg information to " +
47 "the mapper, which is done only at init time. " +
48 "It is better to pass a importable string or " +
53 """Represents a Butler configuration. 57 cfg is 'wet paint' and very likely to change. Use of it in production 58 code other than via the 'old butler' API is strongly discouraged. 60 yaml_tag =
u"!ButlerCfg" 63 super().
__init__({
'repoCfg': repoCfg,
'cls': cls})
67 """Container object for repository data used by Butler 72 The arguments that are used to find or create the RepositoryCfg. 74 "input", "output", or "parent", indicating why Butler loaded this repository. 75 * input: the Repository was passed as a Butler input. 76 * output: the Repository was passed as a Butler output. 77 * parent: the Repository was specified in the RepositoryCfg parents list of a readable repository. 82 The configuration for the Repository. 85 "new", "existing", or "nested". Indicates the origin of the repository and its RepositoryCfg: 86 * new: it was created by this instance of Butler, and this instance of Butler will generate the 88 * existing: it was found (via the root or cfgRoot argument) 89 * nested: the full RepositoryCfg was nested in another RepositoryCfg's parents list (this can happen 90 if parameters of an input specified by RepositoryArgs or dict does not entirely match an existing 94 Path or URI to the location of the RepositoryCfg file. 96 repo : lsst.daf.persistence.Repository 97 The Repository class instance. 99 parentRepoDatas : list of RepoData 100 The parents of this Repository, as indicated this Repository's RepositoryCfg. If this is a new 101 Repository then these are the inputs to this Butler (and will be saved in the RepositoryCfg). These 102 RepoData objects are not owned by this RepoData, these are references to peer RepoData objects in the 103 Butler's RepoDataContainer. 105 isV1Repository : bool 106 True if this is an Old Butler repository. In this case the repository does not have a RepositoryCfg 107 file. It may have a _mapper file and may have a _parent symlink. It will never be treated as a "new" 108 repository, i.e. even though there is not a RepositoryCfg file, one will not be generated. 109 If False, this is a New Butler repository and is specified by RepositoryCfg file. 112 These are values that may be used to restrict the search of input repositories. Details are available 113 in the RepositoryArgs and DataId classes. 116 "input", "output", or "parent", indicating why Butler loaded this repository. 117 * input: the Repository was passed as a Butler input. 118 * output: the Repository was passed as a Butler output. 119 * parent: the Repository was specified in the RepositoryCfg parents list of a readable repository. 121 _repoArgs : RepositoryArgs 122 Contains the arguments that were used to specify this Repository. 152 "parentRepoDatas={}," +
155 "parentRegistry={})").format(
156 self.__class__.__name__,
168 def setCfg(self, cfg, origin, root, isV1Repository):
169 """Set information about the cfg into the RepoData 174 The RepositoryCfg for the repo. 176 'new', 'existing', or 'nested' 178 URI or absolute path to the location of the RepositoryCfg.yaml file. 184 if origin
not in (
'new',
'existing',
'nested'):
185 raise RuntimeError(
"Invalid value for origin:{}".format(origin))
205 if val
not in (
'input',
'output',
'parent'):
206 raise RuntimeError(
"Invalid value for role: {}".format(val))
210 """Get the parents & grandparents etc of this repo data, in depth-first search order. 212 Duplicate entries will be removed in cases where the same parent appears more than once in the parent 217 context : set, optional 218 Users should typically omit context and accept the default argument. Context is used to keep a set 219 of known RepoDatas when calling this function recursively, for duplicate elimination. 224 A list of the parents & grandparents etc of a given repo data, in depth-first search order. 229 if id(self)
in context:
231 context.add(id(self))
233 parents.append(parent)
234 parents += parent.getParentRepoDatas(context)
245 """Container object for RepoData instances owned by a Butler instance. 249 repoDataList : list of RepoData 250 repoData - RepoData instance to add 256 self.
_all = repoDataList
260 """Get a list of RepoData that are used to as inputs to the Butler. 261 The list is created lazily as needed, and cached. 265 A list of RepoData with readable repositories, in the order to be used when searching. 268 raise RuntimeError(
"Inputs not yet initialized.")
272 """Get a list of RepoData that are used to as outputs to the Butler. 273 The list is created lazily as needed, and cached. 277 A list of RepoData with writable repositories, in the order to be use when searching. 280 raise RuntimeError(
"Outputs not yet initialized.")
284 """Get a list of all RepoData that are used to as by the Butler. 285 The list is created lazily as needed, and cached. 289 A list of RepoData with writable repositories, in the order to be use when searching. 294 return "%s(_inputs=%r, \n_outputs=%s, \n_all=%s)" % (
295 self.__class__.__name__,
300 def _buildLookupLists(self):
301 """Build the inputs and outputs lists based on the order of self.all().""" 303 def addToList(repoData, lst):
304 """Add a repoData and each of its parents (depth first) to a list""" 305 if id(repoData)
in alreadyAdded:
308 alreadyAdded.add(id(repoData))
309 for parent
in repoData.parentRepoDatas:
310 addToList(parent, lst)
313 raise RuntimeError(
"Lookup lists are already built.")
314 inputs = [repoData
for repoData
in self.
all()
if repoData.role ==
'input']
315 outputs = [repoData
for repoData
in self.
all()
if repoData.role ==
'output']
318 for repoData
in outputs:
319 if 'r' in repoData.repoArgs.mode: 320 addToList(repoData.repoData, self._inputs) 321 for repoData
in inputs:
322 addToList(repoData.repoData, self.
_inputs)
323 self.
_outputs = [repoData.repoData
for repoData
in outputs]
327 """Butler provides a generic mechanism for persisting and retrieving data using mappers. 329 A Butler manages a collection of datasets known as a repository. Each dataset has a type representing its 330 intended usage and a location. Note that the dataset type is not the same as the C++ or Python type of the 331 object containing the data. For example, an ExposureF object might be used to hold the data for a raw 332 image, a post-ISR image, a calibrated science image, or a difference image. These would all be different 335 A Butler can produce a collection of possible values for a key (or tuples of values for multiple keys) if 336 given a partial data identifier. It can check for the existence of a file containing a dataset given its 337 type and data identifier. The Butler can then retrieve the dataset. Similarly, it can persist an object to 338 an appropriate location when given its associated data identifier. 340 Note that the Butler has two more advanced features when retrieving a data set. First, the retrieval is 341 lazy. Input does not occur until the data set is actually accessed. This allows datasets to be retrieved 342 and placed on a clipboard prospectively with little cost, even if the algorithm of a stage ends up not 343 using them. Second, the Butler will call a standardization hook upon retrieval of the dataset. This 344 function, contained in the input mapper object, must perform any necessary manipulations to force the 345 retrieved object to conform to standards, including translating metadata. 349 __init__(self, root, mapper=None, **mapperArgs) 351 defineAlias(self, alias, datasetType) 353 getKeys(self, datasetType=None, level=None) 355 queryMetadata(self, datasetType, format=None, dataId={}, **rest) 357 datasetExists(self, datasetType, dataId={}, **rest) 359 get(self, datasetType, dataId={}, immediate=False, **rest) 361 put(self, obj, datasetType, dataId={}, **rest) 363 subset(self, datasetType, level=None, dataId={}, **rest) 365 dataRef(self, datasetType, level=None, dataId={}, **rest) 369 The preferred method of initialization is to use the `inputs` and `outputs` __init__ parameters. These 370 are described in the parameters section, below. 372 For backward compatibility: this initialization method signature can take a posix root path, and 373 optionally a mapper class instance or class type that will be instantiated using the mapperArgs input 374 argument. However, for this to work in a backward compatible way it creates a single repository that is 375 used as both an input and an output repository. This is NOT preferred, and will likely break any 376 provenance system we have in place. 381 .. note:: Deprecated in 12_0 382 `root` will be removed in TBD, it is replaced by `inputs` and `outputs` for 383 multiple-repository support. 384 A file system path. Will only work with a PosixRepository. 385 mapper : string or instance 386 .. note:: Deprecated in 12_0 387 `mapper` will be removed in TBD, it is replaced by `inputs` and `outputs` for 388 multiple-repository support. 389 Provides a mapper to be used with Butler. 391 .. note:: Deprecated in 12_0 392 `mapperArgs` will be removed in TBD, it is replaced by `inputs` and `outputs` for 393 multiple-repository support. 394 Provides arguments to be passed to the mapper if the mapper input argument is a class type to be 395 instantiated by Butler. 396 inputs : RepositoryArgs, dict, or string 397 Can be a single item or a list. Provides arguments to load an existing repository (or repositories). 398 String is assumed to be a URI and is used as the cfgRoot (URI to the location of the cfg file). (Local 399 file system URI does not have to start with 'file://' and in this way can be a relative path). The 400 `RepositoryArgs` class can be used to provide more parameters with which to initialize a repository 401 (such as `mapper`, `mapperArgs`, `tags`, etc. See the `RepositoryArgs` documentation for more 402 details). A dict may be used as shorthand for a `RepositoryArgs` class instance. The dict keys must 403 match parameters to the `RepositoryArgs.__init__` function. 404 outputs : RepositoryArgs, dict, or string 405 Provides arguments to load one or more existing repositories or create new ones. The different types 406 are handled the same as for `inputs`. 408 The Butler init sequence loads all of the input and output repositories. 409 This creates the object hierarchy to read from and write to them. Each 410 repository can have 0 or more parents, which also get loaded as inputs. 411 This becomes a DAG of repositories. Ultimately, Butler creates a list of 412 these Repositories in the order that they are used. 414 Initialization Sequence 415 ======================= 417 During initialization Butler creates a Repository class instance & support structure for each object 418 passed to `inputs` and `outputs` as well as the parent repositories recorded in the `RepositoryCfg` of 419 each existing readable repository. 421 This process is complex. It is explained below to shed some light on the intent of each step. 423 1. Input Argument Standardization 424 --------------------------------- 426 In `Butler._processInputArguments` the input arguments are verified to be legal (and a RuntimeError is 427 raised if not), and they are converted into an expected format that is used for the rest of the Butler 428 init sequence. See the docstring for `_processInputArguments`. 430 2. Create RepoData Objects 431 -------------------------- 433 Butler uses an object, called `RepoData`, to keep track of information about each repository; each 434 repository is contained in a single `RepoData`. The attributes are explained in its docstring. 436 After `_processInputArguments`, a RepoData is instantiated and put in a list for each repository in 437 `outputs` and `inputs`. This list of RepoData, the `repoDataList`, now represents all the output and input 438 repositories (but not parent repositories) that this Butler instance will use. 440 3. Get `RepositoryCfg`s 441 ----------------------- 443 `Butler._getCfgs` gets the `RepositoryCfg` for each repository the `repoDataList`. The behavior is 444 described in the docstring. 449 `Butler._addParents` then considers the parents list in the `RepositoryCfg` of each `RepoData` in the 450 `repoDataList` and inserts new `RepoData` objects for each parent not represented in the proper location 451 in the `repoDataList`. Ultimately a flat list is built to represent the DAG of readable repositories 452 represented in depth-first order. 454 5. Set and Verify Parents of Outputs 455 ------------------------------------ 457 To be able to load parent repositories when output repositories are used as inputs, the input repositories 458 are recorded as parents in the `RepositoryCfg` file of new output repositories. When an output repository 459 already exists, for consistency the Butler's inputs must match the list of parents specified the already- 460 existing output repository's `RepositoryCfg` file. 462 In `Butler._setAndVerifyParentsLists`, the list of parents is recorded in the `RepositoryCfg` of new 463 repositories. For existing repositories the list of parents is compared with the `RepositoryCfg`'s parents 464 list, and if they do not match a `RuntimeError` is raised. 466 6. Set the Default Mapper 467 ------------------------- 469 If all the input repositories use the same mapper then we can assume that mapper to be the 470 "default mapper". If there are new output repositories whose `RepositoryArgs` do not specify a mapper and 471 there is a default mapper then the new output repository will be set to use that default mapper. 473 This is handled in `Butler._setDefaultMapper`. 475 7. Cache References to Parent RepoDatas 476 --------------------------------------- 478 In `Butler._connectParentRepoDatas`, in each `RepoData` in `repoDataList`, a list of `RepoData` object 479 references is built that matches the parents specified in that `RepoData`'s `RepositoryCfg`. 481 This list is used later to find things in that repository's parents, without considering peer repository's 482 parents. (e.g. finding the registry of a parent) 487 Tags are described at https://ldm-463.lsst.io/v/draft/#tagging 489 In `Butler._setRepoDataTags`, for each `RepoData`, the tags specified by its `RepositoryArgs` are recorded 490 in a set, and added to the tags set in each of its parents, for ease of lookup when mapping. 492 9. Find Parent Registry and Instantiate RepoData 493 ------------------------------------------------ 495 At this point there is enough information to instantiate the `Repository` instances. There is one final 496 step before instantiating the Repository, which is to try to get a parent registry that can be used by the 497 child repository. The criteria for "can be used" is spelled out in `Butler._setParentRegistry`. However, 498 to get the registry from the parent, the parent must be instantiated. The `repoDataList`, in depth-first 499 search order, is built so that the most-dependent repositories are first, and the least dependent 500 repositories are last. So the `repoDataList` is reversed and the Repositories are instantiated in that 501 order; for each RepoData a parent registry is searched for, and then the Repository is instantiated with 502 whatever registry could be found.""" 504 def __init__(self, root=None, mapper=None, inputs=None, outputs=None, **mapperArgs):
505 self.
_initArgs = {
'root': root,
'mapper': mapper,
'inputs': inputs,
'outputs': outputs,
506 'mapperArgs': mapperArgs}
508 self.
log = Log.getLogger(
"daf.persistence.butler")
514 root=root, mapper=mapper, inputs=inputs, outputs=outputs, **mapperArgs)
517 inputs = [
RepoData(args,
'input')
for args
in inputs]
518 outputs = [
RepoData(args,
'output')
for args
in outputs]
519 repoDataList = outputs + inputs
535 for repoData
in reversed(repoDataList):
539 def _setParentRegistry(self, repoData):
540 """Try to get a parent registry that can be used by this repository. To be usable the repository must 541 "match", meaning the mapper in the passed-in repo is the same type as the mapper in the parent. 544 def getParentRegsitry(repoData, context):
545 """Get the first found registry that matches the the passed-in repo. 550 The RepoData for the repository for which we are searching for a 556 A registry from a parent if one can be found, or None. 561 Indicates a butler init order problem, all parents should be initialized before child 562 repositories, so this function should be able to get any parent of any child repo. 564 if id(self)
in context:
567 context.add(id(self))
568 for parentRepoData
in repoData.getParentRepoDatas():
569 if parentRepoData.cfg.mapper == repoData.cfg.mapper:
570 if parentRepoData.repo
is None:
572 "_getParentRegistry: Parent {} of new repo {} not yet created, ignoring.".format(
573 parentRepoData, repoData))
575 parentRegistry = parentRepoData.repo.getRegistry()
577 return parentRegistry
579 parentRegistry = getParentRegsitry(parentRepoData, context)
581 return parentRegistry
584 repoData.repoData.parentRegistry = getParentRegsitry(repoData.repoData, set())
586 def _processInputArguments(self, root=None, mapper=None, inputs=None, outputs=None, **mapperArgs):
587 """Process, verify, and standardize the input arguments. 588 * Inputs can not be for Old Butler (root, mapper, mapperArgs) AND New Butler (inputs, outputs) 589 `root`, `mapper`, and `mapperArgs` are Old Butler init API. 590 `inputs` and `outputs` are New Butler init API. 591 Old Butler and New Butler init API may not be mixed, Butler may be initialized with only the Old 592 arguments or the New arguments. 593 * Verify that if there is a readable output that there is exactly one output. (This restriction is in 594 place because all readable repositories must be parents of writable repositories, and for 595 consistency the DAG of readable repositories must always be the same. Keeping the list of parents 596 becomes very complicated in the presence of multiple readable output repositories. It is better to 597 only write to output repositories, and then create a new Butler instance and use the outputs as 598 inputs, and write to new output repositories.) 599 * Make a copy of inputs & outputs so they may be modified without changing the passed-in arguments. 600 * Convert any input/output values that are URI strings to RepositoryArgs. 601 * Listify inputs & outputs. 602 * Set default RW mode on inputs & outputs as needed. 606 Same as Butler.__init__ 610 (list of RepositoryArgs, list of RepositoryArgs) 611 First item is a list to use as inputs. 612 Second item is a list to use as outputs. 617 If Old Butler and New Butler arguments are both used this will raise. 618 If an output is readable there is more than one output this will raise. 621 inputs = copy.deepcopy(inputs)
622 outputs = copy.deepcopy(outputs)
624 isV1Args = inputs
is None and outputs
is None 628 mapperArgs=mapperArgs
or None)
629 elif root
or mapper
or mapperArgs:
631 'Butler version 1 API (root, mapper, **mapperArgs) may ' +
632 'not be used with version 2 API (inputs, outputs)')
641 if not isinstance(args, RepositoryArgs)
else args
for args
in inputs]
643 if not isinstance(args, RepositoryArgs)
else args
for args
in outputs]
647 if args.mode
is None:
649 elif 'rw' == args.mode:
651 elif 'r' != args.mode: 652 raise RuntimeError(
"The mode of an input should be readable.")
654 if args.mode
is None:
656 elif 'w' not in args.mode:
657 raise RuntimeError(
"The mode of an output should be writable.")
659 for args
in inputs + outputs:
660 if (args.mapper
and not isinstance(args.mapper, basestring)
and 661 not inspect.isclass(args.mapper)):
662 self.
log.warn(preinitedMapperWarning)
667 raise RuntimeError(
"Butler does not support multiple output repositories if any of the " 668 "outputs are readable.")
673 def inputIsInOutputs(inputArgs, outputArgsList):
674 for o
in outputArgsList:
675 if (
'r' in o.mode and 676 o.root == inputArgs.root and 677 o.mapper == inputArgs.mapper
and 678 o.mapperArgs == inputArgs.mapperArgs
and 679 o.tags == inputArgs.tags
and 680 o.policy == inputArgs.policy):
681 self.
log.debug((
"Input repositoryArgs {} is also listed in outputs as readable; " +
682 "throwing away the input.").format(inputArgs))
686 inputs = [args
for args
in inputs
if not inputIsInOutputs(args, outputs)]
687 return inputs, outputs
690 def _getParentVal(repoData):
691 """Get the value of this repoData as it should appear in the parents 692 list of other repositories""" 693 if repoData.isV1Repository:
695 if repoData.cfgOrigin ==
'nested':
698 return repoData.cfg.root
701 def _getParents(ofRepoData, repoInfo):
702 """Create a parents list of repoData from inputs and (readable) outputs.""" 705 for repoData
in repoInfo:
706 if repoData
is ofRepoData:
708 if 'r' not in repoData.repoArgs.mode: 710 parents.append(Butler._getParentVal(repoData))
714 def _getOldButlerRepositoryCfg(repositoryArgs):
715 if not Storage.isPosix(repositoryArgs.cfgRoot):
717 if not PosixStorage.v1RepoExists(repositoryArgs.cfgRoot):
719 if not repositoryArgs.mapper:
720 repositoryArgs.mapper = PosixStorage.getMapperClass(repositoryArgs.cfgRoot)
721 cfg = RepositoryCfg.makeFromArgs(repositoryArgs)
722 parent = PosixStorage.getParentSymlinkPath(repositoryArgs.cfgRoot)
724 parent = Butler._getOldButlerRepositoryCfg(
RepositoryArgs(cfgRoot=parent, mode=
'r')) 725 if parent
is not None:
726 cfg.addParents([parent])
729 def _getRepositoryCfg(self, repositoryArgs):
730 """Try to get a repository from the location described by cfgRoot. 734 repositoryArgs : RepositoryArgs or string 735 Provides arguments to load an existing repository (or repositories). String is assumed to be a URI 736 and is used as the cfgRoot (URI to the location of the cfg file). 740 (RepositoryCfg or None, bool) 741 The RepositoryCfg, or None if one cannot be found, and True if the RepositoryCfg was created by 742 reading an Old Butler repository, or False if it is a New Butler Repository. 744 if not isinstance(repositoryArgs, RepositoryArgs):
747 cfg = self.storage.getRepositoryCfg(repositoryArgs.cfgRoot) 748 isOldButlerRepository = False 750 cfg = Butler._getOldButlerRepositoryCfg(repositoryArgs)
752 isOldButlerRepository =
True 753 return cfg, isOldButlerRepository
755 def _getCfgs(self, repoDataList):
756 """Get or make a RepositoryCfg for each RepoData, and add the cfg to the RepoData. 757 If the cfg exists, compare values. If values match then use the cfg as an "existing" cfg. If the 758 values do not match, use the cfg as a "nested" cfg. 759 If the cfg does not exist, the RepositoryArgs must be for a writable repository. 763 repoDataList : list of RepoData 764 The RepoData that are output and inputs of this Butler 769 If the passed-in RepositoryArgs indicate an existing repository but other cfg parameters in those 771 match the existing repository's cfg a RuntimeError will be raised. 773 def cfgMatchesArgs(args, cfg):
774 """Test if there are any values in an RepositoryArgs that conflict with the values in a cfg""" 775 if args.mapper
is not None and cfg.mapper != args.mapper:
777 if args.mapperArgs
is not None and cfg.mapperArgs != args.mapperArgs:
779 if args.policy
is not None and cfg.policy != args.policy:
783 for repoData
in repoDataList:
786 if 'w' not in repoData.repoArgs.mode:
788 "No cfg found for read-only input repository at {}".format(repoData.repoArgs.cfgRoot))
789 repoData.setCfg(cfg=RepositoryCfg.makeFromArgs(repoData.repoArgs),
791 root=repoData.repoArgs.cfgRoot,
792 isV1Repository=isOldButlerRepository)
794 if 'w' in repoData.repoArgs.mode:
796 if not cfgMatchesArgs(repoData.repoArgs, cfg):
797 raise RuntimeError((
"The RepositoryArgs and RepositoryCfg must match for writable " +
798 "repositories, RepositoryCfg:{}, RepositoryArgs:{}").format(
799 cfg, repoData.repoArgs))
800 repoData.setCfg(cfg=cfg, origin=
'existing', root=repoData.repoArgs.cfgRoot,
801 isV1Repository=isOldButlerRepository)
804 if cfgMatchesArgs(repoData.repoArgs, cfg):
805 repoData.setCfg(cfg=cfg, origin=
'existing', root=repoData.repoArgs.cfgRoot,
806 isV1Repository=isOldButlerRepository)
808 repoData.setCfg(cfg=cfg, origin=
'nested', root=
None,
809 isV1Repository=isOldButlerRepository)
811 def _addParents(self, repoDataList):
812 """For each repoData in the input list, see if its parents are the next items in the list, and if not 813 add the parent, so that the repoDataList includes parents and is in order to operate depth-first 0..n. 817 repoDataList : list of RepoData 818 The RepoData for the Butler outputs + inputs. 823 Raised if a RepositoryCfg can not be found at a location where a parent repository should be. 827 if repoDataIdx == len(repoDataList):
829 repoData = repoDataList[repoDataIdx]
830 if 'r' not in repoData.repoArgs.mode: 833 if repoData.isNewRepository:
836 if repoData.cfg.parents
is None:
839 for repoParentIdx, repoParent
in enumerate(repoData.cfg.parents):
840 parentIdxInRepoDataList = repoDataIdx + repoParentIdx + 1
841 if not isinstance(repoParent, RepositoryCfg):
843 if repoParentCfg
is not None:
844 cfgOrigin =
'existing' 846 isOldButlerRepository =
False 847 repoParentCfg = repoParent
849 if (parentIdxInRepoDataList < len(repoDataList)
and 850 repoDataList[parentIdxInRepoDataList].cfg == repoParentCfg):
853 role = 'input' if repoData.role ==
'output' else 'parent' 855 newRepoInfo.repoData.setCfg(cfg=repoParentCfg, origin=cfgOrigin, root=args.cfgRoot,
856 isV1Repository=isOldButlerRepository)
857 repoDataList.insert(parentIdxInRepoDataList, newRepoInfo)
860 def _setAndVerifyParentsLists(self, repoDataList):
861 """Make a list of all the input repositories of this Butler, these are the parents of the outputs. 862 For new output repositories, set the parents in the RepositoryCfg. For existing output repositories 863 verify that the RepositoryCfg's parents match the parents list. 867 repoDataList : list of RepoData 868 All the RepoDatas loaded by this butler, in search order. 873 If an existing output repository is loaded and its parents do not match the parents of this Butler 874 an error will be raised. 876 def getIOParents(ofRepoData, repoDataList):
877 """make a parents list for repo in `ofRepoData` that is comprised of inputs and readable 878 outputs (not parents-of-parents) of this butler""" 880 for repoData
in repoDataList:
881 if repoData.role ==
'parent':
883 if repoData
is ofRepoData:
885 if repoData.role ==
'output':
886 if 'r' in repoData.repoArgs.mode: 887 raise RuntimeError(
"If an output is readable it must be the only output.")
894 for repoData
in repoDataList:
895 if repoData.role !=
'output':
897 parents = getIOParents(repoData, repoDataList)
899 if repoData.cfgOrigin ==
'new':
900 repoData.cfg.addParents(parents)
901 elif repoData.cfgOrigin
in (
'existing',
'nested'):
902 if repoData.cfg.parents != parents:
904 repoData.cfg.extendParents(parents)
905 except ParentsMismatch
as e:
906 raise RuntimeError((
"Inputs of this Butler:{} do not match parents of existing " +
907 "writable cfg:{} (ParentMismatch exception: {}").format(
908 parents, repoData.cfg.parents, e))
910 def _setDefaultMapper(self, repoDataList):
911 """Establish a default mapper if there is one and assign it to outputs that do not have a mapper 914 If all inputs have the same mapper it will be used as the default mapper. 918 repoDataList : list of RepoData 919 All the RepoDatas loaded by this butler, in search order. 924 If a default mapper can not be established and there is an output that does not have a mapper. 926 needyOutputs = [rd
for rd
in repoDataList
if rd.role ==
'output' and rd.cfg.mapper
is None]
927 if len(needyOutputs)
is 0:
929 mappers = set([rd.cfg.mapper
for rd
in repoDataList
if rd.role ==
'input'])
930 if len(mappers) != 1:
931 inputs = [rd
for rd
in repoDataList
if rd.role ==
'input']
933 (
"No default mapper could be established from inputs:{} and no mapper specified " +
934 "for outputs:{}").format(inputs, needyOutputs))
935 defaultMapper = mappers.pop()
936 for repoData
in needyOutputs:
937 repoData.cfg.mapper = defaultMapper
939 def _connectParentRepoDatas(self, repoDataList):
940 """For each RepoData in repoDataList, find its parent in the repoDataList and cache a reference to it. 944 repoDataList : list of RepoData 945 All the RepoDatas loaded by this butler, in search order. 950 When a parent is listed in the parents list but not found in the repoDataList. This is not 951 expected to ever happen and would indicate an internal Butler error. 953 for repoData
in repoDataList:
954 for parent
in repoData.cfg.parents:
956 for otherRepoData
in repoDataList:
957 if isinstance(parent, RepositoryCfg):
958 if otherRepoData.repoData.repoData.cfg == parent:
959 parentToAdd = otherRepoData.repoData
961 elif otherRepoData.repoData.cfg.root == parent:
962 parentToAdd = otherRepoData.repoData
964 if parentToAdd
is None:
966 "Could not find a parent matching {} to add to {}".format(parent, repoData))
967 repoData.addParentRepoData(parentToAdd)
970 def _getParentRepoData(parent, repoDataList):
971 """get a parent RepoData from a cfg from a list of RepoData 975 parent : string or RepositoryCfg 976 cfgRoot of a repo or a cfg that describes the repo 977 repoDataList : list of RepoData 983 A RepoData if one can be found, else None 986 for otherRepoData
in repoDataList:
987 if isinstance(parent, RepositoryCfg):
988 if otherRepoData.cfg == parent:
989 repoData = otherRepoData
991 elif otherRepoData.cfg.root == parent:
992 repoData = otherRepoData
996 def _setRepoDataTags(self):
997 """Set the tags from each repoArgs into all its parent repoArgs so that they can be included in tagged 999 def setTags(repoData, tags, context):
1000 if id(repoData)
in context:
1002 repoData.addTags(tags)
1003 context.add(id(repoData))
1004 for parentRepoData
in repoData.parentRepoDatas:
1005 setTags(parentRepoData, tags, context)
1006 for repoData
in self.
_repos.outputs() + self.
_repos.inputs():
1007 setTags(repoData.repoData, repoData.repoArgs.tags, set())
1009 def _convertV1Args(self, root, mapper, mapperArgs):
1010 """Convert Old Butler RepositoryArgs (root, mapper, mapperArgs) to New Butler RepositoryArgs 1016 Posix path to repository root 1017 mapper : class, class instance, or string 1018 Instantiated class, a class object to be instantiated, or a string that refers to a class that 1019 can be imported & used as the mapper. 1021 RepositoryArgs & their values used when instantiating the mapper. 1026 (inputs, outputs) - values to be used for inputs and outputs in Butler.__init__ 1028 if (mapper
and not isinstance(mapper, basestring)
and 1029 not inspect.isclass(mapper)):
1030 self.
log.warn(preinitedMapperWarning)
1033 if hasattr(mapper,
'root'):
1042 mapperArgs=mapperArgs)
1043 return inputs, outputs
1046 return 'Butler(datasetTypeAliasDict=%s, repos=%s, persistence=%s)' % (
1049 def _getDefaultMapper(self):
1051 """Get the default mapper. Currently this means if all the repositories use exactly the same mapper, 1052 that mapper may be considered the default. 1054 This definition may be changing; mappers may be able to exclude themselves as candidates for default, 1055 and they may nominate a different mapper instead. Also, we may not want to look at *all* the 1056 repositories, but only a depth-first search on each of the input & output repositories, and use the 1057 first-found mapper for each of those. TBD. 1066 Mapper class or None 1067 Returns the class type of the default mapper, or None if a default 1068 mapper can not be determined. 1070 defaultMapper =
None 1072 for inputRepoData
in self.
_repos.inputs():
1074 if inputRepoData.cfg.mapper
is not None:
1075 mapper = inputRepoData.cfg.mapper
1080 if isinstance(mapper, basestring):
1082 elif not inspect.isclass(mapper):
1083 mapper = mapper.__class__
1089 if defaultMapper
is None:
1090 defaultMapper = mapper
1091 elif mapper == defaultMapper:
1093 elif mapper
is not None:
1095 return defaultMapper
1097 def _assignDefaultMapper(self, defaultMapper):
1098 for repoData
in self.
_repos.all().values():
1099 if repoData.cfg.mapper
is None and (repoData.isNewRepository
or repoData.isV1Repository):
1100 if defaultMapper
is None:
1102 "No mapper specified for %s and no default mapper could be determined." %
1104 repoData.cfg.mapper = defaultMapper
1108 """posix-only; gets the mapper class at the path specified by root (if a file _mapper can be found at 1109 that location or in a parent location. 1111 As we abstract the storage and support different types of storage locations this method will be 1112 moved entirely into Butler Access, or made more dynamic, and the API will very likely change.""" 1113 return Storage.getMapperClass(root)
1116 """Register an alias that will be substituted in datasetTypes. 1121 The alias keyword. It may start with @ or not. It may not contain @ except as the first character. 1122 datasetType - string 1123 The string that will be substituted when @alias is passed into datasetType. It may not contain '@' 1127 atLoc = alias.rfind(
'@')
1129 alias =
"@" + str(alias)
1131 raise RuntimeError(
"Badly formatted alias string: %s" % (alias,))
1134 if datasetType.count(
'@') != 0:
1135 raise RuntimeError(
"Badly formatted type string: %s" % (datasetType))
1140 if key.startswith(alias)
or alias.startswith(key):
1141 raise RuntimeError(
"Alias: %s overlaps with existing alias: %s" % (alias, key))
1145 def getKeys(self, datasetType=None, level=None, tag=None):
1146 """Get the valid data id keys at or above the given level of hierarchy for the dataset type or the 1147 entire collection if None. The dict values are the basic Python types corresponding to the keys (int, 1152 datasetType - string 1153 The type of dataset to get keys for, entire collection if None. 1155 The hierarchy level to descend to. None if it should not be restricted. Use an empty string if the 1156 mapper should lookup the default level. 1157 tags - any, or list of any 1158 Any object that can be tested to be the same as the tag in a dataId passed into butler input 1159 functions. Applies only to input repositories: If tag is specified by the dataId then the repo 1160 will only be read from used if the tag in the dataId matches a tag used for that repository. 1164 Returns a dict. The dict keys are the valid data id keys at or above the given level of hierarchy for 1165 the dataset type or the entire collection if None. The dict values are the basic Python types 1166 corresponding to the keys (int, float, string). 1172 for repoData
in self.
_repos.inputs():
1173 if not tag
or len(tag.intersection(repoData.tags)) > 0:
1174 keys = repoData.repo.getKeys(datasetType, level)
1177 if keys
is not None:
1182 """Returns the valid values for one or more keys when given a partial 1183 input collection data id. 1187 datasetType - string 1188 The type of dataset to inquire about. 1190 Key or tuple of keys to be returned. 1191 dataId - DataId, dict 1192 The partial data id. 1194 Keyword arguments for the partial data id. 1198 A list of valid values or tuples of valid values as specified by the 1204 dataId.update(**rest)
1208 for repoData
in self.
_repos.inputs():
1209 if not dataId.tag
or len(dataId.tag.intersection(repoData.tags)) > 0:
1210 tuples = repoData.repo.queryMetadata(datasetType, format, dataId)
1217 if len(format) == 1:
1229 """Determines if a dataset file exists. 1233 datasetType - string 1234 The type of dataset to inquire about. 1235 dataId - DataId, dict 1236 The data id of the dataset. 1238 If True, look only in locations where the dataset could be written, 1239 and return True only if it is present in all of them. 1240 **rest keyword arguments for the data id. 1245 True if the dataset exists or is non-file-based. 1249 dataId.update(**rest)
1250 locations = self.
_locate(datasetType, dataId, write=write)
1252 if locations
is None:
1254 locations = [locations]
1259 for location
in locations:
1262 if isinstance(location, ButlerComposite):
1263 for name, componentInfo
in location.componentInfo.items():
1264 if componentInfo.subset:
1265 subset = self.
subset(datasetType=componentInfo.datasetType, dataId=location.dataId)
1266 exists = all([obj.datasetExists()
for obj
in subset])
1268 exists = self.
datasetExists(componentInfo.datasetType, location.dataId)
1272 if not location.repository.exists(location):
1276 def _locate(self, datasetType, dataId, write):
1277 """Get one or more ButlerLocations and/or ButlercComposites. 1281 datasetType : string 1282 The datasetType that is being searched for. The datasetType may be followed by a dot and 1283 a component name (component names are specified in the policy). IE datasetType.componentName 1285 dataId : dict or DataId class instance 1289 True if this is a search to write an object. False if it is a search to read an object. This 1290 affects what type (an object or a container) is returned. 1294 If write is False, will return either a single object or None. If write is True, will return a list 1295 (which may be empty) 1297 repos = self.
_repos.outputs()
if write
else self.
_repos.inputs()
1299 for repoData
in repos:
1301 if not write
and dataId.tag
and len(dataId.tag.intersection(repoData.tags)) == 0:
1303 components = datasetType.split(
'.')
1304 datasetType = components[0]
1305 components = components[1:]
1307 location = repoData.repo.map(datasetType, dataId, write=write)
1310 if location
is None:
1312 location.datasetType = datasetType
1313 if len(components) > 0:
1314 if not isinstance(location, ButlerComposite):
1315 raise RuntimeError(
"The location for a dotted datasetType must be a composite.")
1317 components[0] = location.componentInfo[components[0]].datasetType
1319 datasetType =
'.'.join(components)
1320 location = self.
_locate(datasetType, dataId, write)
1322 if location
is None:
1333 if hasattr(location.mapper,
"bypass_" + location.datasetType):
1337 location.bypass = bypass
1343 if (isinstance(location, ButlerComposite)
or hasattr(location,
'bypass')
or 1344 location.repository.exists(location)):
1348 locations.extend(location)
1350 locations.append(location)
1356 def _getBypassFunc(location, dataId):
1357 pythonType = location.getPythonType()
1358 if pythonType
is not None:
1359 if isinstance(pythonType, basestring):
1361 bypassFunc = getattr(location.mapper,
"bypass_" + location.datasetType)
1362 return lambda: bypassFunc(location.datasetType, pythonType, location, dataId)
1364 def get(self, datasetType, dataId=None, immediate=True, **rest):
1365 """Retrieves a dataset given an input collection data id. 1369 datasetType - string 1370 The type of dataset to retrieve. 1374 If False use a proxy for delayed loading. 1376 keyword arguments for the data id. 1380 An object retrieved from the dataset (or a proxy for one). 1384 dataId.update(**rest)
1386 location = self.
_locate(datasetType, dataId, write=
False)
1387 if location
is None:
1388 raise NoResults(
"No locations for get:", datasetType, dataId)
1389 self.
log.debug(
"Get type=%s keys=%s from %s", datasetType, dataId, str(location))
1391 if hasattr(location,
'bypass'):
1394 return location.bypass
1397 return self.
_read(location)
1398 if location.mapper.canStandardize(location.datasetType):
1399 innerCallback = callback
1402 return location.mapper.standardize(location.datasetType, innerCallback(), dataId)
1405 return ReadProxy(callback)
1407 def put(self, obj, datasetType, dataId={}, doBackup=False, **rest):
1408 """Persists a dataset given an output collection data id. 1413 The object to persist. 1414 datasetType - string 1415 The type of dataset to persist. 1419 If True, rename existing instead of overwriting. 1420 WARNING: Setting doBackup=True is not safe for parallel processing, as it may be subject to race 1423 Keyword arguments for the data id. 1427 dataId.update(**rest)
1429 for location
in self.
_locate(datasetType, dataId, write=
True):
1430 if isinstance(location, ButlerComposite):
1431 disassembler = location.disassembler
if location.disassembler
else genericDisassembler
1432 disassembler(obj=obj, dataId=location.dataId, componentInfo=location.componentInfo)
1433 for name, info
in location.componentInfo.items():
1434 if not info.inputOnly:
1435 self.
put(info.obj, info.datasetType, location.dataId, doBackup=doBackup)
1438 location.getRepository().backup(location.datasetType, dataId)
1439 location.getRepository().write(location, obj)
1441 def subset(self, datasetType, level=None, dataId={}, **rest):
1442 """Return complete dataIds for a dataset type that match a partial (or empty) dataId. 1444 Given a partial (or empty) dataId specified in dataId and **rest, find all datasets that match the 1445 dataId. Optionally restrict the results to a given level specified by a dataId key (e.g. visit or 1446 sensor or amp for a camera). Return an iterable collection of complete dataIds as ButlerDataRefs. 1447 Datasets with the resulting dataIds may not exist; that needs to be tested with datasetExists(). 1451 datasetType - string 1452 The type of dataset collection to subset 1454 The level of dataId at which to subset. Use an empty string if the mapper should look up the 1459 Keyword arguments for the data id. 1463 subset - ButlerSubset 1464 Collection of ButlerDataRefs for datasets matching the data id. 1468 To print the full dataIds for all r-band measurements in a source catalog 1469 (note that the subset call is equivalent to: `butler.subset('src', dataId={'filter':'r'})`): 1471 >>> subset = butler.subset('src', filter=
'r') 1472 >>> for data_ref
in subset: print(data_ref.dataId)
1474 datasetType = self._resolveDatasetTypeAlias(datasetType) 1476 # Currently expected behavior of subset is that if specified level is None then the mapper's default 1477 # level should be used. Convention for level within Butler is that an empty string is used to indicate 1482 dataId = DataId(dataId) 1483 dataId.update(**rest) 1484 return ButlerSubset(self, datasetType, level, dataId) 1486 def dataRef(self, datasetType, level=None, dataId={}, **rest): 1487 """Returns a single ButlerDataRef.
1489 Given a complete dataId specified
in dataId
and **rest, find the unique dataset at the given level
1490 specified by a dataId key (e.g. visit
or sensor
or amp
for a camera)
and return a ButlerDataRef.
1494 datasetType - string
1495 The type of dataset collection to reference
1497 The level of dataId at which to reference
1501 Keyword arguments
for the data id.
1505 dataRef - ButlerDataRef
1506 ButlerDataRef
for dataset matching the data id
1509 datasetType = self._resolveDatasetTypeAlias(datasetType) 1510 dataId = DataId(dataId) 1511 subset = self.subset(datasetType, level, dataId, **rest) 1512 if len(subset) != 1: 1513 raise RuntimeError("No unique dataset for: Dataset type:%s Level:%s Data ID:%s Keywords:%s" % 1514 (str(datasetType), str(level), str(dataId), str(rest))) 1515 return ButlerDataRef(subset, subset.cache[0]) 1517 def _read(self, location): 1518 """Unpersist an object using data inside a ButlerLocation
or ButlerComposite object.
1522 location : ButlerLocation
or ButlerComposite
1523 A ButlerLocation
or ButlerComposite instance populated with data needed to read the object.
1528 An instance of the object specified by the location.
1530 self.log.debug("Starting read from %s", location) 1532 if isinstance(location, ButlerComposite): 1533 for name, componentInfo in location.componentInfo.items(): 1534 if componentInfo.subset: 1535 subset = self.subset(datasetType=componentInfo.datasetType, dataId=location.dataId) 1536 componentInfo.obj = [obj.get() for obj in subset] 1538 obj = self.get(componentInfo.datasetType, location.dataId, immediate=True) 1539 componentInfo.obj = obj 1540 assembler = location.assembler or genericAssembler 1541 results = assembler(dataId=location.dataId, componentInfo=location.componentInfo, 1542 cls=location.python) 1545 results = location.repository.read(location) 1546 if len(results) == 1: 1547 results = results[0] 1548 self.log.debug("Ending read from %s", location) 1551 def __reduce__(self): 1552 ret = (_unreduce, (self._initArgs, self.datasetTypeAliasDict)) 1555 def _resolveDatasetTypeAlias(self, datasetType): 1556 """Replaces all the known alias keywords
in the given string with the alias value.
1560 datasetType - string
1561 A datasetType string to search & replace on
1565 datasetType - string
1566 The de-aliased string
1568 for key in self.datasetTypeAliasDict: 1569 # if all aliases have been replaced, bail out 1570 if datasetType.find('@') == -1: 1572 datasetType = datasetType.replace(key, self.datasetTypeAliasDict[key]) 1574 # If an alias specifier can not be resolved then throw. 1575 if datasetType.find('@') != -1: 1576 raise RuntimeError("Unresolvable alias specifier in datasetType: %s" % (datasetType)) 1581 def _unreduce(initArgs, datasetTypeAliasDict): 1582 mapperArgs = initArgs.pop('mapperArgs') 1583 initArgs.update(mapperArgs) 1584 butler = Butler(**initArgs) 1585 butler.datasetTypeAliasDict = datasetTypeAliasDict
def _buildLookupLists(self)
def _resolveDatasetTypeAlias(self, datasetType)
def datasetExists(self, datasetType, dataId={}, write=False, rest)
def _convertV1Args(self, root, mapper, mapperArgs)
def _setRepoDataTags(self)
def __init__(self, root=None, mapper=None, inputs=None, outputs=None, mapperArgs)
def setCfg(self, cfg, origin, root, isV1Repository)
def _getRepositoryCfg(self, repositoryArgs)
def getParentRepoDatas(self, context=None)
def _setParentRegistry(self, repoData)
def _getCfgs(self, repoDataList)
def subset(self, datasetType, level=None, dataId={}, rest)
def __init__(self, cls, repoCfg)
def isNewRepository(self)
def _read(self, location)
def _setDefaultMapper(self, repoDataList)
def defineAlias(self, alias, datasetType)
def _connectParentRepoDatas(self, repoDataList)
def __init__(self, repoDataList)
def _addParents(self, repoDataList)
def getKeys(self, datasetType=None, level=None, tag=None)
def _getBypassFunc(location, dataId)
def put(self, obj, datasetType, dataId={}, doBackup=False, rest)
def queryMetadata(self, datasetType, format, dataId={}, rest)
def _processInputArguments(self, root=None, mapper=None, inputs=None, outputs=None, mapperArgs)
def addParentRepoData(self, parentRepoData)
def _locate(self, datasetType, dataId, write)
def _getParentVal(repoData)
def _setAndVerifyParentsLists(self, repoDataList)
def get(self, datasetType, dataId=None, immediate=True, rest)
def __init__(self, args, role)