27 """This module defines the Butler class.""" 28 from builtins
import str, super
29 from past.builtins
import basestring
30 from builtins
import object
37 from lsst.log
import Log
38 import lsst.pex.policy
as pexPolicy
39 from .
import ReadProxy, ButlerSubset, ButlerDataRef, Persistence, \
40 Storage, Policy, NoResults, Repository, DataId, RepositoryCfg, \
41 RepositoryArgs, listify, setify, sequencify, doImport, ButlerComposite, genericAssembler, \
42 genericDisassembler, PosixStorage, ParentsMismatch
44 preinitedMapperWarning = (
"Passing an instantiated mapper into " +
45 "Butler.__init__ will prevent Butler from passing " +
46 "parentRegistry or repositoryCfg information to " +
47 "the mapper, which is done only at init time. " +
48 "It is better to pass a importable string or " +
53 """Represents a Butler configuration. 57 cfg is 'wet paint' and very likely to change. Use of it in production 58 code other than via the 'old butler' API is strongly discouraged. 60 yaml_tag =
u"!ButlerCfg" 63 super().
__init__({
'repoCfg': repoCfg,
'cls': cls})
67 """Container object for repository data used by Butler 72 The arguments that are used to find or create the RepositoryCfg. 74 "input", "output", or "parent", indicating why Butler loaded this repository. 75 * input: the Repository was passed as a Butler input. 76 * output: the Repository was passed as a Butler output. 77 * parent: the Repository was specified in the RepositoryCfg parents list of a readable repository. 82 The configuration for the Repository. 85 "new", "existing", or "nested". Indicates the origin of the repository and its RepositoryCfg: 86 * new: it was created by this instance of Butler, and this instance of Butler will generate the 88 * existing: it was found (via the root or cfgRoot argument) 89 * nested: the full RepositoryCfg was nested in another RepositoryCfg's parents list (this can happen 90 if parameters of an input specified by RepositoryArgs or dict does not entirely match an existing 94 Path or URI to the location of the RepositoryCfg file. 96 repo : lsst.daf.persistence.Repository 97 The Repository class instance. 99 parentRepoDatas : list of RepoData 100 The parents of this Repository, as indicated this Repository's RepositoryCfg. If this is a new 101 Repository then these are the inputs to this Butler (and will be saved in the RepositoryCfg). These 102 RepoData objects are not owned by this RepoData, these are references to peer RepoData objects in the 103 Butler's RepoDataContainer. 105 isV1Repository : bool 106 True if this is an Old Butler repository. In this case the repository does not have a RepositoryCfg 107 file. It may have a _mapper file and may have a _parent symlink. It will never be treated as a "new" 108 repository, i.e. even though there is not a RepositoryCfg file, one will not be generated. 109 If False, this is a New Butler repository and is specified by RepositoryCfg file. 112 These are values that may be used to restrict the search of input repositories. Details are available 113 in the RepositoryArgs and DataId classes. 116 "input", "output", or "parent", indicating why Butler loaded this repository. 117 * input: the Repository was passed as a Butler input. 118 * output: the Repository was passed as a Butler output. 119 * parent: the Repository was specified in the RepositoryCfg parents list of a readable repository. 121 _repoArgs : RepositoryArgs 122 Contains the arguments that were used to specify this Repository. 152 "parentRepoDatas={}," +
155 "parentRegistry={})").format(
156 self.__class__.__name__,
168 def setCfg(self, cfg, origin, root, isV1Repository):
169 """Set information about the cfg into the RepoData 174 The RepositoryCfg for the repo. 176 'new', 'existing', or 'nested' 178 URI or absolute path to the location of the RepositoryCfg.yaml file. 184 if origin
not in (
'new',
'existing',
'nested'):
185 raise RuntimeError(
"Invalid value for origin:{}".format(origin))
205 if val
not in (
'input',
'output',
'parent'):
206 raise RuntimeError(
"Invalid value for role: {}".format(val))
210 """Get the parents & grandparents etc of this repo data, in depth-first search order. 212 Duplicate entries will be removed in cases where the same parent appears more than once in the parent 217 context : set, optional 218 Users should typically omit context and accept the default argument. Context is used to keep a set 219 of known RepoDatas when calling this function recursively, for duplicate elimination. 224 A list of the parents & grandparents etc of a given repo data, in depth-first search order. 229 if id(self)
in context:
231 context.add(id(self))
233 parents.append(parent)
234 parents += parent.getParentRepoDatas(context)
245 """Container object for RepoData instances owned by a Butler instance. 249 repoDataList : list of RepoData 250 repoData - RepoData instance to add 256 self.
_all = repoDataList
260 """Get a list of RepoData that are used to as inputs to the Butler. 261 The list is created lazily as needed, and cached. 265 A list of RepoData with readable repositories, in the order to be used when searching. 268 raise RuntimeError(
"Inputs not yet initialized.")
272 """Get a list of RepoData that are used to as outputs to the Butler. 273 The list is created lazily as needed, and cached. 277 A list of RepoData with writable repositories, in the order to be use when searching. 280 raise RuntimeError(
"Outputs not yet initialized.")
284 """Get a list of all RepoData that are used to as by the Butler. 285 The list is created lazily as needed, and cached. 289 A list of RepoData with writable repositories, in the order to be use when searching. 294 return "%s(_inputs=%r, \n_outputs=%s, \n_all=%s)" % (
295 self.__class__.__name__,
300 def _buildLookupLists(self):
301 """Build the inputs and outputs lists based on the order of self.all().""" 303 def addToList(repoData, lst):
304 """Add a repoData and each of its parents (depth first) to a list""" 305 if id(repoData)
in alreadyAdded:
308 alreadyAdded.add(id(repoData))
309 for parent
in repoData.parentRepoDatas:
310 addToList(parent, lst)
313 raise RuntimeError(
"Lookup lists are already built.")
314 inputs = [repoData
for repoData
in self.
all()
if repoData.role ==
'input']
315 outputs = [repoData
for repoData
in self.
all()
if repoData.role ==
'output']
318 for repoData
in outputs:
319 if 'r' in repoData.repoArgs.mode: 320 addToList(repoData.repoData, self._inputs) 321 for repoData
in inputs:
322 addToList(repoData.repoData, self.
_inputs)
323 self.
_outputs = [repoData.repoData
for repoData
in outputs]
327 """Butler provides a generic mechanism for persisting and retrieving data using mappers. 329 A Butler manages a collection of datasets known as a repository. Each dataset has a type representing its 330 intended usage and a location. Note that the dataset type is not the same as the C++ or Python type of the 331 object containing the data. For example, an ExposureF object might be used to hold the data for a raw 332 image, a post-ISR image, a calibrated science image, or a difference image. These would all be different 335 A Butler can produce a collection of possible values for a key (or tuples of values for multiple keys) if 336 given a partial data identifier. It can check for the existence of a file containing a dataset given its 337 type and data identifier. The Butler can then retrieve the dataset. Similarly, it can persist an object to 338 an appropriate location when given its associated data identifier. 340 Note that the Butler has two more advanced features when retrieving a data set. First, the retrieval is 341 lazy. Input does not occur until the data set is actually accessed. This allows datasets to be retrieved 342 and placed on a clipboard prospectively with little cost, even if the algorithm of a stage ends up not 343 using them. Second, the Butler will call a standardization hook upon retrieval of the dataset. This 344 function, contained in the input mapper object, must perform any necessary manipulations to force the 345 retrieved object to conform to standards, including translating metadata. 349 __init__(self, root, mapper=None, **mapperArgs) 351 defineAlias(self, alias, datasetType) 353 getKeys(self, datasetType=None, level=None) 355 queryMetadata(self, datasetType, format=None, dataId={}, **rest) 357 datasetExists(self, datasetType, dataId={}, **rest) 359 get(self, datasetType, dataId={}, immediate=False, **rest) 361 put(self, obj, datasetType, dataId={}, **rest) 363 subset(self, datasetType, level=None, dataId={}, **rest) 365 dataRef(self, datasetType, level=None, dataId={}, **rest) 369 The preferred method of initialization is to use the `inputs` and `outputs` __init__ parameters. These 370 are described in the parameters section, below. 372 For backward compatibility: this initialization method signature can take a posix root path, and 373 optionally a mapper class instance or class type that will be instantiated using the mapperArgs input 374 argument. However, for this to work in a backward compatible way it creates a single repository that is 375 used as both an input and an output repository. This is NOT preferred, and will likely break any 376 provenance system we have in place. 381 .. note:: Deprecated in 12_0 382 `root` will be removed in TBD, it is replaced by `inputs` and `outputs` for 383 multiple-repository support. 384 A file system path. Will only work with a PosixRepository. 385 mapper : string or instance 386 .. note:: Deprecated in 12_0 387 `mapper` will be removed in TBD, it is replaced by `inputs` and `outputs` for 388 multiple-repository support. 389 Provides a mapper to be used with Butler. 391 .. note:: Deprecated in 12_0 392 `mapperArgs` will be removed in TBD, it is replaced by `inputs` and `outputs` for 393 multiple-repository support. 394 Provides arguments to be passed to the mapper if the mapper input argument is a class type to be 395 instantiated by Butler. 396 inputs : RepositoryArgs, dict, or string 397 Can be a single item or a list. Provides arguments to load an existing repository (or repositories). 398 String is assumed to be a URI and is used as the cfgRoot (URI to the location of the cfg file). (Local 399 file system URI does not have to start with 'file://' and in this way can be a relative path). The 400 `RepositoryArgs` class can be used to provide more parameters with which to initialize a repository 401 (such as `mapper`, `mapperArgs`, `tags`, etc. See the `RepositoryArgs` documentation for more 402 details). A dict may be used as shorthand for a `RepositoryArgs` class instance. The dict keys must 403 match parameters to the `RepositoryArgs.__init__` function. 404 outputs : RepositoryArgs, dict, or string 405 Provides arguments to load one or more existing repositories or create new ones. The different types 406 are handled the same as for `inputs`. 408 The Butler init sequence loads all of the input and output repositories. 409 This creates the object hierarchy to read from and write to them. Each 410 repository can have 0 or more parents, which also get loaded as inputs. 411 This becomes a DAG of repositories. Ultimately, Butler creates a list of 412 these Repositories in the order that they are used. 414 Initialization Sequence 415 ======================= 417 During initialization Butler creates a Repository class instance & support structure for each object 418 passed to `inputs` and `outputs` as well as the parent repositories recorded in the `RepositoryCfg` of 419 each existing readable repository. 421 This process is complex. It is explained below to shed some light on the intent of each step. 423 1. Input Argument Standardization 424 --------------------------------- 426 In `Butler._processInputArguments` the input arguments are verified to be legal (and a RuntimeError is 427 raised if not), and they are converted into an expected format that is used for the rest of the Butler 428 init sequence. See the docstring for `_processInputArguments`. 430 2. Create RepoData Objects 431 -------------------------- 433 Butler uses an object, called `RepoData`, to keep track of information about each repository; each 434 repository is contained in a single `RepoData`. The attributes are explained in its docstring. 436 After `_processInputArguments`, a RepoData is instantiated and put in a list for each repository in 437 `outputs` and `inputs`. This list of RepoData, the `repoDataList`, now represents all the output and input 438 repositories (but not parent repositories) that this Butler instance will use. 440 3. Get `RepositoryCfg`s 441 ----------------------- 443 `Butler._getCfgs` gets the `RepositoryCfg` for each repository the `repoDataList`. The behavior is 444 described in the docstring. 449 `Butler._addParents` then considers the parents list in the `RepositoryCfg` of each `RepoData` in the 450 `repoDataList` and inserts new `RepoData` objects for each parent not represented in the proper location 451 in the `repoDataList`. Ultimately a flat list is built to represent the DAG of readable repositories 452 represented in depth-first order. 454 5. Set and Verify Parents of Outputs 455 ------------------------------------ 457 To be able to load parent repositories when output repositories are used as inputs, the input repositories 458 are recorded as parents in the `RepositoryCfg` file of new output repositories. When an output repository 459 already exists, for consistency the Butler's inputs must match the list of parents specified the already- 460 existing output repository's `RepositoryCfg` file. 462 In `Butler._setAndVerifyParentsLists`, the list of parents is recorded in the `RepositoryCfg` of new 463 repositories. For existing repositories the list of parents is compared with the `RepositoryCfg`'s parents 464 list, and if they do not match a `RuntimeError` is raised. 466 6. Set the Default Mapper 467 ------------------------- 469 If all the input repositories use the same mapper then we can assume that mapper to be the 470 "default mapper". If there are new output repositories whose `RepositoryArgs` do not specify a mapper and 471 there is a default mapper then the new output repository will be set to use that default mapper. 473 This is handled in `Butler._setDefaultMapper`. 475 7. Cache References to Parent RepoDatas 476 --------------------------------------- 478 In `Butler._connectParentRepoDatas`, in each `RepoData` in `repoDataList`, a list of `RepoData` object 479 references is built that matches the parents specified in that `RepoData`'s `RepositoryCfg`. 481 This list is used later to find things in that repository's parents, without considering peer repository's 482 parents. (e.g. finding the registry of a parent) 487 Tags are described at https://ldm-463.lsst.io/v/draft/#tagging 489 In `Butler._setRepoDataTags`, for each `RepoData`, the tags specified by its `RepositoryArgs` are recorded 490 in a set, and added to the tags set in each of its parents, for ease of lookup when mapping. 492 9. Find Parent Registry and Instantiate RepoData 493 ------------------------------------------------ 495 At this point there is enough information to instantiate the `Repository` instances. There is one final 496 step before instantiating the Repository, which is to try to get a parent registry that can be used by the 497 child repository. The criteria for "can be used" is spelled out in `Butler._setParentRegistry`. However, 498 to get the registry from the parent, the parent must be instantiated. The `repoDataList`, in depth-first 499 search order, is built so that the most-dependent repositories are first, and the least dependent 500 repositories are last. So the `repoDataList` is reversed and the Repositories are instantiated in that 501 order; for each RepoData a parent registry is searched for, and then the Repository is instantiated with 502 whatever registry could be found.""" 504 def __init__(self, root=None, mapper=None, inputs=None, outputs=None, **mapperArgs):
505 self.
_initArgs = {
'root': root,
'mapper': mapper,
'inputs': inputs,
'outputs': outputs,
506 'mapperArgs': mapperArgs}
508 self.
log = Log.getLogger(
"daf.persistence.butler")
510 persistencePolicy = pexPolicy.Policy()
514 root=root, mapper=mapper, inputs=inputs, outputs=outputs, **mapperArgs)
517 inputs = [
RepoData(args,
'input')
for args
in inputs]
518 outputs = [
RepoData(args,
'output')
for args
in outputs]
519 repoDataList = outputs + inputs
535 for repoData
in reversed(repoDataList):
539 def _setParentRegistry(self, repoData):
540 """Try to get a parent registry that can be used by this repository. To be usable the repository must 541 "match", meaning the mapper in the passed-in repo is the same type as the mapper in the parent. 544 def getParentRegsitry(repoData, context):
545 """Get the first found registry that matches the the passed-in repo. 550 The RepoData for the repository for which we are searching for a 556 A registry from a parent if one can be found, or None. 561 Indicates a butler init order problem, all parents should be initialized before child 562 repositories, so this function should be able to get any parent of any child repo. 564 if id(self)
in context:
567 context.add(id(self))
568 for parentRepoData
in repoData.getParentRepoDatas():
569 if parentRepoData.cfg.mapper == repoData.cfg.mapper:
570 if parentRepoData.repo
is None:
572 "_getParentRegistry: Parent {} of new repo {} not yet created, ignoring.".format(
573 parentRepoData, repoData))
575 parentRegistry = parentRepoData.repo.getRegistry()
577 return parentRegistry
579 parentRegistry = getParentRegsitry(parentRepoData, context)
581 return parentRegistry
584 repoData.repoData.parentRegistry = getParentRegsitry(repoData.repoData, set())
586 def _processInputArguments(self, root=None, mapper=None, inputs=None, outputs=None, **mapperArgs):
587 """Process, verify, and standardize the input arguments. 588 * Inputs can not be for Old Butler (root, mapper, mapperArgs) AND New Butler (inputs, outputs) 589 `root`, `mapper`, and `mapperArgs` are Old Butler init API. 590 `inputs` and `outputs` are New Butler init API. 591 Old Butler and New Butler init API may not be mixed, Butler may be initialized with only the Old 592 arguments or the New arguments. 593 * Verify that if there is a readable output that there is exactly one output. (This restriction is in 594 place because all readable repositories must be parents of writable repositories, and for 595 consistency the DAG of readable repositories must always be the same. Keeping the list of parents 596 becomes very complicated in the presence of multiple readable output repositories. It is better to 597 only write to output repositories, and then create a new Butler instance and use the outputs as 598 inputs, and write to new output repositories.) 599 * Make a copy of inputs & outputs so they may be modified without changing the passed-in arguments. 600 * Convert any input/output values that are URI strings to RepositoryArgs. 601 * Listify inputs & outputs. 602 * Set default RW mode on inputs & outputs as needed. 606 Same as Butler.__init__ 610 (list of RepositoryArgs, list of RepositoryArgs) 611 First item is a list to use as inputs. 612 Second item is a list to use as outputs. 617 If Old Butler and New Butler arguments are both used this will raise. 618 If an output is readable there is more than one output this will raise. 621 inputs = copy.deepcopy(inputs)
622 outputs = copy.deepcopy(outputs)
624 isV1Args = inputs
is None and outputs
is None 628 mapperArgs=mapperArgs
or None)
629 elif root
or mapper
or mapperArgs:
631 'Butler version 1 API (root, mapper, **mapperArgs) may ' +
632 'not be used with version 2 API (inputs, outputs)')
641 if not isinstance(args, RepositoryArgs)
else args
for args
in inputs]
643 if not isinstance(args, RepositoryArgs)
else args
for args
in outputs]
647 if args.mode
is None:
649 elif 'rw' == args.mode:
651 elif 'r' != args.mode: 652 raise RuntimeError(
"The mode of an input should be readable.")
654 if args.mode
is None:
656 elif 'w' not in args.mode:
657 raise RuntimeError(
"The mode of an output should be writable.")
659 for args
in inputs + outputs:
660 if (args.mapper
and not isinstance(args.mapper, basestring)
and 661 not inspect.isclass(args.mapper)):
662 self.
log.warn(preinitedMapperWarning)
667 raise RuntimeError(
"Butler does not support multiple output repositories if any of the " 668 "outputs are readable.")
673 def inputIsInOutputs(inputArgs, outputArgsList):
674 for o
in outputArgsList:
675 if (
'r' in o.mode and 676 o.root == inputArgs.root and 677 o.mapper == inputArgs.mapper
and 678 o.mapperArgs == inputArgs.mapperArgs
and 679 o.tags == inputArgs.tags
and 680 o.policy == inputArgs.policy):
681 self.
log.debug((
"Input repositoryArgs {} is also listed in outputs as readable; " +
682 "throwing away the input.").format(inputArgs))
686 inputs = [args
for args
in inputs
if not inputIsInOutputs(args, outputs)]
687 return inputs, outputs
690 def _getParentVal(repoData):
691 """Get the value of this repoData as it should appear in the parents 692 list of other repositories""" 693 if repoData.isV1Repository:
695 if repoData.cfgOrigin ==
'nested':
698 return repoData.cfg.root
701 def _getParents(ofRepoData, repoInfo):
702 """Create a parents list of repoData from inputs and (readable) outputs.""" 705 for repoData
in repoInfo:
706 if repoData
is ofRepoData:
708 if 'r' not in repoData.repoArgs.mode: 710 parents.append(Butler._getParentVal(repoData))
714 def _getOldButlerRepositoryCfg(repositoryArgs):
715 if not Storage.isPosix(repositoryArgs.cfgRoot):
717 if not PosixStorage.v1RepoExists(repositoryArgs.cfgRoot):
719 if not repositoryArgs.mapper:
720 repositoryArgs.mapper = PosixStorage.getMapperClass(repositoryArgs.cfgRoot)
721 cfg = RepositoryCfg.makeFromArgs(repositoryArgs)
722 parent = PosixStorage.getParentSymlinkPath(repositoryArgs.cfgRoot)
724 parent = Butler._getOldButlerRepositoryCfg(
RepositoryArgs(cfgRoot=parent, mode=
'r')) 725 if parent
is not None:
726 cfg.addParents([parent])
729 def _getRepositoryCfg(self, repositoryArgs):
730 """Try to get a repository from the location described by cfgRoot. 734 repositoryArgs : RepositoryArgs or string 735 Provides arguments to load an existing repository (or repositories). String is assumed to be a URI 736 and is used as the cfgRoot (URI to the location of the cfg file). 740 (RepositoryCfg or None, bool) 741 The RepositoryCfg, or None if one cannot be found, and True if the RepositoryCfg was created by 742 reading an Old Butler repository, or False if it is a New Butler Repository. 744 if not isinstance(repositoryArgs, RepositoryArgs):
747 cfg = self.storage.getRepositoryCfg(repositoryArgs.cfgRoot) 748 isOldButlerRepository = False 750 cfg = Butler._getOldButlerRepositoryCfg(repositoryArgs)
752 isOldButlerRepository =
True 753 return cfg, isOldButlerRepository
755 def _getCfgs(self, repoDataList):
756 """Get or make a RepositoryCfg for each RepoData, and add the cfg to the RepoData. 757 If the cfg exists, compare values. If values match then use the cfg as an "existing" cfg. If the 758 values do not match, use the cfg as a "nested" cfg. 759 If the cfg does not exist, the RepositoryArgs must be for a writable repository. 763 repoDataList : list of RepoData 764 The RepoData that are output and inputs of this Butler 769 If the passed-in RepositoryArgs indicate an existing repository but other cfg parameters in those 771 match the existing repository's cfg a RuntimeError will be raised. 773 def cfgMatchesArgs(args, cfg):
774 """Test if there are any values in an RepositoryArgs that conflict with the values in a cfg""" 775 if args.mapper
is not None and cfg.mapper != args.mapper:
777 if args.mapperArgs
is not None and cfg.mapperArgs != args.mapperArgs:
779 if args.policy
is not None and cfg.policy != args.policy:
783 for repoData
in repoDataList:
786 if 'w' not in repoData.repoArgs.mode:
788 "No cfg found for read-only input repository at {}".format(repoData.repoArgs.cfgRoot))
789 repoData.setCfg(cfg=RepositoryCfg.makeFromArgs(repoData.repoArgs),
791 root=repoData.repoArgs.cfgRoot,
792 isV1Repository=isOldButlerRepository)
794 if 'w' in repoData.repoArgs.mode:
796 if not cfgMatchesArgs(repoData.repoArgs, cfg):
797 raise RuntimeError((
"The RepositoryArgs and RepositoryCfg must match for writable " +
798 "repositories, RepositoryCfg:{}, RepositoryArgs:{}").format(
799 cfg, repoData.repoArgs))
800 repoData.setCfg(cfg=cfg, origin=
'existing', root=repoData.repoArgs.cfgRoot,
801 isV1Repository=isOldButlerRepository)
804 if cfgMatchesArgs(repoData.repoArgs, cfg):
805 repoData.setCfg(cfg=cfg, origin=
'existing', root=repoData.repoArgs.cfgRoot,
806 isV1Repository=isOldButlerRepository)
808 repoData.setCfg(cfg=cfg, origin=
'nested', root=
None,
809 isV1Repository=isOldButlerRepository)
811 def _addParents(self, repoDataList):
812 """For each repoData in the input list, see if its parents are the next items in the list, and if not 813 add the parent, so that the repoDataList includes parents and is in order to operate depth-first 0..n. 817 repoDataList : list of RepoData 818 The RepoData for the Butler outputs + inputs. 823 Raised if a RepositoryCfg can not be found at a location where a parent repository should be. 827 if repoDataIdx == len(repoDataList):
829 repoData = repoDataList[repoDataIdx]
830 if 'r' not in repoData.repoArgs.mode: 833 if repoData.isNewRepository:
836 if repoData.cfg.parents
is None:
839 for repoParentIdx, repoParent
in enumerate(repoData.cfg.parents):
840 parentIdxInRepoDataList = repoDataIdx + repoParentIdx + 1
841 if not isinstance(repoParent, RepositoryCfg):
843 if repoParentCfg
is not None:
844 cfgOrigin =
'existing' 846 isOldButlerRepository =
False 847 repoParentCfg = repoParent
849 if (parentIdxInRepoDataList < len(repoDataList)
and 850 repoDataList[parentIdxInRepoDataList].cfg == repoParentCfg):
853 role = 'input' if repoData.role ==
'output' else 'parent' 855 newRepoInfo.repoData.setCfg(cfg=repoParentCfg, origin=cfgOrigin, root=args.cfgRoot,
856 isV1Repository=isOldButlerRepository)
857 repoDataList.insert(parentIdxInRepoDataList, newRepoInfo)
860 def _setAndVerifyParentsLists(self, repoDataList):
861 """Make a list of all the input repositories of this Butler, these are the parents of the outputs. 862 For new output repositories, set the parents in the RepositoryCfg. For existing output repositories 863 verify that the RepositoryCfg's parents match the parents list. 867 repoDataList : list of RepoData 868 All the RepoDatas loaded by this butler, in search order. 873 If an existing output repository is loaded and its parents do not match the parents of this Butler 874 an error will be raised. 876 def getIOParents(ofRepoData, repoDataList):
877 """make a parents list for repo in `ofRepoData` that is comprised of inputs and readable 878 outputs (not parents-of-parents) of this butler""" 880 for repoData
in repoDataList:
881 if repoData.role ==
'parent':
883 if repoData
is ofRepoData:
885 if repoData.role ==
'output':
886 if 'r' in repoData.repoArgs.mode: 887 raise RuntimeError(
"If an output is readable it must be the only output.")
894 for repoData
in repoDataList:
895 if repoData.role !=
'output':
897 parents = getIOParents(repoData, repoDataList)
899 if repoData.cfgOrigin ==
'new':
900 repoData.cfg.addParents(parents)
901 elif repoData.cfgOrigin
in (
'existing',
'nested'):
902 if repoData.cfg.parents != parents:
904 repoData.cfg.extendParents(parents)
905 except ParentsMismatch
as e:
906 raise RuntimeError((
"Inputs of this Butler:{} do not match parents of existing " +
907 "writable cfg:{} (ParentMismatch exception: {}").format(
908 parents, repoData.cfg.parents, e))
910 def _setDefaultMapper(self, repoDataList):
911 """Establish a default mapper if there is one and assign it to outputs that do not have a mapper 914 If all inputs have the same mapper it will be used as the default mapper. 918 repoDataList : list of RepoData 919 All the RepoDatas loaded by this butler, in search order. 924 If a default mapper can not be established and there is an output that does not have a mapper. 926 needyOutputs = [rd
for rd
in repoDataList
if rd.role ==
'output' and rd.cfg.mapper
is None]
927 if len(needyOutputs)
is 0:
929 mappers = set([rd.cfg.mapper
for rd
in repoDataList
if rd.role ==
'input'])
930 if len(mappers) != 1:
931 inputs = [rd
for rd
in repoDataList
if rd.role ==
'input']
933 (
"No default mapper could be established from inputs:{} and no mapper specified " +
934 "for outputs:{}").format(inputs, needyOutputs))
935 defaultMapper = mappers.pop()
936 for repoData
in needyOutputs:
937 repoData.cfg.mapper = defaultMapper
939 def _connectParentRepoDatas(self, repoDataList):
940 """For each RepoData in repoDataList, find its parent in the repoDataList and cache a reference to it. 944 repoDataList : list of RepoData 945 All the RepoDatas loaded by this butler, in search order. 950 When a parent is listed in the parents list but not found in the repoDataList. This is not 951 expected to ever happen and would indicate an internal Butler error. 953 for repoData
in repoDataList:
954 for parent
in repoData.cfg.parents:
956 for otherRepoData
in repoDataList:
957 if isinstance(parent, RepositoryCfg):
958 if otherRepoData.repoData.repoData.cfg == parent:
959 parentToAdd = otherRepoData.repoData
961 elif otherRepoData.repoData.cfg.root == parent:
962 parentToAdd = otherRepoData.repoData
964 if parentToAdd
is None:
966 "Could not find a parent matching {} to add to {}".format(parent, repoData))
967 repoData.addParentRepoData(parentToAdd)
970 def _getParentRepoData(parent, repoDataList):
971 """get a parent RepoData from a cfg from a list of RepoData 975 parent : string or RepositoryCfg 976 cfgRoot of a repo or a cfg that describes the repo 977 repoDataList : list of RepoData 983 A RepoData if one can be found, else None 986 for otherRepoData
in repoDataList:
987 if isinstance(parent, RepositoryCfg):
988 if otherRepoData.cfg == parent:
989 repoData = otherRepoData
991 elif otherRepoData.cfg.root == parent:
992 repoData = otherRepoData
996 def _setRepoDataTags(self):
997 """Set the tags from each repoArgs into all its parent repoArgs so that they can be included in tagged 999 def setTags(repoData, tags, context):
1000 if id(repoData)
in context:
1002 repoData.addTags(tags)
1003 context.add(id(repoData))
1004 for parentRepoData
in repoData.parentRepoDatas:
1005 setTags(parentRepoData, tags, context)
1006 for repoData
in self.
_repos.outputs() + self.
_repos.inputs():
1007 setTags(repoData.repoData, repoData.repoArgs.tags, set())
1009 def _convertV1Args(self, root, mapper, mapperArgs):
1010 """Convert Old Butler RepositoryArgs (root, mapper, mapperArgs) to New Butler RepositoryArgs 1016 Posix path to repository root 1017 mapper : class, class instance, or string 1018 Instantiated class, a class object to be instantiated, or a string that refers to a class that 1019 can be imported & used as the mapper. 1021 RepositoryArgs & their values used when instantiating the mapper. 1026 (inputs, outputs) - values to be used for inputs and outputs in Butler.__init__ 1028 if (mapper
and not isinstance(mapper, basestring)
and 1029 not inspect.isclass(mapper)):
1030 self.
log.warn(preinitedMapperWarning)
1033 if hasattr(mapper,
'root'):
1042 mapperArgs=mapperArgs)
1043 return inputs, outputs
1046 return 'Butler(datasetTypeAliasDict=%s, repos=%s, persistence=%s)' % (
1049 def _getDefaultMapper(self):
1051 """Get the default mapper. Currently this means if all the repositories use exactly the same mapper, 1052 that mapper may be considered the default. 1054 This definition may be changing; mappers may be able to exclude themselves as candidates for default, 1055 and they may nominate a different mapper instead. Also, we may not want to look at *all* the 1056 repositories, but only a depth-first search on each of the input & output repositories, and use the 1057 first-found mapper for each of those. TBD. 1066 Mapper class or None 1067 Returns the class type of the default mapper, or None if a default 1068 mapper can not be determined. 1070 defaultMapper =
None 1072 for inputRepoData
in self.
_repos.inputs():
1074 if inputRepoData.cfg.mapper
is not None:
1075 mapper = inputRepoData.cfg.mapper
1080 if isinstance(mapper, basestring):
1082 elif not inspect.isclass(mapper):
1083 mapper = mapper.__class__
1089 if defaultMapper
is None:
1090 defaultMapper = mapper
1091 elif mapper == defaultMapper:
1093 elif mapper
is not None:
1095 return defaultMapper
1097 def _assignDefaultMapper(self, defaultMapper):
1098 for repoData
in self.
_repos.all().values():
1099 if repoData.cfg.mapper
is None and (repoData.isNewRepository
or repoData.isV1Repository):
1100 if defaultMapper
is None:
1102 "No mapper specified for %s and no default mapper could be determined." %
1104 repoData.cfg.mapper = defaultMapper
1108 """posix-only; gets the mapper class at the path specified by root (if a file _mapper can be found at 1109 that location or in a parent location. 1111 As we abstract the storage and support different types of storage locations this method will be 1112 moved entirely into Butler Access, or made more dynamic, and the API will very likely change.""" 1113 return Storage.getMapperClass(root)
1116 """Register an alias that will be substituted in datasetTypes. 1121 The alias keyword. It may start with @ or not. It may not contain @ except as the first character. 1122 datasetType - string 1123 The string that will be substituted when @alias is passed into datasetType. It may not contain '@' 1127 atLoc = alias.rfind(
'@')
1129 alias =
"@" + str(alias)
1131 raise RuntimeError(
"Badly formatted alias string: %s" % (alias,))
1134 if datasetType.count(
'@') != 0:
1135 raise RuntimeError(
"Badly formatted type string: %s" % (datasetType))
1140 if key.startswith(alias)
or alias.startswith(key):
1141 raise RuntimeError(
"Alias: %s overlaps with existing alias: %s" % (alias, key))
1145 def getKeys(self, datasetType=None, level=None, tag=None):
1146 """Get the valid data id keys at or above the given level of hierarchy for the dataset type or the 1147 entire collection if None. The dict values are the basic Python types corresponding to the keys (int, 1152 datasetType - string 1153 The type of dataset to get keys for, entire collection if None. 1155 The hierarchy level to descend to. None if it should not be restricted. Use an empty string if the 1156 mapper should lookup the default level. 1157 tags - any, or list of any 1158 Any object that can be tested to be the same as the tag in a dataId passed into butler input 1159 functions. Applies only to input repositories: If tag is specified by the dataId then the repo 1160 will only be read from used if the tag in the dataId matches a tag used for that repository. 1164 Returns a dict. The dict keys are the valid data id keys at or above the given level of hierarchy for 1165 the dataset type or the entire collection if None. The dict values are the basic Python types 1166 corresponding to the keys (int, float, string). 1172 for repoData
in self.
_repos.inputs():
1173 if not tag
or len(tag.intersection(repoData.tags)) > 0:
1174 keys = repoData.repo.getKeys(datasetType, level)
1177 if keys
is not None:
1182 """Returns the valid values for one or more keys when given a partial 1183 input collection data id. 1187 datasetType - string 1188 The type of dataset to inquire about. 1190 Key or tuple of keys to be returned. 1191 dataId - DataId, dict 1192 The partial data id. 1194 Keyword arguments for the partial data id. 1198 A list of valid values or tuples of valid values as specified by the 1204 dataId.update(**rest)
1208 for repoData
in self.
_repos.inputs():
1209 if not dataId.tag
or len(dataId.tag.intersection(repoData.tags)) > 0:
1210 tuples = repoData.repo.queryMetadata(datasetType, format, dataId)
1217 if len(format) == 1:
1229 """Determines if a dataset file exists. 1233 datasetType - string 1234 The type of dataset to inquire about. 1235 dataId - DataId, dict 1236 The data id of the dataset. 1237 **rest keyword arguments for the data id. 1242 True if the dataset exists or is non-file-based. 1246 dataId.update(**rest)
1247 location = self.
_locate(datasetType, dataId, write=
False)
1248 if location
is None:
1253 if isinstance(location, ButlerComposite):
1254 for name, componentInfo
in location.componentInfo.items():
1255 if componentInfo.subset:
1256 subset = self.
subset(datasetType=componentInfo.datasetType, dataId=location.dataId)
1257 exists = all([obj.datasetExists()
for obj
in subset])
1259 exists = self.
datasetExists(componentInfo.datasetType, location.dataId)
1263 exists = location.repository.exists(location)
1266 def _locate(self, datasetType, dataId, write):
1267 """Get one or more ButlerLocations and/or ButlercComposites. 1271 datasetType : string 1272 The datasetType that is being searched for. The datasetType may be followed by a dot and 1273 a component name (component names are specified in the policy). IE datasetType.componentName 1275 dataId : dict or DataId class instance 1279 True if this is a search to write an object. False if it is a search to read an object. This 1280 affects what type (an object or a container) is returned. 1284 If write is False, will return either a single object or None. If write is True, will return a list 1285 (which may be empty) 1287 repos = self.
_repos.outputs()
if write
else self.
_repos.inputs()
1289 for repoData
in repos:
1291 if not write
and dataId.tag
and len(dataId.tag.intersection(repoData.tags)) == 0:
1293 components = datasetType.split(
'.')
1294 datasetType = components[0]
1295 components = components[1:]
1297 location = repoData.repo.map(datasetType, dataId, write=write)
1300 if location
is None:
1302 location.datasetType = datasetType
1303 if len(components) > 0:
1304 if not isinstance(location, ButlerComposite):
1305 raise RuntimeError(
"The location for a dotted datasetType must be a composite.")
1307 components[0] = location.componentInfo[components[0]].datasetType
1309 datasetType =
'.'.join(components)
1310 location = self.
_locate(datasetType, dataId, write)
1312 if location
is None:
1323 if hasattr(location.mapper,
"bypass_" + location.datasetType):
1327 location.bypass = bypass
1333 if (isinstance(location, ButlerComposite)
or hasattr(location,
'bypass')
or 1334 location.repository.exists(location)):
1338 locations.extend(location)
1340 locations.append(location)
1346 def _getBypassFunc(location, dataId):
1347 pythonType = location.getPythonType()
1348 if pythonType
is not None:
1349 if isinstance(pythonType, basestring):
1351 bypassFunc = getattr(location.mapper,
"bypass_" + location.datasetType)
1352 return lambda: bypassFunc(location.datasetType, pythonType, location, dataId)
1354 def get(self, datasetType, dataId=None, immediate=True, **rest):
1355 """Retrieves a dataset given an input collection data id. 1359 datasetType - string 1360 The type of dataset to retrieve. 1364 If False use a proxy for delayed loading. 1366 keyword arguments for the data id. 1370 An object retrieved from the dataset (or a proxy for one). 1374 dataId.update(**rest)
1376 location = self.
_locate(datasetType, dataId, write=
False)
1377 if location
is None:
1378 raise NoResults(
"No locations for get:", datasetType, dataId)
1379 self.
log.debug(
"Get type=%s keys=%s from %s", datasetType, dataId, str(location))
1381 if hasattr(location,
'bypass'):
1384 return location.bypass
1387 return self.
_read(location)
1388 if location.mapper.canStandardize(location.datasetType):
1389 innerCallback = callback
1392 return location.mapper.standardize(location.datasetType, innerCallback(), dataId)
1395 return ReadProxy(callback)
1397 def put(self, obj, datasetType, dataId={}, doBackup=False, **rest):
1398 """Persists a dataset given an output collection data id. 1403 The object to persist. 1404 datasetType - string 1405 The type of dataset to persist. 1409 If True, rename existing instead of overwriting. 1410 WARNING: Setting doBackup=True is not safe for parallel processing, as it may be subject to race 1413 Keyword arguments for the data id. 1417 dataId.update(**rest)
1419 for location
in self.
_locate(datasetType, dataId, write=
True):
1420 if isinstance(location, ButlerComposite):
1421 disassembler = location.disassembler
if location.disassembler
else genericDisassembler
1422 disassembler(obj=obj, dataId=location.dataId, componentInfo=location.componentInfo)
1423 for name, info
in location.componentInfo.items():
1424 if not info.inputOnly:
1425 self.
put(info.obj, info.datasetType, location.dataId, doBackup=doBackup)
1428 location.getRepository().backup(location.datasetType, dataId)
1429 location.getRepository().write(location, obj)
1431 def subset(self, datasetType, level=None, dataId={}, **rest):
1432 """Return complete dataIds for a dataset type that match a partial (or empty) dataId. 1434 Given a partial (or empty) dataId specified in dataId and **rest, find all datasets that match the 1435 dataId. Optionally restrict the results to a given level specified by a dataId key (e.g. visit or 1436 sensor or amp for a camera). Return an iterable collection of complete dataIds as ButlerDataRefs. 1437 Datasets with the resulting dataIds may not exist; that needs to be tested with datasetExists(). 1441 datasetType - string 1442 The type of dataset collection to subset 1444 The level of dataId at which to subset. Use an empty string if the mapper should look up the 1449 Keyword arguments for the data id. 1453 subset - ButlerSubset 1454 Collection of ButlerDataRefs for datasets matching the data id. 1458 To print the full dataIds for all r-band measurements in a source catalog 1459 (note that the subset call is equivalent to: `butler.subset('src', dataId={'filter':'r'})`): 1461 >>> subset = butler.subset('src', filter=
'r') 1462 >>> for data_ref
in subset: print(data_ref.dataId)
1464 datasetType = self._resolveDatasetTypeAlias(datasetType) 1466 # Currently expected behavior of subset is that if specified level is None then the mapper's default 1467 # level should be used. Convention for level within Butler is that an empty string is used to indicate 1472 dataId = DataId(dataId) 1473 dataId.update(**rest) 1474 return ButlerSubset(self, datasetType, level, dataId) 1476 def dataRef(self, datasetType, level=None, dataId={}, **rest): 1477 """Returns a single ButlerDataRef.
1479 Given a complete dataId specified
in dataId
and **rest, find the unique dataset at the given level
1480 specified by a dataId key (e.g. visit
or sensor
or amp
for a camera)
and return a ButlerDataRef.
1484 datasetType - string
1485 The type of dataset collection to reference
1487 The level of dataId at which to reference
1491 Keyword arguments
for the data id.
1495 dataRef - ButlerDataRef
1496 ButlerDataRef
for dataset matching the data id
1499 datasetType = self._resolveDatasetTypeAlias(datasetType) 1500 dataId = DataId(dataId) 1501 subset = self.subset(datasetType, level, dataId, **rest) 1502 if len(subset) != 1: 1503 raise RuntimeError("No unique dataset for: Dataset type:%s Level:%s Data ID:%s Keywords:%s" % 1504 (str(datasetType), str(level), str(dataId), str(rest))) 1505 return ButlerDataRef(subset, subset.cache[0]) 1507 def _read(self, location): 1508 """Unpersist an object using data inside a ButlerLocation
or ButlerComposite object.
1512 location : ButlerLocation
or ButlerComposite
1513 A ButlerLocation
or ButlerComposite instance populated with data needed to read the object.
1518 An instance of the object specified by the location.
1520 self.log.debug("Starting read from %s", location) 1522 if isinstance(location, ButlerComposite): 1523 for name, componentInfo in location.componentInfo.items(): 1524 if componentInfo.subset: 1525 subset = self.subset(datasetType=componentInfo.datasetType, dataId=location.dataId) 1526 componentInfo.obj = [obj.get() for obj in subset] 1528 obj = self.get(componentInfo.datasetType, location.dataId, immediate=True) 1529 componentInfo.obj = obj 1530 assembler = location.assembler or genericAssembler 1531 results = assembler(dataId=location.dataId, componentInfo=location.componentInfo, 1532 cls=location.python) 1535 results = location.repository.read(location) 1536 if len(results) == 1: 1537 results = results[0] 1538 self.log.debug("Ending read from %s", location) 1541 def __reduce__(self): 1542 ret = (_unreduce, (self._initArgs, self.datasetTypeAliasDict)) 1545 def _resolveDatasetTypeAlias(self, datasetType): 1546 """Replaces all the known alias keywords
in the given string with the alias value.
1550 datasetType - string
1551 A datasetType string to search & replace on
1555 datasetType - string
1556 The de-aliased string
1558 for key in self.datasetTypeAliasDict: 1559 # if all aliases have been replaced, bail out 1560 if datasetType.find('@') == -1: 1562 datasetType = datasetType.replace(key, self.datasetTypeAliasDict[key]) 1564 # If an alias specifier can not be resolved then throw. 1565 if datasetType.find('@') != -1: 1566 raise RuntimeError("Unresolvable alias specifier in datasetType: %s" % (datasetType)) 1571 def _unreduce(initArgs, datasetTypeAliasDict): 1572 mapperArgs = initArgs.pop('mapperArgs') 1573 initArgs.update(mapperArgs) 1574 butler = Butler(**initArgs) 1575 butler.datasetTypeAliasDict = datasetTypeAliasDict
def _buildLookupLists(self)
def _resolveDatasetTypeAlias(self, datasetType)
def _convertV1Args(self, root, mapper, mapperArgs)
def _setRepoDataTags(self)
def __init__(self, root=None, mapper=None, inputs=None, outputs=None, mapperArgs)
def setCfg(self, cfg, origin, root, isV1Repository)
def _getRepositoryCfg(self, repositoryArgs)
def getParentRepoDatas(self, context=None)
def _setParentRegistry(self, repoData)
def _getCfgs(self, repoDataList)
def subset(self, datasetType, level=None, dataId={}, rest)
def __init__(self, cls, repoCfg)
def isNewRepository(self)
def _read(self, location)
def _setDefaultMapper(self, repoDataList)
def defineAlias(self, alias, datasetType)
def _connectParentRepoDatas(self, repoDataList)
def __init__(self, repoDataList)
def _addParents(self, repoDataList)
def getKeys(self, datasetType=None, level=None, tag=None)
def _getBypassFunc(location, dataId)
def put(self, obj, datasetType, dataId={}, doBackup=False, rest)
def queryMetadata(self, datasetType, format, dataId={}, rest)
def _processInputArguments(self, root=None, mapper=None, inputs=None, outputs=None, mapperArgs)
def addParentRepoData(self, parentRepoData)
def _locate(self, datasetType, dataId, write)
def _getParentVal(repoData)
def _setAndVerifyParentsLists(self, repoDataList)
def get(self, datasetType, dataId=None, immediate=True, rest)
def __init__(self, args, role)
def datasetExists(self, datasetType, dataId={}, rest)