27 """This module defines the Butler class.""" 28 from future
import standard_library
29 standard_library.install_aliases()
30 from builtins
import str
31 from past.builtins
import basestring
32 from builtins
import object
43 from lsst.log
import Log
44 import lsst.pex.policy
as pexPolicy
45 from .
import LogicalLocation, ReadProxy, ButlerSubset, ButlerDataRef, Persistence, \
46 Storage, Policy, NoResults, Repository, DataId, RepositoryCfg, \
47 RepositoryArgs, listify, setify, sequencify, doImport, ButlerComposite, genericAssembler, \
48 genericDisassembler, PosixStorage, ParentsMismatch
50 preinitedMapperWarning = (
"Passing an instantiated mapper into " +
51 "Butler.__init__ will prevent Butler from passing " +
52 "parentRegistry or repositoryCfg information to " +
53 "the mapper, which is done only at init time. " +
54 "It is better to pass a importable string or " +
59 """Represents a Butler configuration. 63 cfg is 'wet paint' and very likely to change. Use of it in production 64 code other than via the 'old butler' API is strongly discouraged. 66 yaml_tag =
u"!ButlerCfg" 69 super(ButlerCfg, self).
__init__({
'repoCfg': repoCfg,
'cls': cls})
73 """Container object for repository data used by Butler 78 The arguments that are used to find or create the RepositoryCfg. 80 "input", "output", or "parent", indicating why Butler loaded this repository. 81 * input: the Repository was passed as a Butler input. 82 * output: the Repository was passed as a Butler output. 83 * parent: the Repository was specified in the RepositoryCfg parents list of a readable repository. 88 The configuration for the Repository. 91 "new", "existing", or "nested". Indicates the origin of the repository and its RepositoryCfg: 92 * new: it was created by this instance of Butler, and this instance of Butler will generate the 94 * existing: it was found (via the root or cfgRoot argument) 95 * nested: the full RepositoryCfg was nested in another RepositoryCfg's parents list (this can happen 96 if parameters of an input specified by RepositoryArgs or dict does not entirely match an existing 100 Path or URI to the location of the RepositoryCfg file. 102 repo : lsst.daf.persistence.Repository 103 The Repository class instance. 105 parentRepoDatas : list of RepoData 106 The parents of this Repository, as indicated this Repository's RepositoryCfg. If this is a new 107 Repository then these are the inputs to this Butler (and will be saved in the RepositoryCfg). These 108 RepoData objects are not owned by this RepoData, these are references to peer RepoData objects in the 109 Butler's RepoDataContainer. 111 isV1Repository : bool 112 True if this is an Old Butler repository. In this case the repository does not have a RepositoryCfg 113 file. It may have a _mapper file and may have a _parent symlink. It will never be treated as a "new" 114 repository, i.e. even though there is not a RepositoryCfg file, one will not be generated. 115 If False, this is a New Butler repository and is specified by RepositoryCfg file. 118 These are values that may be used to restrict the search of input repositories. Details are available 119 in the RepositoryArgs and DataId classes. 122 "input", "output", or "parent", indicating why Butler loaded this repository. 123 * input: the Repository was passed as a Butler input. 124 * output: the Repository was passed as a Butler output. 125 * parent: the Repository was specified in the RepositoryCfg parents list of a readable repository. 127 _repoArgs : RepositoryArgs 128 Contains the arguments that were used to specify this Repository. 158 "parentRepoDatas={}," +
161 "parentRegistry={})").format(
162 self.__class__.__name__,
174 def setCfg(self, cfg, origin, root, isV1Repository):
175 """Set information about the cfg into the RepoData 180 The RepositoryCfg for the repo. 182 'new', 'existing', or 'nested' 184 URI or absolute path to the location of the RepositoryCfg.yaml file. 190 if origin
not in (
'new',
'existing',
'nested'):
191 raise RuntimeError(
"Invalid value for origin:{}".format(origin))
211 if val
not in (
'input',
'output',
'parent'):
212 raise RuntimeError(
"Invalid value for role: {}".format(val))
216 """Get the parents & grandparents etc of this repo data, in depth-first search order. 218 Duplicate entries will be removed in cases where the same parent appears more than once in the parent 223 context : set, optional 224 Users should typically omit context and accept the default argument. Context is used to keep a set 225 of known RepoDatas when calling this function recursively, for duplicate elimination. 230 A list of the parents & grandparents etc of a given repo data, in depth-first search order. 235 if id(self)
in context:
237 context.add(id(self))
239 parents.append(parent)
240 parents += parent.getParentRepoDatas(context)
251 """Container object for RepoData instances owned by a Butler instance. 255 repoDataList : list of RepoData 256 repoData - RepoData instance to add 262 self.
_all = repoDataList
266 """Get a list of RepoData that are used to as inputs to the Butler. 267 The list is created lazily as needed, and cached. 271 A list of RepoData with readable repositories, in the order to be used when searching. 274 raise RuntimeError(
"Inputs not yet initialized.")
278 """Get a list of RepoData that are used to as outputs to the Butler. 279 The list is created lazily as needed, and cached. 283 A list of RepoData with writable repositories, in the order to be use when searching. 286 raise RuntimeError(
"Outputs not yet initialized.")
290 """Get a list of all RepoData that are used to as by the Butler. 291 The list is created lazily as needed, and cached. 295 A list of RepoData with writable repositories, in the order to be use when searching. 300 return "%s(_inputs=%r, \n_outputs=%s, \n_all=%s)" % (
301 self.__class__.__name__,
306 def _buildLookupLists(self):
307 """Build the inputs and outputs lists based on the order of self.all().""" 309 def addToList(repoData, lst):
310 """Add a repoData and each of its parents (depth first) to a list""" 311 if id(repoData)
in alreadyAdded:
314 alreadyAdded.add(id(repoData))
315 for parent
in repoData.parentRepoDatas:
316 addToList(parent, lst)
319 raise RuntimeError(
"Lookup lists are already built.")
320 inputs = [repoData
for repoData
in self.
all()
if repoData.role ==
'input']
321 outputs = [repoData
for repoData
in self.
all()
if repoData.role ==
'output']
324 for repoData
in outputs:
325 if 'r' in repoData.repoArgs.mode: 326 addToList(repoData.repoData, self._inputs) 327 for repoData
in inputs:
328 addToList(repoData.repoData, self.
_inputs)
329 self.
_outputs = [repoData.repoData
for repoData
in outputs]
333 """Butler provides a generic mechanism for persisting and retrieving data using mappers. 335 A Butler manages a collection of datasets known as a repository. Each dataset has a type representing its 336 intended usage and a location. Note that the dataset type is not the same as the C++ or Python type of the 337 object containing the data. For example, an ExposureF object might be used to hold the data for a raw 338 image, a post-ISR image, a calibrated science image, or a difference image. These would all be different 341 A Butler can produce a collection of possible values for a key (or tuples of values for multiple keys) if 342 given a partial data identifier. It can check for the existence of a file containing a dataset given its 343 type and data identifier. The Butler can then retrieve the dataset. Similarly, it can persist an object to 344 an appropriate location when given its associated data identifier. 346 Note that the Butler has two more advanced features when retrieving a data set. First, the retrieval is 347 lazy. Input does not occur until the data set is actually accessed. This allows datasets to be retrieved 348 and placed on a clipboard prospectively with little cost, even if the algorithm of a stage ends up not 349 using them. Second, the Butler will call a standardization hook upon retrieval of the dataset. This 350 function, contained in the input mapper object, must perform any necessary manipulations to force the 351 retrieved object to conform to standards, including translating metadata. 355 __init__(self, root, mapper=None, **mapperArgs) 357 defineAlias(self, alias, datasetType) 359 getKeys(self, datasetType=None, level=None) 361 queryMetadata(self, datasetType, format=None, dataId={}, **rest) 363 datasetExists(self, datasetType, dataId={}, **rest) 365 get(self, datasetType, dataId={}, immediate=False, **rest) 367 put(self, obj, datasetType, dataId={}, **rest) 369 subset(self, datasetType, level=None, dataId={}, **rest) 371 dataRef(self, datasetType, level=None, dataId={}, **rest) 375 The preferred method of initialization is to use the `inputs` and `outputs` __init__ parameters. These 376 are described in the parameters section, below. 378 For backward compatibility: this initialization method signature can take a posix root path, and 379 optionally a mapper class instance or class type that will be instantiated using the mapperArgs input 380 argument. However, for this to work in a backward compatible way it creates a single repository that is 381 used as both an input and an output repository. This is NOT preferred, and will likely break any 382 provenance system we have in place. 387 .. note:: Deprecated in 12_0 388 `root` will be removed in TBD, it is replaced by `inputs` and `outputs` for 389 multiple-repository support. 390 A file system path. Will only work with a PosixRepository. 391 mapper : string or instance 392 .. note:: Deprecated in 12_0 393 `mapper` will be removed in TBD, it is replaced by `inputs` and `outputs` for 394 multiple-repository support. 395 Provides a mapper to be used with Butler. 397 .. note:: Deprecated in 12_0 398 `mapperArgs` will be removed in TBD, it is replaced by `inputs` and `outputs` for 399 multiple-repository support. 400 Provides arguments to be passed to the mapper if the mapper input argument is a class type to be 401 instantiated by Butler. 402 inputs : RepositoryArgs, dict, or string 403 Can be a single item or a list. Provides arguments to load an existing repository (or repositories). 404 String is assumed to be a URI and is used as the cfgRoot (URI to the location of the cfg file). (Local 405 file system URI does not have to start with 'file://' and in this way can be a relative path). The 406 `RepositoryArgs` class can be used to provide more parameters with which to initialize a repository 407 (such as `mapper`, `mapperArgs`, `tags`, etc. See the `RepositoryArgs` documentation for more 408 details). A dict may be used as shorthand for a `RepositoryArgs` class instance. The dict keys must 409 match parameters to the `RepositoryArgs.__init__` function. 410 outputs : RepositoryArgs, dict, or string 411 Provides arguments to load one or more existing repositories or create new ones. The different types 412 are handled the same as for `inputs`. 414 The Butler init sequence loads all of the input and output repositories. 415 This creates the object hierarchy to read from and write to them. Each 416 repository can have 0 or more parents, which also get loaded as inputs. 417 This becomes a DAG of repositories. Ultimately, Butler creates a list of 418 these Repositories in the order that they are used. 420 Initialization Sequence 421 ======================= 423 During initialization Butler creates a Repository class instance & support structure for each object 424 passed to `inputs` and `outputs` as well as the parent repositories recorded in the `RepositoryCfg` of 425 each existing readable repository. 427 This process is complex. It is explained below to shed some light on the intent of each step. 429 1. Input Argument Standardization 430 --------------------------------- 432 In `Butler._processInputArguments` the input arguments are verified to be legal (and a RuntimeError is 433 raised if not), and they are converted into an expected format that is used for the rest of the Butler 434 init sequence. See the docstring for `_processInputArguments`. 436 2. Create RepoData Objects 437 -------------------------- 439 Butler uses an object, called `RepoData`, to keep track of information about each repository; each 440 repository is contained in a single `RepoData`. The attributes are explained in its docstring. 442 After `_processInputArguments`, a RepoData is instantiated and put in a list for each repository in 443 `outputs` and `inputs`. This list of RepoData, the `repoDataList`, now represents all the output and input 444 repositories (but not parent repositories) that this Butler instance will use. 446 3. Get `RepositoryCfg`s 447 ----------------------- 449 `Butler._getCfgs` gets the `RepositoryCfg` for each repository the `repoDataList`. The behavior is 450 described in the docstring. 455 `Butler._addParents` then considers the parents list in the `RepositoryCfg` of each `RepoData` in the 456 `repoDataList` and inserts new `RepoData` objects for each parent not represented in the proper location 457 in the `repoDataList`. Ultimately a flat list is built to represent the DAG of readable repositories 458 represented in depth-first order. 460 5. Set and Verify Parents of Outputs 461 ------------------------------------ 463 To be able to load parent repositories when output repositories are used as inputs, the input repositories 464 are recorded as parents in the `RepositoryCfg` file of new output repositories. When an output repository 465 already exists, for consistency the Butler's inputs must match the list of parents specified the already- 466 existing output repository's `RepositoryCfg` file. 468 In `Butler._setAndVerifyParentsLists`, the list of parents is recorded in the `RepositoryCfg` of new 469 repositories. For existing repositories the list of parents is compared with the `RepositoryCfg`'s parents 470 list, and if they do not match a `RuntimeError` is raised. 472 6. Set the Default Mapper 473 ------------------------- 475 If all the input repositories use the same mapper then we can assume that mapper to be the 476 "default mapper". If there are new output repositories whose `RepositoryArgs` do not specify a mapper and 477 there is a default mapper then the new output repository will be set to use that default mapper. 479 This is handled in `Butler._setDefaultMapper`. 481 7. Cache References to Parent RepoDatas 482 --------------------------------------- 484 In `Butler._connectParentRepoDatas`, in each `RepoData` in `repoDataList`, a list of `RepoData` object 485 references is built that matches the parents specified in that `RepoData`'s `RepositoryCfg`. 487 This list is used later to find things in that repository's parents, without considering peer repository's 488 parents. (e.g. finding the registry of a parent) 493 Tags are described at https://ldm-463.lsst.io/v/draft/#tagging 495 In `Butler._setRepoDataTags`, for each `RepoData`, the tags specified by its `RepositoryArgs` are recorded 496 in a set, and added to the tags set in each of its parents, for ease of lookup when mapping. 498 9. Find Parent Registry and Instantiate RepoData 499 ------------------------------------------------ 501 At this point there is enough information to instantiate the `Repository` instances. There is one final 502 step before instantiating the Repository, which is to try to get a parent registry that can be used by the 503 child repository. The criteria for "can be used" is spelled out in `Butler._setParentRegistry`. However, 504 to get the registry from the parent, the parent must be instantiated. The `repoDataList`, in depth-first 505 search order, is built so that the most-dependent repositories are first, and the least dependent 506 repositories are last. So the `repoDataList` is reversed and the Repositories are instantiated in that 507 order; for each RepoData a parent registry is searched for, and then the Repository is instantiated with 508 whatever registry could be found.""" 510 def __init__(self, root=None, mapper=None, inputs=None, outputs=None, **mapperArgs):
511 self.
_initArgs = {
'root': root,
'mapper': mapper,
'inputs': inputs,
'outputs': outputs,
512 'mapperArgs': mapperArgs}
514 self.
log = Log.getLogger(
"daf.persistence.butler")
516 persistencePolicy = pexPolicy.Policy()
520 root=root, mapper=mapper, inputs=inputs, outputs=outputs, **mapperArgs)
523 inputs = [
RepoData(args,
'input')
for args
in inputs]
524 outputs = [
RepoData(args,
'output')
for args
in outputs]
525 repoDataList = outputs + inputs
541 for repoData
in reversed(repoDataList):
545 def _setParentRegistry(self, repoData):
546 """Try to get a parent registry that can be used by this repository. To be usable the repository must 547 "match", meaning the mapper in the passed-in repo is the same type as the mapper in the parent. 550 def getParentRegsitry(repoData, context):
551 """Get the first found registry that matches the the passed-in repo. 556 The RepoData for the repository for which we are searching for a 562 A registry from a parent if one can be found, or None. 567 Indicates a butler init order problem, all parents should be initialized before child 568 repositories, so this function should be able to get any parent of any child repo. 570 if id(self)
in context:
573 context.add(id(self))
574 for parentRepoData
in repoData.getParentRepoDatas():
575 if parentRepoData.cfg.mapper == repoData.cfg.mapper:
576 if parentRepoData.repo
is None:
578 "_getParentRegistry: Parent {} of new repo {} not yet created, ignoring.".format(
579 parentRepoData, repoData))
581 parentRegistry = parentRepoData.repo.getRegistry()
583 return parentRegistry
585 parentRegistry = getParentRegsitry(parentRepoData, context)
587 return parentRegistry
590 repoData.repoData.parentRegistry = getParentRegsitry(repoData.repoData, set())
592 def _processInputArguments(self, root=None, mapper=None, inputs=None, outputs=None, **mapperArgs):
593 """Process, verify, and standardize the input arguments. 594 * Inputs can not be for Old Butler (root, mapper, mapperArgs) AND New Butler (inputs, outputs) 595 `root`, `mapper`, and `mapperArgs` are Old Butler init API. 596 `inputs` and `outputs` are New Butler init API. 597 Old Butler and New Butler init API may not be mixed, Butler may be initialized with only the Old 598 arguments or the New arguments. 599 * Verify that if there is a readable output that there is exactly one output. (This restriction is in 600 place because all readable repositories must be parents of writable repositories, and for 601 consistency the DAG of readable repositories must always be the same. Keeping the list of parents 602 becomes very complicated in the presence of multiple readable output repositories. It is better to 603 only write to output repositories, and then create a new Butler instance and use the outputs as 604 inputs, and write to new output repositories.) 605 * Make a copy of inputs & outputs so they may be modified without changing the passed-in arguments. 606 * Convert any input/output values that are URI strings to RepositoryArgs. 607 * Listify inputs & outputs. 608 * Set default RW mode on inputs & outputs as needed. 612 Same as Butler.__init__ 616 (list of RepositoryArgs, list of RepositoryArgs) 617 First item is a list to use as inputs. 618 Second item is a list to use as outputs. 623 If Old Butler and New Butler arguments are both used this will raise. 624 If an output is readable there is more than one output this will raise. 627 inputs = copy.deepcopy(inputs)
628 outputs = copy.deepcopy(outputs)
630 isV1Args = inputs
is None and outputs
is None 634 mapperArgs=mapperArgs
or None)
635 elif root
or mapper
or mapperArgs:
637 'Butler version 1 API (root, mapper, **mapperArgs) may ' +
638 'not be used with version 2 API (inputs, outputs)')
647 if not isinstance(args, RepositoryArgs)
else args
for args
in inputs]
649 if not isinstance(args, RepositoryArgs)
else args
for args
in outputs]
653 if args.mode
is None:
655 elif 'rw' == args.mode:
657 elif 'r' != args.mode: 658 raise RuntimeError(
"The mode of an input should be readable.")
660 if args.mode
is None:
662 elif 'w' not in args.mode:
663 raise RuntimeError(
"The mode of an output should be writable.")
665 for args
in inputs + outputs:
666 if (args.mapper
and not isinstance(args.mapper, basestring)
and 667 not inspect.isclass(args.mapper)):
668 self.
log.warn(preinitedMapperWarning)
673 raise RuntimeError(
"Butler does not support multiple output repositories if any of the " 674 "outputs are readable.")
679 def inputIsInOutputs(inputArgs, outputArgsList):
680 for o
in outputArgsList:
681 if (
'r' in o.mode and 682 o.root == inputArgs.root and 683 o.mapper == inputArgs.mapper
and 684 o.mapperArgs == inputArgs.mapperArgs
and 685 o.tags == inputArgs.tags
and 686 o.policy == inputArgs.policy):
687 self.
log.debug((
"Input repositoryArgs {} is also listed in outputs as readable; " +
688 "throwing away the input.").format(inputArgs))
692 inputs = [args
for args
in inputs
if not inputIsInOutputs(args, outputs)]
693 return inputs, outputs
696 def _getParentVal(repoData):
697 """Get the value of this repoData as it should appear in the parents 698 list of other repositories""" 699 if repoData.isV1Repository:
701 if repoData.cfgOrigin ==
'nested':
704 return repoData.cfg.root
707 def _getParents(ofRepoData, repoInfo):
708 """Create a parents list of repoData from inputs and (readable) outputs.""" 711 for repoData
in repoInfo:
712 if repoData
is ofRepoData:
714 if 'r' not in repoData.repoArgs.mode: 716 parents.append(Butler._getParentVal(repoData))
720 def _getOldButlerRepositoryCfg(repositoryArgs):
721 if not Storage.isPosix(repositoryArgs.cfgRoot):
723 if not PosixStorage.v1RepoExists(repositoryArgs.cfgRoot):
725 if not repositoryArgs.mapper:
726 repositoryArgs.mapper = PosixStorage.getMapperClass(repositoryArgs.cfgRoot)
727 cfg = RepositoryCfg.makeFromArgs(repositoryArgs)
728 parent = PosixStorage.getParentSymlinkPath(repositoryArgs.cfgRoot)
730 parent = Butler._getOldButlerRepositoryCfg(
RepositoryArgs(cfgRoot=parent, mode=
'r')) 731 if parent
is not None:
732 cfg.addParents([parent])
735 def _getRepositoryCfg(self, repositoryArgs):
736 """Try to get a repository from the location described by cfgRoot. 740 repositoryArgs : RepositoryArgs or string 741 Provides arguments to load an existing repository (or repositories). String is assumed to be a URI 742 and is used as the cfgRoot (URI to the location of the cfg file). 746 (RepositoryCfg or None, bool) 747 The RepositoryCfg, or None if one cannot be found, and True if the RepositoryCfg was created by 748 reading an Old Butler repository, or False if it is a New Butler Repository. 750 if not isinstance(repositoryArgs, RepositoryArgs):
753 cfg = self.storage.getRepositoryCfg(repositoryArgs.cfgRoot) 754 isOldButlerRepository = False 756 cfg = Butler._getOldButlerRepositoryCfg(repositoryArgs)
758 isOldButlerRepository =
True 759 return cfg, isOldButlerRepository
761 def _getCfgs(self, repoDataList):
762 """Get or make a RepositoryCfg for each RepoData, and add the cfg to the RepoData. 763 If the cfg exists, compare values. If values match then use the cfg as an "existing" cfg. If the 764 values do not match, use the cfg as a "nested" cfg. 765 If the cfg does not exist, the RepositoryArgs must be for a writable repository. 769 repoDataList : list of RepoData 770 The RepoData that are output and inputs of this Butler 775 If the passed-in RepositoryArgs indicate an existing repository but other cfg parameters in those 777 match the existing repository's cfg a RuntimeError will be raised. 779 def cfgMatchesArgs(args, cfg):
780 """Test if there are any values in an RepositoryArgs that conflict with the values in a cfg""" 781 if args.mapper
is not None and cfg.mapper != args.mapper:
783 if args.mapperArgs
is not None and cfg.mapperArgs != args.mapperArgs:
785 if args.policy
is not None and cfg.policy != args.policy:
789 for repoData
in repoDataList:
792 if 'w' not in repoData.repoArgs.mode:
794 "No cfg found for read-only input repository at {}".format(repoData.repoArgs.cfgRoot))
795 repoData.setCfg(cfg=RepositoryCfg.makeFromArgs(repoData.repoArgs),
797 root=repoData.repoArgs.cfgRoot,
798 isV1Repository=isOldButlerRepository)
800 if 'w' in repoData.repoArgs.mode:
802 if not cfgMatchesArgs(repoData.repoArgs, cfg):
803 raise RuntimeError((
"The RepositoryArgs and RepositoryCfg must match for writable " +
804 "repositories, RepositoryCfg:{}, RepositoryArgs:{}").format(
805 cfg, repoData.repoArgs))
806 repoData.setCfg(cfg=cfg, origin=
'existing', root=repoData.repoArgs.cfgRoot,
807 isV1Repository=isOldButlerRepository)
810 if cfgMatchesArgs(repoData.repoArgs, cfg):
811 repoData.setCfg(cfg=cfg, origin=
'existing', root=repoData.repoArgs.cfgRoot,
812 isV1Repository=isOldButlerRepository)
814 repoData.setCfg(cfg=cfg, origin=
'nested', root=
None,
815 isV1Repository=isOldButlerRepository)
817 def _addParents(self, repoDataList):
818 """For each repoData in the input list, see if its parents are the next items in the list, and if not 819 add the parent, so that the repoDataList includes parents and is in order to operate depth-first 0..n. 823 repoDataList : list of RepoData 824 The RepoData for the Butler outputs + inputs. 829 Raised if a RepositoryCfg can not be found at a location where a parent repository should be. 833 if repoDataIdx == len(repoDataList):
835 repoData = repoDataList[repoDataIdx]
836 if 'r' not in repoData.repoArgs.mode: 839 if repoData.isNewRepository:
842 if repoData.cfg.parents
is None:
845 for repoParentIdx, repoParent
in enumerate(repoData.cfg.parents):
846 parentIdxInRepoDataList = repoDataIdx + repoParentIdx + 1
847 if not isinstance(repoParent, RepositoryCfg):
849 if repoParentCfg
is not None:
850 cfgOrigin =
'existing' 852 isOldButlerRepository =
False 853 repoParentCfg = repoParent
855 if (parentIdxInRepoDataList < len(repoDataList)
and 856 repoDataList[parentIdxInRepoDataList].cfg == repoParentCfg):
859 role = 'input' if repoData.role ==
'output' else 'parent' 861 newRepoInfo.repoData.setCfg(cfg=repoParentCfg, origin=cfgOrigin, root=args.cfgRoot,
862 isV1Repository=isOldButlerRepository)
863 repoDataList.insert(parentIdxInRepoDataList, newRepoInfo)
866 def _setAndVerifyParentsLists(self, repoDataList):
867 """Make a list of all the input repositories of this Butler, these are the parents of the outputs. 868 For new output repositories, set the parents in the RepositoryCfg. For existing output repositories 869 verify that the RepositoryCfg's parents match the parents list. 873 repoDataList : list of RepoData 874 All the RepoDatas loaded by this butler, in search order. 879 If an existing output repository is loaded and its parents do not match the parents of this Butler 880 an error will be raised. 882 def getIOParents(ofRepoData, repoDataList):
883 """make a parents list for repo in `ofRepoData` that is comprised of inputs and readable 884 outputs (not parents-of-parents) of this butler""" 886 for repoData
in repoDataList:
887 if repoData.role ==
'parent':
889 if repoData
is ofRepoData:
891 if repoData.role ==
'output':
892 if 'r' in repoData.repoArgs.mode: 893 raise RuntimeError(
"If an output is readable it must be the only output.")
900 for repoData
in repoDataList:
901 if repoData.role !=
'output':
903 parents = getIOParents(repoData, repoDataList)
905 if repoData.cfgOrigin ==
'new':
906 repoData.cfg.addParents(parents)
907 elif repoData.cfgOrigin
in (
'existing',
'nested'):
908 if repoData.cfg.parents != parents:
910 repoData.cfg.extendParents(parents)
911 except ParentsMismatch
as e:
912 raise RuntimeError((
"Inputs of this Butler:{} do not match parents of existing " +
913 "writable cfg:{} (ParentMismatch exception: {}").format(
914 parents, repoData.cfg.parents, e))
916 def _setDefaultMapper(self, repoDataList):
917 """Establish a default mapper if there is one and assign it to outputs that do not have a mapper 920 If all inputs have the same mapper it will be used as the default mapper. 924 repoDataList : list of RepoData 925 All the RepoDatas loaded by this butler, in search order. 930 If a default mapper can not be established and there is an output that does not have a mapper. 932 needyOutputs = [rd
for rd
in repoDataList
if rd.role ==
'output' and rd.cfg.mapper
is None]
933 if len(needyOutputs)
is 0:
935 mappers = set([rd.cfg.mapper
for rd
in repoDataList
if rd.role ==
'input'])
936 if len(mappers) != 1:
937 inputs = [rd
for rd
in repoDataList
if rd.role ==
'input']
939 (
"No default mapper could be established from inputs:{} and no mapper specified " +
940 "for outputs:{}").format(inputs, needyOutputs))
941 defaultMapper = mappers.pop()
942 for repoData
in needyOutputs:
943 repoData.cfg.mapper = defaultMapper
945 def _connectParentRepoDatas(self, repoDataList):
946 """For each RepoData in repoDataList, find its parent in the repoDataList and cache a reference to it. 950 repoDataList : list of RepoData 951 All the RepoDatas loaded by this butler, in search order. 956 When a parent is listed in the parents list but not found in the repoDataList. This is not 957 expected to ever happen and would indicate an internal Butler error. 959 for repoData
in repoDataList:
960 for parent
in repoData.cfg.parents:
962 for otherRepoData
in repoDataList:
963 if isinstance(parent, RepositoryCfg):
964 if otherRepoData.repoData.repoData.cfg == parent:
965 parentToAdd = otherRepoData.repoData
967 elif otherRepoData.repoData.cfg.root == parent:
968 parentToAdd = otherRepoData.repoData
970 if parentToAdd
is None:
972 "Could not find a parent matching {} to add to {}".format(parent, repoData))
973 repoData.addParentRepoData(parentToAdd)
976 def _getParentRepoData(parent, repoDataList):
977 """get a parent RepoData from a cfg from a list of RepoData 981 parent : string or RepositoryCfg 982 cfgRoot of a repo or a cfg that describes the repo 983 repoDataList : list of RepoData 989 A RepoData if one can be found, else None 992 for otherRepoData
in repoDataList:
993 if isinstance(parent, RepositoryCfg):
994 if otherRepoData.cfg == parent:
995 repoData = otherRepoData
997 elif otherRepoData.cfg.root == parent:
998 repoData = otherRepoData
1002 def _setRepoDataTags(self):
1003 """Set the tags from each repoArgs into all its parent repoArgs so that they can be included in tagged 1005 def setTags(repoData, tags, context):
1006 if id(repoData)
in context:
1008 repoData.addTags(tags)
1009 context.add(id(repoData))
1010 for parentRepoData
in repoData.parentRepoDatas:
1011 setTags(parentRepoData, tags, context)
1012 for repoData
in self.
_repos.outputs() + self.
_repos.inputs():
1013 setTags(repoData.repoData, repoData.repoArgs.tags, set())
1015 def _convertV1Args(self, root, mapper, mapperArgs):
1016 """Convert Old Butler RepositoryArgs (root, mapper, mapperArgs) to New Butler RepositoryArgs 1022 Posix path to repository root 1023 mapper : class, class instance, or string 1024 Instantiated class, a class object to be instantiated, or a string that refers to a class that 1025 can be imported & used as the mapper. 1027 RepositoryArgs & their values used when instantiating the mapper. 1032 (inputs, outputs) - values to be used for inputs and outputs in Butler.__init__ 1034 if (mapper
and not isinstance(mapper, basestring)
and 1035 not inspect.isclass(mapper)):
1036 self.
log.warn(preinitedMapperWarning)
1039 if hasattr(mapper,
'root'):
1048 mapperArgs=mapperArgs)
1049 return inputs, outputs
1052 return 'Butler(datasetTypeAliasDict=%s, repos=%s, persistence=%s)' % (
1055 def _getDefaultMapper(self):
1057 """Get the default mapper. Currently this means if all the repositories use exactly the same mapper, 1058 that mapper may be considered the default. 1060 This definition may be changing; mappers may be able to exclude themselves as candidates for default, 1061 and they may nominate a different mapper instead. Also, we may not want to look at *all* the 1062 repositories, but only a depth-first search on each of the input & output repositories, and use the 1063 first-found mapper for each of those. TBD. 1072 Mapper class or None 1073 Returns the class type of the default mapper, or None if a default 1074 mapper can not be determined. 1076 defaultMapper =
None 1078 for inputRepoData
in self.
_repos.inputs():
1080 if inputRepoData.cfg.mapper
is not None:
1081 mapper = inputRepoData.cfg.mapper
1086 if isinstance(mapper, basestring):
1088 elif not inspect.isclass(mapper):
1089 mapper = mapper.__class__
1095 if defaultMapper
is None:
1096 defaultMapper = mapper
1097 elif mapper == defaultMapper:
1099 elif mapper
is not None:
1101 return defaultMapper
1103 def _assignDefaultMapper(self, defaultMapper):
1104 for repoData
in self.
_repos.all().values():
1105 if repoData.cfg.mapper
is None and (repoData.isNewRepository
or repoData.isV1Repository):
1106 if defaultMapper
is None:
1108 "No mapper specified for %s and no default mapper could be determined." %
1110 repoData.cfg.mapper = defaultMapper
1114 """posix-only; gets the mapper class at the path specified by root (if a file _mapper can be found at 1115 that location or in a parent location. 1117 As we abstract the storage and support different types of storage locations this method will be 1118 moved entirely into Butler Access, or made more dynamic, and the API will very likely change.""" 1119 return Storage.getMapperClass(root)
1122 """Register an alias that will be substituted in datasetTypes. 1127 The alias keyword. It may start with @ or not. It may not contain @ except as the first character. 1128 datasetType - string 1129 The string that will be substituted when @alias is passed into datasetType. It may not contain '@' 1133 atLoc = alias.rfind(
'@')
1135 alias =
"@" + str(alias)
1137 raise RuntimeError(
"Badly formatted alias string: %s" % (alias,))
1140 if datasetType.count(
'@') != 0:
1141 raise RuntimeError(
"Badly formatted type string: %s" % (datasetType))
1146 if key.startswith(alias)
or alias.startswith(key):
1147 raise RuntimeError(
"Alias: %s overlaps with existing alias: %s" % (alias, key))
1151 def getKeys(self, datasetType=None, level=None, tag=None):
1152 """Get the valid data id keys at or above the given level of hierarchy for the dataset type or the 1153 entire collection if None. The dict values are the basic Python types corresponding to the keys (int, 1158 datasetType - string 1159 The type of dataset to get keys for, entire collection if None. 1161 The hierarchy level to descend to. None if it should not be restricted. Use an empty string if the 1162 mapper should lookup the default level. 1163 tags - any, or list of any 1164 Any object that can be tested to be the same as the tag in a dataId passed into butler input 1165 functions. Applies only to input repositories: If tag is specified by the dataId then the repo 1166 will only be read from used if the tag in the dataId matches a tag used for that repository. 1170 Returns a dict. The dict keys are the valid data id keys at or above the given level of hierarchy for 1171 the dataset type or the entire collection if None. The dict values are the basic Python types 1172 corresponding to the keys (int, float, string). 1178 for repoData
in self.
_repos.inputs():
1179 if not tag
or len(tag.intersection(repoData.tags)) > 0:
1180 keys = repoData.repo.getKeys(datasetType, level)
1183 if keys
is not None:
1188 """Returns the valid values for one or more keys when given a partial 1189 input collection data id. 1193 datasetType - string 1194 The type of dataset to inquire about. 1196 Key or tuple of keys to be returned. 1197 dataId - DataId, dict 1198 The partial data id. 1200 Keyword arguments for the partial data id. 1204 A list of valid values or tuples of valid values as specified by the 1210 dataId.update(**rest)
1214 for repoData
in self.
_repos.inputs():
1215 if not dataId.tag
or len(dataId.tag.intersection(repoData.tags)) > 0:
1216 tuples = repoData.repo.queryMetadata(datasetType, format, dataId)
1223 if len(format) == 1:
1235 """Determines if a dataset file exists. 1239 datasetType - string 1240 The type of dataset to inquire about. 1241 dataId - DataId, dict 1242 The data id of the dataset. 1243 **rest keyword arguments for the data id. 1248 True if the dataset exists or is non-file-based. 1252 dataId.update(**rest)
1253 location = self.
_locate(datasetType, dataId, write=
False)
1254 if location
is None:
1259 if isinstance(location, ButlerComposite):
1260 for name, componentInfo
in location.componentInfo.items():
1261 if componentInfo.subset:
1262 subset = self.
subset(datasetType=componentInfo.datasetType, dataId=location.dataId)
1263 exists = all([obj.datasetExists()
for obj
in subset])
1265 exists = self.
datasetExists(componentInfo.datasetType, location.dataId)
1269 exists = location.repository.exists(location)
1272 def _locate(self, datasetType, dataId, write):
1273 """Get one or more ButlerLocations and/or ButlercComposites. 1277 datasetType : string 1278 The datasetType that is being searched for. The datasetType may be followed by a dot and 1279 a component name (component names are specified in the policy). IE datasetType.componentName 1281 dataId : dict or DataId class instance 1285 True if this is a search to write an object. False if it is a search to read an object. This 1286 affects what type (an object or a container) is returned. 1290 If write is False, will return either a single object or None. If write is True, will return a list 1291 (which may be empty) 1293 repos = self.
_repos.outputs()
if write
else self.
_repos.inputs()
1295 for repoData
in repos:
1297 if not write
and dataId.tag
and len(dataId.tag.intersection(repoData.tags)) == 0:
1299 components = datasetType.split(
'.')
1300 datasetType = components[0]
1301 components = components[1:]
1303 location = repoData.repo.map(datasetType, dataId, write=write)
1306 if location
is None:
1308 location.datasetType = datasetType
1309 if len(components) > 0:
1310 if not isinstance(location, ButlerComposite):
1311 raise RuntimeError(
"The location for a dotted datasetType must be a composite.")
1313 components[0] = location.componentInfo[components[0]].datasetType
1315 datasetType =
'.'.join(components)
1316 location = self.
_locate(datasetType, dataId, write)
1318 if location
is None:
1329 if hasattr(location.mapper,
"bypass_" + location.datasetType):
1333 location.bypass = bypass
1339 if (isinstance(location, ButlerComposite)
or hasattr(location,
'bypass')
or 1340 location.repository.exists(location)):
1344 locations.extend(location)
1346 locations.append(location)
1352 def _getBypassFunc(location, dataId):
1353 pythonType = location.getPythonType()
1354 if pythonType
is not None:
1355 if isinstance(pythonType, basestring):
1357 bypassFunc = getattr(location.mapper,
"bypass_" + location.datasetType)
1358 return lambda: bypassFunc(location.datasetType, pythonType, location, dataId)
1360 def get(self, datasetType, dataId=None, immediate=True, **rest):
1361 """Retrieves a dataset given an input collection data id. 1365 datasetType - string 1366 The type of dataset to retrieve. 1370 If False use a proxy for delayed loading. 1372 keyword arguments for the data id. 1376 An object retrieved from the dataset (or a proxy for one). 1380 dataId.update(**rest)
1382 location = self.
_locate(datasetType, dataId, write=
False)
1383 if location
is None:
1384 raise NoResults(
"No locations for get:", datasetType, dataId)
1385 self.
log.debug(
"Get type=%s keys=%s from %s", datasetType, dataId, str(location))
1387 if hasattr(location,
'bypass'):
1389 callback =
lambda : location.bypass
1391 callback =
lambda: self.
_read(location)
1392 if location.mapper.canStandardize(location.datasetType):
1393 innerCallback = callback
1394 callback =
lambda: location.mapper.standardize(location.datasetType, innerCallback(), dataId)
1397 return ReadProxy(callback)
1399 def put(self, obj, datasetType, dataId={}, doBackup=False, **rest):
1400 """Persists a dataset given an output collection data id. 1405 The object to persist. 1406 datasetType - string 1407 The type of dataset to persist. 1411 If True, rename existing instead of overwriting. 1412 WARNING: Setting doBackup=True is not safe for parallel processing, as it may be subject to race 1415 Keyword arguments for the data id. 1419 dataId.update(**rest)
1421 for location
in self.
_locate(datasetType, dataId, write=
True):
1422 if isinstance(location, ButlerComposite):
1423 disassembler = location.disassembler
if location.disassembler
else genericDisassembler
1424 disassembler(obj=obj, dataId=location.dataId, componentInfo=location.componentInfo)
1425 for name, info
in location.componentInfo.items():
1426 if not info.inputOnly:
1427 self.
put(info.obj, info.datasetType, location.dataId, doBackup=doBackup)
1430 location.getRepository().backup(location.datasetType, dataId)
1431 location.getRepository().write(location, obj)
1433 def subset(self, datasetType, level=None, dataId={}, **rest):
1434 """Return complete dataIds for a dataset type that match a partial (or empty) dataId. 1436 Given a partial (or empty) dataId specified in dataId and **rest, find all datasets that match the 1437 dataId. Optionally restrict the results to a given level specified by a dataId key (e.g. visit or 1438 sensor or amp for a camera). Return an iterable collection of complete dataIds as ButlerDataRefs. 1439 Datasets with the resulting dataIds may not exist; that needs to be tested with datasetExists(). 1443 datasetType - string 1444 The type of dataset collection to subset 1446 The level of dataId at which to subset. Use an empty string if the mapper should look up the 1451 Keyword arguments for the data id. 1455 subset - ButlerSubset 1456 Collection of ButlerDataRefs for datasets matching the data id. 1460 To print the full dataIds for all r-band measurements in a source catalog 1461 (note that the subset call is equivalent to: `butler.subset('src', dataId={'filter':'r'})`): 1463 >>> subset = butler.subset('src', filter=
'r') 1464 >>> for data_ref
in subset: print(data_ref.dataId)
1466 datasetType = self._resolveDatasetTypeAlias(datasetType) 1468 # Currently expected behavior of subset is that if specified level is None then the mapper's default 1469 # level should be used. Convention for level within Butler is that an empty string is used to indicate 1474 dataId = DataId(dataId) 1475 dataId.update(**rest) 1476 return ButlerSubset(self, datasetType, level, dataId) 1478 def dataRef(self, datasetType, level=None, dataId={}, **rest): 1479 """Returns a single ButlerDataRef.
1481 Given a complete dataId specified
in dataId
and **rest, find the unique dataset at the given level
1482 specified by a dataId key (e.g. visit
or sensor
or amp
for a camera)
and return a ButlerDataRef.
1486 datasetType - string
1487 The type of dataset collection to reference
1489 The level of dataId at which to reference
1493 Keyword arguments
for the data id.
1497 dataRef - ButlerDataRef
1498 ButlerDataRef
for dataset matching the data id
1501 datasetType = self._resolveDatasetTypeAlias(datasetType) 1502 dataId = DataId(dataId) 1503 subset = self.subset(datasetType, level, dataId, **rest) 1504 if len(subset) != 1: 1505 raise RuntimeError("No unique dataset for: Dataset type:%s Level:%s Data ID:%s Keywords:%s" % 1506 (str(datasetType), str(level), str(dataId), str(rest))) 1507 return ButlerDataRef(subset, subset.cache[0]) 1509 def _read(self, location): 1510 """Unpersist an object using data inside a ButlerLocation
or ButlerComposite object.
1514 location : ButlerLocation
or ButlerComposite
1515 A ButlerLocation
or ButlerComposite instance populated with data needed to read the object.
1520 An instance of the object specified by the location.
1522 self.log.debug("Starting read from %s", location) 1524 if isinstance(location, ButlerComposite): 1525 for name, componentInfo in location.componentInfo.items(): 1526 if componentInfo.subset: 1527 subset = self.subset(datasetType=componentInfo.datasetType, dataId=location.dataId) 1528 componentInfo.obj = [obj.get() for obj in subset] 1530 obj = self.get(componentInfo.datasetType, location.dataId, immediate=True) 1531 componentInfo.obj = obj 1532 assembler = location.assembler or genericAssembler 1533 results = assembler(dataId=location.dataId, componentInfo=location.componentInfo, cls=location.python) 1536 results = location.repository.read(location) 1537 if len(results) == 1: 1538 results = results[0] 1539 self.log.debug("Ending read from %s", location) 1542 def __reduce__(self): 1543 ret = (_unreduce, (self._initArgs, self.datasetTypeAliasDict)) 1546 def _resolveDatasetTypeAlias(self, datasetType): 1547 """Replaces all the known alias keywords
in the given string with the alias value.
1551 datasetType - string
1552 A datasetType string to search & replace on
1556 datasetType - string
1557 The de-aliased string
1559 for key in self.datasetTypeAliasDict: 1560 # if all aliases have been replaced, bail out 1561 if datasetType.find('@') == -1: 1563 datasetType = datasetType.replace(key, self.datasetTypeAliasDict[key]) 1565 # If an alias specifier can not be resolved then throw. 1566 if datasetType.find('@') != -1: 1567 raise RuntimeError("Unresolvable alias specifier in datasetType: %s" % (datasetType)) 1572 def _unreduce(initArgs, datasetTypeAliasDict): 1573 mapperArgs = initArgs.pop('mapperArgs') 1574 initArgs.update(mapperArgs) 1575 butler = Butler(**initArgs) 1576 butler.datasetTypeAliasDict = datasetTypeAliasDict
def _buildLookupLists(self)
def _resolveDatasetTypeAlias(self, datasetType)
def _convertV1Args(self, root, mapper, mapperArgs)
def _setRepoDataTags(self)
def __init__(self, root=None, mapper=None, inputs=None, outputs=None, mapperArgs)
def setCfg(self, cfg, origin, root, isV1Repository)
def _getRepositoryCfg(self, repositoryArgs)
def getParentRepoDatas(self, context=None)
def _setParentRegistry(self, repoData)
def _getCfgs(self, repoDataList)
def subset(self, datasetType, level=None, dataId={}, rest)
def __init__(self, cls, repoCfg)
def isNewRepository(self)
def _read(self, location)
def _setDefaultMapper(self, repoDataList)
def defineAlias(self, alias, datasetType)
def _connectParentRepoDatas(self, repoDataList)
def __init__(self, repoDataList)
def _addParents(self, repoDataList)
def getKeys(self, datasetType=None, level=None, tag=None)
def _getBypassFunc(location, dataId)
def put(self, obj, datasetType, dataId={}, doBackup=False, rest)
def queryMetadata(self, datasetType, format, dataId={}, rest)
def _processInputArguments(self, root=None, mapper=None, inputs=None, outputs=None, mapperArgs)
def addParentRepoData(self, parentRepoData)
def _locate(self, datasetType, dataId, write)
def _getParentVal(repoData)
def _setAndVerifyParentsLists(self, repoDataList)
def get(self, datasetType, dataId=None, immediate=True, rest)
def __init__(self, args, role)
def datasetExists(self, datasetType, dataId={}, rest)