27 """This module defines the Butler class.""" 28 from builtins
import str, super
29 from past.builtins
import basestring
30 from builtins
import object
39 from .
import ReadProxy, ButlerSubset, ButlerDataRef, Persistence, \
40 Storage, Policy, NoResults, Repository, DataId, RepositoryCfg, \
41 RepositoryArgs, listify, setify, sequencify, doImport, ButlerComposite, genericAssembler, \
42 genericDisassembler, PosixStorage, ParentsMismatch
44 preinitedMapperWarning = (
"Passing an instantiated mapper into " +
45 "Butler.__init__ will prevent Butler from passing " +
46 "parentRegistry or repositoryCfg information to " +
47 "the mapper, which is done only at init time. " +
48 "It is better to pass a importable string or " +
53 """Represents a Butler configuration. 57 cfg is 'wet paint' and very likely to change. Use of it in production 58 code other than via the 'old butler' API is strongly discouraged. 60 yaml_tag =
u"!ButlerCfg" 63 super().
__init__({
'repoCfg': repoCfg,
'cls': cls})
67 """Container object for repository data used by Butler 72 The arguments that are used to find or create the RepositoryCfg. 74 "input", "output", or "parent", indicating why Butler loaded this repository. 75 * input: the Repository was passed as a Butler input. 76 * output: the Repository was passed as a Butler output. 77 * parent: the Repository was specified in the RepositoryCfg parents list of a readable repository. 82 The configuration for the Repository. 85 "new", "existing", or "nested". Indicates the origin of the repository and its RepositoryCfg: 86 * new: it was created by this instance of Butler, and this instance of Butler will generate the 88 * existing: it was found (via the root or cfgRoot argument) 89 * nested: the full RepositoryCfg was nested in another RepositoryCfg's parents list (this can happen 90 if parameters of an input specified by RepositoryArgs or dict does not entirely match an existing 94 Path or URI to the location of the RepositoryCfg file. 96 repo : lsst.daf.persistence.Repository 97 The Repository class instance. 99 parentRepoDatas : list of RepoData 100 The parents of this Repository, as indicated this Repository's RepositoryCfg. If this is a new 101 Repository then these are the inputs to this Butler (and will be saved in the RepositoryCfg). These 102 RepoData objects are not owned by this RepoData, these are references to peer RepoData objects in the 103 Butler's RepoDataContainer. 105 isV1Repository : bool 106 True if this is an Old Butler repository. In this case the repository does not have a RepositoryCfg 107 file. It may have a _mapper file and may have a _parent symlink. It will never be treated as a "new" 108 repository, i.e. even though there is not a RepositoryCfg file, one will not be generated. 109 If False, this is a New Butler repository and is specified by RepositoryCfg file. 112 These are values that may be used to restrict the search of input repositories. Details are available 113 in the RepositoryArgs and DataId classes. 116 "input", "output", or "parent", indicating why Butler loaded this repository. 117 * input: the Repository was passed as a Butler input. 118 * output: the Repository was passed as a Butler output. 119 * parent: the Repository was specified in the RepositoryCfg parents list of a readable repository. 121 _repoArgs : RepositoryArgs 122 Contains the arguments that were used to specify this Repository. 152 "parentRepoDatas={}," +
155 "parentRegistry={})").format(
156 self.__class__.__name__,
168 def setCfg(self, cfg, origin, root, isV1Repository):
169 """Set information about the cfg into the RepoData 174 The RepositoryCfg for the repo. 176 'new', 'existing', or 'nested' 178 URI or absolute path to the location of the RepositoryCfg.yaml file. 184 if origin
not in (
'new',
'existing',
'nested'):
185 raise RuntimeError(
"Invalid value for origin:{}".format(origin))
205 if val
not in (
'input',
'output',
'parent'):
206 raise RuntimeError(
"Invalid value for role: {}".format(val))
210 """Get the parents & grandparents etc of this repo data, in depth-first search order. 212 Duplicate entries will be removed in cases where the same parent appears more than once in the parent 217 context : set, optional 218 Users should typically omit context and accept the default argument. Context is used to keep a set 219 of known RepoDatas when calling this function recursively, for duplicate elimination. 224 A list of the parents & grandparents etc of a given repo data, in depth-first search order. 229 if id(self)
in context:
231 context.add(id(self))
233 parents.append(parent)
234 parents += parent.getParentRepoDatas(context)
245 """Container object for RepoData instances owned by a Butler instance. 249 repoDataList : list of RepoData 250 repoData - RepoData instance to add 256 self.
_all = repoDataList
260 """Get a list of RepoData that are used to as inputs to the Butler. 261 The list is created lazily as needed, and cached. 265 A list of RepoData with readable repositories, in the order to be used when searching. 268 raise RuntimeError(
"Inputs not yet initialized.")
272 """Get a list of RepoData that are used to as outputs to the Butler. 273 The list is created lazily as needed, and cached. 277 A list of RepoData with writable repositories, in the order to be use when searching. 280 raise RuntimeError(
"Outputs not yet initialized.")
284 """Get a list of all RepoData that are used to as by the Butler. 285 The list is created lazily as needed, and cached. 289 A list of RepoData with writable repositories, in the order to be use when searching. 294 return "%s(_inputs=%r, \n_outputs=%s, \n_all=%s)" % (
295 self.__class__.__name__,
300 def _buildLookupLists(self):
301 """Build the inputs and outputs lists based on the order of self.all().""" 303 def addToList(repoData, lst):
304 """Add a repoData and each of its parents (depth first) to a list""" 305 if id(repoData)
in alreadyAdded:
308 alreadyAdded.add(id(repoData))
309 for parent
in repoData.parentRepoDatas:
310 addToList(parent, lst)
313 raise RuntimeError(
"Lookup lists are already built.")
314 inputs = [repoData
for repoData
in self.
all()
if repoData.role ==
'input']
315 outputs = [repoData
for repoData
in self.
all()
if repoData.role ==
'output']
318 for repoData
in outputs:
319 if 'r' in repoData.repoArgs.mode: 320 addToList(repoData.repoData, self._inputs) 321 for repoData
in inputs:
322 addToList(repoData.repoData, self.
_inputs)
323 self.
_outputs = [repoData.repoData
for repoData
in outputs]
327 """Butler provides a generic mechanism for persisting and retrieving data using mappers. 329 A Butler manages a collection of datasets known as a repository. Each dataset has a type representing its 330 intended usage and a location. Note that the dataset type is not the same as the C++ or Python type of the 331 object containing the data. For example, an ExposureF object might be used to hold the data for a raw 332 image, a post-ISR image, a calibrated science image, or a difference image. These would all be different 335 A Butler can produce a collection of possible values for a key (or tuples of values for multiple keys) if 336 given a partial data identifier. It can check for the existence of a file containing a dataset given its 337 type and data identifier. The Butler can then retrieve the dataset. Similarly, it can persist an object to 338 an appropriate location when given its associated data identifier. 340 Note that the Butler has two more advanced features when retrieving a data set. First, the retrieval is 341 lazy. Input does not occur until the data set is actually accessed. This allows datasets to be retrieved 342 and placed on a clipboard prospectively with little cost, even if the algorithm of a stage ends up not 343 using them. Second, the Butler will call a standardization hook upon retrieval of the dataset. This 344 function, contained in the input mapper object, must perform any necessary manipulations to force the 345 retrieved object to conform to standards, including translating metadata. 349 __init__(self, root, mapper=None, **mapperArgs) 351 defineAlias(self, alias, datasetType) 353 getKeys(self, datasetType=None, level=None) 355 queryMetadata(self, datasetType, format=None, dataId={}, **rest) 357 datasetExists(self, datasetType, dataId={}, **rest) 359 get(self, datasetType, dataId={}, immediate=False, **rest) 361 put(self, obj, datasetType, dataId={}, **rest) 363 subset(self, datasetType, level=None, dataId={}, **rest) 365 dataRef(self, datasetType, level=None, dataId={}, **rest) 369 The preferred method of initialization is to use the `inputs` and `outputs` __init__ parameters. These 370 are described in the parameters section, below. 372 For backward compatibility: this initialization method signature can take a posix root path, and 373 optionally a mapper class instance or class type that will be instantiated using the mapperArgs input 374 argument. However, for this to work in a backward compatible way it creates a single repository that is 375 used as both an input and an output repository. This is NOT preferred, and will likely break any 376 provenance system we have in place. 381 .. note:: Deprecated in 12_0 382 `root` will be removed in TBD, it is replaced by `inputs` and `outputs` for 383 multiple-repository support. 384 A file system path. Will only work with a PosixRepository. 385 mapper : string or instance 386 .. note:: Deprecated in 12_0 387 `mapper` will be removed in TBD, it is replaced by `inputs` and `outputs` for 388 multiple-repository support. 389 Provides a mapper to be used with Butler. 391 .. note:: Deprecated in 12_0 392 `mapperArgs` will be removed in TBD, it is replaced by `inputs` and `outputs` for 393 multiple-repository support. 394 Provides arguments to be passed to the mapper if the mapper input argument is a class type to be 395 instantiated by Butler. 396 inputs : RepositoryArgs, dict, or string 397 Can be a single item or a list. Provides arguments to load an existing repository (or repositories). 398 String is assumed to be a URI and is used as the cfgRoot (URI to the location of the cfg file). (Local 399 file system URI does not have to start with 'file://' and in this way can be a relative path). The 400 `RepositoryArgs` class can be used to provide more parameters with which to initialize a repository 401 (such as `mapper`, `mapperArgs`, `tags`, etc. See the `RepositoryArgs` documentation for more 402 details). A dict may be used as shorthand for a `RepositoryArgs` class instance. The dict keys must 403 match parameters to the `RepositoryArgs.__init__` function. 404 outputs : RepositoryArgs, dict, or string 405 Provides arguments to load one or more existing repositories or create new ones. The different types 406 are handled the same as for `inputs`. 408 The Butler init sequence loads all of the input and output repositories. 409 This creates the object hierarchy to read from and write to them. Each 410 repository can have 0 or more parents, which also get loaded as inputs. 411 This becomes a DAG of repositories. Ultimately, Butler creates a list of 412 these Repositories in the order that they are used. 414 Initialization Sequence 415 ======================= 417 During initialization Butler creates a Repository class instance & support structure for each object 418 passed to `inputs` and `outputs` as well as the parent repositories recorded in the `RepositoryCfg` of 419 each existing readable repository. 421 This process is complex. It is explained below to shed some light on the intent of each step. 423 1. Input Argument Standardization 424 --------------------------------- 426 In `Butler._processInputArguments` the input arguments are verified to be legal (and a RuntimeError is 427 raised if not), and they are converted into an expected format that is used for the rest of the Butler 428 init sequence. See the docstring for `_processInputArguments`. 430 2. Create RepoData Objects 431 -------------------------- 433 Butler uses an object, called `RepoData`, to keep track of information about each repository; each 434 repository is contained in a single `RepoData`. The attributes are explained in its docstring. 436 After `_processInputArguments`, a RepoData is instantiated and put in a list for each repository in 437 `outputs` and `inputs`. This list of RepoData, the `repoDataList`, now represents all the output and input 438 repositories (but not parent repositories) that this Butler instance will use. 440 3. Get `RepositoryCfg`s 441 ----------------------- 443 `Butler._getCfgs` gets the `RepositoryCfg` for each repository the `repoDataList`. The behavior is 444 described in the docstring. 449 `Butler._addParents` then considers the parents list in the `RepositoryCfg` of each `RepoData` in the 450 `repoDataList` and inserts new `RepoData` objects for each parent not represented in the proper location 451 in the `repoDataList`. Ultimately a flat list is built to represent the DAG of readable repositories 452 represented in depth-first order. 454 5. Set and Verify Parents of Outputs 455 ------------------------------------ 457 To be able to load parent repositories when output repositories are used as inputs, the input repositories 458 are recorded as parents in the `RepositoryCfg` file of new output repositories. When an output repository 459 already exists, for consistency the Butler's inputs must match the list of parents specified the already- 460 existing output repository's `RepositoryCfg` file. 462 In `Butler._setAndVerifyParentsLists`, the list of parents is recorded in the `RepositoryCfg` of new 463 repositories. For existing repositories the list of parents is compared with the `RepositoryCfg`'s parents 464 list, and if they do not match a `RuntimeError` is raised. 466 6. Set the Default Mapper 467 ------------------------- 469 If all the input repositories use the same mapper then we can assume that mapper to be the 470 "default mapper". If there are new output repositories whose `RepositoryArgs` do not specify a mapper and 471 there is a default mapper then the new output repository will be set to use that default mapper. 473 This is handled in `Butler._setDefaultMapper`. 475 7. Cache References to Parent RepoDatas 476 --------------------------------------- 478 In `Butler._connectParentRepoDatas`, in each `RepoData` in `repoDataList`, a list of `RepoData` object 479 references is built that matches the parents specified in that `RepoData`'s `RepositoryCfg`. 481 This list is used later to find things in that repository's parents, without considering peer repository's 482 parents. (e.g. finding the registry of a parent) 487 Tags are described at https://ldm-463.lsst.io/v/draft/#tagging 489 In `Butler._setRepoDataTags`, for each `RepoData`, the tags specified by its `RepositoryArgs` are recorded 490 in a set, and added to the tags set in each of its parents, for ease of lookup when mapping. 492 9. Find Parent Registry and Instantiate RepoData 493 ------------------------------------------------ 495 At this point there is enough information to instantiate the `Repository` instances. There is one final 496 step before instantiating the Repository, which is to try to get a parent registry that can be used by the 497 child repository. The criteria for "can be used" is spelled out in `Butler._setParentRegistry`. However, 498 to get the registry from the parent, the parent must be instantiated. The `repoDataList`, in depth-first 499 search order, is built so that the most-dependent repositories are first, and the least dependent 500 repositories are last. So the `repoDataList` is reversed and the Repositories are instantiated in that 501 order; for each RepoData a parent registry is searched for, and then the Repository is instantiated with 502 whatever registry could be found.""" 504 def __init__(self, root=None, mapper=None, inputs=None, outputs=None, **mapperArgs):
505 self.
_initArgs = {
'root': root,
'mapper': mapper,
'inputs': inputs,
'outputs': outputs,
506 'mapperArgs': mapperArgs}
508 self.
log = Log.getLogger(
"daf.persistence.butler")
514 root=root, mapper=mapper, inputs=inputs, outputs=outputs, **mapperArgs)
517 inputs = [
RepoData(args,
'input')
for args
in inputs]
518 outputs = [
RepoData(args,
'output')
for args
in outputs]
519 repoDataList = outputs + inputs
535 for repoData
in repoDataList:
538 def _initRepo(self, repoData):
539 if repoData.repo
is not None:
543 for parentRepoData
in repoData.parentRepoDatas:
544 if parentRepoData.cfg.mapper != repoData.cfg.mapper:
546 if parentRepoData.repo
is None:
548 parentRegistry = parentRepoData.repo.getRegistry()
549 repoData.parentRegistry = parentRegistry
if parentRegistry
else parentRepoData.parentRegistry
550 if repoData.parentRegistry:
554 def _processInputArguments(self, root=None, mapper=None, inputs=None, outputs=None, **mapperArgs):
555 """Process, verify, and standardize the input arguments. 556 * Inputs can not be for Old Butler (root, mapper, mapperArgs) AND New Butler (inputs, outputs) 557 `root`, `mapper`, and `mapperArgs` are Old Butler init API. 558 `inputs` and `outputs` are New Butler init API. 559 Old Butler and New Butler init API may not be mixed, Butler may be initialized with only the Old 560 arguments or the New arguments. 561 * Verify that if there is a readable output that there is exactly one output. (This restriction is in 562 place because all readable repositories must be parents of writable repositories, and for 563 consistency the DAG of readable repositories must always be the same. Keeping the list of parents 564 becomes very complicated in the presence of multiple readable output repositories. It is better to 565 only write to output repositories, and then create a new Butler instance and use the outputs as 566 inputs, and write to new output repositories.) 567 * Make a copy of inputs & outputs so they may be modified without changing the passed-in arguments. 568 * Convert any input/output values that are URI strings to RepositoryArgs. 569 * Listify inputs & outputs. 570 * Set default RW mode on inputs & outputs as needed. 574 Same as Butler.__init__ 578 (list of RepositoryArgs, list of RepositoryArgs) 579 First item is a list to use as inputs. 580 Second item is a list to use as outputs. 585 If Old Butler and New Butler arguments are both used this will raise. 586 If an output is readable there is more than one output this will raise. 589 inputs = copy.deepcopy(inputs)
590 outputs = copy.deepcopy(outputs)
592 isV1Args = inputs
is None and outputs
is None 596 mapperArgs=mapperArgs
or None)
597 elif root
or mapper
or mapperArgs:
599 'Butler version 1 API (root, mapper, **mapperArgs) may ' +
600 'not be used with version 2 API (inputs, outputs)')
609 if not isinstance(args, RepositoryArgs)
else args
for args
in inputs]
611 if not isinstance(args, RepositoryArgs)
else args
for args
in outputs]
615 if args.mode
is None:
617 elif 'rw' == args.mode:
619 elif 'r' != args.mode: 620 raise RuntimeError(
"The mode of an input should be readable.")
622 if args.mode
is None:
624 elif 'w' not in args.mode:
625 raise RuntimeError(
"The mode of an output should be writable.")
627 for args
in inputs + outputs:
628 if (args.mapper
and not isinstance(args.mapper, basestring)
and 629 not inspect.isclass(args.mapper)):
630 self.
log.warn(preinitedMapperWarning)
635 raise RuntimeError(
"Butler does not support multiple output repositories if any of the " 636 "outputs are readable.")
641 def inputIsInOutputs(inputArgs, outputArgsList):
642 for o
in outputArgsList:
643 if (
'r' in o.mode and 644 o.root == inputArgs.root and 645 o.mapper == inputArgs.mapper
and 646 o.mapperArgs == inputArgs.mapperArgs
and 647 o.tags == inputArgs.tags
and 648 o.policy == inputArgs.policy):
649 self.
log.debug((
"Input repositoryArgs {} is also listed in outputs as readable; " +
650 "throwing away the input.").format(inputArgs))
654 inputs = [args
for args
in inputs
if not inputIsInOutputs(args, outputs)]
655 return inputs, outputs
658 def _getParentVal(repoData):
659 """Get the value of this repoData as it should appear in the parents 660 list of other repositories""" 661 if repoData.isV1Repository:
663 if repoData.cfgOrigin ==
'nested':
666 return repoData.cfg.root
669 def _getParents(ofRepoData, repoInfo):
670 """Create a parents list of repoData from inputs and (readable) outputs.""" 673 for repoData
in repoInfo:
674 if repoData
is ofRepoData:
676 if 'r' not in repoData.repoArgs.mode: 678 parents.append(Butler._getParentVal(repoData))
682 def _getOldButlerRepositoryCfg(repositoryArgs):
683 if not Storage.isPosix(repositoryArgs.cfgRoot):
685 if not PosixStorage.v1RepoExists(repositoryArgs.cfgRoot):
687 if not repositoryArgs.mapper:
688 repositoryArgs.mapper = PosixStorage.getMapperClass(repositoryArgs.cfgRoot)
689 cfg = RepositoryCfg.makeFromArgs(repositoryArgs)
690 parent = PosixStorage.getParentSymlinkPath(repositoryArgs.cfgRoot)
692 parent = Butler._getOldButlerRepositoryCfg(
RepositoryArgs(cfgRoot=parent, mode=
'r')) 693 if parent
is not None:
694 cfg.addParents([parent])
697 def _getRepositoryCfg(self, repositoryArgs):
698 """Try to get a repository from the location described by cfgRoot. 702 repositoryArgs : RepositoryArgs or string 703 Provides arguments to load an existing repository (or repositories). String is assumed to be a URI 704 and is used as the cfgRoot (URI to the location of the cfg file). 708 (RepositoryCfg or None, bool) 709 The RepositoryCfg, or None if one cannot be found, and True if the RepositoryCfg was created by 710 reading an Old Butler repository, or False if it is a New Butler Repository. 712 if not isinstance(repositoryArgs, RepositoryArgs):
715 cfg = self.storage.getRepositoryCfg(repositoryArgs.cfgRoot) 716 isOldButlerRepository = False 718 cfg = Butler._getOldButlerRepositoryCfg(repositoryArgs)
720 isOldButlerRepository =
True 721 return cfg, isOldButlerRepository
723 def _getCfgs(self, repoDataList):
724 """Get or make a RepositoryCfg for each RepoData, and add the cfg to the RepoData. 725 If the cfg exists, compare values. If values match then use the cfg as an "existing" cfg. If the 726 values do not match, use the cfg as a "nested" cfg. 727 If the cfg does not exist, the RepositoryArgs must be for a writable repository. 731 repoDataList : list of RepoData 732 The RepoData that are output and inputs of this Butler 737 If the passed-in RepositoryArgs indicate an existing repository but other cfg parameters in those 739 match the existing repository's cfg a RuntimeError will be raised. 741 def cfgMatchesArgs(args, cfg):
742 """Test if there are any values in an RepositoryArgs that conflict with the values in a cfg""" 743 if args.mapper
is not None and cfg.mapper != args.mapper:
745 if args.mapperArgs
is not None and cfg.mapperArgs != args.mapperArgs:
747 if args.policy
is not None and cfg.policy != args.policy:
751 for repoData
in repoDataList:
754 if 'w' not in repoData.repoArgs.mode:
756 "No cfg found for read-only input repository at {}".format(repoData.repoArgs.cfgRoot))
757 repoData.setCfg(cfg=RepositoryCfg.makeFromArgs(repoData.repoArgs),
759 root=repoData.repoArgs.cfgRoot,
760 isV1Repository=isOldButlerRepository)
770 for i, parent
in enumerate(cfg.parents):
771 if isinstance(parent, RepositoryCfg):
774 if parentIsOldButlerRepository:
775 parentCfg.mapperArgs = cfg.mapperArgs
776 self.
log.info((
"Butler is replacing an Old Butler parent repository path '{}' " 777 "found in the parents list of a New Butler repositoryCfg: {} " 778 "with a repositoryCfg that includes the child repository's " 779 "mapperArgs: {}. This affects the instantiated RepositoryCfg " 780 "but does not change the persisted child repositoryCfg.yaml file." 781 ).format(parent, cfg, parentCfg))
782 cfg._parents[i] = cfg._normalizeParents(cfg.root, [parentCfg])[0]
784 if 'w' in repoData.repoArgs.mode:
786 if not cfgMatchesArgs(repoData.repoArgs, cfg):
787 raise RuntimeError((
"The RepositoryArgs and RepositoryCfg must match for writable " +
788 "repositories, RepositoryCfg:{}, RepositoryArgs:{}").format(
789 cfg, repoData.repoArgs))
790 repoData.setCfg(cfg=cfg, origin=
'existing', root=repoData.repoArgs.cfgRoot,
791 isV1Repository=isOldButlerRepository)
794 if cfgMatchesArgs(repoData.repoArgs, cfg):
795 repoData.setCfg(cfg=cfg, origin=
'existing', root=repoData.repoArgs.cfgRoot,
796 isV1Repository=isOldButlerRepository)
798 repoData.setCfg(cfg=cfg, origin=
'nested', root=
None,
799 isV1Repository=isOldButlerRepository)
801 def _addParents(self, repoDataList):
802 """For each repoData in the input list, see if its parents are the next items in the list, and if not 803 add the parent, so that the repoDataList includes parents and is in order to operate depth-first 0..n. 807 repoDataList : list of RepoData 808 The RepoData for the Butler outputs + inputs. 813 Raised if a RepositoryCfg can not be found at a location where a parent repository should be. 817 if repoDataIdx == len(repoDataList):
819 repoData = repoDataList[repoDataIdx]
820 if 'r' not in repoData.repoArgs.mode: 823 if repoData.isNewRepository:
826 if repoData.cfg.parents
is None:
829 for repoParentIdx, repoParent
in enumerate(repoData.cfg.parents):
830 parentIdxInRepoDataList = repoDataIdx + repoParentIdx + 1
831 if not isinstance(repoParent, RepositoryCfg):
833 if repoParentCfg
is not None:
834 cfgOrigin =
'existing' 836 isOldButlerRepository =
False 837 repoParentCfg = repoParent
839 if (parentIdxInRepoDataList < len(repoDataList)
and 840 repoDataList[parentIdxInRepoDataList].cfg == repoParentCfg):
843 role = 'input' if repoData.role ==
'output' else 'parent' 845 newRepoInfo.repoData.setCfg(cfg=repoParentCfg, origin=cfgOrigin, root=args.cfgRoot,
846 isV1Repository=isOldButlerRepository)
847 repoDataList.insert(parentIdxInRepoDataList, newRepoInfo)
850 def _setAndVerifyParentsLists(self, repoDataList):
851 """Make a list of all the input repositories of this Butler, these are the parents of the outputs. 852 For new output repositories, set the parents in the RepositoryCfg. For existing output repositories 853 verify that the RepositoryCfg's parents match the parents list. 857 repoDataList : list of RepoData 858 All the RepoDatas loaded by this butler, in search order. 863 If an existing output repository is loaded and its parents do not match the parents of this Butler 864 an error will be raised. 866 def getIOParents(ofRepoData, repoDataList):
867 """make a parents list for repo in `ofRepoData` that is comprised of inputs and readable 868 outputs (not parents-of-parents) of this butler""" 870 for repoData
in repoDataList:
871 if repoData.role ==
'parent':
873 if repoData
is ofRepoData:
875 if repoData.role ==
'output':
876 if 'r' in repoData.repoArgs.mode: 877 raise RuntimeError(
"If an output is readable it must be the only output.")
884 for repoData
in repoDataList:
885 if repoData.role !=
'output':
887 parents = getIOParents(repoData, repoDataList)
889 if repoData.cfgOrigin ==
'new':
890 repoData.cfg.addParents(parents)
891 elif repoData.cfgOrigin
in (
'existing',
'nested'):
892 if repoData.cfg.parents != parents:
894 repoData.cfg.extendParents(parents)
895 except ParentsMismatch
as e:
896 raise RuntimeError((
"Inputs of this Butler:{} do not match parents of existing " +
897 "writable cfg:{} (ParentMismatch exception: {}").format(
898 parents, repoData.cfg.parents, e))
900 def _setDefaultMapper(self, repoDataList):
901 """Establish a default mapper if there is one and assign it to outputs that do not have a mapper 904 If all inputs have the same mapper it will be used as the default mapper. 908 repoDataList : list of RepoData 909 All the RepoDatas loaded by this butler, in search order. 914 If a default mapper can not be established and there is an output that does not have a mapper. 916 needyOutputs = [rd
for rd
in repoDataList
if rd.role ==
'output' and rd.cfg.mapper
is None]
917 if len(needyOutputs)
is 0:
919 mappers = set([rd.cfg.mapper
for rd
in repoDataList
if rd.role ==
'input'])
920 if len(mappers) != 1:
921 inputs = [rd
for rd
in repoDataList
if rd.role ==
'input']
923 (
"No default mapper could be established from inputs:{} and no mapper specified " +
924 "for outputs:{}").format(inputs, needyOutputs))
925 defaultMapper = mappers.pop()
926 for repoData
in needyOutputs:
927 repoData.cfg.mapper = defaultMapper
929 def _connectParentRepoDatas(self, repoDataList):
930 """For each RepoData in repoDataList, find its parent in the repoDataList and cache a reference to it. 934 repoDataList : list of RepoData 935 All the RepoDatas loaded by this butler, in search order. 940 When a parent is listed in the parents list but not found in the repoDataList. This is not 941 expected to ever happen and would indicate an internal Butler error. 943 for repoData
in repoDataList:
944 for parent
in repoData.cfg.parents:
946 for otherRepoData
in repoDataList:
947 if isinstance(parent, RepositoryCfg):
948 if otherRepoData.repoData.repoData.cfg == parent:
949 parentToAdd = otherRepoData.repoData
951 elif otherRepoData.repoData.cfg.root == parent:
952 parentToAdd = otherRepoData.repoData
954 if parentToAdd
is None:
956 "Could not find a parent matching {} to add to {}".format(parent, repoData))
957 repoData.addParentRepoData(parentToAdd)
960 def _getParentRepoData(parent, repoDataList):
961 """get a parent RepoData from a cfg from a list of RepoData 965 parent : string or RepositoryCfg 966 cfgRoot of a repo or a cfg that describes the repo 967 repoDataList : list of RepoData 973 A RepoData if one can be found, else None 976 for otherRepoData
in repoDataList:
977 if isinstance(parent, RepositoryCfg):
978 if otherRepoData.cfg == parent:
979 repoData = otherRepoData
981 elif otherRepoData.cfg.root == parent:
982 repoData = otherRepoData
986 def _setRepoDataTags(self):
987 """Set the tags from each repoArgs into all its parent repoArgs so that they can be included in tagged 989 def setTags(repoData, tags, context):
990 if id(repoData)
in context:
992 repoData.addTags(tags)
993 context.add(id(repoData))
994 for parentRepoData
in repoData.parentRepoDatas:
995 setTags(parentRepoData, tags, context)
996 for repoData
in self.
_repos.outputs() + self.
_repos.inputs():
997 setTags(repoData.repoData, repoData.repoArgs.tags, set())
999 def _convertV1Args(self, root, mapper, mapperArgs):
1000 """Convert Old Butler RepositoryArgs (root, mapper, mapperArgs) to New Butler RepositoryArgs 1006 Posix path to repository root 1007 mapper : class, class instance, or string 1008 Instantiated class, a class object to be instantiated, or a string that refers to a class that 1009 can be imported & used as the mapper. 1011 RepositoryArgs & their values used when instantiating the mapper. 1016 (inputs, outputs) - values to be used for inputs and outputs in Butler.__init__ 1018 if (mapper
and not isinstance(mapper, basestring)
and 1019 not inspect.isclass(mapper)):
1020 self.
log.warn(preinitedMapperWarning)
1023 if hasattr(mapper,
'root'):
1032 mapperArgs=mapperArgs)
1033 return inputs, outputs
1036 return 'Butler(datasetTypeAliasDict=%s, repos=%s, persistence=%s)' % (
1039 def _getDefaultMapper(self):
1041 """Get the default mapper. Currently this means if all the repositories use exactly the same mapper, 1042 that mapper may be considered the default. 1044 This definition may be changing; mappers may be able to exclude themselves as candidates for default, 1045 and they may nominate a different mapper instead. Also, we may not want to look at *all* the 1046 repositories, but only a depth-first search on each of the input & output repositories, and use the 1047 first-found mapper for each of those. TBD. 1056 Mapper class or None 1057 Returns the class type of the default mapper, or None if a default 1058 mapper can not be determined. 1060 defaultMapper =
None 1062 for inputRepoData
in self.
_repos.inputs():
1064 if inputRepoData.cfg.mapper
is not None:
1065 mapper = inputRepoData.cfg.mapper
1070 if isinstance(mapper, basestring):
1071 mapper = doImport(mapper)
1072 elif not inspect.isclass(mapper):
1073 mapper = mapper.__class__
1079 if defaultMapper
is None:
1080 defaultMapper = mapper
1081 elif mapper == defaultMapper:
1083 elif mapper
is not None:
1085 return defaultMapper
1087 def _assignDefaultMapper(self, defaultMapper):
1088 for repoData
in self.
_repos.all().values():
1089 if repoData.cfg.mapper
is None and (repoData.isNewRepository
or repoData.isV1Repository):
1090 if defaultMapper
is None:
1092 "No mapper specified for %s and no default mapper could be determined." %
1094 repoData.cfg.mapper = defaultMapper
1098 """posix-only; gets the mapper class at the path specified by root (if a file _mapper can be found at 1099 that location or in a parent location. 1101 As we abstract the storage and support different types of storage locations this method will be 1102 moved entirely into Butler Access, or made more dynamic, and the API will very likely change.""" 1103 return Storage.getMapperClass(root)
1106 """Register an alias that will be substituted in datasetTypes. 1111 The alias keyword. It may start with @ or not. It may not contain @ except as the first character. 1112 datasetType - string 1113 The string that will be substituted when @alias is passed into datasetType. It may not contain '@' 1117 atLoc = alias.rfind(
'@')
1119 alias =
"@" + str(alias)
1121 raise RuntimeError(
"Badly formatted alias string: %s" % (alias,))
1124 if datasetType.count(
'@') != 0:
1125 raise RuntimeError(
"Badly formatted type string: %s" % (datasetType))
1130 if key.startswith(alias)
or alias.startswith(key):
1131 raise RuntimeError(
"Alias: %s overlaps with existing alias: %s" % (alias, key))
1135 def getKeys(self, datasetType=None, level=None, tag=None):
1136 """Get the valid data id keys at or above the given level of hierarchy for the dataset type or the 1137 entire collection if None. The dict values are the basic Python types corresponding to the keys (int, 1142 datasetType - string 1143 The type of dataset to get keys for, entire collection if None. 1145 The hierarchy level to descend to. None if it should not be restricted. Use an empty string if the 1146 mapper should lookup the default level. 1147 tags - any, or list of any 1148 Any object that can be tested to be the same as the tag in a dataId passed into butler input 1149 functions. Applies only to input repositories: If tag is specified by the dataId then the repo 1150 will only be read from used if the tag in the dataId matches a tag used for that repository. 1154 Returns a dict. The dict keys are the valid data id keys at or above the given level of hierarchy for 1155 the dataset type or the entire collection if None. The dict values are the basic Python types 1156 corresponding to the keys (int, float, string). 1162 for repoData
in self.
_repos.inputs():
1163 if not tag
or len(tag.intersection(repoData.tags)) > 0:
1164 keys = repoData.repo.getKeys(datasetType, level)
1167 if keys
is not None:
1172 """Returns the valid values for one or more keys when given a partial 1173 input collection data id. 1177 datasetType - string 1178 The type of dataset to inquire about. 1180 Key or tuple of keys to be returned. 1181 dataId - DataId, dict 1182 The partial data id. 1184 Keyword arguments for the partial data id. 1188 A list of valid values or tuples of valid values as specified by the 1194 dataId.update(**rest)
1198 for repoData
in self.
_repos.inputs():
1199 if not dataId.tag
or len(dataId.tag.intersection(repoData.tags)) > 0:
1200 tuples = repoData.repo.queryMetadata(datasetType, format, dataId)
1207 if len(format) == 1:
1219 """Determines if a dataset file exists. 1223 datasetType - string 1224 The type of dataset to inquire about. 1225 dataId - DataId, dict 1226 The data id of the dataset. 1228 If True, look only in locations where the dataset could be written, 1229 and return True only if it is present in all of them. 1230 **rest keyword arguments for the data id. 1235 True if the dataset exists or is non-file-based. 1239 dataId.update(**rest)
1240 locations = self.
_locate(datasetType, dataId, write=write)
1242 if locations
is None:
1244 locations = [locations]
1249 for location
in locations:
1252 if isinstance(location, ButlerComposite):
1253 for name, componentInfo
in location.componentInfo.items():
1254 if componentInfo.subset:
1255 subset = self.
subset(datasetType=componentInfo.datasetType, dataId=location.dataId)
1256 exists = all([obj.datasetExists()
for obj
in subset])
1258 exists = self.
datasetExists(componentInfo.datasetType, location.dataId)
1262 if not location.repository.exists(location):
1266 def _locate(self, datasetType, dataId, write):
1267 """Get one or more ButlerLocations and/or ButlercComposites. 1271 datasetType : string 1272 The datasetType that is being searched for. The datasetType may be followed by a dot and 1273 a component name (component names are specified in the policy). IE datasetType.componentName 1275 dataId : dict or DataId class instance 1279 True if this is a search to write an object. False if it is a search to read an object. This 1280 affects what type (an object or a container) is returned. 1284 If write is False, will return either a single object or None. If write is True, will return a list 1285 (which may be empty) 1287 repos = self.
_repos.outputs()
if write
else self.
_repos.inputs()
1289 for repoData
in repos:
1291 if not write
and dataId.tag
and len(dataId.tag.intersection(repoData.tags)) == 0:
1293 components = datasetType.split(
'.')
1294 datasetType = components[0]
1295 components = components[1:]
1297 location = repoData.repo.map(datasetType, dataId, write=write)
1300 if location
is None:
1302 location.datasetType = datasetType
1303 if len(components) > 0:
1304 if not isinstance(location, ButlerComposite):
1305 raise RuntimeError(
"The location for a dotted datasetType must be a composite.")
1307 components[0] = location.componentInfo[components[0]].datasetType
1309 datasetType =
'.'.join(components)
1310 location = self.
_locate(datasetType, dataId, write)
1312 if location
is None:
1323 if hasattr(location.mapper,
"bypass_" + location.datasetType):
1327 location.bypass = bypass
1328 except (NoResults, IOError):
1329 self.
log.debug(
"Continuing dataset search while evaluating " 1330 "bypass function for Dataset type:{} Data ID:{} at " 1331 "location {}".format(datasetType, dataId, location))
1335 if (isinstance(location, ButlerComposite)
or hasattr(location,
'bypass')
or 1336 location.repository.exists(location)):
1340 locations.extend(location)
1342 locations.append(location)
1348 def _getBypassFunc(location, dataId):
1349 pythonType = location.getPythonType()
1350 if pythonType
is not None:
1351 if isinstance(pythonType, basestring):
1352 pythonType = doImport(pythonType)
1353 bypassFunc = getattr(location.mapper,
"bypass_" + location.datasetType)
1354 return lambda: bypassFunc(location.datasetType, pythonType, location, dataId)
1356 def get(self, datasetType, dataId=None, immediate=True, **rest):
1357 """Retrieves a dataset given an input collection data id. 1361 datasetType - string 1362 The type of dataset to retrieve. 1366 If False use a proxy for delayed loading. 1368 keyword arguments for the data id. 1372 An object retrieved from the dataset (or a proxy for one). 1376 dataId.update(**rest)
1378 location = self.
_locate(datasetType, dataId, write=
False)
1379 if location
is None:
1380 raise NoResults(
"No locations for get:", datasetType, dataId)
1381 self.
log.debug(
"Get type=%s keys=%s from %s", datasetType, dataId, str(location))
1383 if hasattr(location,
'bypass'):
1386 return location.bypass
1389 return self.
_read(location)
1390 if location.mapper.canStandardize(location.datasetType):
1391 innerCallback = callback
1394 return location.mapper.standardize(location.datasetType, innerCallback(), dataId)
1397 return ReadProxy(callback)
1399 def put(self, obj, datasetType, dataId={}, doBackup=False, **rest):
1400 """Persists a dataset given an output collection data id. 1405 The object to persist. 1406 datasetType - string 1407 The type of dataset to persist. 1411 If True, rename existing instead of overwriting. 1412 WARNING: Setting doBackup=True is not safe for parallel processing, as it may be subject to race 1415 Keyword arguments for the data id. 1419 dataId.update(**rest)
1421 for location
in self.
_locate(datasetType, dataId, write=
True):
1422 if isinstance(location, ButlerComposite):
1423 disassembler = location.disassembler
if location.disassembler
else genericDisassembler
1424 disassembler(obj=obj, dataId=location.dataId, componentInfo=location.componentInfo)
1425 for name, info
in location.componentInfo.items():
1426 if not info.inputOnly:
1427 self.
put(info.obj, info.datasetType, location.dataId, doBackup=doBackup)
1430 location.getRepository().backup(location.datasetType, dataId)
1431 location.getRepository().write(location, obj)
1433 def subset(self, datasetType, level=None, dataId={}, **rest):
1434 """Return complete dataIds for a dataset type that match a partial (or empty) dataId. 1436 Given a partial (or empty) dataId specified in dataId and **rest, find all datasets that match the 1437 dataId. Optionally restrict the results to a given level specified by a dataId key (e.g. visit or 1438 sensor or amp for a camera). Return an iterable collection of complete dataIds as ButlerDataRefs. 1439 Datasets with the resulting dataIds may not exist; that needs to be tested with datasetExists(). 1443 datasetType - string 1444 The type of dataset collection to subset 1446 The level of dataId at which to subset. Use an empty string if the mapper should look up the 1451 Keyword arguments for the data id. 1455 subset - ButlerSubset 1456 Collection of ButlerDataRefs for datasets matching the data id. 1460 To print the full dataIds for all r-band measurements in a source catalog 1461 (note that the subset call is equivalent to: `butler.subset('src', dataId={'filter':'r'})`): 1463 >>> subset = butler.subset('src', filter=
'r') 1464 >>> for data_ref
in subset: print(data_ref.dataId)
1466 datasetType = self._resolveDatasetTypeAlias(datasetType) 1468 # Currently expected behavior of subset is that if specified level is None then the mapper's default 1469 # level should be used. Convention for level within Butler is that an empty string is used to indicate 1474 dataId = DataId(dataId) 1475 dataId.update(**rest) 1476 return ButlerSubset(self, datasetType, level, dataId) 1478 def dataRef(self, datasetType, level=None, dataId={}, **rest): 1479 """Returns a single ButlerDataRef.
1481 Given a complete dataId specified
in dataId
and **rest, find the unique dataset at the given level
1482 specified by a dataId key (e.g. visit
or sensor
or amp
for a camera)
and return a ButlerDataRef.
1486 datasetType - string
1487 The type of dataset collection to reference
1489 The level of dataId at which to reference
1493 Keyword arguments
for the data id.
1497 dataRef - ButlerDataRef
1498 ButlerDataRef
for dataset matching the data id
1501 datasetType = self._resolveDatasetTypeAlias(datasetType) 1502 dataId = DataId(dataId) 1503 subset = self.subset(datasetType, level, dataId, **rest) 1504 if len(subset) != 1: 1505 raise RuntimeError("No unique dataset for: Dataset type:%s Level:%s Data ID:%s Keywords:%s" % 1506 (str(datasetType), str(level), str(dataId), str(rest))) 1507 return ButlerDataRef(subset, subset.cache[0]) 1509 def getUri(self, datasetType, dataId=None, write=False, **rest): 1510 """Return the URI
for a dataset
1512 .. warning:: This
is intended only
for debugging. The URI should
1513 never be used
for anything other than printing.
1515 .. note:: In the event there are multiple URIs
for read, we
return only
1518 .. note::
getUri() does
not currently support composite datasets.
1523 The dataset type of interest.
1524 dataId : `dict`, optional
1525 The data identifier.
1526 write : `bool`, optional
1527 Return the URI
for writing?
1528 rest : `dict`, optional
1529 Keyword arguments
for the data id.
1536 datasetType = self._resolveDatasetTypeAlias(datasetType) 1537 dataId = DataId(dataId) 1538 dataId.update(**rest) 1539 locations = self._locate(datasetType, dataId, write=write) 1540 if locations is None: 1541 raise NoResults("No locations for getUri: ", datasetType, dataId) 1544 # Follow the write path 1545 # Return the first valid write location. 1546 for location in locations: 1547 if isinstance(location, ButlerComposite): 1548 for name, info in location.componentInfo.items(): 1549 if not info.inputOnly: 1550 return self.getUri(info.datasetType, location.dataId, write=True) 1552 return location.getLocationsWithRoot()[0] 1553 # fall back to raise 1554 raise NoResults("No locations for getUri(write=True): ", datasetType, dataId) 1556 # Follow the read path, only return the first valid read 1557 return locations.getLocationsWithRoot()[0] 1559 def _read(self, location): 1560 """Unpersist an object using data inside a ButlerLocation
or ButlerComposite object.
1564 location : ButlerLocation
or ButlerComposite
1565 A ButlerLocation
or ButlerComposite instance populated with data needed to read the object.
1570 An instance of the object specified by the location.
1572 self.log.debug("Starting read from %s", location) 1574 if isinstance(location, ButlerComposite): 1575 for name, componentInfo in location.componentInfo.items(): 1576 if componentInfo.subset: 1577 subset = self.subset(datasetType=componentInfo.datasetType, dataId=location.dataId) 1578 componentInfo.obj = [obj.get() for obj in subset] 1580 obj = self.get(componentInfo.datasetType, location.dataId, immediate=True) 1581 componentInfo.obj = obj 1582 assembler = location.assembler or genericAssembler 1583 results = assembler(dataId=location.dataId, componentInfo=location.componentInfo, 1584 cls=location.python) 1587 results = location.repository.read(location) 1588 if len(results) == 1: 1589 results = results[0] 1590 self.log.debug("Ending read from %s", location) 1593 def __reduce__(self): 1594 ret = (_unreduce, (self._initArgs, self.datasetTypeAliasDict)) 1597 def _resolveDatasetTypeAlias(self, datasetType): 1598 """Replaces all the known alias keywords
in the given string with the alias value.
1602 datasetType - string
1603 A datasetType string to search & replace on
1607 datasetType - string
1608 The de-aliased string
1610 for key in self.datasetTypeAliasDict: 1611 # if all aliases have been replaced, bail out 1612 if datasetType.find('@') == -1: 1614 datasetType = datasetType.replace(key, self.datasetTypeAliasDict[key]) 1616 # If an alias specifier can not be resolved then throw. 1617 if datasetType.find('@') != -1: 1618 raise RuntimeError("Unresolvable alias specifier in datasetType: %s" % (datasetType)) 1623 def _unreduce(initArgs, datasetTypeAliasDict): 1624 mapperArgs = initArgs.pop('mapperArgs') 1625 initArgs.update(mapperArgs) 1626 butler = Butler(**initArgs) 1627 butler.datasetTypeAliasDict = datasetTypeAliasDict
def _buildLookupLists(self)
def _resolveDatasetTypeAlias(self, datasetType)
def datasetExists(self, datasetType, dataId={}, write=False, rest)
def _convertV1Args(self, root, mapper, mapperArgs)
def _setRepoDataTags(self)
def __init__(self, root=None, mapper=None, inputs=None, outputs=None, mapperArgs)
def setCfg(self, cfg, origin, root, isV1Repository)
def _getRepositoryCfg(self, repositoryArgs)
def getParentRepoDatas(self, context=None)
def _getCfgs(self, repoDataList)
def subset(self, datasetType, level=None, dataId={}, rest)
def __init__(self, cls, repoCfg)
def isNewRepository(self)
def _read(self, location)
def _initRepo(self, repoData)
def _setDefaultMapper(self, repoDataList)
def getUri(self, datasetType, dataId=None, write=False, rest)
def defineAlias(self, alias, datasetType)
def _connectParentRepoDatas(self, repoDataList)
def __init__(self, repoDataList)
def _addParents(self, repoDataList)
def getKeys(self, datasetType=None, level=None, tag=None)
def _getBypassFunc(location, dataId)
def put(self, obj, datasetType, dataId={}, doBackup=False, rest)
def queryMetadata(self, datasetType, format, dataId={}, rest)
def _processInputArguments(self, root=None, mapper=None, inputs=None, outputs=None, mapperArgs)
def addParentRepoData(self, parentRepoData)
def _locate(self, datasetType, dataId, write)
def _getParentVal(repoData)
def _setAndVerifyParentsLists(self, repoDataList)
def get(self, datasetType, dataId=None, immediate=True, rest)
def __init__(self, args, role)