lsst.daf.persistence  15.0-6-g4cfb9db+1
butler.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 
3 #
4 # LSST Data Management System
5 # Copyright 2008-2015 LSST Corporation.
6 #
7 # This product includes software developed by the
8 # LSST Project (http://www.lsst.org/).
9 #
10 # This program is free software: you can redistribute it and/or modify
11 # it under the terms of the GNU General Public License as published by
12 # the Free Software Foundation, either version 3 of the License, or
13 # (at your option) any later version.
14 #
15 # This program is distributed in the hope that it will be useful,
16 # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 # GNU General Public License for more details.
19 #
20 # You should have received a copy of the LSST License Statement and
21 # the GNU General Public License along with this program. If not,
22 # see <http://www.lsstcorp.org/LegalNotices/>.
23 #
24 
25 # -*- python -*-
26 
27 """This module defines the Butler class."""
28 from builtins import str, super
29 from past.builtins import basestring
30 from builtins import object
31 
32 import copy
33 import inspect
34 
35 import yaml
36 
37 from lsst.log import Log
38 import lsst.pex.policy as pexPolicy
39 from . import ReadProxy, ButlerSubset, ButlerDataRef, Persistence, \
40  Storage, Policy, NoResults, Repository, DataId, RepositoryCfg, \
41  RepositoryArgs, listify, setify, sequencify, doImport, ButlerComposite, genericAssembler, \
42  genericDisassembler, PosixStorage, ParentsMismatch
43 
44 preinitedMapperWarning = ("Passing an instantiated mapper into " +
45  "Butler.__init__ will prevent Butler from passing " +
46  "parentRegistry or repositoryCfg information to " +
47  "the mapper, which is done only at init time. " +
48  "It is better to pass a importable string or " +
49  "class object.")
50 
51 
52 class ButlerCfg(Policy, yaml.YAMLObject):
53  """Represents a Butler configuration.
54 
55  .. warning::
56 
57  cfg is 'wet paint' and very likely to change. Use of it in production
58  code other than via the 'old butler' API is strongly discouraged.
59  """
60  yaml_tag = u"!ButlerCfg"
61 
62  def __init__(self, cls, repoCfg):
63  super().__init__({'repoCfg': repoCfg, 'cls': cls})
64 
65 
66 class RepoData(object):
67  """Container object for repository data used by Butler
68 
69  Parameters
70  ----------
71  args : RepositoryArgs
72  The arguments that are used to find or create the RepositoryCfg.
73  role : string
74  "input", "output", or "parent", indicating why Butler loaded this repository.
75  * input: the Repository was passed as a Butler input.
76  * output: the Repository was passed as a Butler output.
77  * parent: the Repository was specified in the RepositoryCfg parents list of a readable repository.
78 
79  Attributes
80  ----------
81  cfg: RepositoryCfg
82  The configuration for the Repository.
83 
84  _cfgOrigin : string
85  "new", "existing", or "nested". Indicates the origin of the repository and its RepositoryCfg:
86  * new: it was created by this instance of Butler, and this instance of Butler will generate the
87  RepositoryCfg file.
88  * existing: it was found (via the root or cfgRoot argument)
89  * nested: the full RepositoryCfg was nested in another RepositoryCfg's parents list (this can happen
90  if parameters of an input specified by RepositoryArgs or dict does not entirely match an existing
91  RepositoryCfg).
92 
93  cfgRoot : string
94  Path or URI to the location of the RepositoryCfg file.
95 
96  repo : lsst.daf.persistence.Repository
97  The Repository class instance.
98 
99  parentRepoDatas : list of RepoData
100  The parents of this Repository, as indicated this Repository's RepositoryCfg. If this is a new
101  Repository then these are the inputs to this Butler (and will be saved in the RepositoryCfg). These
102  RepoData objects are not owned by this RepoData, these are references to peer RepoData objects in the
103  Butler's RepoDataContainer.
104 
105  isV1Repository : bool
106  True if this is an Old Butler repository. In this case the repository does not have a RepositoryCfg
107  file. It may have a _mapper file and may have a _parent symlink. It will never be treated as a "new"
108  repository, i.e. even though there is not a RepositoryCfg file, one will not be generated.
109  If False, this is a New Butler repository and is specified by RepositoryCfg file.
110 
111  tags : set
112  These are values that may be used to restrict the search of input repositories. Details are available
113  in the RepositoryArgs and DataId classes.
114 
115  role : string
116  "input", "output", or "parent", indicating why Butler loaded this repository.
117  * input: the Repository was passed as a Butler input.
118  * output: the Repository was passed as a Butler output.
119  * parent: the Repository was specified in the RepositoryCfg parents list of a readable repository.
120 
121  _repoArgs : RepositoryArgs
122  Contains the arguments that were used to specify this Repository.
123  """
124 
125  def __init__(self, args, role):
126  self.cfg = None
127  self._cfgOrigin = None
128  self.cfgRoot = None
129  self.repo = None
130  self.parentRepoDatas = []
131  self.isV1Repository = False
132  self.tags = set()
133  self.role = role
134  self.parentRegistry = None
135  self._repoArgs = args
136 
137  @property
138  def repoArgs(self):
139  return self._repoArgs
140 
141  @property
142  def repoData(self):
143  return self
144 
145  def __repr__(self):
146  return ("{}(id={},"
147  "repoArgs={}"
148  "cfg={!r},"
149  "cfgOrigin={},"
150  "cfgRoot={}," +
151  "repo={},"
152  "parentRepoDatas={}," +
153  "isV1Repository={},"
154  "role={}," +
155  "parentRegistry={})").format(
156  self.__class__.__name__,
157  id(self),
158  self.repoArgs,
159  self.cfg,
160  self.cfgOrigin,
161  self.cfgRoot,
162  self.repo,
163  [id(p) for p in self.parentRepoDatas],
164  self.isV1Repository,
165  self.role,
166  self.parentRegistry)
167 
168  def setCfg(self, cfg, origin, root, isV1Repository):
169  """Set information about the cfg into the RepoData
170 
171  Parameters
172  ----------
173  cfg : RepositoryCfg
174  The RepositoryCfg for the repo.
175  origin : string
176  'new', 'existing', or 'nested'
177  root : string
178  URI or absolute path to the location of the RepositoryCfg.yaml file.
179 
180  Returns
181  -------
182  None
183  """
184  if origin not in ('new', 'existing', 'nested'):
185  raise RuntimeError("Invalid value for origin:{}".format(origin))
186  self.cfg = cfg
187  self._cfgOrigin = origin
188  self.cfgRoot = root
189  self.isV1Repository = isV1Repository
190 
191  @property
192  def cfgOrigin(self):
193  return self._cfgOrigin
194 
195  @property
196  def isNewRepository(self):
197  return self.cfgOrigin == 'new'
198 
199  @property
200  def role(self):
201  return self._role
202 
203  @role.setter
204  def role(self, val):
205  if val not in ('input', 'output', 'parent'):
206  raise RuntimeError("Invalid value for role: {}".format(val))
207  self._role = val
208 
209  def getParentRepoDatas(self, context=None):
210  """Get the parents & grandparents etc of this repo data, in depth-first search order.
211 
212  Duplicate entries will be removed in cases where the same parent appears more than once in the parent
213  graph.
214 
215  Parameters
216  ----------
217  context : set, optional
218  Users should typically omit context and accept the default argument. Context is used to keep a set
219  of known RepoDatas when calling this function recursively, for duplicate elimination.
220 
221  Returns
222  -------
223  list of RepoData
224  A list of the parents & grandparents etc of a given repo data, in depth-first search order.
225  """
226  if context is None:
227  context = set()
228  parents = []
229  if id(self) in context:
230  return parents
231  context.add(id(self))
232  for parent in self.parentRepoDatas:
233  parents.append(parent)
234  parents += parent.getParentRepoDatas(context)
235  return parents
236 
237  def addParentRepoData(self, parentRepoData):
238  self.parentRepoDatas.append(parentRepoData)
239 
240  def addTags(self, tags):
241  self.tags = self.tags.union(tags)
242 
243 
244 class RepoDataContainer(object):
245  """Container object for RepoData instances owned by a Butler instance.
246 
247  Parameters
248  ----------
249  repoDataList : list of RepoData
250  repoData - RepoData instance to add
251  """
252 
253  def __init__(self, repoDataList):
254  self._inputs = None
255  self._outputs = None
256  self._all = repoDataList
257  self._buildLookupLists()
258 
259  def inputs(self):
260  """Get a list of RepoData that are used to as inputs to the Butler.
261  The list is created lazily as needed, and cached.
262 
263  Returns
264  -------
265  A list of RepoData with readable repositories, in the order to be used when searching.
266  """
267  if self._inputs is None:
268  raise RuntimeError("Inputs not yet initialized.")
269  return self._inputs
270 
271  def outputs(self):
272  """Get a list of RepoData that are used to as outputs to the Butler.
273  The list is created lazily as needed, and cached.
274 
275  Returns
276  -------
277  A list of RepoData with writable repositories, in the order to be use when searching.
278  """
279  if self._outputs is None:
280  raise RuntimeError("Outputs not yet initialized.")
281  return self._outputs
282 
283  def all(self):
284  """Get a list of all RepoData that are used to as by the Butler.
285  The list is created lazily as needed, and cached.
286 
287  Returns
288  -------
289  A list of RepoData with writable repositories, in the order to be use when searching.
290  """
291  return self._all
292 
293  def __repr__(self):
294  return "%s(_inputs=%r, \n_outputs=%s, \n_all=%s)" % (
295  self.__class__.__name__,
296  self._inputs,
297  self._outputs,
298  self._all)
299 
300  def _buildLookupLists(self):
301  """Build the inputs and outputs lists based on the order of self.all()."""
302 
303  def addToList(repoData, lst):
304  """Add a repoData and each of its parents (depth first) to a list"""
305  if id(repoData) in alreadyAdded:
306  return
307  lst.append(repoData)
308  alreadyAdded.add(id(repoData))
309  for parent in repoData.parentRepoDatas:
310  addToList(parent, lst)
311 
312  if self._inputs is not None or self._outputs is not None:
313  raise RuntimeError("Lookup lists are already built.")
314  inputs = [repoData for repoData in self.all() if repoData.role == 'input']
315  outputs = [repoData for repoData in self.all() if repoData.role == 'output']
316  self._inputs = []
317  alreadyAdded = set()
318  for repoData in outputs:
319  if 'r' in repoData.repoArgs.mode:
320  addToList(repoData.repoData, self._inputs)
321  for repoData in inputs:
322  addToList(repoData.repoData, self._inputs)
323  self._outputs = [repoData.repoData for repoData in outputs]
324 
325 
326 class Butler(object):
327  """Butler provides a generic mechanism for persisting and retrieving data using mappers.
328 
329  A Butler manages a collection of datasets known as a repository. Each dataset has a type representing its
330  intended usage and a location. Note that the dataset type is not the same as the C++ or Python type of the
331  object containing the data. For example, an ExposureF object might be used to hold the data for a raw
332  image, a post-ISR image, a calibrated science image, or a difference image. These would all be different
333  dataset types.
334 
335  A Butler can produce a collection of possible values for a key (or tuples of values for multiple keys) if
336  given a partial data identifier. It can check for the existence of a file containing a dataset given its
337  type and data identifier. The Butler can then retrieve the dataset. Similarly, it can persist an object to
338  an appropriate location when given its associated data identifier.
339 
340  Note that the Butler has two more advanced features when retrieving a data set. First, the retrieval is
341  lazy. Input does not occur until the data set is actually accessed. This allows datasets to be retrieved
342  and placed on a clipboard prospectively with little cost, even if the algorithm of a stage ends up not
343  using them. Second, the Butler will call a standardization hook upon retrieval of the dataset. This
344  function, contained in the input mapper object, must perform any necessary manipulations to force the
345  retrieved object to conform to standards, including translating metadata.
346 
347  Public methods:
348 
349  __init__(self, root, mapper=None, **mapperArgs)
350 
351  defineAlias(self, alias, datasetType)
352 
353  getKeys(self, datasetType=None, level=None)
354 
355  queryMetadata(self, datasetType, format=None, dataId={}, **rest)
356 
357  datasetExists(self, datasetType, dataId={}, **rest)
358 
359  get(self, datasetType, dataId={}, immediate=False, **rest)
360 
361  put(self, obj, datasetType, dataId={}, **rest)
362 
363  subset(self, datasetType, level=None, dataId={}, **rest)
364 
365  dataRef(self, datasetType, level=None, dataId={}, **rest)
366 
367  Initialization:
368 
369  The preferred method of initialization is to use the `inputs` and `outputs` __init__ parameters. These
370  are described in the parameters section, below.
371 
372  For backward compatibility: this initialization method signature can take a posix root path, and
373  optionally a mapper class instance or class type that will be instantiated using the mapperArgs input
374  argument. However, for this to work in a backward compatible way it creates a single repository that is
375  used as both an input and an output repository. This is NOT preferred, and will likely break any
376  provenance system we have in place.
377 
378  Parameters
379  ----------
380  root : string
381  .. note:: Deprecated in 12_0
382  `root` will be removed in TBD, it is replaced by `inputs` and `outputs` for
383  multiple-repository support.
384  A file system path. Will only work with a PosixRepository.
385  mapper : string or instance
386  .. note:: Deprecated in 12_0
387  `mapper` will be removed in TBD, it is replaced by `inputs` and `outputs` for
388  multiple-repository support.
389  Provides a mapper to be used with Butler.
390  mapperArgs : dict
391  .. note:: Deprecated in 12_0
392  `mapperArgs` will be removed in TBD, it is replaced by `inputs` and `outputs` for
393  multiple-repository support.
394  Provides arguments to be passed to the mapper if the mapper input argument is a class type to be
395  instantiated by Butler.
396  inputs : RepositoryArgs, dict, or string
397  Can be a single item or a list. Provides arguments to load an existing repository (or repositories).
398  String is assumed to be a URI and is used as the cfgRoot (URI to the location of the cfg file). (Local
399  file system URI does not have to start with 'file://' and in this way can be a relative path). The
400  `RepositoryArgs` class can be used to provide more parameters with which to initialize a repository
401  (such as `mapper`, `mapperArgs`, `tags`, etc. See the `RepositoryArgs` documentation for more
402  details). A dict may be used as shorthand for a `RepositoryArgs` class instance. The dict keys must
403  match parameters to the `RepositoryArgs.__init__` function.
404  outputs : RepositoryArgs, dict, or string
405  Provides arguments to load one or more existing repositories or create new ones. The different types
406  are handled the same as for `inputs`.
407 
408  The Butler init sequence loads all of the input and output repositories.
409  This creates the object hierarchy to read from and write to them. Each
410  repository can have 0 or more parents, which also get loaded as inputs.
411  This becomes a DAG of repositories. Ultimately, Butler creates a list of
412  these Repositories in the order that they are used.
413 
414  Initialization Sequence
415  =======================
416 
417  During initialization Butler creates a Repository class instance & support structure for each object
418  passed to `inputs` and `outputs` as well as the parent repositories recorded in the `RepositoryCfg` of
419  each existing readable repository.
420 
421  This process is complex. It is explained below to shed some light on the intent of each step.
422 
423  1. Input Argument Standardization
424  ---------------------------------
425 
426  In `Butler._processInputArguments` the input arguments are verified to be legal (and a RuntimeError is
427  raised if not), and they are converted into an expected format that is used for the rest of the Butler
428  init sequence. See the docstring for `_processInputArguments`.
429 
430  2. Create RepoData Objects
431  --------------------------
432 
433  Butler uses an object, called `RepoData`, to keep track of information about each repository; each
434  repository is contained in a single `RepoData`. The attributes are explained in its docstring.
435 
436  After `_processInputArguments`, a RepoData is instantiated and put in a list for each repository in
437  `outputs` and `inputs`. This list of RepoData, the `repoDataList`, now represents all the output and input
438  repositories (but not parent repositories) that this Butler instance will use.
439 
440  3. Get `RepositoryCfg`s
441  -----------------------
442 
443  `Butler._getCfgs` gets the `RepositoryCfg` for each repository the `repoDataList`. The behavior is
444  described in the docstring.
445 
446  4. Add Parents
447  --------------
448 
449  `Butler._addParents` then considers the parents list in the `RepositoryCfg` of each `RepoData` in the
450  `repoDataList` and inserts new `RepoData` objects for each parent not represented in the proper location
451  in the `repoDataList`. Ultimately a flat list is built to represent the DAG of readable repositories
452  represented in depth-first order.
453 
454  5. Set and Verify Parents of Outputs
455  ------------------------------------
456 
457  To be able to load parent repositories when output repositories are used as inputs, the input repositories
458  are recorded as parents in the `RepositoryCfg` file of new output repositories. When an output repository
459  already exists, for consistency the Butler's inputs must match the list of parents specified the already-
460  existing output repository's `RepositoryCfg` file.
461 
462  In `Butler._setAndVerifyParentsLists`, the list of parents is recorded in the `RepositoryCfg` of new
463  repositories. For existing repositories the list of parents is compared with the `RepositoryCfg`'s parents
464  list, and if they do not match a `RuntimeError` is raised.
465 
466  6. Set the Default Mapper
467  -------------------------
468 
469  If all the input repositories use the same mapper then we can assume that mapper to be the
470  "default mapper". If there are new output repositories whose `RepositoryArgs` do not specify a mapper and
471  there is a default mapper then the new output repository will be set to use that default mapper.
472 
473  This is handled in `Butler._setDefaultMapper`.
474 
475  7. Cache References to Parent RepoDatas
476  ---------------------------------------
477 
478  In `Butler._connectParentRepoDatas`, in each `RepoData` in `repoDataList`, a list of `RepoData` object
479  references is built that matches the parents specified in that `RepoData`'s `RepositoryCfg`.
480 
481  This list is used later to find things in that repository's parents, without considering peer repository's
482  parents. (e.g. finding the registry of a parent)
483 
484  8. Set Tags
485  -----------
486 
487  Tags are described at https://ldm-463.lsst.io/v/draft/#tagging
488 
489  In `Butler._setRepoDataTags`, for each `RepoData`, the tags specified by its `RepositoryArgs` are recorded
490  in a set, and added to the tags set in each of its parents, for ease of lookup when mapping.
491 
492  9. Find Parent Registry and Instantiate RepoData
493  ------------------------------------------------
494 
495  At this point there is enough information to instantiate the `Repository` instances. There is one final
496  step before instantiating the Repository, which is to try to get a parent registry that can be used by the
497  child repository. The criteria for "can be used" is spelled out in `Butler._setParentRegistry`. However,
498  to get the registry from the parent, the parent must be instantiated. The `repoDataList`, in depth-first
499  search order, is built so that the most-dependent repositories are first, and the least dependent
500  repositories are last. So the `repoDataList` is reversed and the Repositories are instantiated in that
501  order; for each RepoData a parent registry is searched for, and then the Repository is instantiated with
502  whatever registry could be found."""
503 
504  def __init__(self, root=None, mapper=None, inputs=None, outputs=None, **mapperArgs):
505  self._initArgs = {'root': root, 'mapper': mapper, 'inputs': inputs, 'outputs': outputs,
506  'mapperArgs': mapperArgs}
507 
508  self.log = Log.getLogger("daf.persistence.butler")
509  # Always use an empty Persistence policy until we can get rid of it
510  persistencePolicy = pexPolicy.Policy()
511  self.persistence = Persistence.getPersistence(persistencePolicy)
512 
513  inputs, outputs = self._processInputArguments(
514  root=root, mapper=mapper, inputs=inputs, outputs=outputs, **mapperArgs)
515 
516  # convert the RepoArgs into RepoData
517  inputs = [RepoData(args, 'input') for args in inputs]
518  outputs = [RepoData(args, 'output') for args in outputs]
519  repoDataList = outputs + inputs
520 
521  self._getCfgs(repoDataList)
522 
523  self._addParents(repoDataList)
524 
525  self._setAndVerifyParentsLists(repoDataList)
526 
527  self._setDefaultMapper(repoDataList)
528 
529  self._connectParentRepoDatas(repoDataList)
530 
531  self._repos = RepoDataContainer(repoDataList)
532 
533  self._setRepoDataTags()
534 
535  for repoData in reversed(repoDataList):
536  self._setParentRegistry(repoData)
537  repoData.repo = Repository(repoData)
538 
539  def _setParentRegistry(self, repoData):
540  """Try to get a parent registry that can be used by this repository. To be usable the repository must
541  "match", meaning the mapper in the passed-in repo is the same type as the mapper in the parent.
542  """
543 
544  def getParentRegsitry(repoData, context):
545  """Get the first found registry that matches the the passed-in repo.
546 
547  Parameters
548  ----------
549  repoData : RepoData
550  The RepoData for the repository for which we are searching for a
551  parent registry.
552 
553  Returns
554  -------
555  Registry or None
556  A registry from a parent if one can be found, or None.
557 
558  Raises
559  ------
560  RuntimeError
561  Indicates a butler init order problem, all parents should be initialized before child
562  repositories, so this function should be able to get any parent of any child repo.
563  """
564  if id(self) in context:
565  return None
566  else:
567  context.add(id(self))
568  for parentRepoData in repoData.getParentRepoDatas():
569  if parentRepoData.cfg.mapper == repoData.cfg.mapper:
570  if parentRepoData.repo is None:
571  self.log.debug(
572  "_getParentRegistry: Parent {} of new repo {} not yet created, ignoring.".format(
573  parentRepoData, repoData))
574  else:
575  parentRegistry = parentRepoData.repo.getRegistry()
576  if parentRegistry:
577  return parentRegistry
578  else:
579  parentRegistry = getParentRegsitry(parentRepoData, context)
580  if parentRegistry:
581  return parentRegistry
582  return None
583 
584  repoData.repoData.parentRegistry = getParentRegsitry(repoData.repoData, set())
585 
586  def _processInputArguments(self, root=None, mapper=None, inputs=None, outputs=None, **mapperArgs):
587  """Process, verify, and standardize the input arguments.
588  * Inputs can not be for Old Butler (root, mapper, mapperArgs) AND New Butler (inputs, outputs)
589  `root`, `mapper`, and `mapperArgs` are Old Butler init API.
590  `inputs` and `outputs` are New Butler init API.
591  Old Butler and New Butler init API may not be mixed, Butler may be initialized with only the Old
592  arguments or the New arguments.
593  * Verify that if there is a readable output that there is exactly one output. (This restriction is in
594  place because all readable repositories must be parents of writable repositories, and for
595  consistency the DAG of readable repositories must always be the same. Keeping the list of parents
596  becomes very complicated in the presence of multiple readable output repositories. It is better to
597  only write to output repositories, and then create a new Butler instance and use the outputs as
598  inputs, and write to new output repositories.)
599  * Make a copy of inputs & outputs so they may be modified without changing the passed-in arguments.
600  * Convert any input/output values that are URI strings to RepositoryArgs.
601  * Listify inputs & outputs.
602  * Set default RW mode on inputs & outputs as needed.
603 
604  Parameters
605  ----------
606  Same as Butler.__init__
607 
608  Returns
609  -------
610  (list of RepositoryArgs, list of RepositoryArgs)
611  First item is a list to use as inputs.
612  Second item is a list to use as outputs.
613 
614  Raises
615  ------
616  RuntimeError
617  If Old Butler and New Butler arguments are both used this will raise.
618  If an output is readable there is more than one output this will raise.
619  """
620  # inputs and outputs may be modified, do not change the external value.
621  inputs = copy.deepcopy(inputs)
622  outputs = copy.deepcopy(outputs)
623 
624  isV1Args = inputs is None and outputs is None
625  if isV1Args:
626  inputs, outputs = self._convertV1Args(root=root,
627  mapper=mapper,
628  mapperArgs=mapperArgs or None)
629  elif root or mapper or mapperArgs:
630  raise RuntimeError(
631  'Butler version 1 API (root, mapper, **mapperArgs) may ' +
632  'not be used with version 2 API (inputs, outputs)')
634 
635  self.storage = Storage()
636 
637  # make sure inputs and outputs are lists, and if list items are a string convert it RepositoryArgs.
638  inputs = listify(inputs)
639  outputs = listify(outputs)
640  inputs = [RepositoryArgs(cfgRoot=args)
641  if not isinstance(args, RepositoryArgs) else args for args in inputs]
642  outputs = [RepositoryArgs(cfgRoot=args)
643  if not isinstance(args, RepositoryArgs) else args for args in outputs]
644  # Set the default value of inputs & outputs, verify the required values ('r' for inputs, 'w' for
645  # outputs) and remove the 'w' from inputs if needed.
646  for args in inputs:
647  if args.mode is None:
648  args.mode = 'r'
649  elif 'rw' == args.mode:
650  args.mode = 'r'
651  elif 'r' != args.mode:
652  raise RuntimeError("The mode of an input should be readable.")
653  for args in outputs:
654  if args.mode is None:
655  args.mode = 'w'
656  elif 'w' not in args.mode:
657  raise RuntimeError("The mode of an output should be writable.")
658  # check for class instances in args.mapper (not allowed)
659  for args in inputs + outputs:
660  if (args.mapper and not isinstance(args.mapper, basestring) and
661  not inspect.isclass(args.mapper)):
662  self.log.warn(preinitedMapperWarning)
663  # if the output is readable, there must be only one output:
664  for o in outputs:
665  if 'r' in o.mode:
666  if len(outputs) > 1:
667  raise RuntimeError("Butler does not support multiple output repositories if any of the "
668  "outputs are readable.")
669 
670  # Handle the case where the output is readable and is also passed in as one of the inputs by removing
671  # the input. This supports a legacy use case in pipe_tasks where the input is also passed as the
672  # output, to the command line parser.
673  def inputIsInOutputs(inputArgs, outputArgsList):
674  for o in outputArgsList:
675  if ('r' in o.mode and
676  o.root == inputArgs.root and
677  o.mapper == inputArgs.mapper and
678  o.mapperArgs == inputArgs.mapperArgs and
679  o.tags == inputArgs.tags and
680  o.policy == inputArgs.policy):
681  self.log.debug(("Input repositoryArgs {} is also listed in outputs as readable; " +
682  "throwing away the input.").format(inputArgs))
683  return True
684  return False
685 
686  inputs = [args for args in inputs if not inputIsInOutputs(args, outputs)]
687  return inputs, outputs
688 
689  @staticmethod
690  def _getParentVal(repoData):
691  """Get the value of this repoData as it should appear in the parents
692  list of other repositories"""
693  if repoData.isV1Repository:
694  return repoData.cfg
695  if repoData.cfgOrigin == 'nested':
696  return repoData.cfg
697  else:
698  return repoData.cfg.root
699 
700  @staticmethod
701  def _getParents(ofRepoData, repoInfo):
702  """Create a parents list of repoData from inputs and (readable) outputs."""
703  parents = []
704  # get the parents list of repoData:
705  for repoData in repoInfo:
706  if repoData is ofRepoData:
707  continue
708  if 'r' not in repoData.repoArgs.mode:
709  continue
710  parents.append(Butler._getParentVal(repoData))
711  return parents
712 
713  @staticmethod
714  def _getOldButlerRepositoryCfg(repositoryArgs):
715  if not Storage.isPosix(repositoryArgs.cfgRoot):
716  return None
717  if not PosixStorage.v1RepoExists(repositoryArgs.cfgRoot):
718  return None
719  if not repositoryArgs.mapper:
720  repositoryArgs.mapper = PosixStorage.getMapperClass(repositoryArgs.cfgRoot)
721  cfg = RepositoryCfg.makeFromArgs(repositoryArgs)
722  parent = PosixStorage.getParentSymlinkPath(repositoryArgs.cfgRoot)
723  if parent:
724  parent = Butler._getOldButlerRepositoryCfg(RepositoryArgs(cfgRoot=parent, mode='r'))
725  if parent is not None:
726  cfg.addParents([parent])
727  return cfg
728 
729  def _getRepositoryCfg(self, repositoryArgs):
730  """Try to get a repository from the location described by cfgRoot.
731 
732  Parameters
733  ----------
734  repositoryArgs : RepositoryArgs or string
735  Provides arguments to load an existing repository (or repositories). String is assumed to be a URI
736  and is used as the cfgRoot (URI to the location of the cfg file).
737 
738  Returned
739  --------
740  (RepositoryCfg or None, bool)
741  The RepositoryCfg, or None if one cannot be found, and True if the RepositoryCfg was created by
742  reading an Old Butler repository, or False if it is a New Butler Repository.
743  """
744  if not isinstance(repositoryArgs, RepositoryArgs):
745  repositoryArgs = RepositoryArgs(cfgRoot=repositoryArgs, mode='r')
746 
747  cfg = self.storage.getRepositoryCfg(repositoryArgs.cfgRoot)
748  isOldButlerRepository = False
749  if cfg is None:
750  cfg = Butler._getOldButlerRepositoryCfg(repositoryArgs)
751  if cfg is not None:
752  isOldButlerRepository = True
753  return cfg, isOldButlerRepository
754 
755  def _getCfgs(self, repoDataList):
756  """Get or make a RepositoryCfg for each RepoData, and add the cfg to the RepoData.
757  If the cfg exists, compare values. If values match then use the cfg as an "existing" cfg. If the
758  values do not match, use the cfg as a "nested" cfg.
759  If the cfg does not exist, the RepositoryArgs must be for a writable repository.
760 
761  Parameters
762  ----------
763  repoDataList : list of RepoData
764  The RepoData that are output and inputs of this Butler
765 
766  Raises
767  ------
768  RuntimeError
769  If the passed-in RepositoryArgs indicate an existing repository but other cfg parameters in those
770  RepositoryArgs don't
771  match the existing repository's cfg a RuntimeError will be raised.
772  """
773  def cfgMatchesArgs(args, cfg):
774  """Test if there are any values in an RepositoryArgs that conflict with the values in a cfg"""
775  if args.mapper is not None and cfg.mapper != args.mapper:
776  return False
777  if args.mapperArgs is not None and cfg.mapperArgs != args.mapperArgs:
778  return False
779  if args.policy is not None and cfg.policy != args.policy:
780  return False
781  return True
782 
783  for repoData in repoDataList:
784  cfg, isOldButlerRepository = self._getRepositoryCfg(repoData.repoArgs)
785  if cfg is None:
786  if 'w' not in repoData.repoArgs.mode:
787  raise RuntimeError(
788  "No cfg found for read-only input repository at {}".format(repoData.repoArgs.cfgRoot))
789  repoData.setCfg(cfg=RepositoryCfg.makeFromArgs(repoData.repoArgs),
790  origin='new',
791  root=repoData.repoArgs.cfgRoot,
792  isV1Repository=isOldButlerRepository)
793  else:
794 
795  # This is a hack fix for an issue introduced by DM-11284; Old Butler parent repositories used
796  # to be stored as a path to the repository in the parents list and it was changed so that the
797  # whole RepositoryCfg, that described the Old Butler repository (including the mapperArgs that
798  # were used with it), was recorded as a "nested" repository cfg. That checkin did not account
799  # for the fact that there were repositoryCfg.yaml files in the world with only the path to
800  # Old Butler repositories in the parents list.
801  if cfg.parents:
802  for i, parent in enumerate(cfg.parents):
803  if isinstance(parent, RepositoryCfg):
804  continue
805  parentCfg, parentIsOldButlerRepository = self._getRepositoryCfg(parent)
806  if parentIsOldButlerRepository:
807  parentCfg.mapperArgs = cfg.mapperArgs
808  self.log.info(("Butler is replacing an Old Butler parent repository path '{}' "
809  "found in the parents list of a New Butler repositoryCfg: {} "
810  "with a repositoryCfg that includes the child repository's "
811  "mapperArgs: {}. This affects the instantiated RepositoryCfg "
812  "but does not change the persisted child repositoryCfg.yaml file."
813  ).format(parent, cfg, parentCfg))
814  cfg._parents[i] = cfg._normalizeParents(cfg.root, [parentCfg])[0]
815 
816  if 'w' in repoData.repoArgs.mode:
817  # if it's an output repository, the RepositoryArgs must match the existing cfg.
818  if not cfgMatchesArgs(repoData.repoArgs, cfg):
819  raise RuntimeError(("The RepositoryArgs and RepositoryCfg must match for writable " +
820  "repositories, RepositoryCfg:{}, RepositoryArgs:{}").format(
821  cfg, repoData.repoArgs))
822  repoData.setCfg(cfg=cfg, origin='existing', root=repoData.repoArgs.cfgRoot,
823  isV1Repository=isOldButlerRepository)
824  else:
825  # if it's an input repository, the cfg can overwrite the in-repo cfg.
826  if cfgMatchesArgs(repoData.repoArgs, cfg):
827  repoData.setCfg(cfg=cfg, origin='existing', root=repoData.repoArgs.cfgRoot,
828  isV1Repository=isOldButlerRepository)
829  else:
830  repoData.setCfg(cfg=cfg, origin='nested', root=None,
831  isV1Repository=isOldButlerRepository)
832 
833  def _addParents(self, repoDataList):
834  """For each repoData in the input list, see if its parents are the next items in the list, and if not
835  add the parent, so that the repoDataList includes parents and is in order to operate depth-first 0..n.
836 
837  Parameters
838  ----------
839  repoDataList : list of RepoData
840  The RepoData for the Butler outputs + inputs.
841 
842  Raises
843  ------
844  RuntimeError
845  Raised if a RepositoryCfg can not be found at a location where a parent repository should be.
846  """
847  repoDataIdx = 0
848  while True:
849  if repoDataIdx == len(repoDataList):
850  break
851  repoData = repoDataList[repoDataIdx]
852  if 'r' not in repoData.repoArgs.mode:
853  repoDataIdx += 1
854  continue # the repoData only needs parents if it's readable.
855  if repoData.isNewRepository:
856  repoDataIdx += 1
857  continue # if it's new the parents will be the inputs of this butler.
858  if repoData.cfg.parents is None:
859  repoDataIdx += 1
860  continue # if there are no parents then there's nothing to do.
861  for repoParentIdx, repoParent in enumerate(repoData.cfg.parents):
862  parentIdxInRepoDataList = repoDataIdx + repoParentIdx + 1
863  if not isinstance(repoParent, RepositoryCfg):
864  repoParentCfg, isOldButlerRepository = self._getRepositoryCfg(repoParent)
865  if repoParentCfg is not None:
866  cfgOrigin = 'existing'
867  else:
868  isOldButlerRepository = False
869  repoParentCfg = repoParent
870  cfgOrigin = 'nested'
871  if (parentIdxInRepoDataList < len(repoDataList) and
872  repoDataList[parentIdxInRepoDataList].cfg == repoParentCfg):
873  continue
874  args = RepositoryArgs(cfgRoot=repoParentCfg.root, mode='r')
875  role = 'input' if repoData.role == 'output' else 'parent'
876  newRepoInfo = RepoData(args, role)
877  newRepoInfo.repoData.setCfg(cfg=repoParentCfg, origin=cfgOrigin, root=args.cfgRoot,
878  isV1Repository=isOldButlerRepository)
879  repoDataList.insert(parentIdxInRepoDataList, newRepoInfo)
880  repoDataIdx += 1
881 
882  def _setAndVerifyParentsLists(self, repoDataList):
883  """Make a list of all the input repositories of this Butler, these are the parents of the outputs.
884  For new output repositories, set the parents in the RepositoryCfg. For existing output repositories
885  verify that the RepositoryCfg's parents match the parents list.
886 
887  Parameters
888  ----------
889  repoDataList : list of RepoData
890  All the RepoDatas loaded by this butler, in search order.
891 
892  Raises
893  ------
894  RuntimeError
895  If an existing output repository is loaded and its parents do not match the parents of this Butler
896  an error will be raised.
897  """
898  def getIOParents(ofRepoData, repoDataList):
899  """make a parents list for repo in `ofRepoData` that is comprised of inputs and readable
900  outputs (not parents-of-parents) of this butler"""
901  parents = []
902  for repoData in repoDataList:
903  if repoData.role == 'parent':
904  continue
905  if repoData is ofRepoData:
906  continue
907  if repoData.role == 'output':
908  if 'r' in repoData.repoArgs.mode:
909  raise RuntimeError("If an output is readable it must be the only output.")
910  # and if this is the only output, this should have continued in
911  # "if repoData is ofRepoData"
912  continue
913  parents.append(self._getParentVal(repoData))
914  return parents
915 
916  for repoData in repoDataList:
917  if repoData.role != 'output':
918  continue
919  parents = getIOParents(repoData, repoDataList)
920  # if repoData is new, add the parent RepositoryCfgs to it.
921  if repoData.cfgOrigin == 'new':
922  repoData.cfg.addParents(parents)
923  elif repoData.cfgOrigin in ('existing', 'nested'):
924  if repoData.cfg.parents != parents:
925  try:
926  repoData.cfg.extendParents(parents)
927  except ParentsMismatch as e:
928  raise RuntimeError(("Inputs of this Butler:{} do not match parents of existing " +
929  "writable cfg:{} (ParentMismatch exception: {}").format(
930  parents, repoData.cfg.parents, e))
931 
932  def _setDefaultMapper(self, repoDataList):
933  """Establish a default mapper if there is one and assign it to outputs that do not have a mapper
934  assigned.
935 
936  If all inputs have the same mapper it will be used as the default mapper.
937 
938  Parameters
939  ----------
940  repoDataList : list of RepoData
941  All the RepoDatas loaded by this butler, in search order.
942 
943  Raises
944  ------
945  RuntimeError
946  If a default mapper can not be established and there is an output that does not have a mapper.
947  """
948  needyOutputs = [rd for rd in repoDataList if rd.role == 'output' and rd.cfg.mapper is None]
949  if len(needyOutputs) is 0:
950  return
951  mappers = set([rd.cfg.mapper for rd in repoDataList if rd.role == 'input'])
952  if len(mappers) != 1:
953  inputs = [rd for rd in repoDataList if rd.role == 'input']
954  raise RuntimeError(
955  ("No default mapper could be established from inputs:{} and no mapper specified " +
956  "for outputs:{}").format(inputs, needyOutputs))
957  defaultMapper = mappers.pop()
958  for repoData in needyOutputs:
959  repoData.cfg.mapper = defaultMapper
960 
961  def _connectParentRepoDatas(self, repoDataList):
962  """For each RepoData in repoDataList, find its parent in the repoDataList and cache a reference to it.
963 
964  Parameters
965  ----------
966  repoDataList : list of RepoData
967  All the RepoDatas loaded by this butler, in search order.
968 
969  Raises
970  ------
971  RuntimeError
972  When a parent is listed in the parents list but not found in the repoDataList. This is not
973  expected to ever happen and would indicate an internal Butler error.
974  """
975  for repoData in repoDataList:
976  for parent in repoData.cfg.parents:
977  parentToAdd = None
978  for otherRepoData in repoDataList:
979  if isinstance(parent, RepositoryCfg):
980  if otherRepoData.repoData.repoData.cfg == parent:
981  parentToAdd = otherRepoData.repoData
982  break
983  elif otherRepoData.repoData.cfg.root == parent:
984  parentToAdd = otherRepoData.repoData
985  break
986  if parentToAdd is None:
987  raise RuntimeError(
988  "Could not find a parent matching {} to add to {}".format(parent, repoData))
989  repoData.addParentRepoData(parentToAdd)
990 
991  @staticmethod
992  def _getParentRepoData(parent, repoDataList):
993  """get a parent RepoData from a cfg from a list of RepoData
994 
995  Parameters
996  ----------
997  parent : string or RepositoryCfg
998  cfgRoot of a repo or a cfg that describes the repo
999  repoDataList : list of RepoData
1000  list to search in
1001 
1002  Returns
1003  -------
1004  RepoData or None
1005  A RepoData if one can be found, else None
1006  """
1007  repoData = None
1008  for otherRepoData in repoDataList:
1009  if isinstance(parent, RepositoryCfg):
1010  if otherRepoData.cfg == parent:
1011  repoData = otherRepoData
1012  break
1013  elif otherRepoData.cfg.root == parent:
1014  repoData = otherRepoData
1015  break
1016  return repoData
1017 
1018  def _setRepoDataTags(self):
1019  """Set the tags from each repoArgs into all its parent repoArgs so that they can be included in tagged
1020  searches."""
1021  def setTags(repoData, tags, context):
1022  if id(repoData) in context:
1023  return
1024  repoData.addTags(tags)
1025  context.add(id(repoData))
1026  for parentRepoData in repoData.parentRepoDatas:
1027  setTags(parentRepoData, tags, context)
1028  for repoData in self._repos.outputs() + self._repos.inputs():
1029  setTags(repoData.repoData, repoData.repoArgs.tags, set())
1030 
1031  def _convertV1Args(self, root, mapper, mapperArgs):
1032  """Convert Old Butler RepositoryArgs (root, mapper, mapperArgs) to New Butler RepositoryArgs
1033  (inputs, outputs)
1034 
1035  Parameters
1036  ----------
1037  root : string
1038  Posix path to repository root
1039  mapper : class, class instance, or string
1040  Instantiated class, a class object to be instantiated, or a string that refers to a class that
1041  can be imported & used as the mapper.
1042  mapperArgs : dict
1043  RepositoryArgs & their values used when instantiating the mapper.
1044 
1045  Returns
1046  -------
1047  tuple
1048  (inputs, outputs) - values to be used for inputs and outputs in Butler.__init__
1049  """
1050  if (mapper and not isinstance(mapper, basestring) and
1051  not inspect.isclass(mapper)):
1052  self.log.warn(preinitedMapperWarning)
1053  inputs = None
1054  if root is None:
1055  if hasattr(mapper, 'root'):
1056  # in legacy repositories, the mapper may be given the root directly.
1057  root = mapper.root
1058  else:
1059  # in the past root="None" could be used to mean root='.'
1060  root = '.'
1061  outputs = RepositoryArgs(mode='rw',
1062  root=root,
1063  mapper=mapper,
1064  mapperArgs=mapperArgs)
1065  return inputs, outputs
1066 
1067  def __repr__(self):
1068  return 'Butler(datasetTypeAliasDict=%s, repos=%s, persistence=%s)' % (
1069  self.datasetTypeAliasDict, self._repos, self.persistence)
1070 
1071  def _getDefaultMapper(self):
1072 
1073  """Get the default mapper. Currently this means if all the repositories use exactly the same mapper,
1074  that mapper may be considered the default.
1075 
1076  This definition may be changing; mappers may be able to exclude themselves as candidates for default,
1077  and they may nominate a different mapper instead. Also, we may not want to look at *all* the
1078  repositories, but only a depth-first search on each of the input & output repositories, and use the
1079  first-found mapper for each of those. TBD.
1080 
1081  Parameters
1082  ----------
1083  inputs : TYPE
1084  Description
1085 
1086  Returns
1087  -------
1088  Mapper class or None
1089  Returns the class type of the default mapper, or None if a default
1090  mapper can not be determined.
1091  """
1092  defaultMapper = None
1093 
1094  for inputRepoData in self._repos.inputs():
1095  mapper = None
1096  if inputRepoData.cfg.mapper is not None:
1097  mapper = inputRepoData.cfg.mapper
1098  # if the mapper is:
1099  # * a string, import it.
1100  # * a class instance, get its class type
1101  # * a class, do nothing; use it
1102  if isinstance(mapper, basestring):
1103  mapper = doImport(mapper)
1104  elif not inspect.isclass(mapper):
1105  mapper = mapper.__class__
1106  # If no mapper has been found, note the first found mapper.
1107  # Then, if a mapper has been found and each next mapper matches it,
1108  # continue looking for mappers.
1109  # If a mapper has been found and another non-matching mapper is
1110  # found then we have no default, return None.
1111  if defaultMapper is None:
1112  defaultMapper = mapper
1113  elif mapper == defaultMapper:
1114  continue
1115  elif mapper is not None:
1116  return None
1117  return defaultMapper
1118 
1119  def _assignDefaultMapper(self, defaultMapper):
1120  for repoData in self._repos.all().values():
1121  if repoData.cfg.mapper is None and (repoData.isNewRepository or repoData.isV1Repository):
1122  if defaultMapper is None:
1123  raise RuntimeError(
1124  "No mapper specified for %s and no default mapper could be determined." %
1125  repoData.args)
1126  repoData.cfg.mapper = defaultMapper
1127 
1128  @staticmethod
1129  def getMapperClass(root):
1130  """posix-only; gets the mapper class at the path specified by root (if a file _mapper can be found at
1131  that location or in a parent location.
1132 
1133  As we abstract the storage and support different types of storage locations this method will be
1134  moved entirely into Butler Access, or made more dynamic, and the API will very likely change."""
1135  return Storage.getMapperClass(root)
1136 
1137  def defineAlias(self, alias, datasetType):
1138  """Register an alias that will be substituted in datasetTypes.
1139 
1140  Parameters
1141  ----------
1142  alias - string
1143  The alias keyword. It may start with @ or not. It may not contain @ except as the first character.
1144  datasetType - string
1145  The string that will be substituted when @alias is passed into datasetType. It may not contain '@'
1146  """
1147  # verify formatting of alias:
1148  # it can have '@' as the first character (if not it's okay, we will add it) or not at all.
1149  atLoc = alias.rfind('@')
1150  if atLoc == -1:
1151  alias = "@" + str(alias)
1152  elif atLoc > 0:
1153  raise RuntimeError("Badly formatted alias string: %s" % (alias,))
1154 
1155  # verify that datasetType does not contain '@'
1156  if datasetType.count('@') != 0:
1157  raise RuntimeError("Badly formatted type string: %s" % (datasetType))
1158 
1159  # verify that the alias keyword does not start with another alias keyword,
1160  # and vice versa
1161  for key in self.datasetTypeAliasDict:
1162  if key.startswith(alias) or alias.startswith(key):
1163  raise RuntimeError("Alias: %s overlaps with existing alias: %s" % (alias, key))
1164 
1165  self.datasetTypeAliasDict[alias] = datasetType
1166 
1167  def getKeys(self, datasetType=None, level=None, tag=None):
1168  """Get the valid data id keys at or above the given level of hierarchy for the dataset type or the
1169  entire collection if None. The dict values are the basic Python types corresponding to the keys (int,
1170  float, string).
1171 
1172  Parameters
1173  ----------
1174  datasetType - string
1175  The type of dataset to get keys for, entire collection if None.
1176  level - string
1177  The hierarchy level to descend to. None if it should not be restricted. Use an empty string if the
1178  mapper should lookup the default level.
1179  tags - any, or list of any
1180  Any object that can be tested to be the same as the tag in a dataId passed into butler input
1181  functions. Applies only to input repositories: If tag is specified by the dataId then the repo
1182  will only be read from used if the tag in the dataId matches a tag used for that repository.
1183 
1184  Returns
1185  -------
1186  Returns a dict. The dict keys are the valid data id keys at or above the given level of hierarchy for
1187  the dataset type or the entire collection if None. The dict values are the basic Python types
1188  corresponding to the keys (int, float, string).
1189  """
1190  datasetType = self._resolveDatasetTypeAlias(datasetType)
1191 
1192  keys = None
1193  tag = setify(tag)
1194  for repoData in self._repos.inputs():
1195  if not tag or len(tag.intersection(repoData.tags)) > 0:
1196  keys = repoData.repo.getKeys(datasetType, level)
1197  # An empty dict is a valid "found" condition for keys. The only value for keys that should
1198  # cause the search to continue is None
1199  if keys is not None:
1200  break
1201  return keys
1202 
1203  def queryMetadata(self, datasetType, format, dataId={}, **rest):
1204  """Returns the valid values for one or more keys when given a partial
1205  input collection data id.
1206 
1207  Parameters
1208  ----------
1209  datasetType - string
1210  The type of dataset to inquire about.
1211  format - str, tuple
1212  Key or tuple of keys to be returned.
1213  dataId - DataId, dict
1214  The partial data id.
1215  **rest -
1216  Keyword arguments for the partial data id.
1217 
1218  Returns
1219  -------
1220  A list of valid values or tuples of valid values as specified by the
1221  format.
1222  """
1223 
1224  datasetType = self._resolveDatasetTypeAlias(datasetType)
1225  dataId = DataId(dataId)
1226  dataId.update(**rest)
1227  format = sequencify(format)
1228 
1229  tuples = None
1230  for repoData in self._repos.inputs():
1231  if not dataId.tag or len(dataId.tag.intersection(repoData.tags)) > 0:
1232  tuples = repoData.repo.queryMetadata(datasetType, format, dataId)
1233  if tuples:
1234  break
1235 
1236  if not tuples:
1237  return []
1238 
1239  if len(format) == 1:
1240  ret = []
1241  for x in tuples:
1242  try:
1243  ret.append(x[0])
1244  except TypeError:
1245  ret.append(x)
1246  return ret
1247 
1248  return tuples
1249 
1250  def datasetExists(self, datasetType, dataId={}, write=False, **rest):
1251  """Determines if a dataset file exists.
1252 
1253  Parameters
1254  ----------
1255  datasetType - string
1256  The type of dataset to inquire about.
1257  dataId - DataId, dict
1258  The data id of the dataset.
1259  write - bool
1260  If True, look only in locations where the dataset could be written,
1261  and return True only if it is present in all of them.
1262  **rest keyword arguments for the data id.
1263 
1264  Returns
1265  -------
1266  exists - bool
1267  True if the dataset exists or is non-file-based.
1268  """
1269  datasetType = self._resolveDatasetTypeAlias(datasetType)
1270  dataId = DataId(dataId)
1271  dataId.update(**rest)
1272  locations = self._locate(datasetType, dataId, write=write)
1273  if not write: # when write=False, locations is not a sequence
1274  if locations is None:
1275  return False
1276  locations = [locations]
1277 
1278  if not locations: # empty list
1279  return False
1280 
1281  for location in locations:
1282  # If the location is a ButlerComposite (as opposed to a ButlerLocation),
1283  # verify the component objects exist.
1284  if isinstance(location, ButlerComposite):
1285  for name, componentInfo in location.componentInfo.items():
1286  if componentInfo.subset:
1287  subset = self.subset(datasetType=componentInfo.datasetType, dataId=location.dataId)
1288  exists = all([obj.datasetExists() for obj in subset])
1289  else:
1290  exists = self.datasetExists(componentInfo.datasetType, location.dataId)
1291  if exists is False:
1292  return False
1293  else:
1294  if not location.repository.exists(location):
1295  return False
1296  return True
1297 
1298  def _locate(self, datasetType, dataId, write):
1299  """Get one or more ButlerLocations and/or ButlercComposites.
1300 
1301  Parameters
1302  ----------
1303  datasetType : string
1304  The datasetType that is being searched for. The datasetType may be followed by a dot and
1305  a component name (component names are specified in the policy). IE datasetType.componentName
1306 
1307  dataId : dict or DataId class instance
1308  The dataId
1309 
1310  write : bool
1311  True if this is a search to write an object. False if it is a search to read an object. This
1312  affects what type (an object or a container) is returned.
1313 
1314  Returns
1315  -------
1316  If write is False, will return either a single object or None. If write is True, will return a list
1317  (which may be empty)
1318  """
1319  repos = self._repos.outputs() if write else self._repos.inputs()
1320  locations = []
1321  for repoData in repos:
1322  # enforce dataId & repository tags when reading:
1323  if not write and dataId.tag and len(dataId.tag.intersection(repoData.tags)) == 0:
1324  continue
1325  components = datasetType.split('.')
1326  datasetType = components[0]
1327  components = components[1:]
1328  try:
1329  location = repoData.repo.map(datasetType, dataId, write=write)
1330  except NoResults:
1331  continue
1332  if location is None:
1333  continue
1334  location.datasetType = datasetType # todo is there a better way than monkey patching here?
1335  if len(components) > 0:
1336  if not isinstance(location, ButlerComposite):
1337  raise RuntimeError("The location for a dotted datasetType must be a composite.")
1338  # replace the first component name with the datasetType
1339  components[0] = location.componentInfo[components[0]].datasetType
1340  # join components back into a dot-delimited string
1341  datasetType = '.'.join(components)
1342  location = self._locate(datasetType, dataId, write)
1343  # if a component location is not found, we can not continue with this repo, move to next repo.
1344  if location is None:
1345  break
1346  # if reading, only one location is desired.
1347  if location:
1348  if not write:
1349  # If there is a bypass function for this dataset type, we can't test to see if the object
1350  # exists in storage, because the bypass function may not actually use the location
1351  # according to the template. Instead, execute the bypass function and include its results
1352  # in the bypass attribute of the location. The bypass function may fail for any reason,
1353  # the most common case being that a file does not exist. If it raises an exception
1354  # indicating such, we ignore the bypass function and proceed as though it does not exist.
1355  if hasattr(location.mapper, "bypass_" + location.datasetType):
1356  bypass = self._getBypassFunc(location, dataId)
1357  try:
1358  bypass = bypass()
1359  location.bypass = bypass
1360  except (NoResults, IOError):
1361  self.log.debug("Continuing dataset search while evaluating "
1362  "bypass function for Dataset type:{} Data ID:{} at "
1363  "location {}".format(datasetType, dataId, location))
1364  # If a location was found but the location does not exist, keep looking in input
1365  # repositories (the registry may have had enough data for a lookup even thought the object
1366  # exists in a different repository.)
1367  if (isinstance(location, ButlerComposite) or hasattr(location, 'bypass') or
1368  location.repository.exists(location)):
1369  return location
1370  else:
1371  try:
1372  locations.extend(location)
1373  except TypeError:
1374  locations.append(location)
1375  if not write:
1376  return None
1377  return locations
1378 
1379  @staticmethod
1380  def _getBypassFunc(location, dataId):
1381  pythonType = location.getPythonType()
1382  if pythonType is not None:
1383  if isinstance(pythonType, basestring):
1384  pythonType = doImport(pythonType)
1385  bypassFunc = getattr(location.mapper, "bypass_" + location.datasetType)
1386  return lambda: bypassFunc(location.datasetType, pythonType, location, dataId)
1387 
1388  def get(self, datasetType, dataId=None, immediate=True, **rest):
1389  """Retrieves a dataset given an input collection data id.
1390 
1391  Parameters
1392  ----------
1393  datasetType - string
1394  The type of dataset to retrieve.
1395  dataId - dict
1396  The data id.
1397  immediate - bool
1398  If False use a proxy for delayed loading.
1399  **rest
1400  keyword arguments for the data id.
1401 
1402  Returns
1403  -------
1404  An object retrieved from the dataset (or a proxy for one).
1405  """
1406  datasetType = self._resolveDatasetTypeAlias(datasetType)
1407  dataId = DataId(dataId)
1408  dataId.update(**rest)
1409 
1410  location = self._locate(datasetType, dataId, write=False)
1411  if location is None:
1412  raise NoResults("No locations for get:", datasetType, dataId)
1413  self.log.debug("Get type=%s keys=%s from %s", datasetType, dataId, str(location))
1414 
1415  if hasattr(location, 'bypass'):
1416  # this type loader block should get moved into a helper someplace, and duplications removed.
1417  def callback():
1418  return location.bypass
1419  else:
1420  def callback():
1421  return self._read(location)
1422  if location.mapper.canStandardize(location.datasetType):
1423  innerCallback = callback
1424 
1425  def callback():
1426  return location.mapper.standardize(location.datasetType, innerCallback(), dataId)
1427  if immediate:
1428  return callback()
1429  return ReadProxy(callback)
1430 
1431  def put(self, obj, datasetType, dataId={}, doBackup=False, **rest):
1432  """Persists a dataset given an output collection data id.
1433 
1434  Parameters
1435  ----------
1436  obj -
1437  The object to persist.
1438  datasetType - string
1439  The type of dataset to persist.
1440  dataId - dict
1441  The data id.
1442  doBackup - bool
1443  If True, rename existing instead of overwriting.
1444  WARNING: Setting doBackup=True is not safe for parallel processing, as it may be subject to race
1445  conditions.
1446  **rest
1447  Keyword arguments for the data id.
1448  """
1449  datasetType = self._resolveDatasetTypeAlias(datasetType)
1450  dataId = DataId(dataId)
1451  dataId.update(**rest)
1452 
1453  for location in self._locate(datasetType, dataId, write=True):
1454  if isinstance(location, ButlerComposite):
1455  disassembler = location.disassembler if location.disassembler else genericDisassembler
1456  disassembler(obj=obj, dataId=location.dataId, componentInfo=location.componentInfo)
1457  for name, info in location.componentInfo.items():
1458  if not info.inputOnly:
1459  self.put(info.obj, info.datasetType, location.dataId, doBackup=doBackup)
1460  else:
1461  if doBackup:
1462  location.getRepository().backup(location.datasetType, dataId)
1463  location.getRepository().write(location, obj)
1464 
1465  def subset(self, datasetType, level=None, dataId={}, **rest):
1466  """Return complete dataIds for a dataset type that match a partial (or empty) dataId.
1467 
1468  Given a partial (or empty) dataId specified in dataId and **rest, find all datasets that match the
1469  dataId. Optionally restrict the results to a given level specified by a dataId key (e.g. visit or
1470  sensor or amp for a camera). Return an iterable collection of complete dataIds as ButlerDataRefs.
1471  Datasets with the resulting dataIds may not exist; that needs to be tested with datasetExists().
1472 
1473  Parameters
1474  ----------
1475  datasetType - string
1476  The type of dataset collection to subset
1477  level - string
1478  The level of dataId at which to subset. Use an empty string if the mapper should look up the
1479  default level.
1480  dataId - dict
1481  The data id.
1482  **rest
1483  Keyword arguments for the data id.
1484 
1485  Returns
1486  -------
1487  subset - ButlerSubset
1488  Collection of ButlerDataRefs for datasets matching the data id.
1489 
1490  Examples
1491  -----------
1492  To print the full dataIds for all r-band measurements in a source catalog
1493  (note that the subset call is equivalent to: `butler.subset('src', dataId={'filter':'r'})`):
1494 
1495  >>> subset = butler.subset('src', filter='r')
1496  >>> for data_ref in subset: print(data_ref.dataId)
1497  """
1498  datasetType = self._resolveDatasetTypeAlias(datasetType)
1499 
1500  # Currently expected behavior of subset is that if specified level is None then the mapper's default
1501  # level should be used. Convention for level within Butler is that an empty string is used to indicate
1502  # 'get default'.
1503  if level is None:
1504  level = ''
1505 
1506  dataId = DataId(dataId)
1507  dataId.update(**rest)
1508  return ButlerSubset(self, datasetType, level, dataId)
1509 
1510  def dataRef(self, datasetType, level=None, dataId={}, **rest):
1511  """Returns a single ButlerDataRef.
1512 
1513  Given a complete dataId specified in dataId and **rest, find the unique dataset at the given level
1514  specified by a dataId key (e.g. visit or sensor or amp for a camera) and return a ButlerDataRef.
1515 
1516  Parameters
1517  ----------
1518  datasetType - string
1519  The type of dataset collection to reference
1520  level - string
1521  The level of dataId at which to reference
1522  dataId - dict
1523  The data id.
1524  **rest
1525  Keyword arguments for the data id.
1526 
1527  Returns
1528  -------
1529  dataRef - ButlerDataRef
1530  ButlerDataRef for dataset matching the data id
1531  """
1532 
1533  datasetType = self._resolveDatasetTypeAlias(datasetType)
1534  dataId = DataId(dataId)
1535  subset = self.subset(datasetType, level, dataId, **rest)
1536  if len(subset) != 1:
1537  raise RuntimeError("No unique dataset for: Dataset type:%s Level:%s Data ID:%s Keywords:%s" %
1538  (str(datasetType), str(level), str(dataId), str(rest)))
1539  return ButlerDataRef(subset, subset.cache[0])
1540 
1541  def getUri(self, datasetType, dataId=None, write=False, **rest):
1542  """Return the URI for a dataset
1543 
1544  .. warning:: This is intended only for debugging. The URI should
1545  never be used for anything other than printing.
1546 
1547  .. note:: In the event there are multiple URIs for read, we return only
1548  the first.
1549 
1550  .. note:: getUri() does not currently support composite datasets.
1551 
1552  Parameters
1553  ----------
1554  datasetType : `str`
1555  The dataset type of interest.
1556  dataId : `dict`, optional
1557  The data identifier.
1558  write : `bool`, optional
1559  Return the URI for writing?
1560  rest : `dict`, optional
1561  Keyword arguments for the data id.
1562 
1563  Returns
1564  -------
1565  uri : `str`
1566  URI for dataset.
1567  """
1568  datasetType = self._resolveDatasetTypeAlias(datasetType)
1569  dataId = DataId(dataId)
1570  dataId.update(**rest)
1571  locations = self._locate(datasetType, dataId, write=write)
1572  if locations is None:
1573  raise NoResults("No locations for getUri: ", datasetType, dataId)
1574 
1575  if write:
1576  # Follow the write path
1577  # Return the first valid write location.
1578  for location in locations:
1579  if isinstance(location, ButlerComposite):
1580  for name, info in location.componentInfo.items():
1581  if not info.inputOnly:
1582  return self.getUri(info.datasetType, location.dataId, write=True)
1583  else:
1584  return location.getLocationsWithRoot()[0]
1585  # fall back to raise
1586  raise NoResults("No locations for getUri(write=True): ", datasetType, dataId)
1587  else:
1588  # Follow the read path, only return the first valid read
1589  return locations.getLocationsWithRoot()[0]
1590 
1591  def _read(self, location):
1592  """Unpersist an object using data inside a ButlerLocation or ButlerComposite object.
1593 
1594  Parameters
1595  ----------
1596  location : ButlerLocation or ButlerComposite
1597  A ButlerLocation or ButlerComposite instance populated with data needed to read the object.
1598 
1599  Returns
1600  -------
1601  object
1602  An instance of the object specified by the location.
1603  """
1604  self.log.debug("Starting read from %s", location)
1605 
1606  if isinstance(location, ButlerComposite):
1607  for name, componentInfo in location.componentInfo.items():
1608  if componentInfo.subset:
1609  subset = self.subset(datasetType=componentInfo.datasetType, dataId=location.dataId)
1610  componentInfo.obj = [obj.get() for obj in subset]
1611  else:
1612  obj = self.get(componentInfo.datasetType, location.dataId, immediate=True)
1613  componentInfo.obj = obj
1614  assembler = location.assembler or genericAssembler
1615  results = assembler(dataId=location.dataId, componentInfo=location.componentInfo,
1616  cls=location.python)
1617  return results
1618  else:
1619  results = location.repository.read(location)
1620  if len(results) == 1:
1621  results = results[0]
1622  self.log.debug("Ending read from %s", location)
1623  return results
1624 
1625  def __reduce__(self):
1626  ret = (_unreduce, (self._initArgs, self.datasetTypeAliasDict))
1627  return ret
1628 
1629  def _resolveDatasetTypeAlias(self, datasetType):
1630  """Replaces all the known alias keywords in the given string with the alias value.
1631 
1632  Parameters
1633  ----------
1634  datasetType - string
1635  A datasetType string to search & replace on
1636 
1637  Returns
1638  -------
1639  datasetType - string
1640  The de-aliased string
1641  """
1642  for key in self.datasetTypeAliasDict:
1643  # if all aliases have been replaced, bail out
1644  if datasetType.find('@') == -1:
1645  break
1646  datasetType = datasetType.replace(key, self.datasetTypeAliasDict[key])
1647 
1648  # If an alias specifier can not be resolved then throw.
1649  if datasetType.find('@') != -1:
1650  raise RuntimeError("Unresolvable alias specifier in datasetType: %s" % (datasetType))
1651 
1652  return datasetType
1653 
1654 
1655 def _unreduce(initArgs, datasetTypeAliasDict):
1656  mapperArgs = initArgs.pop('mapperArgs')
1657  initArgs.update(mapperArgs)
1658  butler = Butler(**initArgs)
1659  butler.datasetTypeAliasDict = datasetTypeAliasDict
1660  return butler
def _resolveDatasetTypeAlias(self, datasetType)
Definition: butler.py:1629
def datasetExists(self, datasetType, dataId={}, write=False, rest)
Definition: butler.py:1250
def _convertV1Args(self, root, mapper, mapperArgs)
Definition: butler.py:1031
def __init__(self, root=None, mapper=None, inputs=None, outputs=None, mapperArgs)
Definition: butler.py:504
def setCfg(self, cfg, origin, root, isV1Repository)
Definition: butler.py:168
def _getRepositoryCfg(self, repositoryArgs)
Definition: butler.py:729
def getParentRepoDatas(self, context=None)
Definition: butler.py:209
def _setParentRegistry(self, repoData)
Definition: butler.py:539
def _getCfgs(self, repoDataList)
Definition: butler.py:755
def subset(self, datasetType, level=None, dataId={}, rest)
Definition: butler.py:1465
def __init__(self, cls, repoCfg)
Definition: butler.py:62
def _setDefaultMapper(self, repoDataList)
Definition: butler.py:932
def getUri(self, datasetType, dataId=None, write=False, rest)
Definition: butler.py:1541
def defineAlias(self, alias, datasetType)
Definition: butler.py:1137
def _connectParentRepoDatas(self, repoDataList)
Definition: butler.py:961
def _addParents(self, repoDataList)
Definition: butler.py:833
def getKeys(self, datasetType=None, level=None, tag=None)
Definition: butler.py:1167
def doImport(pythonType)
Definition: utils.py:109
def _getBypassFunc(location, dataId)
Definition: butler.py:1380
def put(self, obj, datasetType, dataId={}, doBackup=False, rest)
Definition: butler.py:1431
def queryMetadata(self, datasetType, format, dataId={}, rest)
Definition: butler.py:1203
def _processInputArguments(self, root=None, mapper=None, inputs=None, outputs=None, mapperArgs)
Definition: butler.py:586
def addParentRepoData(self, parentRepoData)
Definition: butler.py:237
def _locate(self, datasetType, dataId, write)
Definition: butler.py:1298
def _setAndVerifyParentsLists(self, repoDataList)
Definition: butler.py:882
def get(self, datasetType, dataId=None, immediate=True, rest)
Definition: butler.py:1388
def __init__(self, args, role)
Definition: butler.py:125