lsst.daf.persistence  13.0-32-g7b14ddd
 All Classes Namespaces Files Functions Variables Typedefs Friends Macros
butler.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 
3 #
4 # LSST Data Management System
5 # Copyright 2008-2015 LSST Corporation.
6 #
7 # This product includes software developed by the
8 # LSST Project (http://www.lsst.org/).
9 #
10 # This program is free software: you can redistribute it and/or modify
11 # it under the terms of the GNU General Public License as published by
12 # the Free Software Foundation, either version 3 of the License, or
13 # (at your option) any later version.
14 #
15 # This program is distributed in the hope that it will be useful,
16 # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 # GNU General Public License for more details.
19 #
20 # You should have received a copy of the LSST License Statement and
21 # the GNU General Public License along with this program. If not,
22 # see <http://www.lsstcorp.org/LegalNotices/>.
23 #
24 
25 # -*- python -*-
26 
27 """This module defines the Butler class."""
28 from builtins import str
29 from past.builtins import basestring
30 from builtins import object
31 
32 import copy
33 import inspect
34 
35 import yaml
36 
37 from lsst.log import Log
38 import lsst.pex.policy as pexPolicy
39 from . import ReadProxy, ButlerSubset, ButlerDataRef, Persistence, \
40  Storage, Policy, NoResults, Repository, DataId, RepositoryCfg, \
41  RepositoryArgs, listify, setify, sequencify, doImport, ButlerComposite, genericAssembler, \
42  genericDisassembler, PosixStorage, ParentsMismatch
43 
44 preinitedMapperWarning = ("Passing an instantiated mapper into " +
45  "Butler.__init__ will prevent Butler from passing " +
46  "parentRegistry or repositoryCfg information to " +
47  "the mapper, which is done only at init time. " +
48  "It is better to pass a importable string or " +
49  "class object.")
50 
51 
52 class ButlerCfg(Policy, yaml.YAMLObject):
53  """Represents a Butler configuration.
54 
55  .. warning::
56 
57  cfg is 'wet paint' and very likely to change. Use of it in production
58  code other than via the 'old butler' API is strongly discouraged.
59  """
60  yaml_tag = u"!ButlerCfg"
61 
62  def __init__(self, cls, repoCfg):
63  super(ButlerCfg, self).__init__({'repoCfg': repoCfg, 'cls': cls})
64 
65 
66 class RepoData(object):
67  """Container object for repository data used by Butler
68 
69  Parameters
70  ----------
71  args : RepositoryArgs
72  The arguments that are used to find or create the RepositoryCfg.
73  role : string
74  "input", "output", or "parent", indicating why Butler loaded this repository.
75  * input: the Repository was passed as a Butler input.
76  * output: the Repository was passed as a Butler output.
77  * parent: the Repository was specified in the RepositoryCfg parents list of a readable repository.
78 
79  Attributes
80  ----------
81  cfg: RepositoryCfg
82  The configuration for the Repository.
83 
84  _cfgOrigin : string
85  "new", "existing", or "nested". Indicates the origin of the repository and its RepositoryCfg:
86  * new: it was created by this instance of Butler, and this instance of Butler will generate the
87  RepositoryCfg file.
88  * existing: it was found (via the root or cfgRoot argument)
89  * nested: the full RepositoryCfg was nested in another RepositoryCfg's parents list (this can happen
90  if parameters of an input specified by RepositoryArgs or dict does not entirely match an existing
91  RepositoryCfg).
92 
93  cfgRoot : string
94  Path or URI to the location of the RepositoryCfg file.
95 
96  repo : lsst.daf.persistence.Repository
97  The Repository class instance.
98 
99  parentRepoDatas : list of RepoData
100  The parents of this Repository, as indicated this Repository's RepositoryCfg. If this is a new
101  Repository then these are the inputs to this Butler (and will be saved in the RepositoryCfg). These
102  RepoData objects are not owned by this RepoData, these are references to peer RepoData objects in the
103  Butler's RepoDataContainer.
104 
105  isV1Repository : bool
106  True if this is an Old Butler repository. In this case the repository does not have a RepositoryCfg
107  file. It may have a _mapper file and may have a _parent symlink. It will never be treated as a "new"
108  repository, i.e. even though there is not a RepositoryCfg file, one will not be generated.
109  If False, this is a New Butler repository and is specified by RepositoryCfg file.
110 
111  tags : set
112  These are values that may be used to restrict the search of input repositories. Details are available
113  in the RepositoryArgs and DataId classes.
114 
115  role : string
116  "input", "output", or "parent", indicating why Butler loaded this repository.
117  * input: the Repository was passed as a Butler input.
118  * output: the Repository was passed as a Butler output.
119  * parent: the Repository was specified in the RepositoryCfg parents list of a readable repository.
120 
121  _repoArgs : RepositoryArgs
122  Contains the arguments that were used to specify this Repository.
123  """
124 
125  def __init__(self, args, role):
126  self.cfg = None
127  self._cfgOrigin = None
128  self.cfgRoot = None
129  self.repo = None
130  self.parentRepoDatas = []
131  self.isV1Repository = False
132  self.tags = set()
133  self.role = role
134  self.parentRegistry = None
135  self._repoArgs = args
136 
137  @property
138  def repoArgs(self):
139  return self._repoArgs
140 
141  @property
142  def repoData(self):
143  return self
144 
145  def __repr__(self):
146  return ("{}(id={},"
147  "repoArgs={}"
148  "cfg={!r},"
149  "cfgOrigin={},"
150  "cfgRoot={}," +
151  "repo={},"
152  "parentRepoDatas={}," +
153  "isV1Repository={},"
154  "role={}," +
155  "parentRegistry={})").format(
156  self.__class__.__name__,
157  id(self),
158  self.repoArgs,
159  self.cfg,
160  self.cfgOrigin,
161  self.cfgRoot,
162  self.repo,
163  [id(p) for p in self.parentRepoDatas],
164  self.isV1Repository,
165  self.role,
166  self.parentRegistry)
167 
168  def setCfg(self, cfg, origin, root, isV1Repository):
169  """Set information about the cfg into the RepoData
170 
171  Parameters
172  ----------
173  cfg : RepositoryCfg
174  The RepositoryCfg for the repo.
175  origin : string
176  'new', 'existing', or 'nested'
177  root : string
178  URI or absolute path to the location of the RepositoryCfg.yaml file.
179 
180  Returns
181  -------
182  None
183  """
184  if origin not in ('new', 'existing', 'nested'):
185  raise RuntimeError("Invalid value for origin:{}".format(origin))
186  self.cfg = cfg
187  self._cfgOrigin = origin
188  self.cfgRoot = root
189  self.isV1Repository = isV1Repository
190 
191  @property
192  def cfgOrigin(self):
193  return self._cfgOrigin
194 
195  @property
196  def isNewRepository(self):
197  return self.cfgOrigin == 'new'
198 
199  @property
200  def role(self):
201  return self._role
202 
203  @role.setter
204  def role(self, val):
205  if val not in ('input', 'output', 'parent'):
206  raise RuntimeError("Invalid value for role: {}".format(val))
207  self._role = val
208 
209  def getParentRepoDatas(self, context=None):
210  """Get the parents & grandparents etc of this repo data, in depth-first search order.
211 
212  Duplicate entries will be removed in cases where the same parent appears more than once in the parent
213  graph.
214 
215  Parameters
216  ----------
217  context : set, optional
218  Users should typically omit context and accept the default argument. Context is used to keep a set
219  of known RepoDatas when calling this function recursively, for duplicate elimination.
220 
221  Returns
222  -------
223  list of RepoData
224  A list of the parents & grandparents etc of a given repo data, in depth-first search order.
225  """
226  if context is None:
227  context = set()
228  parents = []
229  if id(self) in context:
230  return parents
231  context.add(id(self))
232  for parent in self.parentRepoDatas:
233  parents.append(parent)
234  parents += parent.getParentRepoDatas(context)
235  return parents
236 
237  def addParentRepoData(self, parentRepoData):
238  self.parentRepoDatas.append(parentRepoData)
239 
240  def addTags(self, tags):
241  self.tags = self.tags.union(tags)
242 
243 
244 class RepoDataContainer(object):
245  """Container object for RepoData instances owned by a Butler instance.
246 
247  Parameters
248  ----------
249  repoDataList : list of RepoData
250  repoData - RepoData instance to add
251  """
252 
253  def __init__(self, repoDataList):
254  self._inputs = None
255  self._outputs = None
256  self._all = repoDataList
257  self._buildLookupLists()
258 
259  def inputs(self):
260  """Get a list of RepoData that are used to as inputs to the Butler.
261  The list is created lazily as needed, and cached.
262 
263  Returns
264  -------
265  A list of RepoData with readable repositories, in the order to be used when searching.
266  """
267  if self._inputs is None:
268  raise RuntimeError("Inputs not yet initialized.")
269  return self._inputs
270 
271  def outputs(self):
272  """Get a list of RepoData that are used to as outputs to the Butler.
273  The list is created lazily as needed, and cached.
274 
275  Returns
276  -------
277  A list of RepoData with writable repositories, in the order to be use when searching.
278  """
279  if self._outputs is None:
280  raise RuntimeError("Outputs not yet initialized.")
281  return self._outputs
282 
283  def all(self):
284  """Get a list of all RepoData that are used to as by the Butler.
285  The list is created lazily as needed, and cached.
286 
287  Returns
288  -------
289  A list of RepoData with writable repositories, in the order to be use when searching.
290  """
291  return self._all
292 
293  def __repr__(self):
294  return "%s(_inputs=%r, \n_outputs=%s, \n_all=%s)" % (
295  self.__class__.__name__,
296  self._inputs,
297  self._outputs,
298  self._all)
299 
300  def _buildLookupLists(self):
301  """Build the inputs and outputs lists based on the order of self.all()."""
302 
303  def addToList(repoData, lst):
304  """Add a repoData and each of its parents (depth first) to a list"""
305  if id(repoData) in alreadyAdded:
306  return
307  lst.append(repoData)
308  alreadyAdded.add(id(repoData))
309  for parent in repoData.parentRepoDatas:
310  addToList(parent, lst)
311 
312  if self._inputs is not None or self._outputs is not None:
313  raise RuntimeError("Lookup lists are already built.")
314  inputs = [repoData for repoData in self.all() if repoData.role == 'input']
315  outputs = [repoData for repoData in self.all() if repoData.role == 'output']
316  self._inputs = []
317  alreadyAdded = set()
318  for repoData in outputs:
319  if 'r' in repoData.repoArgs.mode:
320  addToList(repoData.repoData, self._inputs)
321  for repoData in inputs:
322  addToList(repoData.repoData, self._inputs)
323  self._outputs = [repoData.repoData for repoData in outputs]
324 
325 
326 class Butler(object):
327  """Butler provides a generic mechanism for persisting and retrieving data using mappers.
328 
329  A Butler manages a collection of datasets known as a repository. Each dataset has a type representing its
330  intended usage and a location. Note that the dataset type is not the same as the C++ or Python type of the
331  object containing the data. For example, an ExposureF object might be used to hold the data for a raw
332  image, a post-ISR image, a calibrated science image, or a difference image. These would all be different
333  dataset types.
334 
335  A Butler can produce a collection of possible values for a key (or tuples of values for multiple keys) if
336  given a partial data identifier. It can check for the existence of a file containing a dataset given its
337  type and data identifier. The Butler can then retrieve the dataset. Similarly, it can persist an object to
338  an appropriate location when given its associated data identifier.
339 
340  Note that the Butler has two more advanced features when retrieving a data set. First, the retrieval is
341  lazy. Input does not occur until the data set is actually accessed. This allows datasets to be retrieved
342  and placed on a clipboard prospectively with little cost, even if the algorithm of a stage ends up not
343  using them. Second, the Butler will call a standardization hook upon retrieval of the dataset. This
344  function, contained in the input mapper object, must perform any necessary manipulations to force the
345  retrieved object to conform to standards, including translating metadata.
346 
347  Public methods:
348 
349  __init__(self, root, mapper=None, **mapperArgs)
350 
351  defineAlias(self, alias, datasetType)
352 
353  getKeys(self, datasetType=None, level=None)
354 
355  queryMetadata(self, datasetType, format=None, dataId={}, **rest)
356 
357  datasetExists(self, datasetType, dataId={}, **rest)
358 
359  get(self, datasetType, dataId={}, immediate=False, **rest)
360 
361  put(self, obj, datasetType, dataId={}, **rest)
362 
363  subset(self, datasetType, level=None, dataId={}, **rest)
364 
365  dataRef(self, datasetType, level=None, dataId={}, **rest)
366 
367  Initialization:
368 
369  The preferred method of initialization is to use the `inputs` and `outputs` __init__ parameters. These
370  are described in the parameters section, below.
371 
372  For backward compatibility: this initialization method signature can take a posix root path, and
373  optionally a mapper class instance or class type that will be instantiated using the mapperArgs input
374  argument. However, for this to work in a backward compatible way it creates a single repository that is
375  used as both an input and an output repository. This is NOT preferred, and will likely break any
376  provenance system we have in place.
377 
378  Parameters
379  ----------
380  root : string
381  .. note:: Deprecated in 12_0
382  `root` will be removed in TBD, it is replaced by `inputs` and `outputs` for
383  multiple-repository support.
384  A file system path. Will only work with a PosixRepository.
385  mapper : string or instance
386  .. note:: Deprecated in 12_0
387  `mapper` will be removed in TBD, it is replaced by `inputs` and `outputs` for
388  multiple-repository support.
389  Provides a mapper to be used with Butler.
390  mapperArgs : dict
391  .. note:: Deprecated in 12_0
392  `mapperArgs` will be removed in TBD, it is replaced by `inputs` and `outputs` for
393  multiple-repository support.
394  Provides arguments to be passed to the mapper if the mapper input argument is a class type to be
395  instantiated by Butler.
396  inputs : RepositoryArgs, dict, or string
397  Can be a single item or a list. Provides arguments to load an existing repository (or repositories).
398  String is assumed to be a URI and is used as the cfgRoot (URI to the location of the cfg file). (Local
399  file system URI does not have to start with 'file://' and in this way can be a relative path). The
400  `RepositoryArgs` class can be used to provide more parameters with which to initialize a repository
401  (such as `mapper`, `mapperArgs`, `tags`, etc. See the `RepositoryArgs` documentation for more
402  details). A dict may be used as shorthand for a `RepositoryArgs` class instance. The dict keys must
403  match parameters to the `RepositoryArgs.__init__` function.
404  outputs : RepositoryArgs, dict, or string
405  Provides arguments to load one or more existing repositories or create new ones. The different types
406  are handled the same as for `inputs`.
407 
408  The Butler init sequence loads all of the input and output repositories.
409  This creates the object hierarchy to read from and write to them. Each
410  repository can have 0 or more parents, which also get loaded as inputs.
411  This becomes a DAG of repositories. Ultimately, Butler creates a list of
412  these Repositories in the order that they are used.
413 
414  Initialization Sequence
415  =======================
416 
417  During initialization Butler creates a Repository class instance & support structure for each object
418  passed to `inputs` and `outputs` as well as the parent repositories recorded in the `RepositoryCfg` of
419  each existing readable repository.
420 
421  This process is complex. It is explained below to shed some light on the intent of each step.
422 
423  1. Input Argument Standardization
424  ---------------------------------
425 
426  In `Butler._processInputArguments` the input arguments are verified to be legal (and a RuntimeError is
427  raised if not), and they are converted into an expected format that is used for the rest of the Butler
428  init sequence. See the docstring for `_processInputArguments`.
429 
430  2. Create RepoData Objects
431  --------------------------
432 
433  Butler uses an object, called `RepoData`, to keep track of information about each repository; each
434  repository is contained in a single `RepoData`. The attributes are explained in its docstring.
435 
436  After `_processInputArguments`, a RepoData is instantiated and put in a list for each repository in
437  `outputs` and `inputs`. This list of RepoData, the `repoDataList`, now represents all the output and input
438  repositories (but not parent repositories) that this Butler instance will use.
439 
440  3. Get `RepositoryCfg`s
441  -----------------------
442 
443  `Butler._getCfgs` gets the `RepositoryCfg` for each repository the `repoDataList`. The behavior is
444  described in the docstring.
445 
446  4. Add Parents
447  --------------
448 
449  `Butler._addParents` then considers the parents list in the `RepositoryCfg` of each `RepoData` in the
450  `repoDataList` and inserts new `RepoData` objects for each parent not represented in the proper location
451  in the `repoDataList`. Ultimately a flat list is built to represent the DAG of readable repositories
452  represented in depth-first order.
453 
454  5. Set and Verify Parents of Outputs
455  ------------------------------------
456 
457  To be able to load parent repositories when output repositories are used as inputs, the input repositories
458  are recorded as parents in the `RepositoryCfg` file of new output repositories. When an output repository
459  already exists, for consistency the Butler's inputs must match the list of parents specified the already-
460  existing output repository's `RepositoryCfg` file.
461 
462  In `Butler._setAndVerifyParentsLists`, the list of parents is recorded in the `RepositoryCfg` of new
463  repositories. For existing repositories the list of parents is compared with the `RepositoryCfg`'s parents
464  list, and if they do not match a `RuntimeError` is raised.
465 
466  6. Set the Default Mapper
467  -------------------------
468 
469  If all the input repositories use the same mapper then we can assume that mapper to be the
470  "default mapper". If there are new output repositories whose `RepositoryArgs` do not specify a mapper and
471  there is a default mapper then the new output repository will be set to use that default mapper.
472 
473  This is handled in `Butler._setDefaultMapper`.
474 
475  7. Cache References to Parent RepoDatas
476  ---------------------------------------
477 
478  In `Butler._connectParentRepoDatas`, in each `RepoData` in `repoDataList`, a list of `RepoData` object
479  references is built that matches the parents specified in that `RepoData`'s `RepositoryCfg`.
480 
481  This list is used later to find things in that repository's parents, without considering peer repository's
482  parents. (e.g. finding the registry of a parent)
483 
484  8. Set Tags
485  -----------
486 
487  Tags are described at https://ldm-463.lsst.io/v/draft/#tagging
488 
489  In `Butler._setRepoDataTags`, for each `RepoData`, the tags specified by its `RepositoryArgs` are recorded
490  in a set, and added to the tags set in each of its parents, for ease of lookup when mapping.
491 
492  9. Find Parent Registry and Instantiate RepoData
493  ------------------------------------------------
494 
495  At this point there is enough information to instantiate the `Repository` instances. There is one final
496  step before instantiating the Repository, which is to try to get a parent registry that can be used by the
497  child repository. The criteria for "can be used" is spelled out in `Butler._setParentRegistry`. However,
498  to get the registry from the parent, the parent must be instantiated. The `repoDataList`, in depth-first
499  search order, is built so that the most-dependent repositories are first, and the least dependent
500  repositories are last. So the `repoDataList` is reversed and the Repositories are instantiated in that
501  order; for each RepoData a parent registry is searched for, and then the Repository is instantiated with
502  whatever registry could be found."""
503 
504  def __init__(self, root=None, mapper=None, inputs=None, outputs=None, **mapperArgs):
505  self._initArgs = {'root': root, 'mapper': mapper, 'inputs': inputs, 'outputs': outputs,
506  'mapperArgs': mapperArgs}
507 
508  self.log = Log.getLogger("daf.persistence.butler")
509  # Always use an empty Persistence policy until we can get rid of it
510  persistencePolicy = pexPolicy.Policy()
511  self.persistence = Persistence.getPersistence(persistencePolicy)
512 
513  inputs, outputs = self._processInputArguments(
514  root=root, mapper=mapper, inputs=inputs, outputs=outputs, **mapperArgs)
515 
516  # convert the RepoArgs into RepoData
517  inputs = [RepoData(args, 'input') for args in inputs]
518  outputs = [RepoData(args, 'output') for args in outputs]
519  repoDataList = outputs + inputs
520 
521  self._getCfgs(repoDataList)
522 
523  self._addParents(repoDataList)
524 
525  self._setAndVerifyParentsLists(repoDataList)
526 
527  self._setDefaultMapper(repoDataList)
528 
529  self._connectParentRepoDatas(repoDataList)
530 
531  self._repos = RepoDataContainer(repoDataList)
532 
533  self._setRepoDataTags()
534 
535  for repoData in reversed(repoDataList):
536  self._setParentRegistry(repoData)
537  repoData.repo = Repository(repoData)
538 
539  def _setParentRegistry(self, repoData):
540  """Try to get a parent registry that can be used by this repository. To be usable the repository must
541  "match", meaning the mapper in the passed-in repo is the same type as the mapper in the parent.
542  """
543 
544  def getParentRegsitry(repoData, context):
545  """Get the first found registry that matches the the passed-in repo.
546 
547  Parameters
548  ----------
549  repoData : RepoData
550  The RepoData for the repository for which we are searching for a
551  parent registry.
552 
553  Returns
554  -------
555  Registry or None
556  A registry from a parent if one can be found, or None.
557 
558  Raises
559  ------
560  RuntimeError
561  Indicates a butler init order problem, all parents should be initialized before child
562  repositories, so this function should be able to get any parent of any child repo.
563  """
564  if id(self) in context:
565  return None
566  else:
567  context.add(id(self))
568  for parentRepoData in repoData.getParentRepoDatas():
569  if parentRepoData.cfg.mapper == repoData.cfg.mapper:
570  if parentRepoData.repo is None:
571  self.log.debug(
572  "_getParentRegistry: Parent {} of new repo {} not yet created, ignoring.".format(
573  parentRepoData, repoData))
574  else:
575  parentRegistry = parentRepoData.repo.getRegistry()
576  if parentRegistry:
577  return parentRegistry
578  else:
579  parentRegistry = getParentRegsitry(parentRepoData, context)
580  if parentRegistry:
581  return parentRegistry
582  return None
583 
584  repoData.repoData.parentRegistry = getParentRegsitry(repoData.repoData, set())
585 
586  def _processInputArguments(self, root=None, mapper=None, inputs=None, outputs=None, **mapperArgs):
587  """Process, verify, and standardize the input arguments.
588  * Inputs can not be for Old Butler (root, mapper, mapperArgs) AND New Butler (inputs, outputs)
589  `root`, `mapper`, and `mapperArgs` are Old Butler init API.
590  `inputs` and `outputs` are New Butler init API.
591  Old Butler and New Butler init API may not be mixed, Butler may be initialized with only the Old
592  arguments or the New arguments.
593  * Verify that if there is a readable output that there is exactly one output. (This restriction is in
594  place because all readable repositories must be parents of writable repositories, and for
595  consistency the DAG of readable repositories must always be the same. Keeping the list of parents
596  becomes very complicated in the presence of multiple readable output repositories. It is better to
597  only write to output repositories, and then create a new Butler instance and use the outputs as
598  inputs, and write to new output repositories.)
599  * Make a copy of inputs & outputs so they may be modified without changing the passed-in arguments.
600  * Convert any input/output values that are URI strings to RepositoryArgs.
601  * Listify inputs & outputs.
602  * Set default RW mode on inputs & outputs as needed.
603 
604  Parameters
605  ----------
606  Same as Butler.__init__
607 
608  Returns
609  -------
610  (list of RepositoryArgs, list of RepositoryArgs)
611  First item is a list to use as inputs.
612  Second item is a list to use as outputs.
613 
614  Raises
615  ------
616  RuntimeError
617  If Old Butler and New Butler arguments are both used this will raise.
618  If an output is readable there is more than one output this will raise.
619  """
620  # inputs and outputs may be modified, do not change the external value.
621  inputs = copy.deepcopy(inputs)
622  outputs = copy.deepcopy(outputs)
623 
624  isV1Args = inputs is None and outputs is None
625  if isV1Args:
626  inputs, outputs = self._convertV1Args(root=root,
627  mapper=mapper,
628  mapperArgs=mapperArgs or None)
629  elif root or mapper or mapperArgs:
630  raise RuntimeError(
631  'Butler version 1 API (root, mapper, **mapperArgs) may ' +
632  'not be used with version 2 API (inputs, outputs)')
634 
635  self.storage = Storage()
636 
637  # make sure inputs and outputs are lists, and if list items are a string convert it RepositoryArgs.
638  inputs = listify(inputs)
639  outputs = listify(outputs)
640  inputs = [RepositoryArgs(cfgRoot=args)
641  if not isinstance(args, RepositoryArgs) else args for args in inputs]
642  outputs = [RepositoryArgs(cfgRoot=args)
643  if not isinstance(args, RepositoryArgs) else args for args in outputs]
644  # Set the default value of inputs & outputs, verify the required values ('r' for inputs, 'w' for
645  # outputs) and remove the 'w' from inputs if needed.
646  for args in inputs:
647  if args.mode is None:
648  args.mode = 'r'
649  elif 'rw' == args.mode:
650  args.mode = 'r'
651  elif 'r' != args.mode:
652  raise RuntimeError("The mode of an input should be readable.")
653  for args in outputs:
654  if args.mode is None:
655  args.mode = 'w'
656  elif 'w' not in args.mode:
657  raise RuntimeError("The mode of an output should be writable.")
658  # check for class instances in args.mapper (not allowed)
659  for args in inputs + outputs:
660  if (args.mapper and not isinstance(args.mapper, basestring) and
661  not inspect.isclass(args.mapper)):
662  self.log.warn(preinitedMapperWarning)
663  # if the output is readable, there must be only one output:
664  for o in outputs:
665  if 'r' in o.mode:
666  if len(outputs) > 1:
667  raise RuntimeError("Butler does not support multiple output repositories if any of the "
668  "outputs are readable.")
669 
670  # Handle the case where the output is readable and is also passed in as one of the inputs by removing
671  # the input. This supports a legacy use case in pipe_tasks where the input is also passed as the
672  # output, to the command line parser.
673  def inputIsInOutputs(inputArgs, outputArgsList):
674  for o in outputArgsList:
675  if ('r' in o.mode and
676  o.root == inputArgs.root and
677  o.mapper == inputArgs.mapper and
678  o.mapperArgs == inputArgs.mapperArgs and
679  o.tags == inputArgs.tags and
680  o.policy == inputArgs.policy):
681  self.log.debug(("Input repositoryArgs {} is also listed in outputs as readable; " +
682  "throwing away the input.").format(inputArgs))
683  return True
684  return False
685 
686  inputs = [args for args in inputs if not inputIsInOutputs(args, outputs)]
687  return inputs, outputs
688 
689  @staticmethod
690  def _getParentVal(repoData):
691  """Get the value of this repoData as it should appear in the parents
692  list of other repositories"""
693  if repoData.isV1Repository:
694  return repoData.cfg
695  if repoData.cfgOrigin == 'nested':
696  return repoData.cfg
697  else:
698  return repoData.cfg.root
699 
700  @staticmethod
701  def _getParents(ofRepoData, repoInfo):
702  """Create a parents list of repoData from inputs and (readable) outputs."""
703  parents = []
704  # get the parents list of repoData:
705  for repoData in repoInfo:
706  if repoData is ofRepoData:
707  continue
708  if 'r' not in repoData.repoArgs.mode:
709  continue
710  parents.append(Butler._getParentVal(repoData))
711  return parents
712 
713  @staticmethod
714  def _getOldButlerRepositoryCfg(repositoryArgs):
715  if not Storage.isPosix(repositoryArgs.cfgRoot):
716  return None
717  if not PosixStorage.v1RepoExists(repositoryArgs.cfgRoot):
718  return None
719  if not repositoryArgs.mapper:
720  repositoryArgs.mapper = PosixStorage.getMapperClass(repositoryArgs.cfgRoot)
721  cfg = RepositoryCfg.makeFromArgs(repositoryArgs)
722  parent = PosixStorage.getParentSymlinkPath(repositoryArgs.cfgRoot)
723  if parent:
724  parent = Butler._getOldButlerRepositoryCfg(RepositoryArgs(cfgRoot=parent, mode='r'))
725  if parent is not None:
726  cfg.addParents([parent])
727  return cfg
728 
729  def _getRepositoryCfg(self, repositoryArgs):
730  """Try to get a repository from the location described by cfgRoot.
731 
732  Parameters
733  ----------
734  repositoryArgs : RepositoryArgs or string
735  Provides arguments to load an existing repository (or repositories). String is assumed to be a URI
736  and is used as the cfgRoot (URI to the location of the cfg file).
737 
738  Returned
739  --------
740  (RepositoryCfg or None, bool)
741  The RepositoryCfg, or None if one cannot be found, and True if the RepositoryCfg was created by
742  reading an Old Butler repository, or False if it is a New Butler Repository.
743  """
744  if not isinstance(repositoryArgs, RepositoryArgs):
745  repositoryArgs = RepositoryArgs(cfgRoot=repositoryArgs, mode='r')
746 
747  cfg = self.storage.getRepositoryCfg(repositoryArgs.cfgRoot)
748  isOldButlerRepository = False
749  if cfg is None:
750  cfg = Butler._getOldButlerRepositoryCfg(repositoryArgs)
751  if cfg is not None:
752  isOldButlerRepository = True
753  return cfg, isOldButlerRepository
754 
755  def _getCfgs(self, repoDataList):
756  """Get or make a RepositoryCfg for each RepoData, and add the cfg to the RepoData.
757  If the cfg exists, compare values. If values match then use the cfg as an "existing" cfg. If the
758  values do not match, use the cfg as a "nested" cfg.
759  If the cfg does not exist, the RepositoryArgs must be for a writable repository.
760 
761  Parameters
762  ----------
763  repoDataList : list of RepoData
764  The RepoData that are output and inputs of this Butler
765 
766  Raises
767  ------
768  RuntimeError
769  If the passed-in RepositoryArgs indicate an existing repository but other cfg parameters in those
770  RepositoryArgs don't
771  match the existing repository's cfg a RuntimeError will be raised.
772  """
773  def cfgMatchesArgs(args, cfg):
774  """Test if there are any values in an RepositoryArgs that conflict with the values in a cfg"""
775  if args.mapper is not None and cfg.mapper != args.mapper:
776  return False
777  if args.mapperArgs is not None and cfg.mapperArgs != args.mapperArgs:
778  return False
779  if args.policy is not None and cfg.policy != args.policy:
780  return False
781  return True
782 
783  for repoData in repoDataList:
784  cfg, isOldButlerRepository = self._getRepositoryCfg(repoData.repoArgs)
785  if cfg is None:
786  if 'w' not in repoData.repoArgs.mode:
787  raise RuntimeError(
788  "No cfg found for read-only input repository at {}".format(repoData.repoArgs.cfgRoot))
789  repoData.setCfg(cfg=RepositoryCfg.makeFromArgs(repoData.repoArgs),
790  origin='new',
791  root=repoData.repoArgs.cfgRoot,
792  isV1Repository=isOldButlerRepository)
793  else:
794  if 'w' in repoData.repoArgs.mode:
795  # if it's an output repository, the RepositoryArgs must match the existing cfg.
796  if not cfgMatchesArgs(repoData.repoArgs, cfg):
797  raise RuntimeError(("The RepositoryArgs and RepositoryCfg must match for writable " +
798  "repositories, RepositoryCfg:{}, RepositoryArgs:{}").format(
799  cfg, repoData.repoArgs))
800  repoData.setCfg(cfg=cfg, origin='existing', root=repoData.repoArgs.cfgRoot,
801  isV1Repository=isOldButlerRepository)
802  else:
803  # if it's an input repository, the cfg can overwrite the in-repo cfg.
804  if cfgMatchesArgs(repoData.repoArgs, cfg):
805  repoData.setCfg(cfg=cfg, origin='existing', root=repoData.repoArgs.cfgRoot,
806  isV1Repository=isOldButlerRepository)
807  else:
808  repoData.setCfg(cfg=cfg, origin='nested', root=None,
809  isV1Repository=isOldButlerRepository)
810 
811  def _addParents(self, repoDataList):
812  """For each repoData in the input list, see if its parents are the next items in the list, and if not
813  add the parent, so that the repoDataList includes parents and is in order to operate depth-first 0..n.
814 
815  Parameters
816  ----------
817  repoDataList : list of RepoData
818  The RepoData for the Butler outputs + inputs.
819 
820  Raises
821  ------
822  RuntimeError
823  Raised if a RepositoryCfg can not be found at a location where a parent repository should be.
824  """
825  repoDataIdx = 0
826  while True:
827  if repoDataIdx == len(repoDataList):
828  break
829  repoData = repoDataList[repoDataIdx]
830  if 'r' not in repoData.repoArgs.mode:
831  repoDataIdx += 1
832  continue # the repoData only needs parents if it's readable.
833  if repoData.isNewRepository:
834  repoDataIdx += 1
835  continue # if it's new the parents will be the inputs of this butler.
836  if repoData.cfg.parents is None:
837  repoDataIdx += 1
838  continue # if there are no parents then there's nothing to do.
839  for repoParentIdx, repoParent in enumerate(repoData.cfg.parents):
840  parentIdxInRepoDataList = repoDataIdx + repoParentIdx + 1
841  if not isinstance(repoParent, RepositoryCfg):
842  repoParentCfg, isOldButlerRepository = self._getRepositoryCfg(repoParent)
843  if repoParentCfg is not None:
844  cfgOrigin = 'existing'
845  else:
846  isOldButlerRepository = False
847  repoParentCfg = repoParent
848  cfgOrigin = 'nested'
849  if (parentIdxInRepoDataList < len(repoDataList) and
850  repoDataList[parentIdxInRepoDataList].cfg == repoParentCfg):
851  continue
852  args = RepositoryArgs(cfgRoot=repoParentCfg.root, mode='r')
853  role = 'input' if repoData.role == 'output' else 'parent'
854  newRepoInfo = RepoData(args, role)
855  newRepoInfo.repoData.setCfg(cfg=repoParentCfg, origin=cfgOrigin, root=args.cfgRoot,
856  isV1Repository=isOldButlerRepository)
857  repoDataList.insert(parentIdxInRepoDataList, newRepoInfo)
858  repoDataIdx += 1
859 
860  def _setAndVerifyParentsLists(self, repoDataList):
861  """Make a list of all the input repositories of this Butler, these are the parents of the outputs.
862  For new output repositories, set the parents in the RepositoryCfg. For existing output repositories
863  verify that the RepositoryCfg's parents match the parents list.
864 
865  Parameters
866  ----------
867  repoDataList : list of RepoData
868  All the RepoDatas loaded by this butler, in search order.
869 
870  Raises
871  ------
872  RuntimeError
873  If an existing output repository is loaded and its parents do not match the parents of this Butler
874  an error will be raised.
875  """
876  def getIOParents(ofRepoData, repoDataList):
877  """make a parents list for repo in `ofRepoData` that is comprised of inputs and readable
878  outputs (not parents-of-parents) of this butler"""
879  parents = []
880  for repoData in repoDataList:
881  if repoData.role == 'parent':
882  continue
883  if repoData is ofRepoData:
884  continue
885  if repoData.role == 'output':
886  if 'r' in repoData.repoArgs.mode:
887  raise RuntimeError("If an output is readable it must be the only output.")
888  # and if this is the only output, this should have continued in
889  # "if repoData is ofRepoData"
890  continue
891  parents.append(self._getParentVal(repoData))
892  return parents
893 
894  for repoData in repoDataList:
895  if repoData.role != 'output':
896  continue
897  parents = getIOParents(repoData, repoDataList)
898  # if repoData is new, add the parent RepositoryCfgs to it.
899  if repoData.cfgOrigin == 'new':
900  repoData.cfg.addParents(parents)
901  elif repoData.cfgOrigin in ('existing', 'nested'):
902  if repoData.cfg.parents != parents:
903  try:
904  repoData.cfg.extendParents(parents)
905  except ParentsMismatch as e:
906  raise RuntimeError(("Inputs of this Butler:{} do not match parents of existing " +
907  "writable cfg:{} (ParentMismatch exception: {}").format(
908  parents, repoData.cfg.parents, e))
909 
910  def _setDefaultMapper(self, repoDataList):
911  """Establish a default mapper if there is one and assign it to outputs that do not have a mapper
912  assigned.
913 
914  If all inputs have the same mapper it will be used as the default mapper.
915 
916  Parameters
917  ----------
918  repoDataList : list of RepoData
919  All the RepoDatas loaded by this butler, in search order.
920 
921  Raises
922  ------
923  RuntimeError
924  If a default mapper can not be established and there is an output that does not have a mapper.
925  """
926  needyOutputs = [rd for rd in repoDataList if rd.role == 'output' and rd.cfg.mapper is None]
927  if len(needyOutputs) is 0:
928  return
929  mappers = set([rd.cfg.mapper for rd in repoDataList if rd.role == 'input'])
930  if len(mappers) != 1:
931  inputs = [rd for rd in repoDataList if rd.role == 'input']
932  raise RuntimeError(
933  ("No default mapper could be established from inputs:{} and no mapper specified " +
934  "for outputs:{}").format(inputs, needyOutputs))
935  defaultMapper = mappers.pop()
936  for repoData in needyOutputs:
937  repoData.cfg.mapper = defaultMapper
938 
939  def _connectParentRepoDatas(self, repoDataList):
940  """For each RepoData in repoDataList, find its parent in the repoDataList and cache a reference to it.
941 
942  Parameters
943  ----------
944  repoDataList : list of RepoData
945  All the RepoDatas loaded by this butler, in search order.
946 
947  Raises
948  ------
949  RuntimeError
950  When a parent is listed in the parents list but not found in the repoDataList. This is not
951  expected to ever happen and would indicate an internal Butler error.
952  """
953  for repoData in repoDataList:
954  for parent in repoData.cfg.parents:
955  parentToAdd = None
956  for otherRepoData in repoDataList:
957  if isinstance(parent, RepositoryCfg):
958  if otherRepoData.repoData.repoData.cfg == parent:
959  parentToAdd = otherRepoData.repoData
960  break
961  elif otherRepoData.repoData.cfg.root == parent:
962  parentToAdd = otherRepoData.repoData
963  break
964  if parentToAdd is None:
965  raise RuntimeError(
966  "Could not find a parent matching {} to add to {}".format(parent, repoData))
967  repoData.addParentRepoData(parentToAdd)
968 
969  @staticmethod
970  def _getParentRepoData(parent, repoDataList):
971  """get a parent RepoData from a cfg from a list of RepoData
972 
973  Parameters
974  ----------
975  parent : string or RepositoryCfg
976  cfgRoot of a repo or a cfg that describes the repo
977  repoDataList : list of RepoData
978  list to search in
979 
980  Returns
981  -------
982  RepoData or None
983  A RepoData if one can be found, else None
984  """
985  repoData = None
986  for otherRepoData in repoDataList:
987  if isinstance(parent, RepositoryCfg):
988  if otherRepoData.cfg == parent:
989  repoData = otherRepoData
990  break
991  elif otherRepoData.cfg.root == parent:
992  repoData = otherRepoData
993  break
994  return repoData
995 
996  def _setRepoDataTags(self):
997  """Set the tags from each repoArgs into all its parent repoArgs so that they can be included in tagged
998  searches."""
999  def setTags(repoData, tags, context):
1000  if id(repoData) in context:
1001  return
1002  repoData.addTags(tags)
1003  context.add(id(repoData))
1004  for parentRepoData in repoData.parentRepoDatas:
1005  setTags(parentRepoData, tags, context)
1006  for repoData in self._repos.outputs() + self._repos.inputs():
1007  setTags(repoData.repoData, repoData.repoArgs.tags, set())
1008 
1009  def _convertV1Args(self, root, mapper, mapperArgs):
1010  """Convert Old Butler RepositoryArgs (root, mapper, mapperArgs) to New Butler RepositoryArgs
1011  (inputs, outputs)
1012 
1013  Parameters
1014  ----------
1015  root : string
1016  Posix path to repository root
1017  mapper : class, class instance, or string
1018  Instantiated class, a class object to be instantiated, or a string that refers to a class that
1019  can be imported & used as the mapper.
1020  mapperArgs : dict
1021  RepositoryArgs & their values used when instantiating the mapper.
1022 
1023  Returns
1024  -------
1025  tuple
1026  (inputs, outputs) - values to be used for inputs and outputs in Butler.__init__
1027  """
1028  if (mapper and not isinstance(mapper, basestring) and
1029  not inspect.isclass(mapper)):
1030  self.log.warn(preinitedMapperWarning)
1031  inputs = None
1032  if root is None:
1033  if hasattr(mapper, 'root'):
1034  # in legacy repositories, the mapper may be given the root directly.
1035  root = mapper.root
1036  else:
1037  # in the past root="None" could be used to mean root='.'
1038  root = '.'
1039  outputs = RepositoryArgs(mode='rw',
1040  root=root,
1041  mapper=mapper,
1042  mapperArgs=mapperArgs)
1043  return inputs, outputs
1044 
1045  def __repr__(self):
1046  return 'Butler(datasetTypeAliasDict=%s, repos=%s, persistence=%s)' % (
1047  self.datasetTypeAliasDict, self._repos, self.persistence)
1048 
1049  def _getDefaultMapper(self):
1050 
1051  """Get the default mapper. Currently this means if all the repositories use exactly the same mapper,
1052  that mapper may be considered the default.
1053 
1054  This definition may be changing; mappers may be able to exclude themselves as candidates for default,
1055  and they may nominate a different mapper instead. Also, we may not want to look at *all* the
1056  repositories, but only a depth-first search on each of the input & output repositories, and use the
1057  first-found mapper for each of those. TBD.
1058 
1059  Parameters
1060  ----------
1061  inputs : TYPE
1062  Description
1063 
1064  Returns
1065  -------
1066  Mapper class or None
1067  Returns the class type of the default mapper, or None if a default
1068  mapper can not be determined.
1069  """
1070  defaultMapper = None
1071 
1072  for inputRepoData in self._repos.inputs():
1073  mapper = None
1074  if inputRepoData.cfg.mapper is not None:
1075  mapper = inputRepoData.cfg.mapper
1076  # if the mapper is:
1077  # * a string, import it.
1078  # * a class instance, get its class type
1079  # * a class, do nothing; use it
1080  if isinstance(mapper, basestring):
1081  mapper = doImport(mapper)
1082  elif not inspect.isclass(mapper):
1083  mapper = mapper.__class__
1084  # If no mapper has been found, note the first found mapper.
1085  # Then, if a mapper has been found and each next mapper matches it,
1086  # continue looking for mappers.
1087  # If a mapper has been found and another non-matching mapper is
1088  # found then we have no default, return None.
1089  if defaultMapper is None:
1090  defaultMapper = mapper
1091  elif mapper == defaultMapper:
1092  continue
1093  elif mapper is not None:
1094  return None
1095  return defaultMapper
1096 
1097  def _assignDefaultMapper(self, defaultMapper):
1098  for repoData in self._repos.all().values():
1099  if repoData.cfg.mapper is None and (repoData.isNewRepository or repoData.isV1Repository):
1100  if defaultMapper is None:
1101  raise RuntimeError(
1102  "No mapper specified for %s and no default mapper could be determined." %
1103  repoData.args)
1104  repoData.cfg.mapper = defaultMapper
1105 
1106  @staticmethod
1107  def getMapperClass(root):
1108  """posix-only; gets the mapper class at the path specified by root (if a file _mapper can be found at
1109  that location or in a parent location.
1110 
1111  As we abstract the storage and support different types of storage locations this method will be
1112  moved entirely into Butler Access, or made more dynamic, and the API will very likely change."""
1113  return Storage.getMapperClass(root)
1114 
1115  def defineAlias(self, alias, datasetType):
1116  """Register an alias that will be substituted in datasetTypes.
1117 
1118  Parameters
1119  ----------
1120  alias - string
1121  The alias keyword. It may start with @ or not. It may not contain @ except as the first character.
1122  datasetType - string
1123  The string that will be substituted when @alias is passed into datasetType. It may not contain '@'
1124  """
1125  # verify formatting of alias:
1126  # it can have '@' as the first character (if not it's okay, we will add it) or not at all.
1127  atLoc = alias.rfind('@')
1128  if atLoc == -1:
1129  alias = "@" + str(alias)
1130  elif atLoc > 0:
1131  raise RuntimeError("Badly formatted alias string: %s" % (alias,))
1132 
1133  # verify that datasetType does not contain '@'
1134  if datasetType.count('@') != 0:
1135  raise RuntimeError("Badly formatted type string: %s" % (datasetType))
1136 
1137  # verify that the alias keyword does not start with another alias keyword,
1138  # and vice versa
1139  for key in self.datasetTypeAliasDict:
1140  if key.startswith(alias) or alias.startswith(key):
1141  raise RuntimeError("Alias: %s overlaps with existing alias: %s" % (alias, key))
1142 
1143  self.datasetTypeAliasDict[alias] = datasetType
1144 
1145  def getKeys(self, datasetType=None, level=None, tag=None):
1146  """Get the valid data id keys at or above the given level of hierarchy for the dataset type or the
1147  entire collection if None. The dict values are the basic Python types corresponding to the keys (int,
1148  float, string).
1149 
1150  Parameters
1151  ----------
1152  datasetType - string
1153  The type of dataset to get keys for, entire collection if None.
1154  level - string
1155  The hierarchy level to descend to. None if it should not be restricted. Use an empty string if the
1156  mapper should lookup the default level.
1157  tags - any, or list of any
1158  Any object that can be tested to be the same as the tag in a dataId passed into butler input
1159  functions. Applies only to input repositories: If tag is specified by the dataId then the repo
1160  will only be read from used if the tag in the dataId matches a tag used for that repository.
1161 
1162  Returns
1163  -------
1164  Returns a dict. The dict keys are the valid data id keys at or above the given level of hierarchy for
1165  the dataset type or the entire collection if None. The dict values are the basic Python types
1166  corresponding to the keys (int, float, string).
1167  """
1168  datasetType = self._resolveDatasetTypeAlias(datasetType)
1169 
1170  keys = None
1171  tag = setify(tag)
1172  for repoData in self._repos.inputs():
1173  if not tag or len(tag.intersection(repoData.tags)) > 0:
1174  keys = repoData.repo.getKeys(datasetType, level)
1175  # An empty dict is a valid "found" condition for keys. The only value for keys that should
1176  # cause the search to continue is None
1177  if keys is not None:
1178  break
1179  return keys
1180 
1181  def queryMetadata(self, datasetType, format, dataId={}, **rest):
1182  """Returns the valid values for one or more keys when given a partial
1183  input collection data id.
1184 
1185  Parameters
1186  ----------
1187  datasetType - string
1188  The type of dataset to inquire about.
1189  format - str, tuple
1190  Key or tuple of keys to be returned.
1191  dataId - DataId, dict
1192  The partial data id.
1193  **rest -
1194  Keyword arguments for the partial data id.
1195 
1196  Returns
1197  -------
1198  A list of valid values or tuples of valid values as specified by the
1199  format.
1200  """
1201 
1202  datasetType = self._resolveDatasetTypeAlias(datasetType)
1203  dataId = DataId(dataId)
1204  dataId.update(**rest)
1205  format = sequencify(format)
1206 
1207  tuples = None
1208  for repoData in self._repos.inputs():
1209  if not dataId.tag or len(dataId.tag.intersection(repoData.tags)) > 0:
1210  tuples = repoData.repo.queryMetadata(datasetType, format, dataId)
1211  if tuples:
1212  break
1213 
1214  if not tuples:
1215  return []
1216 
1217  if len(format) == 1:
1218  ret = []
1219  for x in tuples:
1220  try:
1221  ret.append(x[0])
1222  except TypeError:
1223  ret.append(x)
1224  return ret
1225 
1226  return tuples
1227 
1228  def datasetExists(self, datasetType, dataId={}, **rest):
1229  """Determines if a dataset file exists.
1230 
1231  Parameters
1232  ----------
1233  datasetType - string
1234  The type of dataset to inquire about.
1235  dataId - DataId, dict
1236  The data id of the dataset.
1237  **rest keyword arguments for the data id.
1238 
1239  Returns
1240  -------
1241  exists - bool
1242  True if the dataset exists or is non-file-based.
1243  """
1244  datasetType = self._resolveDatasetTypeAlias(datasetType)
1245  dataId = DataId(dataId)
1246  dataId.update(**rest)
1247  location = self._locate(datasetType, dataId, write=False)
1248  if location is None:
1249  return False
1250 
1251  # If the location is a ButlerComposite (as opposed to a ButlerLocation), verify the component objects
1252  # exist.
1253  if isinstance(location, ButlerComposite):
1254  for name, componentInfo in location.componentInfo.items():
1255  if componentInfo.subset:
1256  subset = self.subset(datasetType=componentInfo.datasetType, dataId=location.dataId)
1257  exists = all([obj.datasetExists() for obj in subset])
1258  else:
1259  exists = self.datasetExists(componentInfo.datasetType, location.dataId)
1260  if exists is False:
1261  break
1262  else:
1263  exists = location.repository.exists(location)
1264  return exists
1265 
1266  def _locate(self, datasetType, dataId, write):
1267  """Get one or more ButlerLocations and/or ButlercComposites.
1268 
1269  Parameters
1270  ----------
1271  datasetType : string
1272  The datasetType that is being searched for. The datasetType may be followed by a dot and
1273  a component name (component names are specified in the policy). IE datasetType.componentName
1274 
1275  dataId : dict or DataId class instance
1276  The dataId
1277 
1278  write : bool
1279  True if this is a search to write an object. False if it is a search to read an object. This
1280  affects what type (an object or a container) is returned.
1281 
1282  Returns
1283  -------
1284  If write is False, will return either a single object or None. If write is True, will return a list
1285  (which may be empty)
1286  """
1287  repos = self._repos.outputs() if write else self._repos.inputs()
1288  locations = []
1289  for repoData in repos:
1290  # enforce dataId & repository tags when reading:
1291  if not write and dataId.tag and len(dataId.tag.intersection(repoData.tags)) == 0:
1292  continue
1293  components = datasetType.split('.')
1294  datasetType = components[0]
1295  components = components[1:]
1296  try:
1297  location = repoData.repo.map(datasetType, dataId, write=write)
1298  except NoResults:
1299  continue
1300  if location is None:
1301  continue
1302  location.datasetType = datasetType # todo is there a better way than monkey patching here?
1303  if len(components) > 0:
1304  if not isinstance(location, ButlerComposite):
1305  raise RuntimeError("The location for a dotted datasetType must be a composite.")
1306  # replace the first component name with the datasetType
1307  components[0] = location.componentInfo[components[0]].datasetType
1308  # join components back into a dot-delimited string
1309  datasetType = '.'.join(components)
1310  location = self._locate(datasetType, dataId, write)
1311  # if a component location is not found, we can not continue with this repo, move to next repo.
1312  if location is None:
1313  break
1314  # if reading, only one location is desired.
1315  if location:
1316  if not write:
1317  # If there is a bypass function for this dataset type, we can't test to see if the object
1318  # exists in storage, because the bypass function may not actually use the location
1319  # according to the template. Instead, execute the bypass function and include its results
1320  # in the bypass attribute of the location. The bypass function may fail for any reason,
1321  # the most common case being that a file does not exist. If it raises an exception we
1322  # ignore its existance and proceed as though it does not exist.
1323  if hasattr(location.mapper, "bypass_" + location.datasetType):
1324  bypass = self._getBypassFunc(location, dataId)
1325  try:
1326  bypass = bypass()
1327  location.bypass = bypass
1328  except:
1329  pass
1330  # If a location was found but the location does not exist, keep looking in input
1331  # repositories (the registry may have had enough data for a lookup even thought the object
1332  # exists in a different repository.)
1333  if (isinstance(location, ButlerComposite) or hasattr(location, 'bypass') or
1334  location.repository.exists(location)):
1335  return location
1336  else:
1337  try:
1338  locations.extend(location)
1339  except TypeError:
1340  locations.append(location)
1341  if not write:
1342  return None
1343  return locations
1344 
1345  @staticmethod
1346  def _getBypassFunc(location, dataId):
1347  pythonType = location.getPythonType()
1348  if pythonType is not None:
1349  if isinstance(pythonType, basestring):
1350  pythonType = doImport(pythonType)
1351  bypassFunc = getattr(location.mapper, "bypass_" + location.datasetType)
1352  return lambda: bypassFunc(location.datasetType, pythonType, location, dataId)
1353 
1354  def get(self, datasetType, dataId=None, immediate=True, **rest):
1355  """Retrieves a dataset given an input collection data id.
1356 
1357  Parameters
1358  ----------
1359  datasetType - string
1360  The type of dataset to retrieve.
1361  dataId - dict
1362  The data id.
1363  immediate - bool
1364  If False use a proxy for delayed loading.
1365  **rest
1366  keyword arguments for the data id.
1367 
1368  Returns
1369  -------
1370  An object retrieved from the dataset (or a proxy for one).
1371  """
1372  datasetType = self._resolveDatasetTypeAlias(datasetType)
1373  dataId = DataId(dataId)
1374  dataId.update(**rest)
1375 
1376  location = self._locate(datasetType, dataId, write=False)
1377  if location is None:
1378  raise NoResults("No locations for get:", datasetType, dataId)
1379  self.log.debug("Get type=%s keys=%s from %s", datasetType, dataId, str(location))
1380 
1381  if hasattr(location, 'bypass'):
1382  # this type loader block should get moved into a helper someplace, and duplications removed.
1383  def callback():
1384  return location.bypass
1385  else:
1386  def callback():
1387  return self._read(location)
1388  if location.mapper.canStandardize(location.datasetType):
1389  innerCallback = callback
1390 
1391  def callback():
1392  return location.mapper.standardize(location.datasetType, innerCallback(), dataId)
1393  if immediate:
1394  return callback()
1395  return ReadProxy(callback)
1396 
1397  def put(self, obj, datasetType, dataId={}, doBackup=False, **rest):
1398  """Persists a dataset given an output collection data id.
1399 
1400  Parameters
1401  ----------
1402  obj -
1403  The object to persist.
1404  datasetType - string
1405  The type of dataset to persist.
1406  dataId - dict
1407  The data id.
1408  doBackup - bool
1409  If True, rename existing instead of overwriting.
1410  WARNING: Setting doBackup=True is not safe for parallel processing, as it may be subject to race
1411  conditions.
1412  **rest
1413  Keyword arguments for the data id.
1414  """
1415  datasetType = self._resolveDatasetTypeAlias(datasetType)
1416  dataId = DataId(dataId)
1417  dataId.update(**rest)
1418 
1419  for location in self._locate(datasetType, dataId, write=True):
1420  if isinstance(location, ButlerComposite):
1421  disassembler = location.disassembler if location.disassembler else genericDisassembler
1422  disassembler(obj=obj, dataId=location.dataId, componentInfo=location.componentInfo)
1423  for name, info in location.componentInfo.items():
1424  if not info.inputOnly:
1425  self.put(info.obj, info.datasetType, location.dataId, doBackup=doBackup)
1426  else:
1427  if doBackup:
1428  location.getRepository().backup(location.datasetType, dataId)
1429  location.getRepository().write(location, obj)
1430 
1431  def subset(self, datasetType, level=None, dataId={}, **rest):
1432  """Return complete dataIds for a dataset type that match a partial (or empty) dataId.
1433 
1434  Given a partial (or empty) dataId specified in dataId and **rest, find all datasets that match the
1435  dataId. Optionally restrict the results to a given level specified by a dataId key (e.g. visit or
1436  sensor or amp for a camera). Return an iterable collection of complete dataIds as ButlerDataRefs.
1437  Datasets with the resulting dataIds may not exist; that needs to be tested with datasetExists().
1438 
1439  Parameters
1440  ----------
1441  datasetType - string
1442  The type of dataset collection to subset
1443  level - string
1444  The level of dataId at which to subset. Use an empty string if the mapper should look up the
1445  default level.
1446  dataId - dict
1447  The data id.
1448  **rest
1449  Keyword arguments for the data id.
1450 
1451  Returns
1452  -------
1453  subset - ButlerSubset
1454  Collection of ButlerDataRefs for datasets matching the data id.
1455 
1456  Examples
1457  -----------
1458  To print the full dataIds for all r-band measurements in a source catalog
1459  (note that the subset call is equivalent to: `butler.subset('src', dataId={'filter':'r'})`):
1460 
1461  >>> subset = butler.subset('src', filter='r')
1462  >>> for data_ref in subset: print(data_ref.dataId)
1463  """
1464  datasetType = self._resolveDatasetTypeAlias(datasetType)
1465 
1466  # Currently expected behavior of subset is that if specified level is None then the mapper's default
1467  # level should be used. Convention for level within Butler is that an empty string is used to indicate
1468  # 'get default'.
1469  if level is None:
1470  level = ''
1471 
1472  dataId = DataId(dataId)
1473  dataId.update(**rest)
1474  return ButlerSubset(self, datasetType, level, dataId)
1475 
1476  def dataRef(self, datasetType, level=None, dataId={}, **rest):
1477  """Returns a single ButlerDataRef.
1478 
1479  Given a complete dataId specified in dataId and **rest, find the unique dataset at the given level
1480  specified by a dataId key (e.g. visit or sensor or amp for a camera) and return a ButlerDataRef.
1481 
1482  Parameters
1483  ----------
1484  datasetType - string
1485  The type of dataset collection to reference
1486  level - string
1487  The level of dataId at which to reference
1488  dataId - dict
1489  The data id.
1490  **rest
1491  Keyword arguments for the data id.
1492 
1493  Returns
1494  -------
1495  dataRef - ButlerDataRef
1496  ButlerDataRef for dataset matching the data id
1497  """
1498 
1499  datasetType = self._resolveDatasetTypeAlias(datasetType)
1500  dataId = DataId(dataId)
1501  subset = self.subset(datasetType, level, dataId, **rest)
1502  if len(subset) != 1:
1503  raise RuntimeError("No unique dataset for: Dataset type:%s Level:%s Data ID:%s Keywords:%s" %
1504  (str(datasetType), str(level), str(dataId), str(rest)))
1505  return ButlerDataRef(subset, subset.cache[0])
1506 
1507  def _read(self, location):
1508  """Unpersist an object using data inside a ButlerLocation or ButlerComposite object.
1509 
1510  Parameters
1511  ----------
1512  location : ButlerLocation or ButlerComposite
1513  A ButlerLocation or ButlerComposite instance populated with data needed to read the object.
1514 
1515  Returns
1516  -------
1517  object
1518  An instance of the object specified by the location.
1519  """
1520  self.log.debug("Starting read from %s", location)
1521 
1522  if isinstance(location, ButlerComposite):
1523  for name, componentInfo in location.componentInfo.items():
1524  if componentInfo.subset:
1525  subset = self.subset(datasetType=componentInfo.datasetType, dataId=location.dataId)
1526  componentInfo.obj = [obj.get() for obj in subset]
1527  else:
1528  obj = self.get(componentInfo.datasetType, location.dataId, immediate=True)
1529  componentInfo.obj = obj
1530  assembler = location.assembler or genericAssembler
1531  results = assembler(dataId=location.dataId, componentInfo=location.componentInfo,
1532  cls=location.python)
1533  return results
1534  else:
1535  results = location.repository.read(location)
1536  if len(results) == 1:
1537  results = results[0]
1538  self.log.debug("Ending read from %s", location)
1539  return results
1540 
1541  def __reduce__(self):
1542  ret = (_unreduce, (self._initArgs, self.datasetTypeAliasDict))
1543  return ret
1544 
1545  def _resolveDatasetTypeAlias(self, datasetType):
1546  """Replaces all the known alias keywords in the given string with the alias value.
1547 
1548  Parameters
1549  ----------
1550  datasetType - string
1551  A datasetType string to search & replace on
1552 
1553  Returns
1554  -------
1555  datasetType - string
1556  The de-aliased string
1557  """
1558  for key in self.datasetTypeAliasDict:
1559  # if all aliases have been replaced, bail out
1560  if datasetType.find('@') == -1:
1561  break
1562  datasetType = datasetType.replace(key, self.datasetTypeAliasDict[key])
1563 
1564  # If an alias specifier can not be resolved then throw.
1565  if datasetType.find('@') != -1:
1566  raise RuntimeError("Unresolvable alias specifier in datasetType: %s" % (datasetType))
1567 
1568  return datasetType
1569 
1570 
1571 def _unreduce(initArgs, datasetTypeAliasDict):
1572  mapperArgs = initArgs.pop('mapperArgs')
1573  initArgs.update(mapperArgs)
1574  butler = Butler(**initArgs)
1575  butler.datasetTypeAliasDict = datasetTypeAliasDict
1576  return butler