lsst.daf.persistence  13.0-30-gd2bda26
 All Classes Namespaces Files Functions Variables Typedefs Friends Macros
butler.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 
3 #
4 # LSST Data Management System
5 # Copyright 2008-2015 LSST Corporation.
6 #
7 # This product includes software developed by the
8 # LSST Project (http://www.lsst.org/).
9 #
10 # This program is free software: you can redistribute it and/or modify
11 # it under the terms of the GNU General Public License as published by
12 # the Free Software Foundation, either version 3 of the License, or
13 # (at your option) any later version.
14 #
15 # This program is distributed in the hope that it will be useful,
16 # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 # GNU General Public License for more details.
19 #
20 # You should have received a copy of the LSST License Statement and
21 # the GNU General Public License along with this program. If not,
22 # see <http://www.lsstcorp.org/LegalNotices/>.
23 #
24 
25 # -*- python -*-
26 
27 """This module defines the Butler class."""
28 from future import standard_library
29 standard_library.install_aliases()
30 from builtins import str
31 from past.builtins import basestring
32 from builtins import object
33 
34 import collections
35 import copy
36 import inspect
37 import json
38 import os
39 import weakref
40 
41 import yaml
42 
43 from lsst.log import Log
44 import lsst.pex.policy as pexPolicy
45 from . import LogicalLocation, ReadProxy, ButlerSubset, ButlerDataRef, Persistence, \
46  Storage, Policy, NoResults, Repository, DataId, RepositoryCfg, \
47  RepositoryArgs, listify, setify, sequencify, doImport, ButlerComposite, genericAssembler, \
48  genericDisassembler, PosixStorage, ParentsMismatch
49 
50 preinitedMapperWarning = ("Passing an instantiated mapper into " +
51  "Butler.__init__ will prevent Butler from passing " +
52  "parentRegistry or repositoryCfg information to " +
53  "the mapper, which is done only at init time. " +
54  "It is better to pass a importable string or " +
55  "class object.")
56 
57 
58 class ButlerCfg(Policy, yaml.YAMLObject):
59  """Represents a Butler configuration.
60 
61  .. warning::
62 
63  cfg is 'wet paint' and very likely to change. Use of it in production
64  code other than via the 'old butler' API is strongly discouraged.
65  """
66  yaml_tag = u"!ButlerCfg"
67 
68  def __init__(self, cls, repoCfg):
69  super(ButlerCfg, self).__init__({'repoCfg': repoCfg, 'cls': cls})
70 
71 
72 class RepoData(object):
73  """Container object for repository data used by Butler
74 
75  Parameters
76  ----------
77  args : RepositoryArgs
78  The arguments that are used to find or create the RepositoryCfg.
79  role : string
80  "input", "output", or "parent", indicating why Butler loaded this repository.
81  * input: the Repository was passed as a Butler input.
82  * output: the Repository was passed as a Butler output.
83  * parent: the Repository was specified in the RepositoryCfg parents list of a readable repository.
84 
85  Attributes
86  ----------
87  cfg: RepositoryCfg
88  The configuration for the Repository.
89 
90  _cfgOrigin : string
91  "new", "existing", or "nested". Indicates the origin of the repository and its RepositoryCfg:
92  * new: it was created by this instance of Butler, and this instance of Butler will generate the
93  RepositoryCfg file.
94  * existing: it was found (via the root or cfgRoot argument)
95  * nested: the full RepositoryCfg was nested in another RepositoryCfg's parents list (this can happen
96  if parameters of an input specified by RepositoryArgs or dict does not entirely match an existing
97  RepositoryCfg).
98 
99  cfgRoot : string
100  Path or URI to the location of the RepositoryCfg file.
101 
102  repo : lsst.daf.persistence.Repository
103  The Repository class instance.
104 
105  parentRepoDatas : list of RepoData
106  The parents of this Repository, as indicated this Repository's RepositoryCfg. If this is a new
107  Repository then these are the inputs to this Butler (and will be saved in the RepositoryCfg). These
108  RepoData objects are not owned by this RepoData, these are references to peer RepoData objects in the
109  Butler's RepoDataContainer.
110 
111  isV1Repository : bool
112  True if this is an Old Butler repository. In this case the repository does not have a RepositoryCfg
113  file. It may have a _mapper file and may have a _parent symlink. It will never be treated as a "new"
114  repository, i.e. even though there is not a RepositoryCfg file, one will not be generated.
115  If False, this is a New Butler repository and is specified by RepositoryCfg file.
116 
117  tags : set
118  These are values that may be used to restrict the search of input repositories. Details are available
119  in the RepositoryArgs and DataId classes.
120 
121  role : string
122  "input", "output", or "parent", indicating why Butler loaded this repository.
123  * input: the Repository was passed as a Butler input.
124  * output: the Repository was passed as a Butler output.
125  * parent: the Repository was specified in the RepositoryCfg parents list of a readable repository.
126 
127  _repoArgs : RepositoryArgs
128  Contains the arguments that were used to specify this Repository.
129  """
130 
131  def __init__(self, args, role):
132  self.cfg = None
133  self._cfgOrigin = None
134  self.cfgRoot = None
135  self.repo = None
136  self.parentRepoDatas = []
137  self.isV1Repository = False
138  self.tags = set()
139  self.role = role
140  self.parentRegistry = None
141  self._repoArgs = args
142 
143  @property
144  def repoArgs(self):
145  return self._repoArgs
146 
147  @property
148  def repoData(self):
149  return self
150 
151  def __repr__(self):
152  return ("{}(id={},"
153  "repoArgs={}"
154  "cfg={!r},"
155  "cfgOrigin={},"
156  "cfgRoot={}," +
157  "repo={},"
158  "parentRepoDatas={}," +
159  "isV1Repository={},"
160  "role={}," +
161  "parentRegistry={})").format(
162  self.__class__.__name__,
163  id(self),
164  self.repoArgs,
165  self.cfg,
166  self.cfgOrigin,
167  self.cfgRoot,
168  self.repo,
169  [id(p) for p in self.parentRepoDatas],
170  self.isV1Repository,
171  self.role,
172  self.parentRegistry)
173 
174  def setCfg(self, cfg, origin, root, isV1Repository):
175  """Set information about the cfg into the RepoData
176 
177  Parameters
178  ----------
179  cfg : RepositoryCfg
180  The RepositoryCfg for the repo.
181  origin : string
182  'new', 'existing', or 'nested'
183  root : string
184  URI or absolute path to the location of the RepositoryCfg.yaml file.
185 
186  Returns
187  -------
188  None
189  """
190  if origin not in ('new', 'existing', 'nested'):
191  raise RuntimeError("Invalid value for origin:{}".format(origin))
192  self.cfg = cfg
193  self._cfgOrigin = origin
194  self.cfgRoot = root
195  self.isV1Repository = isV1Repository
196 
197  @property
198  def cfgOrigin(self):
199  return self._cfgOrigin
200 
201  @property
202  def isNewRepository(self):
203  return self.cfgOrigin == 'new'
204 
205  @property
206  def role(self):
207  return self._role
208 
209  @role.setter
210  def role(self, val):
211  if val not in ('input', 'output', 'parent'):
212  raise RuntimeError("Invalid value for role: {}".format(val))
213  self._role = val
214 
215  def getParentRepoDatas(self, context=None):
216  """Get the parents & grandparents etc of this repo data, in depth-first search order.
217 
218  Duplicate entries will be removed in cases where the same parent appears more than once in the parent
219  graph.
220 
221  Parameters
222  ----------
223  context : set, optional
224  Users should typically omit context and accept the default argument. Context is used to keep a set
225  of known RepoDatas when calling this function recursively, for duplicate elimination.
226 
227  Returns
228  -------
229  list of RepoData
230  A list of the parents & grandparents etc of a given repo data, in depth-first search order.
231  """
232  if context is None:
233  context = set()
234  parents = []
235  if id(self) in context:
236  return parents
237  context.add(id(self))
238  for parent in self.parentRepoDatas:
239  parents.append(parent)
240  parents += parent.getParentRepoDatas(context)
241  return parents
242 
243  def addParentRepoData(self, parentRepoData):
244  self.parentRepoDatas.append(parentRepoData)
245 
246  def addTags(self, tags):
247  self.tags = self.tags.union(tags)
248 
249 
250 class RepoDataContainer(object):
251  """Container object for RepoData instances owned by a Butler instance.
252 
253  Parameters
254  ----------
255  repoDataList : list of RepoData
256  repoData - RepoData instance to add
257  """
258 
259  def __init__(self, repoDataList):
260  self._inputs = None
261  self._outputs = None
262  self._all = repoDataList
263  self._buildLookupLists()
264 
265  def inputs(self):
266  """Get a list of RepoData that are used to as inputs to the Butler.
267  The list is created lazily as needed, and cached.
268 
269  Returns
270  -------
271  A list of RepoData with readable repositories, in the order to be used when searching.
272  """
273  if self._inputs is None:
274  raise RuntimeError("Inputs not yet initialized.")
275  return self._inputs
276 
277  def outputs(self):
278  """Get a list of RepoData that are used to as outputs to the Butler.
279  The list is created lazily as needed, and cached.
280 
281  Returns
282  -------
283  A list of RepoData with writable repositories, in the order to be use when searching.
284  """
285  if self._outputs is None:
286  raise RuntimeError("Outputs not yet initialized.")
287  return self._outputs
288 
289  def all(self):
290  """Get a list of all RepoData that are used to as by the Butler.
291  The list is created lazily as needed, and cached.
292 
293  Returns
294  -------
295  A list of RepoData with writable repositories, in the order to be use when searching.
296  """
297  return self._all
298 
299  def __repr__(self):
300  return "%s(_inputs=%r, \n_outputs=%s, \n_all=%s)" % (
301  self.__class__.__name__,
302  self._inputs,
303  self._outputs,
304  self._all)
305 
306  def _buildLookupLists(self):
307  """Build the inputs and outputs lists based on the order of self.all()."""
308 
309  def addToList(repoData, lst):
310  """Add a repoData and each of its parents (depth first) to a list"""
311  if id(repoData) in alreadyAdded:
312  return
313  lst.append(repoData)
314  alreadyAdded.add(id(repoData))
315  for parent in repoData.parentRepoDatas:
316  addToList(parent, lst)
317 
318  if self._inputs is not None or self._outputs is not None:
319  raise RuntimeError("Lookup lists are already built.")
320  inputs = [repoData for repoData in self.all() if repoData.role == 'input']
321  outputs = [repoData for repoData in self.all() if repoData.role == 'output']
322  self._inputs = []
323  alreadyAdded = set()
324  for repoData in outputs:
325  if 'r' in repoData.repoArgs.mode:
326  addToList(repoData.repoData, self._inputs)
327  for repoData in inputs:
328  addToList(repoData.repoData, self._inputs)
329  self._outputs = [repoData.repoData for repoData in outputs]
330 
331 
332 class Butler(object):
333  """Butler provides a generic mechanism for persisting and retrieving data using mappers.
334 
335  A Butler manages a collection of datasets known as a repository. Each dataset has a type representing its
336  intended usage and a location. Note that the dataset type is not the same as the C++ or Python type of the
337  object containing the data. For example, an ExposureF object might be used to hold the data for a raw
338  image, a post-ISR image, a calibrated science image, or a difference image. These would all be different
339  dataset types.
340 
341  A Butler can produce a collection of possible values for a key (or tuples of values for multiple keys) if
342  given a partial data identifier. It can check for the existence of a file containing a dataset given its
343  type and data identifier. The Butler can then retrieve the dataset. Similarly, it can persist an object to
344  an appropriate location when given its associated data identifier.
345 
346  Note that the Butler has two more advanced features when retrieving a data set. First, the retrieval is
347  lazy. Input does not occur until the data set is actually accessed. This allows datasets to be retrieved
348  and placed on a clipboard prospectively with little cost, even if the algorithm of a stage ends up not
349  using them. Second, the Butler will call a standardization hook upon retrieval of the dataset. This
350  function, contained in the input mapper object, must perform any necessary manipulations to force the
351  retrieved object to conform to standards, including translating metadata.
352 
353  Public methods:
354 
355  __init__(self, root, mapper=None, **mapperArgs)
356 
357  defineAlias(self, alias, datasetType)
358 
359  getKeys(self, datasetType=None, level=None)
360 
361  queryMetadata(self, datasetType, format=None, dataId={}, **rest)
362 
363  datasetExists(self, datasetType, dataId={}, **rest)
364 
365  get(self, datasetType, dataId={}, immediate=False, **rest)
366 
367  put(self, obj, datasetType, dataId={}, **rest)
368 
369  subset(self, datasetType, level=None, dataId={}, **rest)
370 
371  dataRef(self, datasetType, level=None, dataId={}, **rest)
372 
373  Initialization:
374 
375  The preferred method of initialization is to use the `inputs` and `outputs` __init__ parameters. These
376  are described in the parameters section, below.
377 
378  For backward compatibility: this initialization method signature can take a posix root path, and
379  optionally a mapper class instance or class type that will be instantiated using the mapperArgs input
380  argument. However, for this to work in a backward compatible way it creates a single repository that is
381  used as both an input and an output repository. This is NOT preferred, and will likely break any
382  provenance system we have in place.
383 
384  Parameters
385  ----------
386  root : string
387  .. note:: Deprecated in 12_0
388  `root` will be removed in TBD, it is replaced by `inputs` and `outputs` for
389  multiple-repository support.
390  A file system path. Will only work with a PosixRepository.
391  mapper : string or instance
392  .. note:: Deprecated in 12_0
393  `mapper` will be removed in TBD, it is replaced by `inputs` and `outputs` for
394  multiple-repository support.
395  Provides a mapper to be used with Butler.
396  mapperArgs : dict
397  .. note:: Deprecated in 12_0
398  `mapperArgs` will be removed in TBD, it is replaced by `inputs` and `outputs` for
399  multiple-repository support.
400  Provides arguments to be passed to the mapper if the mapper input argument is a class type to be
401  instantiated by Butler.
402  inputs : RepositoryArgs, dict, or string
403  Can be a single item or a list. Provides arguments to load an existing repository (or repositories).
404  String is assumed to be a URI and is used as the cfgRoot (URI to the location of the cfg file). (Local
405  file system URI does not have to start with 'file://' and in this way can be a relative path). The
406  `RepositoryArgs` class can be used to provide more parameters with which to initialize a repository
407  (such as `mapper`, `mapperArgs`, `tags`, etc. See the `RepositoryArgs` documentation for more
408  details). A dict may be used as shorthand for a `RepositoryArgs` class instance. The dict keys must
409  match parameters to the `RepositoryArgs.__init__` function.
410  outputs : RepositoryArgs, dict, or string
411  Provides arguments to load one or more existing repositories or create new ones. The different types
412  are handled the same as for `inputs`.
413 
414  The Butler init sequence loads all of the input and output repositories.
415  This creates the object hierarchy to read from and write to them. Each
416  repository can have 0 or more parents, which also get loaded as inputs.
417  This becomes a DAG of repositories. Ultimately, Butler creates a list of
418  these Repositories in the order that they are used.
419 
420  Initialization Sequence
421  =======================
422 
423  During initialization Butler creates a Repository class instance & support structure for each object
424  passed to `inputs` and `outputs` as well as the parent repositories recorded in the `RepositoryCfg` of
425  each existing readable repository.
426 
427  This process is complex. It is explained below to shed some light on the intent of each step.
428 
429  1. Input Argument Standardization
430  ---------------------------------
431 
432  In `Butler._processInputArguments` the input arguments are verified to be legal (and a RuntimeError is
433  raised if not), and they are converted into an expected format that is used for the rest of the Butler
434  init sequence. See the docstring for `_processInputArguments`.
435 
436  2. Create RepoData Objects
437  --------------------------
438 
439  Butler uses an object, called `RepoData`, to keep track of information about each repository; each
440  repository is contained in a single `RepoData`. The attributes are explained in its docstring.
441 
442  After `_processInputArguments`, a RepoData is instantiated and put in a list for each repository in
443  `outputs` and `inputs`. This list of RepoData, the `repoDataList`, now represents all the output and input
444  repositories (but not parent repositories) that this Butler instance will use.
445 
446  3. Get `RepositoryCfg`s
447  -----------------------
448 
449  `Butler._getCfgs` gets the `RepositoryCfg` for each repository the `repoDataList`. The behavior is
450  described in the docstring.
451 
452  4. Add Parents
453  --------------
454 
455  `Butler._addParents` then considers the parents list in the `RepositoryCfg` of each `RepoData` in the
456  `repoDataList` and inserts new `RepoData` objects for each parent not represented in the proper location
457  in the `repoDataList`. Ultimately a flat list is built to represent the DAG of readable repositories
458  represented in depth-first order.
459 
460  5. Set and Verify Parents of Outputs
461  ------------------------------------
462 
463  To be able to load parent repositories when output repositories are used as inputs, the input repositories
464  are recorded as parents in the `RepositoryCfg` file of new output repositories. When an output repository
465  already exists, for consistency the Butler's inputs must match the list of parents specified the already-
466  existing output repository's `RepositoryCfg` file.
467 
468  In `Butler._setAndVerifyParentsLists`, the list of parents is recorded in the `RepositoryCfg` of new
469  repositories. For existing repositories the list of parents is compared with the `RepositoryCfg`'s parents
470  list, and if they do not match a `RuntimeError` is raised.
471 
472  6. Set the Default Mapper
473  -------------------------
474 
475  If all the input repositories use the same mapper then we can assume that mapper to be the
476  "default mapper". If there are new output repositories whose `RepositoryArgs` do not specify a mapper and
477  there is a default mapper then the new output repository will be set to use that default mapper.
478 
479  This is handled in `Butler._setDefaultMapper`.
480 
481  7. Cache References to Parent RepoDatas
482  ---------------------------------------
483 
484  In `Butler._connectParentRepoDatas`, in each `RepoData` in `repoDataList`, a list of `RepoData` object
485  references is built that matches the parents specified in that `RepoData`'s `RepositoryCfg`.
486 
487  This list is used later to find things in that repository's parents, without considering peer repository's
488  parents. (e.g. finding the registry of a parent)
489 
490  8. Set Tags
491  -----------
492 
493  Tags are described at https://ldm-463.lsst.io/v/draft/#tagging
494 
495  In `Butler._setRepoDataTags`, for each `RepoData`, the tags specified by its `RepositoryArgs` are recorded
496  in a set, and added to the tags set in each of its parents, for ease of lookup when mapping.
497 
498  9. Find Parent Registry and Instantiate RepoData
499  ------------------------------------------------
500 
501  At this point there is enough information to instantiate the `Repository` instances. There is one final
502  step before instantiating the Repository, which is to try to get a parent registry that can be used by the
503  child repository. The criteria for "can be used" is spelled out in `Butler._setParentRegistry`. However,
504  to get the registry from the parent, the parent must be instantiated. The `repoDataList`, in depth-first
505  search order, is built so that the most-dependent repositories are first, and the least dependent
506  repositories are last. So the `repoDataList` is reversed and the Repositories are instantiated in that
507  order; for each RepoData a parent registry is searched for, and then the Repository is instantiated with
508  whatever registry could be found."""
509 
510  def __init__(self, root=None, mapper=None, inputs=None, outputs=None, **mapperArgs):
511  self._initArgs = {'root': root, 'mapper': mapper, 'inputs': inputs, 'outputs': outputs,
512  'mapperArgs': mapperArgs}
513 
514  self.log = Log.getLogger("daf.persistence.butler")
515  # Always use an empty Persistence policy until we can get rid of it
516  persistencePolicy = pexPolicy.Policy()
517  self.persistence = Persistence.getPersistence(persistencePolicy)
518 
519  inputs, outputs = self._processInputArguments(
520  root=root, mapper=mapper, inputs=inputs, outputs=outputs, **mapperArgs)
521 
522  # convert the RepoArgs into RepoData
523  inputs = [RepoData(args, 'input') for args in inputs]
524  outputs = [RepoData(args, 'output') for args in outputs]
525  repoDataList = outputs + inputs
526 
527  self._getCfgs(repoDataList)
528 
529  self._addParents(repoDataList)
530 
531  self._setAndVerifyParentsLists(repoDataList)
532 
533  self._setDefaultMapper(repoDataList)
534 
535  self._connectParentRepoDatas(repoDataList)
536 
537  self._repos = RepoDataContainer(repoDataList)
538 
539  self._setRepoDataTags()
540 
541  for repoData in reversed(repoDataList):
542  self._setParentRegistry(repoData)
543  repoData.repo = Repository(repoData)
544 
545  def _setParentRegistry(self, repoData):
546  """Try to get a parent registry that can be used by this repository. To be usable the repository must
547  "match", meaning the mapper in the passed-in repo is the same type as the mapper in the parent.
548  """
549 
550  def getParentRegsitry(repoData, context):
551  """Get the first found registry that matches the the passed-in repo.
552 
553  Parameters
554  ----------
555  repoData : RepoData
556  The RepoData for the repository for which we are searching for a
557  parent registry.
558 
559  Returns
560  -------
561  Registry or None
562  A registry from a parent if one can be found, or None.
563 
564  Raises
565  ------
566  RuntimeError
567  Indicates a butler init order problem, all parents should be initialized before child
568  repositories, so this function should be able to get any parent of any child repo.
569  """
570  if id(self) in context:
571  return None
572  else:
573  context.add(id(self))
574  for parentRepoData in repoData.getParentRepoDatas():
575  if parentRepoData.cfg.mapper == repoData.cfg.mapper:
576  if parentRepoData.repo is None:
577  self.log.debug(
578  "_getParentRegistry: Parent {} of new repo {} not yet created, ignoring.".format(
579  parentRepoData, repoData))
580  else:
581  parentRegistry = parentRepoData.repo.getRegistry()
582  if parentRegistry:
583  return parentRegistry
584  else:
585  parentRegistry = getParentRegsitry(parentRepoData, context)
586  if parentRegistry:
587  return parentRegistry
588  return None
589 
590  repoData.repoData.parentRegistry = getParentRegsitry(repoData.repoData, set())
591 
592  def _processInputArguments(self, root=None, mapper=None, inputs=None, outputs=None, **mapperArgs):
593  """Process, verify, and standardize the input arguments.
594  * Inputs can not be for Old Butler (root, mapper, mapperArgs) AND New Butler (inputs, outputs)
595  `root`, `mapper`, and `mapperArgs` are Old Butler init API.
596  `inputs` and `outputs` are New Butler init API.
597  Old Butler and New Butler init API may not be mixed, Butler may be initialized with only the Old
598  arguments or the New arguments.
599  * Verify that if there is a readable output that there is exactly one output. (This restriction is in
600  place because all readable repositories must be parents of writable repositories, and for
601  consistency the DAG of readable repositories must always be the same. Keeping the list of parents
602  becomes very complicated in the presence of multiple readable output repositories. It is better to
603  only write to output repositories, and then create a new Butler instance and use the outputs as
604  inputs, and write to new output repositories.)
605  * Make a copy of inputs & outputs so they may be modified without changing the passed-in arguments.
606  * Convert any input/output values that are URI strings to RepositoryArgs.
607  * Listify inputs & outputs.
608  * Set default RW mode on inputs & outputs as needed.
609 
610  Parameters
611  ----------
612  Same as Butler.__init__
613 
614  Returns
615  -------
616  (list of RepositoryArgs, list of RepositoryArgs)
617  First item is a list to use as inputs.
618  Second item is a list to use as outputs.
619 
620  Raises
621  ------
622  RuntimeError
623  If Old Butler and New Butler arguments are both used this will raise.
624  If an output is readable there is more than one output this will raise.
625  """
626  # inputs and outputs may be modified, do not change the external value.
627  inputs = copy.deepcopy(inputs)
628  outputs = copy.deepcopy(outputs)
629 
630  isV1Args = inputs is None and outputs is None
631  if isV1Args:
632  inputs, outputs = self._convertV1Args(root=root,
633  mapper=mapper,
634  mapperArgs=mapperArgs or None)
635  elif root or mapper or mapperArgs:
636  raise RuntimeError(
637  'Butler version 1 API (root, mapper, **mapperArgs) may ' +
638  'not be used with version 2 API (inputs, outputs)')
640 
641  self.storage = Storage()
642 
643  # make sure inputs and outputs are lists, and if list items are a string convert it RepositoryArgs.
644  inputs = listify(inputs)
645  outputs = listify(outputs)
646  inputs = [RepositoryArgs(cfgRoot=args)
647  if not isinstance(args, RepositoryArgs) else args for args in inputs]
648  outputs = [RepositoryArgs(cfgRoot=args)
649  if not isinstance(args, RepositoryArgs) else args for args in outputs]
650  # Set the default value of inputs & outputs, verify the required values ('r' for inputs, 'w' for
651  # outputs) and remove the 'w' from inputs if needed.
652  for args in inputs:
653  if args.mode is None:
654  args.mode = 'r'
655  elif 'rw' == args.mode:
656  args.mode = 'r'
657  elif 'r' != args.mode:
658  raise RuntimeError("The mode of an input should be readable.")
659  for args in outputs:
660  if args.mode is None:
661  args.mode = 'w'
662  elif 'w' not in args.mode:
663  raise RuntimeError("The mode of an output should be writable.")
664  # check for class instances in args.mapper (not allowed)
665  for args in inputs + outputs:
666  if (args.mapper and not isinstance(args.mapper, basestring) and
667  not inspect.isclass(args.mapper)):
668  self.log.warn(preinitedMapperWarning)
669  # if the output is readable, there must be only one output:
670  for o in outputs:
671  if 'r' in o.mode:
672  if len(outputs) > 1:
673  raise RuntimeError("Butler does not support multiple output repositories if any of the "
674  "outputs are readable.")
675 
676  # Handle the case where the output is readable and is also passed in as one of the inputs by removing
677  # the input. This supports a legacy use case in pipe_tasks where the input is also passed as the
678  # output, to the command line parser.
679  def inputIsInOutputs(inputArgs, outputArgsList):
680  for o in outputArgsList:
681  if ('r' in o.mode and
682  o.root == inputArgs.root and
683  o.mapper == inputArgs.mapper and
684  o.mapperArgs == inputArgs.mapperArgs and
685  o.tags == inputArgs.tags and
686  o.policy == inputArgs.policy):
687  self.log.debug(("Input repositoryArgs {} is also listed in outputs as readable; " +
688  "throwing away the input.").format(inputArgs))
689  return True
690  return False
691 
692  inputs = [args for args in inputs if not inputIsInOutputs(args, outputs)]
693  return inputs, outputs
694 
695  @staticmethod
696  def _getParentVal(repoData):
697  """Get the value of this repoData as it should appear in the parents
698  list of other repositories"""
699  if repoData.isV1Repository:
700  return repoData.cfg
701  if repoData.cfgOrigin == 'nested':
702  return repoData.cfg
703  else:
704  return repoData.cfg.root
705 
706  @staticmethod
707  def _getParents(ofRepoData, repoInfo):
708  """Create a parents list of repoData from inputs and (readable) outputs."""
709  parents = []
710  # get the parents list of repoData:
711  for repoData in repoInfo:
712  if repoData is ofRepoData:
713  continue
714  if 'r' not in repoData.repoArgs.mode:
715  continue
716  parents.append(Butler._getParentVal(repoData))
717  return parents
718 
719  @staticmethod
720  def _getOldButlerRepositoryCfg(repositoryArgs):
721  if not Storage.isPosix(repositoryArgs.cfgRoot):
722  return None
723  if not PosixStorage.v1RepoExists(repositoryArgs.cfgRoot):
724  return None
725  if not repositoryArgs.mapper:
726  repositoryArgs.mapper = PosixStorage.getMapperClass(repositoryArgs.cfgRoot)
727  cfg = RepositoryCfg.makeFromArgs(repositoryArgs)
728  parent = PosixStorage.getParentSymlinkPath(repositoryArgs.cfgRoot)
729  if parent:
730  parent = Butler._getOldButlerRepositoryCfg(RepositoryArgs(cfgRoot=parent, mode='r'))
731  if parent is not None:
732  cfg.addParents([parent])
733  return cfg
734 
735  def _getRepositoryCfg(self, repositoryArgs):
736  """Try to get a repository from the location described by cfgRoot.
737 
738  Parameters
739  ----------
740  repositoryArgs : RepositoryArgs or string
741  Provides arguments to load an existing repository (or repositories). String is assumed to be a URI
742  and is used as the cfgRoot (URI to the location of the cfg file).
743 
744  Returned
745  --------
746  (RepositoryCfg or None, bool)
747  The RepositoryCfg, or None if one cannot be found, and True if the RepositoryCfg was created by
748  reading an Old Butler repository, or False if it is a New Butler Repository.
749  """
750  if not isinstance(repositoryArgs, RepositoryArgs):
751  repositoryArgs = RepositoryArgs(cfgRoot=repositoryArgs, mode='r')
752 
753  cfg = self.storage.getRepositoryCfg(repositoryArgs.cfgRoot)
754  isOldButlerRepository = False
755  if cfg is None:
756  cfg = Butler._getOldButlerRepositoryCfg(repositoryArgs)
757  if cfg is not None:
758  isOldButlerRepository = True
759  return cfg, isOldButlerRepository
760 
761  def _getCfgs(self, repoDataList):
762  """Get or make a RepositoryCfg for each RepoData, and add the cfg to the RepoData.
763  If the cfg exists, compare values. If values match then use the cfg as an "existing" cfg. If the
764  values do not match, use the cfg as a "nested" cfg.
765  If the cfg does not exist, the RepositoryArgs must be for a writable repository.
766 
767  Parameters
768  ----------
769  repoDataList : list of RepoData
770  The RepoData that are output and inputs of this Butler
771 
772  Raises
773  ------
774  RuntimeError
775  If the passed-in RepositoryArgs indicate an existing repository but other cfg parameters in those
776  RepositoryArgs don't
777  match the existing repository's cfg a RuntimeError will be raised.
778  """
779  def cfgMatchesArgs(args, cfg):
780  """Test if there are any values in an RepositoryArgs that conflict with the values in a cfg"""
781  if args.mapper is not None and cfg.mapper != args.mapper:
782  return False
783  if args.mapperArgs is not None and cfg.mapperArgs != args.mapperArgs:
784  return False
785  if args.policy is not None and cfg.policy != args.policy:
786  return False
787  return True
788 
789  for repoData in repoDataList:
790  cfg, isOldButlerRepository = self._getRepositoryCfg(repoData.repoArgs)
791  if cfg is None:
792  if 'w' not in repoData.repoArgs.mode:
793  raise RuntimeError(
794  "No cfg found for read-only input repository at {}".format(repoData.repoArgs.cfgRoot))
795  repoData.setCfg(cfg=RepositoryCfg.makeFromArgs(repoData.repoArgs),
796  origin='new',
797  root=repoData.repoArgs.cfgRoot,
798  isV1Repository=isOldButlerRepository)
799  else:
800  if 'w' in repoData.repoArgs.mode:
801  # if it's an output repository, the RepositoryArgs must match the existing cfg.
802  if not cfgMatchesArgs(repoData.repoArgs, cfg):
803  raise RuntimeError(("The RepositoryArgs and RepositoryCfg must match for writable " +
804  "repositories, RepositoryCfg:{}, RepositoryArgs:{}").format(
805  cfg, repoData.repoArgs))
806  repoData.setCfg(cfg=cfg, origin='existing', root=repoData.repoArgs.cfgRoot,
807  isV1Repository=isOldButlerRepository)
808  else:
809  # if it's an input repository, the cfg can overwrite the in-repo cfg.
810  if cfgMatchesArgs(repoData.repoArgs, cfg):
811  repoData.setCfg(cfg=cfg, origin='existing', root=repoData.repoArgs.cfgRoot,
812  isV1Repository=isOldButlerRepository)
813  else:
814  repoData.setCfg(cfg=cfg, origin='nested', root=None,
815  isV1Repository=isOldButlerRepository)
816 
817  def _addParents(self, repoDataList):
818  """For each repoData in the input list, see if its parents are the next items in the list, and if not
819  add the parent, so that the repoDataList includes parents and is in order to operate depth-first 0..n.
820 
821  Parameters
822  ----------
823  repoDataList : list of RepoData
824  The RepoData for the Butler outputs + inputs.
825 
826  Raises
827  ------
828  RuntimeError
829  Raised if a RepositoryCfg can not be found at a location where a parent repository should be.
830  """
831  repoDataIdx = 0
832  while True:
833  if repoDataIdx == len(repoDataList):
834  break
835  repoData = repoDataList[repoDataIdx]
836  if 'r' not in repoData.repoArgs.mode:
837  repoDataIdx += 1
838  continue # the repoData only needs parents if it's readable.
839  if repoData.isNewRepository:
840  repoDataIdx += 1
841  continue # if it's new the parents will be the inputs of this butler.
842  if repoData.cfg.parents is None:
843  repoDataIdx += 1
844  continue # if there are no parents then there's nothing to do.
845  for repoParentIdx, repoParent in enumerate(repoData.cfg.parents):
846  parentIdxInRepoDataList = repoDataIdx + repoParentIdx + 1
847  if not isinstance(repoParent, RepositoryCfg):
848  repoParentCfg, isOldButlerRepository = self._getRepositoryCfg(repoParent)
849  if repoParentCfg is not None:
850  cfgOrigin = 'existing'
851  else:
852  isOldButlerRepository = False
853  repoParentCfg = repoParent
854  cfgOrigin = 'nested'
855  if (parentIdxInRepoDataList < len(repoDataList) and
856  repoDataList[parentIdxInRepoDataList].cfg == repoParentCfg):
857  continue
858  args = RepositoryArgs(cfgRoot=repoParentCfg.root, mode='r')
859  role = 'input' if repoData.role == 'output' else 'parent'
860  newRepoInfo = RepoData(args, role)
861  newRepoInfo.repoData.setCfg(cfg=repoParentCfg, origin=cfgOrigin, root=args.cfgRoot,
862  isV1Repository=isOldButlerRepository)
863  repoDataList.insert(parentIdxInRepoDataList, newRepoInfo)
864  repoDataIdx += 1
865 
866  def _setAndVerifyParentsLists(self, repoDataList):
867  """Make a list of all the input repositories of this Butler, these are the parents of the outputs.
868  For new output repositories, set the parents in the RepositoryCfg. For existing output repositories
869  verify that the RepositoryCfg's parents match the parents list.
870 
871  Parameters
872  ----------
873  repoDataList : list of RepoData
874  All the RepoDatas loaded by this butler, in search order.
875 
876  Raises
877  ------
878  RuntimeError
879  If an existing output repository is loaded and its parents do not match the parents of this Butler
880  an error will be raised.
881  """
882  def getIOParents(ofRepoData, repoDataList):
883  """make a parents list for repo in `ofRepoData` that is comprised of inputs and readable
884  outputs (not parents-of-parents) of this butler"""
885  parents = []
886  for repoData in repoDataList:
887  if repoData.role == 'parent':
888  continue
889  if repoData is ofRepoData:
890  continue
891  if repoData.role == 'output':
892  if 'r' in repoData.repoArgs.mode:
893  raise RuntimeError("If an output is readable it must be the only output.")
894  # and if this is the only output, this should have continued in
895  # "if repoData is ofRepoData"
896  continue
897  parents.append(self._getParentVal(repoData))
898  return parents
899 
900  for repoData in repoDataList:
901  if repoData.role != 'output':
902  continue
903  parents = getIOParents(repoData, repoDataList)
904  # if repoData is new, add the parent RepositoryCfgs to it.
905  if repoData.cfgOrigin == 'new':
906  repoData.cfg.addParents(parents)
907  elif repoData.cfgOrigin in ('existing', 'nested'):
908  if repoData.cfg.parents != parents:
909  try:
910  repoData.cfg.extendParents(parents)
911  except ParentsMismatch as e:
912  raise RuntimeError(("Inputs of this Butler:{} do not match parents of existing " +
913  "writable cfg:{} (ParentMismatch exception: {}").format(
914  parents, repoData.cfg.parents, e))
915 
916  def _setDefaultMapper(self, repoDataList):
917  """Establish a default mapper if there is one and assign it to outputs that do not have a mapper
918  assigned.
919 
920  If all inputs have the same mapper it will be used as the default mapper.
921 
922  Parameters
923  ----------
924  repoDataList : list of RepoData
925  All the RepoDatas loaded by this butler, in search order.
926 
927  Raises
928  ------
929  RuntimeError
930  If a default mapper can not be established and there is an output that does not have a mapper.
931  """
932  needyOutputs = [rd for rd in repoDataList if rd.role == 'output' and rd.cfg.mapper is None]
933  if len(needyOutputs) is 0:
934  return
935  mappers = set([rd.cfg.mapper for rd in repoDataList if rd.role == 'input'])
936  if len(mappers) != 1:
937  inputs = [rd for rd in repoDataList if rd.role == 'input']
938  raise RuntimeError(
939  ("No default mapper could be established from inputs:{} and no mapper specified " +
940  "for outputs:{}").format(inputs, needyOutputs))
941  defaultMapper = mappers.pop()
942  for repoData in needyOutputs:
943  repoData.cfg.mapper = defaultMapper
944 
945  def _connectParentRepoDatas(self, repoDataList):
946  """For each RepoData in repoDataList, find its parent in the repoDataList and cache a reference to it.
947 
948  Parameters
949  ----------
950  repoDataList : list of RepoData
951  All the RepoDatas loaded by this butler, in search order.
952 
953  Raises
954  ------
955  RuntimeError
956  When a parent is listed in the parents list but not found in the repoDataList. This is not
957  expected to ever happen and would indicate an internal Butler error.
958  """
959  for repoData in repoDataList:
960  for parent in repoData.cfg.parents:
961  parentToAdd = None
962  for otherRepoData in repoDataList:
963  if isinstance(parent, RepositoryCfg):
964  if otherRepoData.repoData.repoData.cfg == parent:
965  parentToAdd = otherRepoData.repoData
966  break
967  elif otherRepoData.repoData.cfg.root == parent:
968  parentToAdd = otherRepoData.repoData
969  break
970  if parentToAdd is None:
971  raise RuntimeError(
972  "Could not find a parent matching {} to add to {}".format(parent, repoData))
973  repoData.addParentRepoData(parentToAdd)
974 
975  @staticmethod
976  def _getParentRepoData(parent, repoDataList):
977  """get a parent RepoData from a cfg from a list of RepoData
978 
979  Parameters
980  ----------
981  parent : string or RepositoryCfg
982  cfgRoot of a repo or a cfg that describes the repo
983  repoDataList : list of RepoData
984  list to search in
985 
986  Returns
987  -------
988  RepoData or None
989  A RepoData if one can be found, else None
990  """
991  repoData = None
992  for otherRepoData in repoDataList:
993  if isinstance(parent, RepositoryCfg):
994  if otherRepoData.cfg == parent:
995  repoData = otherRepoData
996  break
997  elif otherRepoData.cfg.root == parent:
998  repoData = otherRepoData
999  break
1000  return repoData
1001 
1002  def _setRepoDataTags(self):
1003  """Set the tags from each repoArgs into all its parent repoArgs so that they can be included in tagged
1004  searches."""
1005  def setTags(repoData, tags, context):
1006  if id(repoData) in context:
1007  return
1008  repoData.addTags(tags)
1009  context.add(id(repoData))
1010  for parentRepoData in repoData.parentRepoDatas:
1011  setTags(parentRepoData, tags, context)
1012  for repoData in self._repos.outputs() + self._repos.inputs():
1013  setTags(repoData.repoData, repoData.repoArgs.tags, set())
1014 
1015  def _convertV1Args(self, root, mapper, mapperArgs):
1016  """Convert Old Butler RepositoryArgs (root, mapper, mapperArgs) to New Butler RepositoryArgs
1017  (inputs, outputs)
1018 
1019  Parameters
1020  ----------
1021  root : string
1022  Posix path to repository root
1023  mapper : class, class instance, or string
1024  Instantiated class, a class object to be instantiated, or a string that refers to a class that
1025  can be imported & used as the mapper.
1026  mapperArgs : dict
1027  RepositoryArgs & their values used when instantiating the mapper.
1028 
1029  Returns
1030  -------
1031  tuple
1032  (inputs, outputs) - values to be used for inputs and outputs in Butler.__init__
1033  """
1034  if (mapper and not isinstance(mapper, basestring) and
1035  not inspect.isclass(mapper)):
1036  self.log.warn(preinitedMapperWarning)
1037  inputs = None
1038  if root is None:
1039  if hasattr(mapper, 'root'):
1040  # in legacy repositories, the mapper may be given the root directly.
1041  root = mapper.root
1042  else:
1043  # in the past root="None" could be used to mean root='.'
1044  root = '.'
1045  outputs = RepositoryArgs(mode='rw',
1046  root=root,
1047  mapper=mapper,
1048  mapperArgs=mapperArgs)
1049  return inputs, outputs
1050 
1051  def __repr__(self):
1052  return 'Butler(datasetTypeAliasDict=%s, repos=%s, persistence=%s)' % (
1053  self.datasetTypeAliasDict, self._repos, self.persistence)
1054 
1055  def _getDefaultMapper(self):
1056 
1057  """Get the default mapper. Currently this means if all the repositories use exactly the same mapper,
1058  that mapper may be considered the default.
1059 
1060  This definition may be changing; mappers may be able to exclude themselves as candidates for default,
1061  and they may nominate a different mapper instead. Also, we may not want to look at *all* the
1062  repositories, but only a depth-first search on each of the input & output repositories, and use the
1063  first-found mapper for each of those. TBD.
1064 
1065  Parameters
1066  ----------
1067  inputs : TYPE
1068  Description
1069 
1070  Returns
1071  -------
1072  Mapper class or None
1073  Returns the class type of the default mapper, or None if a default
1074  mapper can not be determined.
1075  """
1076  defaultMapper = None
1077 
1078  for inputRepoData in self._repos.inputs():
1079  mapper = None
1080  if inputRepoData.cfg.mapper is not None:
1081  mapper = inputRepoData.cfg.mapper
1082  # if the mapper is:
1083  # * a string, import it.
1084  # * a class instance, get its class type
1085  # * a class, do nothing; use it
1086  if isinstance(mapper, basestring):
1087  mapper = doImport(mapper)
1088  elif not inspect.isclass(mapper):
1089  mapper = mapper.__class__
1090  # If no mapper has been found, note the first found mapper.
1091  # Then, if a mapper has been found and each next mapper matches it,
1092  # continue looking for mappers.
1093  # If a mapper has been found and another non-matching mapper is
1094  # found then we have no default, return None.
1095  if defaultMapper is None:
1096  defaultMapper = mapper
1097  elif mapper == defaultMapper:
1098  continue
1099  elif mapper is not None:
1100  return None
1101  return defaultMapper
1102 
1103  def _assignDefaultMapper(self, defaultMapper):
1104  for repoData in self._repos.all().values():
1105  if repoData.cfg.mapper is None and (repoData.isNewRepository or repoData.isV1Repository):
1106  if defaultMapper is None:
1107  raise RuntimeError(
1108  "No mapper specified for %s and no default mapper could be determined." %
1109  repoData.args)
1110  repoData.cfg.mapper = defaultMapper
1111 
1112  @staticmethod
1113  def getMapperClass(root):
1114  """posix-only; gets the mapper class at the path specified by root (if a file _mapper can be found at
1115  that location or in a parent location.
1116 
1117  As we abstract the storage and support different types of storage locations this method will be
1118  moved entirely into Butler Access, or made more dynamic, and the API will very likely change."""
1119  return Storage.getMapperClass(root)
1120 
1121  def defineAlias(self, alias, datasetType):
1122  """Register an alias that will be substituted in datasetTypes.
1123 
1124  Parameters
1125  ----------
1126  alias - string
1127  The alias keyword. It may start with @ or not. It may not contain @ except as the first character.
1128  datasetType - string
1129  The string that will be substituted when @alias is passed into datasetType. It may not contain '@'
1130  """
1131  # verify formatting of alias:
1132  # it can have '@' as the first character (if not it's okay, we will add it) or not at all.
1133  atLoc = alias.rfind('@')
1134  if atLoc == -1:
1135  alias = "@" + str(alias)
1136  elif atLoc > 0:
1137  raise RuntimeError("Badly formatted alias string: %s" % (alias,))
1138 
1139  # verify that datasetType does not contain '@'
1140  if datasetType.count('@') != 0:
1141  raise RuntimeError("Badly formatted type string: %s" % (datasetType))
1142 
1143  # verify that the alias keyword does not start with another alias keyword,
1144  # and vice versa
1145  for key in self.datasetTypeAliasDict:
1146  if key.startswith(alias) or alias.startswith(key):
1147  raise RuntimeError("Alias: %s overlaps with existing alias: %s" % (alias, key))
1148 
1149  self.datasetTypeAliasDict[alias] = datasetType
1150 
1151  def getKeys(self, datasetType=None, level=None, tag=None):
1152  """Get the valid data id keys at or above the given level of hierarchy for the dataset type or the
1153  entire collection if None. The dict values are the basic Python types corresponding to the keys (int,
1154  float, string).
1155 
1156  Parameters
1157  ----------
1158  datasetType - string
1159  The type of dataset to get keys for, entire collection if None.
1160  level - string
1161  The hierarchy level to descend to. None if it should not be restricted. Use an empty string if the
1162  mapper should lookup the default level.
1163  tags - any, or list of any
1164  Any object that can be tested to be the same as the tag in a dataId passed into butler input
1165  functions. Applies only to input repositories: If tag is specified by the dataId then the repo
1166  will only be read from used if the tag in the dataId matches a tag used for that repository.
1167 
1168  Returns
1169  -------
1170  Returns a dict. The dict keys are the valid data id keys at or above the given level of hierarchy for
1171  the dataset type or the entire collection if None. The dict values are the basic Python types
1172  corresponding to the keys (int, float, string).
1173  """
1174  datasetType = self._resolveDatasetTypeAlias(datasetType)
1175 
1176  keys = None
1177  tag = setify(tag)
1178  for repoData in self._repos.inputs():
1179  if not tag or len(tag.intersection(repoData.tags)) > 0:
1180  keys = repoData.repo.getKeys(datasetType, level)
1181  # An empty dict is a valid "found" condition for keys. The only value for keys that should
1182  # cause the search to continue is None
1183  if keys is not None:
1184  break
1185  return keys
1186 
1187  def queryMetadata(self, datasetType, format, dataId={}, **rest):
1188  """Returns the valid values for one or more keys when given a partial
1189  input collection data id.
1190 
1191  Parameters
1192  ----------
1193  datasetType - string
1194  The type of dataset to inquire about.
1195  format - str, tuple
1196  Key or tuple of keys to be returned.
1197  dataId - DataId, dict
1198  The partial data id.
1199  **rest -
1200  Keyword arguments for the partial data id.
1201 
1202  Returns
1203  -------
1204  A list of valid values or tuples of valid values as specified by the
1205  format.
1206  """
1207 
1208  datasetType = self._resolveDatasetTypeAlias(datasetType)
1209  dataId = DataId(dataId)
1210  dataId.update(**rest)
1211  format = sequencify(format)
1212 
1213  tuples = None
1214  for repoData in self._repos.inputs():
1215  if not dataId.tag or len(dataId.tag.intersection(repoData.tags)) > 0:
1216  tuples = repoData.repo.queryMetadata(datasetType, format, dataId)
1217  if tuples:
1218  break
1219 
1220  if not tuples:
1221  return []
1222 
1223  if len(format) == 1:
1224  ret = []
1225  for x in tuples:
1226  try:
1227  ret.append(x[0])
1228  except TypeError:
1229  ret.append(x)
1230  return ret
1231 
1232  return tuples
1233 
1234  def datasetExists(self, datasetType, dataId={}, **rest):
1235  """Determines if a dataset file exists.
1236 
1237  Parameters
1238  ----------
1239  datasetType - string
1240  The type of dataset to inquire about.
1241  dataId - DataId, dict
1242  The data id of the dataset.
1243  **rest keyword arguments for the data id.
1244 
1245  Returns
1246  -------
1247  exists - bool
1248  True if the dataset exists or is non-file-based.
1249  """
1250  datasetType = self._resolveDatasetTypeAlias(datasetType)
1251  dataId = DataId(dataId)
1252  dataId.update(**rest)
1253  location = self._locate(datasetType, dataId, write=False)
1254  if location is None:
1255  return False
1256 
1257  # If the location is a ButlerComposite (as opposed to a ButlerLocation), verify the component objects
1258  # exist.
1259  if isinstance(location, ButlerComposite):
1260  for name, componentInfo in location.componentInfo.items():
1261  if componentInfo.subset:
1262  subset = self.subset(datasetType=componentInfo.datasetType, dataId=location.dataId)
1263  exists = all([obj.datasetExists() for obj in subset])
1264  else:
1265  exists = self.datasetExists(componentInfo.datasetType, location.dataId)
1266  if exists is False:
1267  break
1268  else:
1269  exists = location.repository.exists(location)
1270  return exists
1271 
1272  def _locate(self, datasetType, dataId, write):
1273  """Get one or more ButlerLocations and/or ButlercComposites.
1274 
1275  Parameters
1276  ----------
1277  datasetType : string
1278  The datasetType that is being searched for. The datasetType may be followed by a dot and
1279  a component name (component names are specified in the policy). IE datasetType.componentName
1280 
1281  dataId : dict or DataId class instance
1282  The dataId
1283 
1284  write : bool
1285  True if this is a search to write an object. False if it is a search to read an object. This
1286  affects what type (an object or a container) is returned.
1287 
1288  Returns
1289  -------
1290  If write is False, will return either a single object or None. If write is True, will return a list
1291  (which may be empty)
1292  """
1293  repos = self._repos.outputs() if write else self._repos.inputs()
1294  locations = []
1295  for repoData in repos:
1296  # enforce dataId & repository tags when reading:
1297  if not write and dataId.tag and len(dataId.tag.intersection(repoData.tags)) == 0:
1298  continue
1299  components = datasetType.split('.')
1300  datasetType = components[0]
1301  components = components[1:]
1302  try:
1303  location = repoData.repo.map(datasetType, dataId, write=write)
1304  except NoResults:
1305  continue
1306  if location is None:
1307  continue
1308  location.datasetType = datasetType # todo is there a better way than monkey patching here?
1309  if len(components) > 0:
1310  if not isinstance(location, ButlerComposite):
1311  raise RuntimeError("The location for a dotted datasetType must be a composite.")
1312  # replace the first component name with the datasetType
1313  components[0] = location.componentInfo[components[0]].datasetType
1314  # join components back into a dot-delimited string
1315  datasetType = '.'.join(components)
1316  location = self._locate(datasetType, dataId, write)
1317  # if a component location is not found, we can not continue with this repo, move to next repo.
1318  if location is None:
1319  break
1320  # if reading, only one location is desired.
1321  if location:
1322  if not write:
1323  # If there is a bypass function for this dataset type, we can't test to see if the object
1324  # exists in storage, because the bypass function may not actually use the location
1325  # according to the template. Instead, execute the bypass function and include its results
1326  # in the bypass attribute of the location. The bypass function may fail for any reason,
1327  # the most common case being that a file does not exist. If it raises an exception we
1328  # ignore its existance and proceed as though it does not exist.
1329  if hasattr(location.mapper, "bypass_" + location.datasetType):
1330  bypass = self._getBypassFunc(location, dataId)
1331  try:
1332  bypass = bypass()
1333  location.bypass = bypass
1334  except:
1335  pass
1336  # If a location was found but the location does not exist, keep looking in input
1337  # repositories (the registry may have had enough data for a lookup even thought the object
1338  # exists in a different repository.)
1339  if (isinstance(location, ButlerComposite) or hasattr(location, 'bypass') or
1340  location.repository.exists(location)):
1341  return location
1342  else:
1343  try:
1344  locations.extend(location)
1345  except TypeError:
1346  locations.append(location)
1347  if not write:
1348  return None
1349  return locations
1350 
1351  @staticmethod
1352  def _getBypassFunc(location, dataId):
1353  pythonType = location.getPythonType()
1354  if pythonType is not None:
1355  if isinstance(pythonType, basestring):
1356  pythonType = doImport(pythonType)
1357  bypassFunc = getattr(location.mapper, "bypass_" + location.datasetType)
1358  return lambda: bypassFunc(location.datasetType, pythonType, location, dataId)
1359 
1360  def get(self, datasetType, dataId=None, immediate=True, **rest):
1361  """Retrieves a dataset given an input collection data id.
1362 
1363  Parameters
1364  ----------
1365  datasetType - string
1366  The type of dataset to retrieve.
1367  dataId - dict
1368  The data id.
1369  immediate - bool
1370  If False use a proxy for delayed loading.
1371  **rest
1372  keyword arguments for the data id.
1373 
1374  Returns
1375  -------
1376  An object retrieved from the dataset (or a proxy for one).
1377  """
1378  datasetType = self._resolveDatasetTypeAlias(datasetType)
1379  dataId = DataId(dataId)
1380  dataId.update(**rest)
1381 
1382  location = self._locate(datasetType, dataId, write=False)
1383  if location is None:
1384  raise NoResults("No locations for get:", datasetType, dataId)
1385  self.log.debug("Get type=%s keys=%s from %s", datasetType, dataId, str(location))
1386 
1387  if hasattr(location, 'bypass'):
1388  # this type loader block should get moved into a helper someplace, and duplications removed.
1389  callback = lambda : location.bypass
1390  else:
1391  callback = lambda: self._read(location)
1392  if location.mapper.canStandardize(location.datasetType):
1393  innerCallback = callback
1394  callback = lambda: location.mapper.standardize(location.datasetType, innerCallback(), dataId)
1395  if immediate:
1396  return callback()
1397  return ReadProxy(callback)
1398 
1399  def put(self, obj, datasetType, dataId={}, doBackup=False, **rest):
1400  """Persists a dataset given an output collection data id.
1401 
1402  Parameters
1403  ----------
1404  obj -
1405  The object to persist.
1406  datasetType - string
1407  The type of dataset to persist.
1408  dataId - dict
1409  The data id.
1410  doBackup - bool
1411  If True, rename existing instead of overwriting.
1412  WARNING: Setting doBackup=True is not safe for parallel processing, as it may be subject to race
1413  conditions.
1414  **rest
1415  Keyword arguments for the data id.
1416  """
1417  datasetType = self._resolveDatasetTypeAlias(datasetType)
1418  dataId = DataId(dataId)
1419  dataId.update(**rest)
1420 
1421  for location in self._locate(datasetType, dataId, write=True):
1422  if isinstance(location, ButlerComposite):
1423  disassembler = location.disassembler if location.disassembler else genericDisassembler
1424  disassembler(obj=obj, dataId=location.dataId, componentInfo=location.componentInfo)
1425  for name, info in location.componentInfo.items():
1426  if not info.inputOnly:
1427  self.put(info.obj, info.datasetType, location.dataId, doBackup=doBackup)
1428  else:
1429  if doBackup:
1430  location.getRepository().backup(location.datasetType, dataId)
1431  location.getRepository().write(location, obj)
1432 
1433  def subset(self, datasetType, level=None, dataId={}, **rest):
1434  """Return complete dataIds for a dataset type that match a partial (or empty) dataId.
1435 
1436  Given a partial (or empty) dataId specified in dataId and **rest, find all datasets that match the
1437  dataId. Optionally restrict the results to a given level specified by a dataId key (e.g. visit or
1438  sensor or amp for a camera). Return an iterable collection of complete dataIds as ButlerDataRefs.
1439  Datasets with the resulting dataIds may not exist; that needs to be tested with datasetExists().
1440 
1441  Parameters
1442  ----------
1443  datasetType - string
1444  The type of dataset collection to subset
1445  level - string
1446  The level of dataId at which to subset. Use an empty string if the mapper should look up the
1447  default level.
1448  dataId - dict
1449  The data id.
1450  **rest
1451  Keyword arguments for the data id.
1452 
1453  Returns
1454  -------
1455  subset - ButlerSubset
1456  Collection of ButlerDataRefs for datasets matching the data id.
1457 
1458  Examples
1459  -----------
1460  To print the full dataIds for all r-band measurements in a source catalog
1461  (note that the subset call is equivalent to: `butler.subset('src', dataId={'filter':'r'})`):
1462 
1463  >>> subset = butler.subset('src', filter='r')
1464  >>> for data_ref in subset: print(data_ref.dataId)
1465  """
1466  datasetType = self._resolveDatasetTypeAlias(datasetType)
1467 
1468  # Currently expected behavior of subset is that if specified level is None then the mapper's default
1469  # level should be used. Convention for level within Butler is that an empty string is used to indicate
1470  # 'get default'.
1471  if level is None:
1472  level = ''
1473 
1474  dataId = DataId(dataId)
1475  dataId.update(**rest)
1476  return ButlerSubset(self, datasetType, level, dataId)
1477 
1478  def dataRef(self, datasetType, level=None, dataId={}, **rest):
1479  """Returns a single ButlerDataRef.
1480 
1481  Given a complete dataId specified in dataId and **rest, find the unique dataset at the given level
1482  specified by a dataId key (e.g. visit or sensor or amp for a camera) and return a ButlerDataRef.
1483 
1484  Parameters
1485  ----------
1486  datasetType - string
1487  The type of dataset collection to reference
1488  level - string
1489  The level of dataId at which to reference
1490  dataId - dict
1491  The data id.
1492  **rest
1493  Keyword arguments for the data id.
1494 
1495  Returns
1496  -------
1497  dataRef - ButlerDataRef
1498  ButlerDataRef for dataset matching the data id
1499  """
1500 
1501  datasetType = self._resolveDatasetTypeAlias(datasetType)
1502  dataId = DataId(dataId)
1503  subset = self.subset(datasetType, level, dataId, **rest)
1504  if len(subset) != 1:
1505  raise RuntimeError("No unique dataset for: Dataset type:%s Level:%s Data ID:%s Keywords:%s" %
1506  (str(datasetType), str(level), str(dataId), str(rest)))
1507  return ButlerDataRef(subset, subset.cache[0])
1508 
1509  def _read(self, location):
1510  """Unpersist an object using data inside a ButlerLocation or ButlerComposite object.
1511 
1512  Parameters
1513  ----------
1514  location : ButlerLocation or ButlerComposite
1515  A ButlerLocation or ButlerComposite instance populated with data needed to read the object.
1516 
1517  Returns
1518  -------
1519  object
1520  An instance of the object specified by the location.
1521  """
1522  self.log.debug("Starting read from %s", location)
1523 
1524  if isinstance(location, ButlerComposite):
1525  for name, componentInfo in location.componentInfo.items():
1526  if componentInfo.subset:
1527  subset = self.subset(datasetType=componentInfo.datasetType, dataId=location.dataId)
1528  componentInfo.obj = [obj.get() for obj in subset]
1529  else:
1530  obj = self.get(componentInfo.datasetType, location.dataId, immediate=True)
1531  componentInfo.obj = obj
1532  assembler = location.assembler or genericAssembler
1533  results = assembler(dataId=location.dataId, componentInfo=location.componentInfo, cls=location.python)
1534  return results
1535  else:
1536  results = location.repository.read(location)
1537  if len(results) == 1:
1538  results = results[0]
1539  self.log.debug("Ending read from %s", location)
1540  return results
1541 
1542  def __reduce__(self):
1543  ret = (_unreduce, (self._initArgs, self.datasetTypeAliasDict))
1544  return ret
1545 
1546  def _resolveDatasetTypeAlias(self, datasetType):
1547  """Replaces all the known alias keywords in the given string with the alias value.
1548 
1549  Parameters
1550  ----------
1551  datasetType - string
1552  A datasetType string to search & replace on
1553 
1554  Returns
1555  -------
1556  datasetType - string
1557  The de-aliased string
1558  """
1559  for key in self.datasetTypeAliasDict:
1560  # if all aliases have been replaced, bail out
1561  if datasetType.find('@') == -1:
1562  break
1563  datasetType = datasetType.replace(key, self.datasetTypeAliasDict[key])
1564 
1565  # If an alias specifier can not be resolved then throw.
1566  if datasetType.find('@') != -1:
1567  raise RuntimeError("Unresolvable alias specifier in datasetType: %s" % (datasetType))
1568 
1569  return datasetType
1570 
1571 
1572 def _unreduce(initArgs, datasetTypeAliasDict):
1573  mapperArgs = initArgs.pop('mapperArgs')
1574  initArgs.update(mapperArgs)
1575  butler = Butler(**initArgs)
1576  butler.datasetTypeAliasDict = datasetTypeAliasDict
1577  return butler