lsst.daf.persistence  13.0-11-gfc17871
 All Classes Namespaces Files Functions Variables Typedefs Friends Macros
butler.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 
3 #
4 # LSST Data Management System
5 # Copyright 2008-2015 LSST Corporation.
6 #
7 # This product includes software developed by the
8 # LSST Project (http://www.lsst.org/).
9 #
10 # This program is free software: you can redistribute it and/or modify
11 # it under the terms of the GNU General Public License as published by
12 # the Free Software Foundation, either version 3 of the License, or
13 # (at your option) any later version.
14 #
15 # This program is distributed in the hope that it will be useful,
16 # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 # GNU General Public License for more details.
19 #
20 # You should have received a copy of the LSST License Statement and
21 # the GNU General Public License along with this program. If not,
22 # see <http://www.lsstcorp.org/LegalNotices/>.
23 #
24 
25 # -*- python -*-
26 
27 """This module defines the Butler class."""
28 from future import standard_library
29 standard_library.install_aliases()
30 from builtins import str
31 from past.builtins import basestring
32 from builtins import object
33 
34 import collections
35 import copy
36 import inspect
37 import json
38 import os
39 import weakref
40 
41 import yaml
42 
43 from lsst.log import Log
44 import lsst.pex.policy as pexPolicy
45 from . import LogicalLocation, ReadProxy, ButlerSubset, ButlerDataRef, Persistence, \
46  Storage, Policy, NoResults, Repository, DataId, RepositoryCfg, \
47  RepositoryArgs, listify, setify, sequencify, doImport, ButlerComposite, genericAssembler, \
48  genericDisassembler, PosixStorage
49 
50 preinitedMapperWarning = ("Passing an instantiated mapper into " +
51  "Butler.__init__ will prevent Butler from passing " +
52  "parentRegistry or repositoryCfg information to " +
53  "the mapper, which is done only at init time. " +
54  "It is better to pass a importable string or " +
55  "class object.")
56 
57 
58 class ButlerCfg(Policy, yaml.YAMLObject):
59  """Represents a Butler configuration.
60 
61  .. warning::
62 
63  cfg is 'wet paint' and very likely to change. Use of it in production
64  code other than via the 'old butler' API is strongly discouraged.
65  """
66  yaml_tag = u"!ButlerCfg"
67 
68  def __init__(self, cls, repoCfg):
69  super(ButlerCfg, self).__init__({'repoCfg': repoCfg, 'cls': cls})
70 
71 
72 class RepoData(object):
73  """Container object for repository data used by Butler
74 
75  Parameters
76  ----------
77  args - RepositoryArgs
78  Arguments used to initialize self.repo
79  cfg - RepositoryCfg
80  Configuration of repository
81  storedCfg - RepositoryCfg or None
82  If the cfg at root and the RepositoryArgs don't match then a new cfg is kept in cfg and the cfg that
83  was read from root is kept in storedCfg.
84  repo - Repository
85  The repository class instance
86  tags - set
87  The tags that apply to this repository, if any
88  """
89 
90  def __init__(self, args, cfg, storedCfg=None, isNewRepository=False, isV1Repository=True):
91  self.args = args
92  self.cfg = cfg
93  self.storedCfg = storedCfg
94  self.repo = None
95  self.mode = args.mode
96  # self.tags is used to keep track of *all* the applicable tags to the Repo, not just the tags in
97  # the cfg (e.g. parents inherit their childrens' tags)
98  self.tags = set()
99  self.isNewRepository = isNewRepository
100  self.isV1Repository = isV1Repository
101  self.parentRegistry = None
102 
103  def __reduce__(self):
104  return (RepoData, (self.args, self.cfg, self.repo, self.mode, self.tags))
105 
106  def __repr__(self):
107  s = "RepoData(args=%s cfg=%s repo=%s tags=%s isNewRepository=%s isV1Repository:%s parentRegistry:%s)"
108  return s % (self.args, self.cfg, self.repo, self.tags, self.isNewRepository, self.isV1Repository,
109  self.parentRegistry)
110 
111  def addTags(self, tags):
112  self.tags = self.tags.union(tags)
113 
114 
115 class RepoDataContainer(object):
116  """Container object for RepoData instances owned by a Butler instance."""
117 
118  def __init__(self):
119  self.byRepoRoot = {} # {args.root, RepoData}
120  self.byCfgRoot = {} # {args.cfgRoot, RepoData}
121  self._inputs = None
122  self._outputs = None
123  self._all = None # {cfg.root, RepoData}
124 
125  def add(self, repoData):
126  """Add a RepoData to the container
127 
128  Parameters
129  ----------
130  repoData - RepoData instance to add
131  """
132  self.byRepoRoot[repoData.cfg.root] = repoData
133  self.byCfgRoot[repoData.args.cfgRoot] = repoData
134 
135  def inputs(self):
136  """Get a list of RepoData that are used to as inputs to the Butler.
137  The list is created lazily as needed, and cached.
138 
139  Returns
140  -------
141  A list of RepoData with readable repositories, in the order to be used when searching.
142  """
143  if self._inputs is None:
144  raise RuntimeError("Inputs not yet initialized.")
145  return self._inputs
146 
147  def outputs(self):
148  """Get a list of RepoData that are used to as outputs to the Butler.
149  The list is created lazily as needed, and cached.
150 
151  Returns
152  -------
153  A list of RepoData with writable repositories, in the order to be use when searching.
154  """
155  if self._outputs is None:
156  raise RuntimeError("Outputs not yet initialized.")
157  return self._outputs
158 
159  def all(self):
160  """Get a list of all RepoData that are used to as by the Butler.
161  The list is created lazily as needed, and cached.
162 
163  Returns
164  -------
165  A list of RepoData with writable repositories, in the order to be use when searching.
166  """
167  if self._all is None:
168  raise RuntimeError("The all list is not yet initialized.")
169  return self._all
170 
171  def __repr__(self):
172  return "%s(\nbyRepoRoot=%r, \nbyCfgRoot=%r, \n_inputs=%r, \n_outputs=%s, \n_all=%s)" % (
173  self.__class__.__name__,
174  self.byRepoRoot,
175  self.byCfgRoot,
176  self._inputs,
177  self._outputs,
178  self._all)
179 
180  def _buildLookupLists(self, inputs, outputs):
181  """Buld the lists of inputs, outputs, and all repo datas in lookup
182  order.
183 
184  Parameters
185  ----------
186  inputs : list of RepositoryArgs
187  The input RepositoryArgs, in order.
188  outputs : list of RepositoryArgs
189  The output RepositoryArgs, in order.
190 
191  Returns
192  -------
193  None
194  """
195  def addRepoDataToLists(repoData, inout):
196  """"Adds the cfg represented by repoData to the _all dict/list, as
197  well as the _inputs or _outputs list, as indicated by inout. Then,
198  adds all the parents of the cfg to the lists."""
199  if inout not in ('in', 'out', 'ref'):
200  raise RuntimeError("'inout' must be 'in', 'out', or 'ref', not '%s'" % inout)
201  if repoData.cfg.root not in self._all:
202  self._all[repoData.cfg.root] = repoData
203  if inout == 'in' and repoData not in self._inputs:
204  self._inputs.append(repoData)
205  elif inout == 'out' and repoData not in self._outputs:
206  self._outputs.append(repoData)
207  if 'r' in repoData.args.mode:
208  self._inputs.append(repoData)
209  for parent in repoData.cfg.parents:
210  addParentAs = 'in' if 'r' in repoData.args.mode and inout != 'ref' else 'ref'
211  addRepoDataToLists(self.byRepoRoot[parent], addParentAs)
212 
213  self._all = collections.OrderedDict()
214  self._inputs = []
215  self._outputs = []
216 
217  for repoArgs in outputs:
218  repoData = self.byCfgRoot[repoArgs.cfgRoot]
219  addRepoDataToLists(repoData, 'out')
220  for repoArgs in inputs:
221  repoData = self.byCfgRoot[repoArgs.cfgRoot]
222  addRepoDataToLists(repoData, 'in')
223 
224 
225 class Butler(object):
226  """Butler provides a generic mechanism for persisting and retrieving data using mappers.
227 
228  A Butler manages a collection of datasets known as a repository. Each dataset has a type representing its
229  intended usage and a location. Note that the dataset type is not the same as the C++ or Python type of the
230  object containing the data. For example, an ExposureF object might be used to hold the data for a raw
231  image, a post-ISR image, a calibrated science image, or a difference image. These would all be different
232  dataset types.
233 
234  A Butler can produce a collection of possible values for a key (or tuples of values for multiple keys) if
235  given a partial data identifier. It can check for the existence of a file containing a dataset given its
236  type and data identifier. The Butler can then retrieve the dataset. Similarly, it can persist an object to
237  an appropriate location when given its associated data identifier.
238 
239  Note that the Butler has two more advanced features when retrieving a data set. First, the retrieval is
240  lazy. Input does not occur until the data set is actually accessed. This allows datasets to be retrieved
241  and placed on a clipboard prospectively with little cost, even if the algorithm of a stage ends up not
242  using them. Second, the Butler will call a standardization hook upon retrieval of the dataset. This
243  function, contained in the input mapper object, must perform any necessary manipulations to force the
244  retrieved object to conform to standards, including translating metadata.
245 
246  Public methods:
247 
248  __init__(self, root, mapper=None, **mapperArgs)
249 
250  defineAlias(self, alias, datasetType)
251 
252  getKeys(self, datasetType=None, level=None)
253 
254  queryMetadata(self, datasetType, format=None, dataId={}, **rest)
255 
256  datasetExists(self, datasetType, dataId={}, **rest)
257 
258  get(self, datasetType, dataId={}, immediate=False, **rest)
259 
260  put(self, obj, datasetType, dataId={}, **rest)
261 
262  subset(self, datasetType, level=None, dataId={}, **rest)
263 
264  dataRef(self, datasetType, level=None, dataId={}, **rest)
265 
266  Initialization:
267 
268  The preferred method of initialization is to pass in a RepositoryArgs instance, or a list of
269  RepositoryArgs to inputs and/or outputs.
270 
271  For backward compatibility: this initialization method signature can take a posix root path, and
272  optionally a mapper class instance or class type that will be instantiated using the mapperArgs input
273  argument. However, for this to work in a backward compatible way it creates a single repository that is
274  used as both an input and an output repository. This is NOT preferred, and will likely break any
275  provenance system we have in place.
276 
277  Parameters
278  ----------
279  root - string
280  .. note:: Deprecated in 12_0
281  `root` will be removed in TBD, it is replaced by `inputs` and `outputs` for
282  multiple-repository support.
283  A fileysystem path. Will only work with a PosixRepository.
284  mapper - string or instance
285  .. note:: Deprecated in 12_0
286  `mapper` will be removed in TBD, it is replaced by `inputs` and `outputs` for
287  multiple-repository support.
288  Provides a mapper to be used with Butler.
289  mapperArgs - dict
290  .. note:: Deprecated in 12_0
291  `mapperArgs` will be removed in TBD, it is replaced by `inputs` and `outputs` for
292  multiple-repository support.
293  Provides arguments to be passed to the mapper if the mapper input arg is a class type to be
294  instantiated by Butler.
295  inputs - RepositoryArgs or string
296  Can be a single item or a list. Provides arguments to load an existing repository (or repositories).
297  String is assumed to be a URI and is used as the cfgRoot (URI to the location of the cfg file). (Local
298  file system URI does not have to start with 'file://' and in this way can be a relative path).
299  outputs - RepositoryArg or string
300  Can be a single item or a list. Provides arguments to load one or more existing repositories or create
301  new ones. String is assumed to be a URI and as used as the repository root.
302  """
303 
304  def __init__(self, root=None, mapper=None, inputs=None, outputs=None, **mapperArgs):
305 
306  self.log = Log.getLogger("daf.persistence.butler")
307 
308  self._initArgs = {'root': root, 'mapper': mapper, 'inputs': inputs, 'outputs': outputs,
309  'mapperArgs': mapperArgs}
310  # inputs and outputs may be modified, do not change the external value.
311  inputs = copy.deepcopy(inputs)
312  outputs = copy.deepcopy(outputs)
313 
314  isV1Args = inputs is None and outputs is None
315  if isV1Args:
316  inputs, outputs = self._convertV1Args(root=root, mapper=mapper, mapperArgs=mapperArgs)
317  elif root or mapper or mapperArgs:
318  raise RuntimeError(
319  'Butler version 1 API (root, mapper, **mapperArgs) may ' +
320  'not be used with version 2 API (inputs, outputs)')
322 
323  # make sure inputs and outputs are lists, and if list items are a string convert it RepositoryArgs.
324  inputs = listify(inputs)
325  outputs = listify(outputs)
326  inputs = [RepositoryArgs(cfgRoot=args)
327  if not isinstance(args, RepositoryArgs) else args for args in inputs]
328  outputs = [RepositoryArgs(cfgRoot=args)
329  if not isinstance(args, RepositoryArgs) else args for args in outputs]
330  # Set default rw modes on input and output args as needed
331  for args in inputs:
332  if args.mode is None:
333  args.mode = 'r'
334  elif 'r' not in args.mode:
335  raise RuntimeError("The mode of an input should be readable.")
336  for args in outputs:
337  if args.mode is None:
338  args.mode = 'w'
339  elif 'w' not in args.mode:
340  raise RuntimeError("The mode of an output should be writable.")
341  # check for class instances in args.mapper (not allowed)
342  for args in inputs + outputs:
343  if (args.mapper and not isinstance(args.mapper, basestring) and
344  not inspect.isclass(args.mapper)):
345  self.log.warn(preinitedMapperWarning)
346 
347  # Always use an empty Persistence policy until we can get rid of it
348  persistencePolicy = pexPolicy.Policy()
349  self.persistence = Persistence.getPersistence(persistencePolicy)
350 
351  self._createRepoDatas(inputs, outputs)
352 
353  self._repos._buildLookupLists(inputs, outputs)
354 
355  self._setRepoDataTags()
356 
357  defaultMapper = self._getDefaultMapper()
358  self._assignDefaultMapper(defaultMapper)
359 
360  for repoData in reversed(self._repos.all().values()):
361  parentRegistry = self._getParentRegistry(repoData)
362  repoData.parentRegistry = parentRegistry
363  repoData.repo = Repository(repoData)
364 
365  def _getParentRegistry(self, repoData):
366  """Get the first found registry that matches the the passed-in repo.
367 
368  "Matches" means the mapper in the passed-in repo is the same type as
369  the mapper in the parent.
370 
371  Parameters
372  ----------
373  repoData : RepoData
374  The RepoData for the repository for which we are searching for a
375  parent registry.
376 
377  Returns
378  -------
379  Registry or None
380  A registry from a parent if one can be found, or None.
381 
382  Raises
383  ------
384  RuntimeError
385  Indicates a butler init order problem, all parents should be initialized before child
386  repositories, so this function should be able to get any parent of any child repo.
387  """
388  registry = None
389  for parentRepoData in self._getParentRepoDatas(repoData):
390  if parentRepoData.cfg.mapper == repoData.cfg.mapper:
391  if not parentRepoData.repo:
392  raise RuntimeError("Parent repo should be initialized before child repos.")
393  registry = parentRepoData.repo.getRegistry()
394  if registry:
395  break
396  return registry
397 
398  def _getParentRepoDatas(self, repoData):
399  """Get the parents & grandparents etc of a given repo data, in depth-first search order.
400 
401  Parameters
402  ----------
403  repoData : RepoData instance
404  The RepoData whose parents should be retreived.
405 
406  Returns
407  -------
408  list of RepoData
409  A list of the parents & grandparents etc of a given repo data, in depth-first search order.
410  """
411  for parentCfgRoot in repoData.cfg.parents:
412  parentRepoData = self._repos.byCfgRoot[parentCfgRoot]
413  yield parentRepoData
414  for parentRepoData in self._getParentRepoDatas(parentRepoData):
415  yield parentRepoData
416 
417  def _setRepoDataTags(self):
418  """Set the tags from each repoArgs into all its parent repoArgs so that they can be included in tagged
419  searches.
420 
421  Returns
422  -------
423  None
424  """
425  def setTags(butler, repoData, tags):
426  tags.update(repoData.args.tags)
427  repoData.addTags(tags)
428  for parent in repoData.cfg.parents:
429  setTags(butler, butler._repos.byRepoRoot[parent], copy.copy(tags))
430 
431  for repoData in self._repos.all().values():
432  setTags(self, repoData, set())
433 
434  def _createRepoData(self, args, inout, instanceParents):
435  """Make a RepoData object for args, adding it to the RepoDataContainer.
436 
437  Parameters
438  ----------
439  args : RepoArgs
440  A RepositoryArgs that describes a new or existing Repository.
441  inout : 'in' or 'out'
442  Indicates if this Repository should be used by the Butler as an input or an output.
443  instanceParents : list of string
444  URI/path to the RepositoryCfg of parents in this instance of Butler; inputs and readable outputs
445  (but not their parents, grand-parents are looked up when the parents are loaded)
446 
447  Returns
448  -------
449  None
450  """
451  def parentListWithoutThis(root, instanceParents):
452  """instanceParents is typically all the inputs to butler. If 'this' root is in that list (because
453  this repo is writable) then remove it, as a repo is never its own parent."""
454  parents = copy.copy(instanceParents)
455  try:
456  parents.remove(args.cfgRoot)
457  except ValueError:
458  pass
459  return parents
460 
461  # if only a string is passed for inputs or outputs, assumption is that it's a URI;
462  # place it in a RepositoryArgs instance; cfgRoot for inputs, root for outputs.
463  if inout not in ('in', 'out'):
464  raise RuntimeError("inout must be either 'in' or 'out'")
465  # if we already have RepoData for these repoArgs, we're done with that repo and it's parents.
466  if args.cfgRoot in self._repos.byCfgRoot:
467  return
468  # Get the RepositoryCfg, if it exists:
469  cfg = Storage.getRepositoryCfg(args.cfgRoot)
470  # Handle the case where the Repository exists and contains a RepositoryCfg file:
471  if cfg:
472  if not cfg.matchesArgs(args):
473  raise RuntimeError("Persisted repo cfg does not match input args. cfg:%s, args:%s"
474  % (cfg, args))
475  # need to fix intermediate cfgs
476  # storedCfg = cfg
477  # cfg = RepositoryCfg.makeFromArgs(args)
478  parents = parentListWithoutThis(args.cfgRoot, instanceParents)
479  if inout == 'out' and cfg.parents != parents:
480  raise RuntimeError(
481  "Persisted repo cfg parents do not match butler parents: cfg:%s, parents:%s"
482  % (cfg, instanceParents))
483  else:
484  storedCfg = None
485  repoData = RepoData(args=args, cfg=cfg, storedCfg=storedCfg)
486  self._repos.add(repoData)
487  for parentArgs in cfg.parents:
488  self._createRepoData(RepositoryArgs(parentArgs, mode='r'), 'in', instanceParents)
489  # Handle the case where a RepositoryCfg file does not exist:
490  else:
491  # Posix repos might be Butler V1 Repos, requires special handling:
492  if Storage.isPosix(args.cfgRoot):
493  v1RepoExists = PosixStorage.v1RepoExists(args.cfgRoot)
494  if not v1RepoExists and inout == 'in':
495  msg = "Input repositories must exist; no repo found at " \
496  "%s. (A Butler V1 Repository 'exists' if the root " \
497  " folder exists AND contains items.)" % args.cfgRoot
498  raise RuntimeError(msg)
499  if inout == 'out' and not v1RepoExists:
500  parents = parentListWithoutThis(args.cfgRoot, instanceParents)
501  else:
502  parents = None
503  if v1RepoExists:
504  if not args.mapper:
505  args.mapper = PosixStorage.getMapperClass(args.cfgRoot)
506  cfg = RepositoryCfg.makeFromArgs(args, parents)
507  repoData = RepoData(args=args, cfg=cfg, isNewRepository=not v1RepoExists,
508  isV1Repository=v1RepoExists)
509  self._repos.add(repoData)
510  if v1RepoExists:
511  parent = PosixStorage.getParentSymlinkPath(args.cfgRoot)
512  if parent:
513  parent = PosixStorage.absolutePath(args.cfgRoot, parent)
514  cfg.addParents(parent)
515  self._createRepoData(RepositoryArgs(parent, mode='r'), 'in', instanceParents)
516  # Do not need to check for Butler V1 Repos in non-posix Storages:
517  else:
518  if inout == 'in':
519  msg = "Input repositories must exist; no repo found at " \
520  "%s." % args.cfgRoot
521  raise RuntimeError(msg)
522  cfg = RepositoryCfg.makeFromArgs(args, parents)
523  repoData = RepoData(args=args, cfg=cfg, isNewRepository=True)
524  self._repos.add(repoData)
525 
526  @staticmethod
527  def _getParentsList(inputs, outputs):
528  parents = []
529  # The parents of readable output repositories are handled as though they were passed to butler as
530  # inputs.
531  # When we add remote storage types, getting the repositoryCfg here and again later in _createRepoData
532  # may be slow. We could fetch & cache if needed.
533  for args in outputs:
534  if 'r' in args.mode and args.cfgRoot not in parents:
535  parents.append(args.cfgRoot)
536  cfg = Storage.getRepositoryCfg(args.cfgRoot)
537  if cfg:
538  for parent in cfg.parents:
539  if parent not in parents:
540  parents.append(parent)
541  for args in inputs:
542  if args.cfgRoot not in parents:
543  parents.append(args.cfgRoot)
544  return parents
545 
546  def _createRepoDatas(self, inputs, outputs):
547  """Create the RepoDataContainer and put a RepoData object in it for each repository listed in inputs
548  and outputs as well as each parent of each repository.
549 
550  After this function runs, there will be a RepoData for any Repository that may be used by this Butler
551  instance.
552 
553  Parameters
554  ----------
555  inputs : list of RepoArgs
556  Repositories to be used by the Butler as as input repositories.
557  outputs : list of RepoArgs
558  Repositories to be used by the Butler as as output repositories.
559 
560  Returns
561  -------
562  None
563  """
564  try:
565  if self._repos:
566  raise RuntimeError("Must not call _createRepoDatas twice.")
567  except AttributeError:
568  pass
569  self._repos = RepoDataContainer()
570  parents = self._getParentsList(inputs, outputs)
571 
572  for outputArgs in outputs:
573  self._createRepoData(outputArgs, 'out', parents)
574  for inputArgs in inputs:
575  self._createRepoData(inputArgs, 'in', parents)
576 
577  def _convertV1Args(self, root, mapper, mapperArgs):
578  """Convert Butler V1 args (root, mapper, mapperArgs) to V2 args (inputs, outputs)
579 
580  Parameters
581  ----------
582  root : string
583  Posix path to repository root
584  mapper : class, class instance, or string
585  Instantiated class, a class object to be instantiated, or a string that refers to a class that
586  can be imported & used as the mapper.
587  mapperArgs : dict
588  Args & their values used when instnatiating the mapper.
589 
590  Returns
591  -------
592  tuple
593  (inputs, outputs) - values to be used for inputs and outputs in Butler.__init__
594  """
595  if (mapper and not isinstance(mapper, basestring) and
596  not inspect.isclass(mapper)):
597  self.log.warn(preinitedMapperWarning)
598  inputs = None
599  if root is None:
600  if hasattr(mapper, 'root'):
601  # in legacy repos, the mapper may be given the root directly.
602  root = mapper.root
603  else:
604  # in the past root="None" could be used to mean root='.'
605  root = '.'
606  outputs = RepositoryArgs(mode='rw',
607  root=root,
608  mapper=mapper,
609  mapperArgs=mapperArgs)
610  return inputs, outputs
611 
612  def __repr__(self):
613  return 'Butler(datasetTypeAliasDict=%s, repos=%s, persistence=%s)' % (
614  self.datasetTypeAliasDict, self._repos, self.persistence)
615 
616  def _getDefaultMapper(self):
617  """Get the default mapper. Currently this means if all the repos use
618  exactly the same mapper, that mapper may be considered the default.
619 
620  This definition may be changing; mappers may be able to exclude
621  themselves as candidates for default, and they may nominate a different
622  mapper instead. Also, we may not want to look at *all* the repos, but
623  only a depth-first search on each of the input & output repos, and
624  use the first-found mapper for each of those. TBD.
625 
626  Parameters
627  ----------
628  inputs : TYPE
629  Description
630 
631  Returns
632  -------
633  Mapper class or None
634  Returns the class type of the default mapper, or None if a default
635  mapper can not be determined.
636  """
637  defaultMapper = None
638 
639  for inputRepoData in self._repos.inputs():
640  mapper = None
641  if inputRepoData.cfg.mapper is not None:
642  mapper = inputRepoData.cfg.mapper
643  # if the mapper is:
644  # * a string, import it.
645  # * a class instance, get its class type
646  # * a class, do nothing; use it
647  if isinstance(mapper, basestring):
648  mapper = doImport(mapper)
649  elif not inspect.isclass(mapper):
650  mapper = mapper.__class__
651  # If no mapper has been found, note the first found mapper.
652  # Then, if a mapper has been found and each next mapper matches it,
653  # continue looking for mappers.
654  # If a mapper has been found and another non-matching mapper is
655  # found then we have no default, return None.
656  if defaultMapper is None:
657  defaultMapper = mapper
658  elif mapper == defaultMapper:
659  continue
660  elif mapper is not None:
661  return None
662  return defaultMapper
663 
664  def _assignDefaultMapper(self, defaultMapper):
665  for repoData in self._repos.all().values():
666  if repoData.cfg.mapper is None and (repoData.isNewRepository or repoData.isV1Repository):
667  if defaultMapper is None:
668  raise RuntimeError(
669  "No mapper specified for %s and no default mapper could be determined." %
670  repoData.args)
671  repoData.cfg.mapper = defaultMapper
672 
673  @staticmethod
674  def getMapperClass(root):
675  """posix-only; gets the mapper class at the path specifed by root (if a file _mapper can be found at
676  that location or in a parent location.
677 
678  As we abstract the storage and support different types of storage locations this method will be
679  moved entirely into Butler Access, or made more dynamic, and the API will very likely change."""
680  return Storage.getMapperClass(root)
681 
682  def defineAlias(self, alias, datasetType):
683  """Register an alias that will be substituted in datasetTypes.
684 
685  Paramters
686  ---------
687  alias - str
688  The alias keyword. It may start with @ or not. It may not contain @ except as the first character.
689  datasetType - str
690  The string that will be substituted when @alias is passed into datasetType. It may not contain '@'
691  """
692  # verify formatting of alias:
693  # it can have '@' as the first character (if not it's okay, we will add it) or not at all.
694  atLoc = alias.rfind('@')
695  if atLoc == -1:
696  alias = "@" + str(alias)
697  elif atLoc > 0:
698  raise RuntimeError("Badly formatted alias string: %s" % (alias,))
699 
700  # verify that datasetType does not contain '@'
701  if datasetType.count('@') != 0:
702  raise RuntimeError("Badly formatted type string: %s" % (datasetType))
703 
704  # verify that the alias keyword does not start with another alias keyword,
705  # and vice versa
706  for key in self.datasetTypeAliasDict:
707  if key.startswith(alias) or alias.startswith(key):
708  raise RuntimeError("Alias: %s overlaps with existing alias: %s" % (alias, key))
709 
710  self.datasetTypeAliasDict[alias] = datasetType
711 
712  def getKeys(self, datasetType=None, level=None, tag=None):
713  """Get the valid data id keys at or above the given level of hierarchy for the dataset type or the
714  entire collection if None. The dict values are the basic Python types corresponding to the keys (int,
715  float, str).
716 
717  Parameters
718  ----------
719  datasetType - str
720  The type of dataset to get keys for, entire collection if None.
721  level - str
722  The hierarchy level to descend to. None if it should not be restricted. Use an empty string if the
723  mapper should lookup the default level.
724  tags - any, or list of any
725  Any object that can be tested to be the same as the tag in a dataId passed into butler input
726  functions. Applies only to input repositories: If tag is specified by the dataId then the repo
727  will only be read from used if the tag in the dataId matches a tag used for that repository.
728 
729  Returns
730  -------
731  Returns a dict. The dict keys are the valid data id keys at or above the given level of hierarchy for
732  the dataset type or the entire collection if None. The dict values are the basic Python types
733  corresponding to the keys (int, float, str).
734  """
735  datasetType = self._resolveDatasetTypeAlias(datasetType)
736 
737  keys = None
738  tag = setify(tag)
739  for repoData in self._repos.inputs():
740  if not tag or len(tag.intersection(repoData.tags)) > 0:
741  keys = repoData.repo.getKeys(datasetType, level)
742  # An empty dict is a valid "found" condition for keys. The only value for keys that should
743  # cause the search to continue is None
744  if keys is not None:
745  break
746  return keys
747 
748  def queryMetadata(self, datasetType, format=None, dataId={}, **rest):
749  """Returns the valid values for one or more keys when given a partial
750  input collection data id.
751 
752  Parameters
753  ----------
754  datasetType - str
755  The type of dataset to inquire about.
756  key - str
757  A key giving the level of granularity of the inquiry.
758  format - str, tuple
759  An optional key or tuple of keys to be returned.
760  dataId - DataId, dict
761  The partial data id.
762  **rest -
763  Keyword arguments for the partial data id.
764 
765  Returns
766  -------
767  A list of valid values or tuples of valid values as specified by the format (defaulting to the same as
768  the key) at the key's level of granularity.
769  """
770 
771  datasetType = self._resolveDatasetTypeAlias(datasetType)
772  dataId = DataId(dataId)
773  dataId.update(**rest)
774 
775  if format is None:
776  format = (key,)
777  else:
778  format = sequencify(format)
779 
780  tuples = None
781  for repoData in self._repos.inputs():
782  if not dataId.tag or len(dataId.tag.intersection(repoData.tags)) > 0:
783  tuples = repoData.repo.queryMetadata(datasetType, format, dataId)
784  if tuples:
785  break
786 
787  if not tuples:
788  return []
789 
790  if len(format) == 1:
791  ret = []
792  for x in tuples:
793  try:
794  ret.append(x[0])
795  except TypeError:
796  ret.append(x)
797  return ret
798 
799  return tuples
800 
801  def datasetExists(self, datasetType, dataId={}, **rest):
802  """Determines if a dataset file exists.
803 
804  Parameters
805  ----------
806  datasetType - str
807  The type of dataset to inquire about.
808  dataId - DataId, dict
809  The data id of the dataset.
810  **rest keyword arguments for the data id.
811 
812  Returns
813  -------
814  exists - bool
815  True if the dataset exists or is non-file-based.
816  """
817  datasetType = self._resolveDatasetTypeAlias(datasetType)
818  dataId = DataId(dataId)
819  dataId.update(**rest)
820 
821  location = None
822  for repoData in self._repos.inputs():
823  if not dataId.tag or len(dataId.tag.intersection(repoData.tags)) > 0:
824  location = repoData.repo.map(datasetType, dataId)
825  if location and location.repository.exists(location):
826  break
827  else:
828  location = None
829 
830  return bool(location)
831 
832 
833  def _locate(self, datasetType, dataId, write):
834  """Get one or more ButlerLocations and/or ButlercComposites.
835 
836  Parameters
837  ----------
838  datasetType : string
839  The datasetType that is being searched for. The datasetType may be followed by a dot and
840  a component name (component names are specified in the policy). IE datasetType.componentName
841 
842  dataId : dict or DataId class instance
843  The dataId
844 
845  write : bool
846  True if this is a search to write an object. False if it is a search to read an object. This
847  affects what type (an object or a container) is returned.
848 
849  Returns
850  -------
851  If write is False, will return either a single object or None. If write is True, will return a list
852  (which may be empty)
853  """
854  repos = self._repos.outputs() if write else self._repos.inputs()
855  locations = []
856  for repoData in repos:
857  # enforce dataId & repository tags when reading:
858  if not write and dataId.tag and len(dataId.tag.intersection(repoData.tags)) == 0:
859  continue
860  components = datasetType.split('.')
861  datasetType = components[0]
862  components = components[1:]
863  try:
864  location = repoData.repo.map(datasetType, dataId, write=write)
865  except NoResults:
866  continue
867  if location is None:
868  continue
869  location.datasetType = datasetType # todo is there a better way than monkey patching here?
870  if len(components) > 0:
871  if not isinstance(location, ButlerComposite):
872  raise RuntimeError("The location for a dotted datasetType must be a composite.")
873  # replace the first component name with the datasetType
874  components[0] = location.componentInfo[components[0]].datasetType
875  # join components back into a dot-delimited string
876  datasetType = '.'.join(components)
877  location = self._locate(datasetType, dataId, write)
878  # if a cmponent location is not found, we can not continue with this repo, move to next repo.
879  if location is None:
880  break
881  # if reading, only one location is desired.
882  if location:
883  if not write:
884  # If there is a bypass function for this dataset type, we can't test to see if the object
885  # exists in storage. Just return the location.
886  if hasattr(location.mapper, "bypass_" + location.datasetType):
887  try:
888  # The dataset for the location may or may not exist
889  # and may or may not be needed. Right now the only
890  # way to know is to call the bypass function.
891  location.bypass = self._getBypassFunc(location, dataId)()
892  return location
893  except:
894  continue
895  # If a location was found but the location does not exist, keep looking in input
896  # repositories (the registry may have had enough data for a lookup even thought the object
897  # exists in a different repository.)
898  if isinstance(location, ButlerComposite) or location.repository.exists(location):
899  return location
900  else:
901  try:
902  locations.extend(location)
903  except TypeError:
904  locations.append(location)
905  if not write:
906  return None
907  return locations
908 
909  @staticmethod
910  def _getBypassFunc(location, dataId):
911  pythonType = location.getPythonType()
912  if pythonType is not None:
913  if isinstance(pythonType, basestring):
914  pythonType = doImport(pythonType)
915  bypassFunc = getattr(location.mapper, "bypass_" + location.datasetType)
916  return lambda: bypassFunc(location.datasetType, pythonType, location, dataId)
917 
918 
919  def get(self, datasetType, dataId=None, immediate=True, **rest):
920  """Retrieves a dataset given an input collection data id.
921 
922  Parameters
923  ----------
924  datasetType - str
925  The type of dataset to retrieve.
926  dataId - dict
927  The data id.
928  immediate - bool
929  If False use a proxy for delayed loading.
930  **rest
931  keyword arguments for the data id.
932 
933  Returns
934  -------
935  An object retrieved from the dataset (or a proxy for one).
936  """
937  datasetType = self._resolveDatasetTypeAlias(datasetType)
938  dataId = DataId(dataId)
939  dataId.update(**rest)
940 
941  location = self._locate(datasetType, dataId, write=False)
942  if location is None:
943  raise NoResults("No locations for get:", datasetType, dataId)
944  self.log.debug("Get type=%s keys=%s from %s", datasetType, dataId, str(location))
945 
946  if isinstance(location, ButlerComposite):
947  for name, componentInfo in location.componentInfo.items():
948  if componentInfo.subset:
949  subset = self.subset(datasetType=componentInfo.datasetType, dataId=location.dataId)
950  componentInfo.obj = [obj.get() for obj in subset]
951  else:
952  obj = self.get(componentInfo.datasetType, location.dataId, immediate=True)
953  componentInfo.obj = obj
954  assembler = location.assembler or genericAssembler
955  obj = assembler(dataId=location.dataId, componentInfo=location.componentInfo, cls=location.python)
956  return obj
957 
958  if hasattr(location, 'bypass'):
959  # this type loader block should get moved into a helper someplace, and duplciations removed.
960  callback = lambda : location.bypass
961  else:
962  callback = lambda: self._read(location)
963  if location.mapper.canStandardize(location.datasetType):
964  innerCallback = callback
965  callback = lambda: location.mapper.standardize(location.datasetType, innerCallback(), dataId)
966  if immediate:
967  return callback()
968  return ReadProxy(callback)
969 
970  def put(self, obj, datasetType, dataId={}, doBackup=False, **rest):
971  """Persists a dataset given an output collection data id.
972 
973  Parameters
974  ----------
975  obj -
976  The object to persist.
977  datasetType - str
978  The type of dataset to persist.
979  dataId - dict
980  The data id.
981  doBackup - bool
982  If True, rename existing instead of overwriting.
983  WARNING: Setting doBackup=True is not safe for parallel processing, as it may be subject to race
984  conditions.
985  **rest
986  Keyword arguments for the data id.
987  """
988  datasetType = self._resolveDatasetTypeAlias(datasetType)
989  dataId = DataId(dataId)
990  dataId.update(**rest)
991 
992  for location in self._locate(datasetType, dataId, write=True):
993  if isinstance(location, ButlerComposite):
994  disassembler = location.disassembler if location.disassembler else genericDisassembler
995  disassembler(obj=obj, dataId=location.dataId, componentInfo=location.componentInfo)
996  for name, info in location.componentInfo.items():
997  if not info.inputOnly:
998  self.put(info.obj, info.datasetType, location.dataId, doBackup=doBackup)
999  else:
1000  if doBackup:
1001  location.getRepository().backup(location.datasetType, dataId)
1002  location.getRepository().write(location, obj)
1003 
1004  def subset(self, datasetType, level=None, dataId={}, **rest):
1005  """Return complete dataIds for a dataset type that match a partial (or empty) dataId.
1006 
1007  Given a partial (or empty) dataId specified in dataId and **rest, find all datasets that match the
1008  dataId. Optionally restrict the results to a given level specified by a dataId key (e.g. visit or
1009  sensor or amp for a camera). Return an iterable collection of complete dataIds as ButlerDataRefs.
1010  Datasets with the resulting dataIds may not exist; that needs to be tested with datasetExists().
1011 
1012  Parameters
1013  ----------
1014  datasetType - str
1015  The type of dataset collection to subset
1016  level - str
1017  The level of dataId at which to subset. Use an empty string if the mapper should look up the
1018  default level.
1019  dataId - dict
1020  The data id.
1021  **rest
1022  Keyword arguments for the data id.
1023 
1024  Returns
1025  -------
1026  subset - ButlerSubset
1027  Collection of ButlerDataRefs for datasets matching the data id.
1028 
1029  Examples
1030  -----------
1031  To print the full dataIds for all r-band measurements in a source catalog
1032  (note that the subset call is equivalent to: `butler.subset('src', dataId={'filter':'r'})`):
1033 
1034  >>> subset = butler.subset('src', filter='r')
1035  >>> for data_ref in subset: print(data_ref.dataId)
1036  """
1037  datasetType = self._resolveDatasetTypeAlias(datasetType)
1038 
1039  # Currently expected behavior of subset is that if specified level is None then the mapper's default
1040  # level should be used. Convention for level within Butler is that an empty string is used to indicate
1041  # 'get default'.
1042  if level is None:
1043  level = ''
1044 
1045  dataId = DataId(dataId)
1046  dataId.update(**rest)
1047  return ButlerSubset(self, datasetType, level, dataId)
1048 
1049  def dataRef(self, datasetType, level=None, dataId={}, **rest):
1050  """Returns a single ButlerDataRef.
1051 
1052  Given a complete dataId specified in dataId and **rest, find the unique dataset at the given level
1053  specified by a dataId key (e.g. visit or sensor or amp for a camera) and return a ButlerDataRef.
1054 
1055  Parameters
1056  ----------
1057  datasetType - str
1058  The type of dataset collection to reference
1059  level - str
1060  The level of dataId at which to reference
1061  dataId - dict
1062  The data id.
1063  **rest
1064  Keyword arguments for the data id.
1065 
1066  Returns
1067  -------
1068  dataRef - ButlerDataRef
1069  ButlerDataRef for dataset matching the data id
1070  """
1071 
1072  datasetType = self._resolveDatasetTypeAlias(datasetType)
1073  dataId = DataId(dataId)
1074  subset = self.subset(datasetType, level, dataId, **rest)
1075  if len(subset) != 1:
1076  raise RuntimeError("No unique dataset for: Dataset type:%s Level:%s Data ID:%s Keywords:%s" %
1077  (str(datasetType), str(level), str(dataId), str(rest)))
1078  return ButlerDataRef(subset, subset.cache[0])
1079 
1080  def _read(self, location):
1081  """Unpersist an object using data inside a butlerLocation object.
1082 
1083  Parameters
1084  ----------
1085  location - ButlerLocation
1086  A butlerLocation instance populated with data needed to read the object.
1087 
1088  Returns
1089  -------
1090  object - an instance of the object specified by the butlerLocation.
1091  """
1092  self.log.debug("Starting read from %s", location)
1093  results = location.repository.read(location)
1094  if len(results) == 1:
1095  results = results[0]
1096  self.log.debug("Ending read from %s", location)
1097  return results
1098 
1099  def __reduce__(self):
1100  ret = (_unreduce, (self._initArgs, self.datasetTypeAliasDict))
1101  return ret
1102 
1103  def _resolveDatasetTypeAlias(self, datasetType):
1104  """Replaces all the known alias keywords in the given string with the alias value.
1105 
1106  Parameters
1107  ----------
1108  datasetType - str
1109  A datasetType string to search & replace on
1110 
1111  Returns
1112  -------
1113  datasetType - str
1114  The de-aliased string
1115  """
1116  for key in self.datasetTypeAliasDict:
1117  # if all aliases have been replaced, bail out
1118  if datasetType.find('@') == -1:
1119  break
1120  datasetType = datasetType.replace(key, self.datasetTypeAliasDict[key])
1121 
1122  # If an alias specifier can not be resolved then throw.
1123  if datasetType.find('@') != -1:
1124  raise RuntimeError("Unresolvable alias specifier in datasetType: %s" % (datasetType))
1125 
1126  return datasetType
1127 
1128 
1129 def _unreduce(initArgs, datasetTypeAliasDict):
1130  mapperArgs = initArgs.pop('mapperArgs')
1131  initArgs.update(mapperArgs)
1132  butler = Butler(**initArgs)
1133  butler.datasetTypeAliasDict = datasetTypeAliasDict
1134  return butler