lsst.daf.persistence  13.0-17-gd5d205a+1
 All Classes Namespaces Files Functions Variables Typedefs Friends Macros
butler.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 
3 #
4 # LSST Data Management System
5 # Copyright 2008-2015 LSST Corporation.
6 #
7 # This product includes software developed by the
8 # LSST Project (http://www.lsst.org/).
9 #
10 # This program is free software: you can redistribute it and/or modify
11 # it under the terms of the GNU General Public License as published by
12 # the Free Software Foundation, either version 3 of the License, or
13 # (at your option) any later version.
14 #
15 # This program is distributed in the hope that it will be useful,
16 # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 # GNU General Public License for more details.
19 #
20 # You should have received a copy of the LSST License Statement and
21 # the GNU General Public License along with this program. If not,
22 # see <http://www.lsstcorp.org/LegalNotices/>.
23 #
24 
25 # -*- python -*-
26 
27 """This module defines the Butler class."""
28 from future import standard_library
29 standard_library.install_aliases()
30 from builtins import str
31 from past.builtins import basestring
32 from builtins import object
33 
34 import collections
35 import copy
36 import inspect
37 import json
38 import os
39 import weakref
40 
41 import yaml
42 
43 from lsst.log import Log
44 import lsst.pex.policy as pexPolicy
45 from . import LogicalLocation, ReadProxy, ButlerSubset, ButlerDataRef, Persistence, \
46  Storage, Policy, NoResults, Repository, DataId, RepositoryCfg, \
47  RepositoryArgs, listify, setify, sequencify, doImport, ButlerComposite, genericAssembler, \
48  genericDisassembler, PosixStorage
49 
50 preinitedMapperWarning = ("Passing an instantiated mapper into " +
51  "Butler.__init__ will prevent Butler from passing " +
52  "parentRegistry or repositoryCfg information to " +
53  "the mapper, which is done only at init time. " +
54  "It is better to pass a importable string or " +
55  "class object.")
56 
57 
58 class ButlerCfg(Policy, yaml.YAMLObject):
59  """Represents a Butler configuration.
60 
61  .. warning::
62 
63  cfg is 'wet paint' and very likely to change. Use of it in production
64  code other than via the 'old butler' API is strongly discouraged.
65  """
66  yaml_tag = u"!ButlerCfg"
67 
68  def __init__(self, cls, repoCfg):
69  super(ButlerCfg, self).__init__({'repoCfg': repoCfg, 'cls': cls})
70 
71 
72 class RepoData(object):
73  """Container object for repository data used by Butler
74 
75  Parameters
76  ----------
77  args - RepositoryArgs
78  Arguments used to initialize self.repo
79  cfg - RepositoryCfg
80  Configuration of repository
81  storedCfg - RepositoryCfg or None
82  If the cfg at root and the RepositoryArgs don't match then a new cfg is kept in cfg and the cfg that
83  was read from root is kept in storedCfg.
84  repo - Repository
85  The repository class instance
86  tags - set
87  The tags that apply to this repository, if any
88  """
89 
90  def __init__(self, args, cfg, storedCfg=None, isNewRepository=False, isV1Repository=True):
91  self.args = args
92  self.cfg = cfg
93  self.storedCfg = storedCfg
94  self.repo = None
95  self.mode = args.mode
96  # self.tags is used to keep track of *all* the applicable tags to the Repo, not just the tags in
97  # the cfg (e.g. parents inherit their childrens' tags)
98  self.tags = set()
99  self.isNewRepository = isNewRepository
100  self.isV1Repository = isV1Repository
101  self.parentRegistry = None
102 
103  def __reduce__(self):
104  return (RepoData, (self.args, self.cfg, self.repo, self.mode, self.tags))
105 
106  def __repr__(self):
107  s = "RepoData(args=%s cfg=%s repo=%s tags=%s isNewRepository=%s isV1Repository:%s parentRegistry:%s)"
108  return s % (self.args, self.cfg, self.repo, self.tags, self.isNewRepository, self.isV1Repository,
109  self.parentRegistry)
110 
111  def addTags(self, tags):
112  self.tags = self.tags.union(tags)
113 
114 
115 class RepoDataContainer(object):
116  """Container object for RepoData instances owned by a Butler instance."""
117 
118  def __init__(self):
119  self.byRepoRoot = {} # {args.root, RepoData}
120  self.byCfgRoot = {} # {args.cfgRoot, RepoData}
121  self._inputs = None
122  self._outputs = None
123  self._all = None # {cfg.root, RepoData}
124 
125  def add(self, repoData):
126  """Add a RepoData to the container
127 
128  Parameters
129  ----------
130  repoData - RepoData instance to add
131  """
132  self.byRepoRoot[repoData.cfg.root] = repoData
133  self.byCfgRoot[repoData.args.cfgRoot] = repoData
134 
135  def inputs(self):
136  """Get a list of RepoData that are used to as inputs to the Butler.
137  The list is created lazily as needed, and cached.
138 
139  Returns
140  -------
141  A list of RepoData with readable repositories, in the order to be used when searching.
142  """
143  if self._inputs is None:
144  raise RuntimeError("Inputs not yet initialized.")
145  return self._inputs
146 
147  def outputs(self):
148  """Get a list of RepoData that are used to as outputs to the Butler.
149  The list is created lazily as needed, and cached.
150 
151  Returns
152  -------
153  A list of RepoData with writable repositories, in the order to be use when searching.
154  """
155  if self._outputs is None:
156  raise RuntimeError("Outputs not yet initialized.")
157  return self._outputs
158 
159  def all(self):
160  """Get a list of all RepoData that are used to as by the Butler.
161  The list is created lazily as needed, and cached.
162 
163  Returns
164  -------
165  A list of RepoData with writable repositories, in the order to be use when searching.
166  """
167  if self._all is None:
168  raise RuntimeError("The all list is not yet initialized.")
169  return self._all
170 
171  def __repr__(self):
172  return "%s(\nbyRepoRoot=%r, \nbyCfgRoot=%r, \n_inputs=%r, \n_outputs=%s, \n_all=%s)" % (
173  self.__class__.__name__,
174  self.byRepoRoot,
175  self.byCfgRoot,
176  self._inputs,
177  self._outputs,
178  self._all)
179 
180  def _buildLookupLists(self, inputs, outputs):
181  """Buld the lists of inputs, outputs, and all repo datas in lookup
182  order.
183 
184  Parameters
185  ----------
186  inputs : list of RepositoryArgs
187  The input RepositoryArgs, in order.
188  outputs : list of RepositoryArgs
189  The output RepositoryArgs, in order.
190 
191  Returns
192  -------
193  None
194  """
195  def addRepoDataToLists(repoData, inout):
196  """"Adds the cfg represented by repoData to the _all dict/list, as
197  well as the _inputs or _outputs list, as indicated by inout. Then,
198  adds all the parents of the cfg to the lists."""
199  if inout not in ('in', 'out', 'ref'):
200  raise RuntimeError("'inout' must be 'in', 'out', or 'ref', not '%s'" % inout)
201  if repoData.cfg.root not in self._all:
202  self._all[repoData.cfg.root] = repoData
203  if inout == 'in' and repoData not in self._inputs:
204  self._inputs.append(repoData)
205  elif inout == 'out' and repoData not in self._outputs:
206  self._outputs.append(repoData)
207  if 'r' in repoData.args.mode:
208  self._inputs.append(repoData)
209  for parent in repoData.cfg.parents:
210  addParentAs = 'in' if 'r' in repoData.args.mode and inout != 'ref' else 'ref'
211  addRepoDataToLists(self.byRepoRoot[parent], addParentAs)
212 
213  self._all = collections.OrderedDict()
214  self._inputs = []
215  self._outputs = []
216 
217  for repoArgs in outputs:
218  repoData = self.byCfgRoot[repoArgs.cfgRoot]
219  addRepoDataToLists(repoData, 'out')
220  for repoArgs in inputs:
221  repoData = self.byCfgRoot[repoArgs.cfgRoot]
222  addRepoDataToLists(repoData, 'in')
223 
224 
225 class Butler(object):
226  """Butler provides a generic mechanism for persisting and retrieving data using mappers.
227 
228  A Butler manages a collection of datasets known as a repository. Each dataset has a type representing its
229  intended usage and a location. Note that the dataset type is not the same as the C++ or Python type of the
230  object containing the data. For example, an ExposureF object might be used to hold the data for a raw
231  image, a post-ISR image, a calibrated science image, or a difference image. These would all be different
232  dataset types.
233 
234  A Butler can produce a collection of possible values for a key (or tuples of values for multiple keys) if
235  given a partial data identifier. It can check for the existence of a file containing a dataset given its
236  type and data identifier. The Butler can then retrieve the dataset. Similarly, it can persist an object to
237  an appropriate location when given its associated data identifier.
238 
239  Note that the Butler has two more advanced features when retrieving a data set. First, the retrieval is
240  lazy. Input does not occur until the data set is actually accessed. This allows datasets to be retrieved
241  and placed on a clipboard prospectively with little cost, even if the algorithm of a stage ends up not
242  using them. Second, the Butler will call a standardization hook upon retrieval of the dataset. This
243  function, contained in the input mapper object, must perform any necessary manipulations to force the
244  retrieved object to conform to standards, including translating metadata.
245 
246  Public methods:
247 
248  __init__(self, root, mapper=None, **mapperArgs)
249 
250  defineAlias(self, alias, datasetType)
251 
252  getKeys(self, datasetType=None, level=None)
253 
254  queryMetadata(self, datasetType, format=None, dataId={}, **rest)
255 
256  datasetExists(self, datasetType, dataId={}, **rest)
257 
258  get(self, datasetType, dataId={}, immediate=False, **rest)
259 
260  put(self, obj, datasetType, dataId={}, **rest)
261 
262  subset(self, datasetType, level=None, dataId={}, **rest)
263 
264  dataRef(self, datasetType, level=None, dataId={}, **rest)
265 
266  Initialization:
267 
268  The preferred method of initialization is to pass in a RepositoryArgs instance, or a list of
269  RepositoryArgs to inputs and/or outputs.
270 
271  For backward compatibility: this initialization method signature can take a posix root path, and
272  optionally a mapper class instance or class type that will be instantiated using the mapperArgs input
273  argument. However, for this to work in a backward compatible way it creates a single repository that is
274  used as both an input and an output repository. This is NOT preferred, and will likely break any
275  provenance system we have in place.
276 
277  Parameters
278  ----------
279  root - string
280  .. note:: Deprecated in 12_0
281  `root` will be removed in TBD, it is replaced by `inputs` and `outputs` for
282  multiple-repository support.
283  A fileysystem path. Will only work with a PosixRepository.
284  mapper - string or instance
285  .. note:: Deprecated in 12_0
286  `mapper` will be removed in TBD, it is replaced by `inputs` and `outputs` for
287  multiple-repository support.
288  Provides a mapper to be used with Butler.
289  mapperArgs - dict
290  .. note:: Deprecated in 12_0
291  `mapperArgs` will be removed in TBD, it is replaced by `inputs` and `outputs` for
292  multiple-repository support.
293  Provides arguments to be passed to the mapper if the mapper input arg is a class type to be
294  instantiated by Butler.
295  inputs - RepositoryArgs or string
296  Can be a single item or a list. Provides arguments to load an existing repository (or repositories).
297  String is assumed to be a URI and is used as the cfgRoot (URI to the location of the cfg file). (Local
298  file system URI does not have to start with 'file://' and in this way can be a relative path).
299  outputs - RepositoryArg or string
300  Can be a single item or a list. Provides arguments to load one or more existing repositories or create
301  new ones. String is assumed to be a URI and as used as the repository root.
302  """
303 
304  def __init__(self, root=None, mapper=None, inputs=None, outputs=None, **mapperArgs):
305 
306  self.log = Log.getLogger("daf.persistence.butler")
307 
308  self._initArgs = {'root': root, 'mapper': mapper, 'inputs': inputs, 'outputs': outputs,
309  'mapperArgs': mapperArgs}
310  # inputs and outputs may be modified, do not change the external value.
311  inputs = copy.deepcopy(inputs)
312  outputs = copy.deepcopy(outputs)
313 
314  isV1Args = inputs is None and outputs is None
315  if isV1Args:
316  inputs, outputs = self._convertV1Args(root=root, mapper=mapper, mapperArgs=mapperArgs)
317  elif root or mapper or mapperArgs:
318  raise RuntimeError(
319  'Butler version 1 API (root, mapper, **mapperArgs) may ' +
320  'not be used with version 2 API (inputs, outputs)')
322 
323  self.storage = Storage()
324 
325  # make sure inputs and outputs are lists, and if list items are a string convert it RepositoryArgs.
326  inputs = listify(inputs)
327  outputs = listify(outputs)
328  inputs = [RepositoryArgs(cfgRoot=args)
329  if not isinstance(args, RepositoryArgs) else args for args in inputs]
330  outputs = [RepositoryArgs(cfgRoot=args)
331  if not isinstance(args, RepositoryArgs) else args for args in outputs]
332  # Set default rw modes on input and output args as needed
333  for args in inputs:
334  if args.mode is None:
335  args.mode = 'r'
336  elif 'r' not in args.mode:
337  raise RuntimeError("The mode of an input should be readable.")
338  for args in outputs:
339  if args.mode is None:
340  args.mode = 'w'
341  elif 'w' not in args.mode:
342  raise RuntimeError("The mode of an output should be writable.")
343  # check for class instances in args.mapper (not allowed)
344  for args in inputs + outputs:
345  if (args.mapper and not isinstance(args.mapper, basestring) and
346  not inspect.isclass(args.mapper)):
347  self.log.warn(preinitedMapperWarning)
348 
349  # Always use an empty Persistence policy until we can get rid of it
350  persistencePolicy = pexPolicy.Policy()
351  self.persistence = Persistence.getPersistence(persistencePolicy)
352 
353  self._createRepoDatas(inputs, outputs)
354 
355  self._repos._buildLookupLists(inputs, outputs)
356 
357  self._setRepoDataTags()
358 
359  defaultMapper = self._getDefaultMapper()
360  self._assignDefaultMapper(defaultMapper)
361 
362  for repoData in reversed(self._repos.all().values()):
363  parentRegistry = self._getParentRegistry(repoData)
364  repoData.parentRegistry = parentRegistry
365  repoData.repo = Repository(repoData)
366 
367  def _getParentRegistry(self, repoData):
368  """Get the first found registry that matches the the passed-in repo.
369 
370  "Matches" means the mapper in the passed-in repo is the same type as
371  the mapper in the parent.
372 
373  Parameters
374  ----------
375  repoData : RepoData
376  The RepoData for the repository for which we are searching for a
377  parent registry.
378 
379  Returns
380  -------
381  Registry or None
382  A registry from a parent if one can be found, or None.
383 
384  Raises
385  ------
386  RuntimeError
387  Indicates a butler init order problem, all parents should be initialized before child
388  repositories, so this function should be able to get any parent of any child repo.
389  """
390  registry = None
391  for parentRepoData in self._getParentRepoDatas(repoData):
392  if parentRepoData.cfg.mapper == repoData.cfg.mapper:
393  if not parentRepoData.repo:
394  raise RuntimeError("Parent repo should be initialized before child repos.")
395  registry = parentRepoData.repo.getRegistry()
396  if registry:
397  break
398  return registry
399 
400  def _getParentRepoDatas(self, repoData):
401  """Get the parents & grandparents etc of a given repo data, in depth-first search order.
402 
403  Parameters
404  ----------
405  repoData : RepoData instance
406  The RepoData whose parents should be retreived.
407 
408  Returns
409  -------
410  list of RepoData
411  A list of the parents & grandparents etc of a given repo data, in depth-first search order.
412  """
413  for parentCfgRoot in repoData.cfg.parents:
414  parentRepoData = self._repos.byCfgRoot[parentCfgRoot]
415  yield parentRepoData
416  for parentRepoData in self._getParentRepoDatas(parentRepoData):
417  yield parentRepoData
418 
419  def _setRepoDataTags(self):
420  """Set the tags from each repoArgs into all its parent repoArgs so that they can be included in tagged
421  searches.
422 
423  Returns
424  -------
425  None
426  """
427  def setTags(butler, repoData, tags):
428  tags.update(repoData.args.tags)
429  repoData.addTags(tags)
430  for parent in repoData.cfg.parents:
431  setTags(butler, butler._repos.byRepoRoot[parent], copy.copy(tags))
432 
433  for repoData in self._repos.all().values():
434  setTags(self, repoData, set())
435 
436  def _createRepoData(self, args, inout, instanceParents):
437  """Make a RepoData object for args, adding it to the RepoDataContainer.
438 
439  Parameters
440  ----------
441  args : RepoArgs
442  A RepositoryArgs that describes a new or existing Repository.
443  inout : 'in' or 'out'
444  Indicates if this Repository should be used by the Butler as an input or an output.
445  instanceParents : list of string
446  URI/path to the RepositoryCfg of parents in this instance of Butler; inputs and readable outputs
447  (but not their parents, grand-parents are looked up when the parents are loaded)
448 
449  Returns
450  -------
451  None
452  """
453  def parentListWithoutThis(root, instanceParents):
454  """instanceParents is typically all the inputs to butler. If 'this' root is in that list (because
455  this repo is writable) then remove it, as a repo is never its own parent."""
456  parents = copy.copy(instanceParents)
457  try:
458  parents.remove(args.cfgRoot)
459  except ValueError:
460  pass
461  return parents
462 
463  # if only a string is passed for inputs or outputs, assumption is that it's a URI;
464  # place it in a RepositoryArgs instance; cfgRoot for inputs, root for outputs.
465  if inout not in ('in', 'out'):
466  raise RuntimeError("inout must be either 'in' or 'out'")
467  # if we already have RepoData for these repoArgs, we're done with that repo and it's parents.
468  if args.cfgRoot in self._repos.byCfgRoot:
469  return
470  # Get the RepositoryCfg, if it exists:
471  cfg = self.storage.getRepositoryCfg(args.cfgRoot)
472  # Handle the case where the Repository exists and contains a RepositoryCfg file:
473  if cfg:
474  if not cfg.matchesArgs(args):
475  raise RuntimeError("Persisted repo cfg does not match input args. cfg:%s, args:%s"
476  % (cfg, args))
477  # need to fix intermediate cfgs
478  # storedCfg = cfg
479  # cfg = RepositoryCfg.makeFromArgs(args)
480  parents = parentListWithoutThis(args.cfgRoot, instanceParents)
481  if inout == 'out' and cfg.parents != parents:
482  raise RuntimeError(
483  "Persisted repo cfg parents do not match butler parents: cfg:%s, parents:%s"
484  % (cfg, instanceParents))
485  else:
486  storedCfg = None
487  repoData = RepoData(args=args, cfg=cfg, storedCfg=storedCfg)
488  self._repos.add(repoData)
489  for parentArgs in cfg.parents:
490  self._createRepoData(RepositoryArgs(parentArgs, mode='r'), 'in', instanceParents)
491  # Handle the case where a RepositoryCfg file does not exist:
492  else:
493  # Posix repos might be Butler V1 Repos, requires special handling:
494  if Storage.isPosix(args.cfgRoot):
495  v1RepoExists = PosixStorage.v1RepoExists(args.cfgRoot)
496  if not v1RepoExists and inout == 'in':
497  msg = "Input repositories must exist; no repo found at " \
498  "%s. (A Butler V1 Repository 'exists' if the root " \
499  " folder exists AND contains items.)" % args.cfgRoot
500  raise RuntimeError(msg)
501  if inout == 'out' and not v1RepoExists:
502  parents = parentListWithoutThis(args.cfgRoot, instanceParents)
503  else:
504  parents = None
505  if v1RepoExists:
506  if not args.mapper:
507  args.mapper = PosixStorage.getMapperClass(args.cfgRoot)
508  cfg = RepositoryCfg.makeFromArgs(args, parents)
509  repoData = RepoData(args=args, cfg=cfg, isNewRepository=not v1RepoExists,
510  isV1Repository=v1RepoExists)
511  self._repos.add(repoData)
512  if v1RepoExists:
513  parent = PosixStorage.getParentSymlinkPath(args.cfgRoot)
514  if parent:
515  parent = PosixStorage.absolutePath(args.cfgRoot, parent)
516  cfg.addParents(parent)
517  self._createRepoData(RepositoryArgs(parent, mode='r'), 'in', instanceParents)
518  # Do not need to check for Butler V1 Repos in non-posix Storages:
519  else:
520  if inout == 'in':
521  msg = "Input repositories must exist; no repo found at " \
522  "%s." % args.cfgRoot
523  raise RuntimeError(msg)
524  cfg = RepositoryCfg.makeFromArgs(args, parents)
525  repoData = RepoData(args=args, cfg=cfg, isNewRepository=True)
526  self._repos.add(repoData)
527 
528  def _getParentsList(self, inputs, outputs):
529  parents = []
530  # The parents of readable output repositories are handled as though they were passed to butler as
531  # inputs.
532  # When we add remote storage types, getting the repositoryCfg here and again later in _createRepoData
533  # may be slow. We could fetch & cache if needed.
534  for args in outputs:
535  if 'r' in args.mode and args.cfgRoot not in parents:
536  parents.append(args.cfgRoot)
537  cfg = self.storage.getRepositoryCfg(args.cfgRoot)
538  if cfg:
539  for parent in cfg.parents:
540  if parent not in parents:
541  parents.append(parent)
542  for args in inputs:
543  if args.cfgRoot not in parents:
544  parents.append(args.cfgRoot)
545  return parents
546 
547  def _createRepoDatas(self, inputs, outputs):
548  """Create the RepoDataContainer and put a RepoData object in it for each repository listed in inputs
549  and outputs as well as each parent of each repository.
550 
551  After this function runs, there will be a RepoData for any Repository that may be used by this Butler
552  instance.
553 
554  Parameters
555  ----------
556  inputs : list of RepoArgs
557  Repositories to be used by the Butler as as input repositories.
558  outputs : list of RepoArgs
559  Repositories to be used by the Butler as as output repositories.
560 
561  Returns
562  -------
563  None
564  """
565  try:
566  if self._repos:
567  raise RuntimeError("Must not call _createRepoDatas twice.")
568  except AttributeError:
569  pass
570  self._repos = RepoDataContainer()
571  parents = self._getParentsList(inputs, outputs)
572 
573  for outputArgs in outputs:
574  self._createRepoData(outputArgs, 'out', parents)
575  for inputArgs in inputs:
576  self._createRepoData(inputArgs, 'in', parents)
577 
578  def _convertV1Args(self, root, mapper, mapperArgs):
579  """Convert Butler V1 args (root, mapper, mapperArgs) to V2 args (inputs, outputs)
580 
581  Parameters
582  ----------
583  root : string
584  Posix path to repository root
585  mapper : class, class instance, or string
586  Instantiated class, a class object to be instantiated, or a string that refers to a class that
587  can be imported & used as the mapper.
588  mapperArgs : dict
589  Args & their values used when instnatiating the mapper.
590 
591  Returns
592  -------
593  tuple
594  (inputs, outputs) - values to be used for inputs and outputs in Butler.__init__
595  """
596  if (mapper and not isinstance(mapper, basestring) and
597  not inspect.isclass(mapper)):
598  self.log.warn(preinitedMapperWarning)
599  inputs = None
600  if root is None:
601  if hasattr(mapper, 'root'):
602  # in legacy repos, the mapper may be given the root directly.
603  root = mapper.root
604  else:
605  # in the past root="None" could be used to mean root='.'
606  root = '.'
607  outputs = RepositoryArgs(mode='rw',
608  root=root,
609  mapper=mapper,
610  mapperArgs=mapperArgs)
611  return inputs, outputs
612 
613  def __repr__(self):
614  return 'Butler(datasetTypeAliasDict=%s, repos=%s, persistence=%s)' % (
615  self.datasetTypeAliasDict, self._repos, self.persistence)
616 
617  def _getDefaultMapper(self):
618  """Get the default mapper. Currently this means if all the repos use
619  exactly the same mapper, that mapper may be considered the default.
620 
621  This definition may be changing; mappers may be able to exclude
622  themselves as candidates for default, and they may nominate a different
623  mapper instead. Also, we may not want to look at *all* the repos, but
624  only a depth-first search on each of the input & output repos, and
625  use the first-found mapper for each of those. TBD.
626 
627  Parameters
628  ----------
629  inputs : TYPE
630  Description
631 
632  Returns
633  -------
634  Mapper class or None
635  Returns the class type of the default mapper, or None if a default
636  mapper can not be determined.
637  """
638  defaultMapper = None
639 
640  for inputRepoData in self._repos.inputs():
641  mapper = None
642  if inputRepoData.cfg.mapper is not None:
643  mapper = inputRepoData.cfg.mapper
644  # if the mapper is:
645  # * a string, import it.
646  # * a class instance, get its class type
647  # * a class, do nothing; use it
648  if isinstance(mapper, basestring):
649  mapper = doImport(mapper)
650  elif not inspect.isclass(mapper):
651  mapper = mapper.__class__
652  # If no mapper has been found, note the first found mapper.
653  # Then, if a mapper has been found and each next mapper matches it,
654  # continue looking for mappers.
655  # If a mapper has been found and another non-matching mapper is
656  # found then we have no default, return None.
657  if defaultMapper is None:
658  defaultMapper = mapper
659  elif mapper == defaultMapper:
660  continue
661  elif mapper is not None:
662  return None
663  return defaultMapper
664 
665  def _assignDefaultMapper(self, defaultMapper):
666  for repoData in self._repos.all().values():
667  if repoData.cfg.mapper is None and (repoData.isNewRepository or repoData.isV1Repository):
668  if defaultMapper is None:
669  raise RuntimeError(
670  "No mapper specified for %s and no default mapper could be determined." %
671  repoData.args)
672  repoData.cfg.mapper = defaultMapper
673 
674  @staticmethod
675  def getMapperClass(root):
676  """posix-only; gets the mapper class at the path specifed by root (if a file _mapper can be found at
677  that location or in a parent location.
678 
679  As we abstract the storage and support different types of storage locations this method will be
680  moved entirely into Butler Access, or made more dynamic, and the API will very likely change."""
681  return Storage.getMapperClass(root)
682 
683  def defineAlias(self, alias, datasetType):
684  """Register an alias that will be substituted in datasetTypes.
685 
686  Paramters
687  ---------
688  alias - str
689  The alias keyword. It may start with @ or not. It may not contain @ except as the first character.
690  datasetType - str
691  The string that will be substituted when @alias is passed into datasetType. It may not contain '@'
692  """
693  # verify formatting of alias:
694  # it can have '@' as the first character (if not it's okay, we will add it) or not at all.
695  atLoc = alias.rfind('@')
696  if atLoc == -1:
697  alias = "@" + str(alias)
698  elif atLoc > 0:
699  raise RuntimeError("Badly formatted alias string: %s" % (alias,))
700 
701  # verify that datasetType does not contain '@'
702  if datasetType.count('@') != 0:
703  raise RuntimeError("Badly formatted type string: %s" % (datasetType))
704 
705  # verify that the alias keyword does not start with another alias keyword,
706  # and vice versa
707  for key in self.datasetTypeAliasDict:
708  if key.startswith(alias) or alias.startswith(key):
709  raise RuntimeError("Alias: %s overlaps with existing alias: %s" % (alias, key))
710 
711  self.datasetTypeAliasDict[alias] = datasetType
712 
713  def getKeys(self, datasetType=None, level=None, tag=None):
714  """Get the valid data id keys at or above the given level of hierarchy for the dataset type or the
715  entire collection if None. The dict values are the basic Python types corresponding to the keys (int,
716  float, str).
717 
718  Parameters
719  ----------
720  datasetType - str
721  The type of dataset to get keys for, entire collection if None.
722  level - str
723  The hierarchy level to descend to. None if it should not be restricted. Use an empty string if the
724  mapper should lookup the default level.
725  tags - any, or list of any
726  Any object that can be tested to be the same as the tag in a dataId passed into butler input
727  functions. Applies only to input repositories: If tag is specified by the dataId then the repo
728  will only be read from used if the tag in the dataId matches a tag used for that repository.
729 
730  Returns
731  -------
732  Returns a dict. The dict keys are the valid data id keys at or above the given level of hierarchy for
733  the dataset type or the entire collection if None. The dict values are the basic Python types
734  corresponding to the keys (int, float, str).
735  """
736  datasetType = self._resolveDatasetTypeAlias(datasetType)
737 
738  keys = None
739  tag = setify(tag)
740  for repoData in self._repos.inputs():
741  if not tag or len(tag.intersection(repoData.tags)) > 0:
742  keys = repoData.repo.getKeys(datasetType, level)
743  # An empty dict is a valid "found" condition for keys. The only value for keys that should
744  # cause the search to continue is None
745  if keys is not None:
746  break
747  return keys
748 
749  def queryMetadata(self, datasetType, format, dataId={}, **rest):
750  """Returns the valid values for one or more keys when given a partial
751  input collection data id.
752 
753  Parameters
754  ----------
755  datasetType - str
756  The type of dataset to inquire about.
757  format - str, tuple
758  Key or tuple of keys to be returned.
759  dataId - DataId, dict
760  The partial data id.
761  **rest -
762  Keyword arguments for the partial data id.
763 
764  Returns
765  -------
766  A list of valid values or tuples of valid values as specified by the
767  format.
768  """
769 
770  datasetType = self._resolveDatasetTypeAlias(datasetType)
771  dataId = DataId(dataId)
772  dataId.update(**rest)
773  format = sequencify(format)
774 
775  tuples = None
776  for repoData in self._repos.inputs():
777  if not dataId.tag or len(dataId.tag.intersection(repoData.tags)) > 0:
778  tuples = repoData.repo.queryMetadata(datasetType, format, dataId)
779  if tuples:
780  break
781 
782  if not tuples:
783  return []
784 
785  if len(format) == 1:
786  ret = []
787  for x in tuples:
788  try:
789  ret.append(x[0])
790  except TypeError:
791  ret.append(x)
792  return ret
793 
794  return tuples
795 
796  def datasetExists(self, datasetType, dataId={}, **rest):
797  """Determines if a dataset file exists.
798 
799  Parameters
800  ----------
801  datasetType - str
802  The type of dataset to inquire about.
803  dataId - DataId, dict
804  The data id of the dataset.
805  **rest keyword arguments for the data id.
806 
807  Returns
808  -------
809  exists - bool
810  True if the dataset exists or is non-file-based.
811  """
812  datasetType = self._resolveDatasetTypeAlias(datasetType)
813  dataId = DataId(dataId)
814  dataId.update(**rest)
815 
816  location = None
817  for repoData in self._repos.inputs():
818  if not dataId.tag or len(dataId.tag.intersection(repoData.tags)) > 0:
819  location = repoData.repo.map(datasetType, dataId)
820  if location and location.repository.exists(location):
821  break
822  else:
823  location = None
824 
825  return bool(location)
826 
827 
828  def _locate(self, datasetType, dataId, write):
829  """Get one or more ButlerLocations and/or ButlercComposites.
830 
831  Parameters
832  ----------
833  datasetType : string
834  The datasetType that is being searched for. The datasetType may be followed by a dot and
835  a component name (component names are specified in the policy). IE datasetType.componentName
836 
837  dataId : dict or DataId class instance
838  The dataId
839 
840  write : bool
841  True if this is a search to write an object. False if it is a search to read an object. This
842  affects what type (an object or a container) is returned.
843 
844  Returns
845  -------
846  If write is False, will return either a single object or None. If write is True, will return a list
847  (which may be empty)
848  """
849  repos = self._repos.outputs() if write else self._repos.inputs()
850  locations = []
851  for repoData in repos:
852  # enforce dataId & repository tags when reading:
853  if not write and dataId.tag and len(dataId.tag.intersection(repoData.tags)) == 0:
854  continue
855  components = datasetType.split('.')
856  datasetType = components[0]
857  components = components[1:]
858  try:
859  location = repoData.repo.map(datasetType, dataId, write=write)
860  except NoResults:
861  continue
862  if location is None:
863  continue
864  location.datasetType = datasetType # todo is there a better way than monkey patching here?
865  if len(components) > 0:
866  if not isinstance(location, ButlerComposite):
867  raise RuntimeError("The location for a dotted datasetType must be a composite.")
868  # replace the first component name with the datasetType
869  components[0] = location.componentInfo[components[0]].datasetType
870  # join components back into a dot-delimited string
871  datasetType = '.'.join(components)
872  location = self._locate(datasetType, dataId, write)
873  # if a cmponent location is not found, we can not continue with this repo, move to next repo.
874  if location is None:
875  break
876  # if reading, only one location is desired.
877  if location:
878  if not write:
879  # If there is a bypass function for this dataset type, we can't test to see if the object
880  # exists in storage. Just return the location.
881  if hasattr(location.mapper, "bypass_" + location.datasetType):
882  try:
883  # The dataset for the location may or may not exist
884  # and may or may not be needed. Right now the only
885  # way to know is to call the bypass function.
886  location.bypass = self._getBypassFunc(location, dataId)()
887  return location
888  except:
889  continue
890  # If a location was found but the location does not exist, keep looking in input
891  # repositories (the registry may have had enough data for a lookup even thought the object
892  # exists in a different repository.)
893  if isinstance(location, ButlerComposite) or location.repository.exists(location):
894  return location
895  else:
896  try:
897  locations.extend(location)
898  except TypeError:
899  locations.append(location)
900  if not write:
901  return None
902  return locations
903 
904  @staticmethod
905  def _getBypassFunc(location, dataId):
906  pythonType = location.getPythonType()
907  if pythonType is not None:
908  if isinstance(pythonType, basestring):
909  pythonType = doImport(pythonType)
910  bypassFunc = getattr(location.mapper, "bypass_" + location.datasetType)
911  return lambda: bypassFunc(location.datasetType, pythonType, location, dataId)
912 
913 
914  def get(self, datasetType, dataId=None, immediate=True, **rest):
915  """Retrieves a dataset given an input collection data id.
916 
917  Parameters
918  ----------
919  datasetType - str
920  The type of dataset to retrieve.
921  dataId - dict
922  The data id.
923  immediate - bool
924  If False use a proxy for delayed loading.
925  **rest
926  keyword arguments for the data id.
927 
928  Returns
929  -------
930  An object retrieved from the dataset (or a proxy for one).
931  """
932  datasetType = self._resolveDatasetTypeAlias(datasetType)
933  dataId = DataId(dataId)
934  dataId.update(**rest)
935 
936  location = self._locate(datasetType, dataId, write=False)
937  if location is None:
938  raise NoResults("No locations for get:", datasetType, dataId)
939  self.log.debug("Get type=%s keys=%s from %s", datasetType, dataId, str(location))
940 
941  if isinstance(location, ButlerComposite):
942  for name, componentInfo in location.componentInfo.items():
943  if componentInfo.subset:
944  subset = self.subset(datasetType=componentInfo.datasetType, dataId=location.dataId)
945  componentInfo.obj = [obj.get() for obj in subset]
946  else:
947  obj = self.get(componentInfo.datasetType, location.dataId, immediate=True)
948  componentInfo.obj = obj
949  assembler = location.assembler or genericAssembler
950  obj = assembler(dataId=location.dataId, componentInfo=location.componentInfo, cls=location.python)
951  return obj
952 
953  if hasattr(location, 'bypass'):
954  # this type loader block should get moved into a helper someplace, and duplciations removed.
955  callback = lambda : location.bypass
956  else:
957  callback = lambda: self._read(location)
958  if location.mapper.canStandardize(location.datasetType):
959  innerCallback = callback
960  callback = lambda: location.mapper.standardize(location.datasetType, innerCallback(), dataId)
961  if immediate:
962  return callback()
963  return ReadProxy(callback)
964 
965  def put(self, obj, datasetType, dataId={}, doBackup=False, **rest):
966  """Persists a dataset given an output collection data id.
967 
968  Parameters
969  ----------
970  obj -
971  The object to persist.
972  datasetType - str
973  The type of dataset to persist.
974  dataId - dict
975  The data id.
976  doBackup - bool
977  If True, rename existing instead of overwriting.
978  WARNING: Setting doBackup=True is not safe for parallel processing, as it may be subject to race
979  conditions.
980  **rest
981  Keyword arguments for the data id.
982  """
983  datasetType = self._resolveDatasetTypeAlias(datasetType)
984  dataId = DataId(dataId)
985  dataId.update(**rest)
986 
987  for location in self._locate(datasetType, dataId, write=True):
988  if isinstance(location, ButlerComposite):
989  disassembler = location.disassembler if location.disassembler else genericDisassembler
990  disassembler(obj=obj, dataId=location.dataId, componentInfo=location.componentInfo)
991  for name, info in location.componentInfo.items():
992  if not info.inputOnly:
993  self.put(info.obj, info.datasetType, location.dataId, doBackup=doBackup)
994  else:
995  if doBackup:
996  location.getRepository().backup(location.datasetType, dataId)
997  location.getRepository().write(location, obj)
998 
999  def subset(self, datasetType, level=None, dataId={}, **rest):
1000  """Return complete dataIds for a dataset type that match a partial (or empty) dataId.
1001 
1002  Given a partial (or empty) dataId specified in dataId and **rest, find all datasets that match the
1003  dataId. Optionally restrict the results to a given level specified by a dataId key (e.g. visit or
1004  sensor or amp for a camera). Return an iterable collection of complete dataIds as ButlerDataRefs.
1005  Datasets with the resulting dataIds may not exist; that needs to be tested with datasetExists().
1006 
1007  Parameters
1008  ----------
1009  datasetType - str
1010  The type of dataset collection to subset
1011  level - str
1012  The level of dataId at which to subset. Use an empty string if the mapper should look up the
1013  default level.
1014  dataId - dict
1015  The data id.
1016  **rest
1017  Keyword arguments for the data id.
1018 
1019  Returns
1020  -------
1021  subset - ButlerSubset
1022  Collection of ButlerDataRefs for datasets matching the data id.
1023 
1024  Examples
1025  -----------
1026  To print the full dataIds for all r-band measurements in a source catalog
1027  (note that the subset call is equivalent to: `butler.subset('src', dataId={'filter':'r'})`):
1028 
1029  >>> subset = butler.subset('src', filter='r')
1030  >>> for data_ref in subset: print(data_ref.dataId)
1031  """
1032  datasetType = self._resolveDatasetTypeAlias(datasetType)
1033 
1034  # Currently expected behavior of subset is that if specified level is None then the mapper's default
1035  # level should be used. Convention for level within Butler is that an empty string is used to indicate
1036  # 'get default'.
1037  if level is None:
1038  level = ''
1039 
1040  dataId = DataId(dataId)
1041  dataId.update(**rest)
1042  return ButlerSubset(self, datasetType, level, dataId)
1043 
1044  def dataRef(self, datasetType, level=None, dataId={}, **rest):
1045  """Returns a single ButlerDataRef.
1046 
1047  Given a complete dataId specified in dataId and **rest, find the unique dataset at the given level
1048  specified by a dataId key (e.g. visit or sensor or amp for a camera) and return a ButlerDataRef.
1049 
1050  Parameters
1051  ----------
1052  datasetType - str
1053  The type of dataset collection to reference
1054  level - str
1055  The level of dataId at which to reference
1056  dataId - dict
1057  The data id.
1058  **rest
1059  Keyword arguments for the data id.
1060 
1061  Returns
1062  -------
1063  dataRef - ButlerDataRef
1064  ButlerDataRef for dataset matching the data id
1065  """
1066 
1067  datasetType = self._resolveDatasetTypeAlias(datasetType)
1068  dataId = DataId(dataId)
1069  subset = self.subset(datasetType, level, dataId, **rest)
1070  if len(subset) != 1:
1071  raise RuntimeError("No unique dataset for: Dataset type:%s Level:%s Data ID:%s Keywords:%s" %
1072  (str(datasetType), str(level), str(dataId), str(rest)))
1073  return ButlerDataRef(subset, subset.cache[0])
1074 
1075  def _read(self, location):
1076  """Unpersist an object using data inside a butlerLocation object.
1077 
1078  Parameters
1079  ----------
1080  location - ButlerLocation
1081  A butlerLocation instance populated with data needed to read the object.
1082 
1083  Returns
1084  -------
1085  object - an instance of the object specified by the butlerLocation.
1086  """
1087  self.log.debug("Starting read from %s", location)
1088  results = location.repository.read(location)
1089  if len(results) == 1:
1090  results = results[0]
1091  self.log.debug("Ending read from %s", location)
1092  return results
1093 
1094  def __reduce__(self):
1095  ret = (_unreduce, (self._initArgs, self.datasetTypeAliasDict))
1096  return ret
1097 
1098  def _resolveDatasetTypeAlias(self, datasetType):
1099  """Replaces all the known alias keywords in the given string with the alias value.
1100 
1101  Parameters
1102  ----------
1103  datasetType - str
1104  A datasetType string to search & replace on
1105 
1106  Returns
1107  -------
1108  datasetType - str
1109  The de-aliased string
1110  """
1111  for key in self.datasetTypeAliasDict:
1112  # if all aliases have been replaced, bail out
1113  if datasetType.find('@') == -1:
1114  break
1115  datasetType = datasetType.replace(key, self.datasetTypeAliasDict[key])
1116 
1117  # If an alias specifier can not be resolved then throw.
1118  if datasetType.find('@') != -1:
1119  raise RuntimeError("Unresolvable alias specifier in datasetType: %s" % (datasetType))
1120 
1121  return datasetType
1122 
1123 
1124 def _unreduce(initArgs, datasetTypeAliasDict):
1125  mapperArgs = initArgs.pop('mapperArgs')
1126  initArgs.update(mapperArgs)
1127  butler = Butler(**initArgs)
1128  butler.datasetTypeAliasDict = datasetTypeAliasDict
1129  return butler