Coverage for python/lsst/pipe/base/executionButlerBuilder.py: 13%
118 statements
« prev ^ index » next coverage.py v6.4.1, created at 2022-07-09 06:14 -0700
« prev ^ index » next coverage.py v6.4.1, created at 2022-07-09 06:14 -0700
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("buildExecutionButler",)
25import io
26import itertools
27from collections import defaultdict
28from typing import Callable, DefaultDict, Iterable, List, Mapping, Optional, Set, Tuple, Union
30from lsst.daf.butler import Butler, Config, DataCoordinate, DatasetRef, DatasetType
31from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
32from lsst.daf.butler.registry import ConflictingDefinitionError
33from lsst.daf.butler.transfers import RepoExportContext
34from lsst.resources import ResourcePath, ResourcePathExpression
35from lsst.utils.introspection import get_class_of
37from .graph import QuantumGraph, QuantumNode
38from .pipeline import PipelineDatasetTypes
40DataSetTypeMap = Mapping[DatasetType, Set[DataCoordinate]]
43def _validate_dataset_type(
44 candidate: DatasetType, previous: dict[Union[str, DatasetType], DatasetType]
45) -> DatasetType:
46 """Check the dataset types and return a consistent variant if there are
47 different compatible options.
49 Parameters
50 ----------
51 candidate : `lsst.daf.butler.DatasetType`
52 The candidate dataset type.
53 previous : `dict` [Union[`str`, `DatasetType`], `DatasetType`]
54 Previous dataset types found, indexed by name and also by
55 dataset type. The latter provides a quick way of returning a
56 previously checked dataset type.
58 Returns
59 -------
60 datasetType : `lsst.daf.butler.DatasetType`
61 The dataset type to be used. This can be different from the
62 given ``candidate`` if a previous dataset type was encountered
63 with the same name and this one is compatible with it.
65 Raises
66 ------
67 ConflictingDefinitionError
68 Raised if a candidate dataset type has the same name as one
69 previously encountered but is not compatible with it.
71 Notes
72 -----
73 This function ensures that if a dataset type is given that has the
74 same name as a previously encountered dataset type but differs solely
75 in a way that is interchangeable (through a supported storage class)
76 then we will always return the first dataset type encountered instead
77 of the new variant. We assume that the butler will handle the
78 type conversion itself later.
79 """
80 # First check that if we have previously vetted this dataset type.
81 # Return the vetted form immediately if we have.
82 checked = previous.get(candidate)
83 if checked:
84 return checked
86 # Have not previously encountered this dataset type.
87 name = candidate.name
88 if prevDsType := previous.get(name):
89 # Check compatibility. For now assume both directions have to
90 # be acceptable.
91 if prevDsType.is_compatible_with(candidate) and candidate.is_compatible_with(prevDsType):
92 # Ensure that if this dataset type is used again we will return
93 # the version that we were first given with this name. Store
94 # it for next time and return the previous one.
95 previous[candidate] = prevDsType
96 return prevDsType
97 else:
98 raise ConflictingDefinitionError(
99 f"Dataset type incompatibility in graph: {prevDsType} not compatible with {candidate}"
100 )
102 # New dataset type encountered. Store it by name and by dataset type
103 # so it will be validated immediately next time it comes up.
104 previous[name] = candidate
105 previous[candidate] = candidate
106 return candidate
109def _accumulate(
110 graph: QuantumGraph,
111 dataset_types: PipelineDatasetTypes,
112) -> Tuple[Set[DatasetRef], DataSetTypeMap]:
113 # accumulate the DatasetRefs that will be transferred to the execution
114 # registry
116 # exports holds all the existing data that will be migrated to the
117 # execution butler
118 exports: Set[DatasetRef] = set()
120 # inserts is the mapping of DatasetType to dataIds for what is to be
121 # inserted into the registry. These are the products that are expected
122 # to be produced during processing of the QuantumGraph
123 inserts: DefaultDict[DatasetType, Set[DataCoordinate]] = defaultdict(set)
125 # It is possible to end up with a graph that has different storage
126 # classes attached to the same dataset type name. This is okay but
127 # must we must ensure that only a single dataset type definition is
128 # accumulated in the loop below. This data structure caches every dataset
129 # type encountered and stores the compatible alternative.
130 datasetTypes: dict[Union[str, DatasetType], DatasetType] = {}
132 # Add inserts for initOutputs (including initIntermediates); these are
133 # defined fully by their DatasetType, because they have no dimensions, and
134 # they are by definition not resolved. initInputs are part of Quantum and
135 # that's the only place the graph stores the dataset IDs, so we process
136 # them there even though each Quantum for a task has the same ones.
137 for dataset_type in itertools.chain(dataset_types.initIntermediates, dataset_types.initOutputs):
138 dataset_type = _validate_dataset_type(dataset_type, datasetTypes)
139 inserts[dataset_type].add(DataCoordinate.makeEmpty(dataset_type.dimensions.universe))
141 n: QuantumNode
142 for quantum in (n.quantum for n in graph):
143 for attrName in ("initInputs", "inputs", "outputs"):
144 attr: Mapping[DatasetType, Union[DatasetRef, List[DatasetRef]]] = getattr(quantum, attrName)
146 for type, refs in attr.items():
147 # This if block is because init inputs has a different
148 # signature for its items
149 if not isinstance(refs, list):
150 refs = [refs]
151 # iterate over all the references, if it has an id, it
152 # means it exists and should be exported, if not it should
153 # be inserted into the new registry
154 for ref in refs:
155 if ref.id is not None:
156 # If this is a component we want the composite to be
157 # exported.
158 if ref.isComponent():
159 ref = ref.makeCompositeRef()
160 exports.add(ref)
161 else:
162 if ref.isComponent():
163 # We can't insert a component, and a component will
164 # be part of some other upstream dataset, so it
165 # should be safe to skip them here
166 continue
167 type = _validate_dataset_type(type, datasetTypes)
168 inserts[type].add(ref.dataId)
169 return exports, inserts
172def _discoverCollections(butler: Butler, collections: Iterable[str]) -> set[str]:
173 # Recurse through any discovered collections to make sure all collections
174 # are exported. This exists because I ran into a situation where some
175 # collections were not properly being discovered and exported. This
176 # method may be able to be removed in the future if collection export
177 # logic changes
178 collections = set(collections)
179 while True:
180 discoveredCollections = set(
181 butler.registry.queryCollections(collections, flattenChains=True, includeChains=True)
182 )
183 if len(discoveredCollections) > len(collections):
184 collections = discoveredCollections
185 else:
186 break
187 return collections
190def _export(
191 butler: Butler, collections: Optional[Iterable[str]], exports: Set[DatasetRef], inserts: DataSetTypeMap
192) -> io.StringIO:
193 # This exports the datasets that exist in the input butler using
194 # daf butler objects, however it reaches in deep and does not use the
195 # public methods so that it can export it to a string buffer and skip
196 # disk access.
197 yamlBuffer = io.StringIO()
198 # Yaml is hard coded, since the class controls both ends of the
199 # export/import
200 BackendClass = get_class_of(butler._config["repo_transfer_formats", "yaml", "export"])
201 backend = BackendClass(yamlBuffer, universe=butler.registry.dimensions)
202 exporter = RepoExportContext(butler.registry, butler.datastore, backend, directory=None, transfer=None)
204 # Need to ensure that the dimension records for input are transferred.
205 # Butler.transfer_from() does not (yet) transfer records.
206 dataIds = set(ref.dataId for ref in exports)
207 exporter.saveDataIds(dataIds)
209 # Need to ensure that the dimension records for outputs are
210 # transferred.
211 for _, dataIds in inserts.items():
212 exporter.saveDataIds(dataIds)
214 # Look for any defined collection, if not get the defaults
215 if collections is None:
216 collections = butler.registry.defaults.collections
218 # look up all collections associated with those inputs, this follows
219 # all chains to make sure everything is properly exported
220 for c in _discoverCollections(butler, collections):
221 exporter.saveCollection(c)
222 exporter._finish()
224 # reset the string buffer to the beginning so the read operation will
225 # actually *see* the data that was exported
226 yamlBuffer.seek(0)
227 return yamlBuffer
230def _setupNewButler(butler: Butler, outputLocation: ResourcePath, dirExists: bool) -> Butler:
231 # Set up the new butler object at the specified location
232 if dirExists:
233 # Remove the existing table, if the code got this far and this exists
234 # clobber must be true
235 executionRegistry = outputLocation.join("gen3.sqlite3")
236 if executionRegistry.exists():
237 executionRegistry.remove()
238 else:
239 outputLocation.mkdir()
241 # Copy the existing butler config, modifying the location of the
242 # registry to the specified location.
243 # Preserve the root path from the existing butler so things like
244 # file data stores continue to look at the old location.
245 config = Config(butler._config)
246 config["root"] = outputLocation.geturl()
247 config["allow_put_of_predefined_dataset"] = True
248 config["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3"
250 # Remove any namespace that may be set in main registry.
251 config.pop(("registry", "namespace"), None)
253 # record the current root of the datastore if it is specified relative
254 # to the butler root
255 if config.get(("datastore", "root")) == BUTLER_ROOT_TAG and butler._config.configDir is not None:
256 config["datastore", "root"] = butler._config.configDir.geturl()
257 config["datastore", "trust_get_request"] = True
259 # Requires that we use the dimension configuration from the original
260 # butler and not use the defaults.
261 config = Butler.makeRepo(
262 root=outputLocation,
263 config=config,
264 dimensionConfig=butler.registry.dimensions.dimensionConfig,
265 overwrite=True,
266 forceConfigRoot=False,
267 )
269 # Return a newly created butler
270 return Butler(config, writeable=True)
273def _import(
274 yamlBuffer: io.StringIO,
275 newButler: Butler,
276 inserts: DataSetTypeMap,
277 run: Optional[str],
278 butlerModifier: Optional[Callable[[Butler], Butler]],
279) -> Butler:
280 # This method takes the exports from the existing butler, imports
281 # them into the newly created butler, and then inserts the datasets
282 # that are expected to be produced.
284 # import the existing datasets using "split" mode. "split" is safe
285 # because execution butler is assumed to be able to see all the file
286 # locations that the main datastore can see. "split" supports some
287 # absolute URIs in the datastore.
288 newButler.import_(filename=yamlBuffer, format="yaml", reuseIds=True, transfer="split")
290 # If there is modifier callable, run it to make necessary updates
291 # to the new butler.
292 if butlerModifier is not None:
293 newButler = butlerModifier(newButler)
295 # Register datasets to be produced and insert them into the registry
296 for dsType, dataIds in inserts.items():
297 # There may be inconsistencies with storage class definitions
298 # so those differences must be checked.
299 try:
300 newButler.registry.registerDatasetType(dsType)
301 except ConflictingDefinitionError:
302 # We do not at this point know whether the dataset type is
303 # an intermediate (and so must be able to support conversion
304 # from the registry storage class to an input) or solely an output
305 # dataset type. Test both compatibilities.
306 registryDsType = newButler.registry.getDatasetType(dsType.name)
307 if registryDsType.is_compatible_with(dsType) and dsType.is_compatible_with(registryDsType):
308 # Ensure that we use the registry type when inserting.
309 dsType = registryDsType
310 else:
311 # Not compatible so re-raise the original exception.
312 raise
314 newButler.registry.insertDatasets(dsType, dataIds, run)
316 return newButler
319def buildExecutionButler(
320 butler: Butler,
321 graph: QuantumGraph,
322 outputLocation: ResourcePathExpression,
323 run: Optional[str],
324 *,
325 clobber: bool = False,
326 butlerModifier: Optional[Callable[[Butler], Butler]] = None,
327 collections: Optional[Iterable[str]] = None,
328) -> Butler:
329 r"""buildExecutionButler is a function that is responsible for exporting
330 input `QuantumGraphs` into a new minimal `~lsst.daf.butler.Butler` which
331 only contains datasets specified by the `QuantumGraph`. These datasets are
332 both those that already exist in the input `~lsst.daf.butler.Butler`, and
333 those that are expected to be produced during the execution of the
334 `QuantumGraph`.
336 Parameters
337 ----------
338 butler : `lsst.daf.butler.Bulter`
339 This is the existing `~lsst.daf.butler.Butler` instance from which
340 existing datasets will be exported. This should be the
341 `~lsst.daf.butler.Butler` which was used to create any `QuantumGraphs`
342 that will be converted with this object.
343 graph : `QuantumGraph`
344 Graph containing nodes that are to be exported into an execution
345 butler
346 outputLocation : convertible to `ResourcePath
347 URI Location at which the execution butler is to be exported. May be
348 specified as a string or a `ResourcePath` instance.
349 run : `str`, optional
350 The run collection that the exported datasets are to be placed in. If
351 None, the default value in registry.defaults will be used.
352 clobber : `bool`, Optional
353 By default a butler will not be created if a file or directory
354 already exists at the output location. If this is set to `True`
355 what is at the location will be deleted prior to running the
356 export. Defaults to `False`
357 butlerModifier : `~typing.Callable`, Optional
358 If supplied this should be a callable that accepts a
359 `~lsst.daf.butler.Butler`, and returns an instantiated
360 `~lsst.daf.butler.Butler`. This callable may be used to make any
361 modifications to the `~lsst.daf.butler.Butler` desired. This
362 will be called after importing all datasets that exist in the input
363 `~lsst.daf.butler.Butler` but prior to inserting Datasets expected
364 to be produced. Examples of what this method could do include
365 things such as creating collections/runs/ etc.
366 collections : `~typing.Iterable` of `str`, Optional
367 An iterable of collection names that will be exported from the input
368 `~lsst.daf.butler.Butler` when creating the execution butler. If not
369 supplied the `~lsst.daf.butler.Butler`\ 's `~lsst.daf.butler.Registry`
370 default collections will be used.
372 Returns
373 -------
374 executionButler : `lsst.daf.butler.Butler`
375 An instance of the newly created execution butler
377 Raises
378 ------
379 FileExistsError
380 Raised if something exists in the filesystem at the specified output
381 location and clobber is `False`
382 NotADirectoryError
383 Raised if specified output URI does not correspond to a directory
384 """
385 # We know this must refer to a directory.
386 outputLocation = ResourcePath(outputLocation, forceDirectory=True)
388 # Do this first to Fail Fast if the output exists
389 if (dirExists := outputLocation.exists()) and not clobber:
390 raise FileExistsError("Cannot create a butler at specified location, location exists")
391 if not outputLocation.isdir():
392 raise NotADirectoryError("The specified output URI does not appear to correspond to a directory")
394 # Gather all DatasetTypes from the Python and check any that already exist
395 # in the registry for consistency. This does not check that all dataset
396 # types here exist, because they might want to register dataset types
397 # later. It would be nice to also check that, but to that we would need to
398 # be told whether they plan to register dataset types later (DM-30845).
399 dataset_types = PipelineDatasetTypes.fromPipeline(graph.iterTaskGraph(), registry=butler.registry)
401 exports, inserts = _accumulate(graph, dataset_types)
402 yamlBuffer = _export(butler, collections, exports, inserts)
404 newButler = _setupNewButler(butler, outputLocation, dirExists)
406 newButler = _import(yamlBuffer, newButler, inserts, run, butlerModifier)
408 # Transfer the existing datasets directly from the source butler.
409 newButler.transfer_from(
410 butler,
411 exports,
412 transfer="auto", # No transfers should be happening.
413 skip_missing=False, # Everything should exist.
414 register_dataset_types=True,
415 )
417 return newButler