Coverage for python/lsst/pipe/base/executionButlerBuilder.py: 9%
148 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-06-06 10:05 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-06-06 10:05 +0000
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("buildExecutionButler",)
25import io
26from collections import defaultdict
27from typing import Callable, Iterable, List, Mapping, Optional, Set, Tuple, Union
29from lsst.daf.butler import Butler, Config, DatasetRef, DatasetType, Registry
30from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
31from lsst.daf.butler.registry import ConflictingDefinitionError, MissingDatasetTypeError
32from lsst.daf.butler.transfers import RepoExportContext
33from lsst.resources import ResourcePath, ResourcePathExpression
34from lsst.utils.introspection import get_class_of
36from .graph import QuantumGraph
38DataSetTypeRefMap = Mapping[DatasetType, Set[DatasetRef]]
41def _validate_dataset_type(
42 candidate: DatasetType, previous: dict[Union[str, DatasetType], DatasetType], registry: Registry
43) -> DatasetType:
44 """Check the dataset types and return a consistent variant if there are
45 different compatible options.
47 Parameters
48 ----------
49 candidate : `lsst.daf.butler.DatasetType`
50 The candidate dataset type.
51 previous : `dict` [Union[`str`, `DatasetType`], `DatasetType`]
52 Previous dataset types found, indexed by name and also by
53 dataset type. The latter provides a quick way of returning a
54 previously checked dataset type.
55 registry : `lsst.daf.butler.Registry`
56 Main registry whose dataset type registration should override the
57 given one if it exists.
59 Returns
60 -------
61 datasetType : `lsst.daf.butler.DatasetType`
62 The dataset type to be used. This can be different from the
63 given ``candidate`` if a previous dataset type was encountered
64 with the same name and this one is compatible with it.
66 Raises
67 ------
68 ConflictingDefinitionError
69 Raised if a candidate dataset type has the same name as one
70 previously encountered but is not compatible with it.
72 Notes
73 -----
74 This function ensures that if a dataset type is given that has the
75 same name as a previously encountered dataset type but differs solely
76 in a way that is interchangeable (through a supported storage class)
77 then we will always return the first dataset type encountered instead
78 of the new variant. We assume that the butler will handle the
79 type conversion itself later.
80 """
81 # First check that if we have previously vetted this dataset type.
82 # Return the vetted form immediately if we have.
83 checked = previous.get(candidate)
84 if checked:
85 return checked
87 # Have not previously encountered this dataset type.
88 name = candidate.name
89 if prevDsType := previous.get(name):
90 # Check compatibility. For now assume both directions have to
91 # be acceptable.
92 if prevDsType.is_compatible_with(candidate) and candidate.is_compatible_with(prevDsType):
93 # Ensure that if this dataset type is used again we will return
94 # the version that we were first given with this name. Store
95 # it for next time and return the previous one.
96 previous[candidate] = prevDsType
97 return prevDsType
98 else:
99 raise ConflictingDefinitionError(
100 f"Dataset type incompatibility in graph: {prevDsType} not compatible with {candidate}"
101 )
103 # We haven't seen this dataset type in this graph before, but it may
104 # already be in the registry.
105 try:
106 registryDsType = registry.getDatasetType(name)
107 previous[candidate] = registryDsType
108 return registryDsType
109 except MissingDatasetTypeError:
110 pass
111 # Dataset type is totally new. Store it by name and by dataset type so
112 # it will be validated immediately next time it comes up.
113 previous[name] = candidate
114 previous[candidate] = candidate
115 return candidate
118def _accumulate(
119 butler: Butler,
120 graph: QuantumGraph,
121) -> Tuple[Set[DatasetRef], DataSetTypeRefMap]:
122 # accumulate the DatasetRefs that will be transferred to the execution
123 # registry
125 # exports holds all the existing data that will be migrated to the
126 # execution butler
127 exports: Set[DatasetRef] = set()
129 # inserts is the mapping of DatasetType to dataIds for what is to be
130 # inserted into the registry. These are the products that are expected
131 # to be produced during processing of the QuantumGraph
132 inserts: DataSetTypeRefMap = defaultdict(set)
134 # It is possible to end up with a graph that has different storage
135 # classes attached to the same dataset type name. This is okay but
136 # must we must ensure that only a single dataset type definition is
137 # accumulated in the loop below. This data structure caches every dataset
138 # type encountered and stores the compatible alternative.
139 datasetTypes: dict[Union[str, DatasetType], DatasetType] = {}
141 # Find the initOutput refs.
142 initOutputRefs = list(graph.globalInitOutputRefs())
143 for task_def in graph.iterTaskGraph():
144 task_refs = graph.initOutputRefs(task_def)
145 if task_refs:
146 initOutputRefs.extend(task_refs)
148 for ref in initOutputRefs:
149 dataset_type = ref.datasetType
150 if dataset_type.component() is not None:
151 dataset_type = dataset_type.makeCompositeDatasetType()
152 dataset_type = _validate_dataset_type(dataset_type, datasetTypes, butler.registry)
153 inserts[dataset_type].add(ref)
155 # Output references may be resolved even if they do not exist. Find all
156 # actually existing refs.
157 check_refs: Set[DatasetRef] = set()
158 for quantum in (n.quantum for n in graph):
159 for attrName in ("initInputs", "inputs", "outputs"):
160 attr: Mapping[DatasetType, Union[DatasetRef, List[DatasetRef]]] = getattr(quantum, attrName)
161 for type, refs in attr.items():
162 # This if block is because init inputs has a different
163 # signature for its items
164 if not isinstance(refs, list):
165 refs = [refs]
166 for ref in refs:
167 if ref.isComponent():
168 ref = ref.makeCompositeRef()
169 check_refs.add(ref)
170 exist_map = butler.datastore.knows_these(check_refs)
171 existing_ids = set(ref.id for ref, exists in exist_map.items() if exists)
172 del exist_map
174 for quantum in (n.quantum for n in graph):
175 for attrName in ("initInputs", "inputs", "outputs"):
176 attr = getattr(quantum, attrName)
178 for type, refs in attr.items():
179 if not isinstance(refs, list):
180 refs = [refs]
181 if type.component() is not None:
182 type = type.makeCompositeDatasetType()
183 type = _validate_dataset_type(type, datasetTypes, butler.registry)
184 # iterate over all the references, if it exists and should be
185 # exported, if not it should be inserted into the new registry
186 for ref in refs:
187 # Component dataset ID is the same as its parent ID, so
188 # checking component in existing_ids works OK.
189 if ref.id in existing_ids:
190 # If this is a component we want the composite to be
191 # exported.
192 if ref.isComponent():
193 ref = ref.makeCompositeRef()
194 # Make sure we export this with the registry's dataset
195 # type, since transfer_from doesn't handle storage
196 # class differences (maybe it should, but it's not
197 # bad to be defensive here even if that changes).
198 if type != ref.datasetType:
199 ref = ref.overrideStorageClass(type.storageClass)
200 assert ref.datasetType == type, "Dataset types should not differ in other ways."
201 exports.add(ref)
202 else:
203 if ref.isComponent():
204 # We can't insert a component, and a component will
205 # be part of some other upstream dataset, so it
206 # should be safe to skip them here
207 continue
208 inserts[type].add(ref)
210 return exports, inserts
213def _discoverCollections(butler: Butler, collections: Iterable[str]) -> set[str]:
214 # Recurse through any discovered collections to make sure all collections
215 # are exported. This exists because I ran into a situation where some
216 # collections were not properly being discovered and exported. This
217 # method may be able to be removed in the future if collection export
218 # logic changes
219 collections = set(collections)
220 while True:
221 discoveredCollections = set(
222 butler.registry.queryCollections(collections, flattenChains=True, includeChains=True)
223 )
224 if len(discoveredCollections) > len(collections):
225 collections = discoveredCollections
226 else:
227 break
228 return collections
231def _export(butler: Butler, collections: Optional[Iterable[str]], inserts: DataSetTypeRefMap) -> io.StringIO:
232 # This exports relevant dimension records and collections using daf butler
233 # objects, however it reaches in deep and does not use the public methods
234 # so that it can export it to a string buffer and skip disk access. This
235 # does not export the datasets themselves, since we use transfer_from for
236 # that.
237 yamlBuffer = io.StringIO()
238 # Yaml is hard coded, since the class controls both ends of the
239 # export/import
240 BackendClass = get_class_of(butler._config["repo_transfer_formats", "yaml", "export"])
241 backend = BackendClass(yamlBuffer, universe=butler.registry.dimensions)
242 exporter = RepoExportContext(butler.registry, butler.datastore, backend, directory=None, transfer=None)
244 # Need to ensure that the dimension records for outputs are
245 # transferred.
246 for _, refs in inserts.items():
247 exporter.saveDataIds([ref.dataId for ref in refs])
249 # Look for any defined collection, if not get the defaults
250 if collections is None:
251 collections = butler.registry.defaults.collections
253 # look up all collections associated with those inputs, this follows
254 # all chains to make sure everything is properly exported
255 for c in _discoverCollections(butler, collections):
256 exporter.saveCollection(c)
257 exporter._finish()
259 # reset the string buffer to the beginning so the read operation will
260 # actually *see* the data that was exported
261 yamlBuffer.seek(0)
262 return yamlBuffer
265def _setupNewButler(
266 butler: Butler,
267 outputLocation: ResourcePath,
268 dirExists: bool,
269 datastoreRoot: Optional[ResourcePath] = None,
270) -> Butler:
271 """Set up the execution butler
273 Parameters
274 ----------
275 butler : `Butler`
276 The original butler, upon which the execution butler is based.
277 outputLocation : `ResourcePath`
278 Location of the execution butler.
279 dirExists : `bool`
280 Does the ``outputLocation`` exist, and if so, should it be clobbered?
281 datastoreRoot : `ResourcePath`, optional
282 Path for the execution butler datastore. If not specified, then the
283 original butler's datastore will be used.
285 Returns
286 -------
287 execution_butler : `Butler`
288 Execution butler.
289 """
290 # Set up the new butler object at the specified location
291 if dirExists:
292 # Remove the existing table, if the code got this far and this exists
293 # clobber must be true
294 executionRegistry = outputLocation.join("gen3.sqlite3")
295 if executionRegistry.exists():
296 executionRegistry.remove()
297 else:
298 outputLocation.mkdir()
300 # Copy the existing butler config, modifying the location of the
301 # registry to the specified location.
302 # Preserve the root path from the existing butler so things like
303 # file data stores continue to look at the old location.
304 config = Config(butler._config)
305 config["root"] = outputLocation.geturl()
306 config["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3"
308 # Remove any namespace that may be set in main registry.
309 config.pop(("registry", "namespace"), None)
311 # Obscore manager cannot be used with execution butler.
312 config.pop(("registry", "managers", "obscore"), None)
314 # record the current root of the datastore if it is specified relative
315 # to the butler root
316 if datastoreRoot is not None:
317 config["datastore", "root"] = datastoreRoot.geturl()
318 elif config.get(("datastore", "root")) == BUTLER_ROOT_TAG and butler._config.configDir is not None:
319 config["datastore", "root"] = butler._config.configDir.geturl()
320 config["datastore", "trust_get_request"] = True
322 # Requires that we use the dimension configuration from the original
323 # butler and not use the defaults.
324 config = Butler.makeRepo(
325 root=outputLocation,
326 config=config,
327 dimensionConfig=butler.registry.dimensions.dimensionConfig,
328 overwrite=True,
329 forceConfigRoot=False,
330 )
332 # Return a newly created butler
333 return Butler(config, writeable=True)
336def _import(
337 yamlBuffer: io.StringIO,
338 newButler: Butler,
339 inserts: DataSetTypeRefMap,
340 run: Optional[str],
341 butlerModifier: Optional[Callable[[Butler], Butler]],
342) -> Butler:
343 # This method takes the exports from the existing butler, imports
344 # them into the newly created butler, and then inserts the datasets
345 # that are expected to be produced.
347 # import the existing datasets using "split" mode. "split" is safe
348 # because execution butler is assumed to be able to see all the file
349 # locations that the main datastore can see. "split" supports some
350 # absolute URIs in the datastore.
351 newButler.import_(filename=yamlBuffer, format="yaml", transfer="split")
353 # If there is modifier callable, run it to make necessary updates
354 # to the new butler.
355 if butlerModifier is not None:
356 newButler = butlerModifier(newButler)
358 # Register datasets to be produced and insert them into the registry
359 for dsType, refs in inserts.items():
360 # Storage class differences should have already been resolved by calls
361 # _validate_dataset_type in _export, resulting in the Registry dataset
362 # type whenever that exists.
363 newButler.registry.registerDatasetType(dsType)
364 newButler.registry._importDatasets(refs)
366 return newButler
369def buildExecutionButler(
370 butler: Butler,
371 graph: QuantumGraph,
372 outputLocation: ResourcePathExpression,
373 run: Optional[str],
374 *,
375 clobber: bool = False,
376 butlerModifier: Optional[Callable[[Butler], Butler]] = None,
377 collections: Optional[Iterable[str]] = None,
378 datastoreRoot: Optional[ResourcePathExpression] = None,
379 transfer: str = "auto",
380) -> Butler:
381 r"""buildExecutionButler is a function that is responsible for exporting
382 input `QuantumGraphs` into a new minimal `~lsst.daf.butler.Butler` which
383 only contains datasets specified by the `QuantumGraph`. These datasets are
384 both those that already exist in the input `~lsst.daf.butler.Butler`, and
385 those that are expected to be produced during the execution of the
386 `QuantumGraph`.
388 Parameters
389 ----------
390 butler : `lsst.daf.butler.Butler`
391 This is the existing `~lsst.daf.butler.Butler` instance from which
392 existing datasets will be exported. This should be the
393 `~lsst.daf.butler.Butler` which was used to create any `QuantumGraphs`
394 that will be converted with this object.
395 graph : `QuantumGraph`
396 Graph containing nodes that are to be exported into an execution
397 butler
398 outputLocation : convertible to `ResourcePath`
399 URI Location at which the execution butler is to be exported. May be
400 specified as a string or a `ResourcePath` instance.
401 run : `str`, optional
402 The run collection that the exported datasets are to be placed in. If
403 None, the default value in registry.defaults will be used.
404 clobber : `bool`, Optional
405 By default a butler will not be created if a file or directory
406 already exists at the output location. If this is set to `True`
407 what is at the location will be deleted prior to running the
408 export. Defaults to `False`
409 butlerModifier : `~typing.Callable`, Optional
410 If supplied this should be a callable that accepts a
411 `~lsst.daf.butler.Butler`, and returns an instantiated
412 `~lsst.daf.butler.Butler`. This callable may be used to make any
413 modifications to the `~lsst.daf.butler.Butler` desired. This
414 will be called after importing all datasets that exist in the input
415 `~lsst.daf.butler.Butler` but prior to inserting Datasets expected
416 to be produced. Examples of what this method could do include
417 things such as creating collections/runs/ etc.
418 collections : `~typing.Iterable` of `str`, Optional
419 An iterable of collection names that will be exported from the input
420 `~lsst.daf.butler.Butler` when creating the execution butler. If not
421 supplied the `~lsst.daf.butler.Butler`\ 's `~lsst.daf.butler.Registry`
422 default collections will be used.
423 datastoreRoot : convertible to `ResourcePath`, Optional
424 Root directory for datastore of execution butler. If `None`, then the
425 original butler's datastore will be used.
426 transfer : `str`
427 How (and whether) the input datasets should be added to the execution
428 butler datastore. This should be a ``transfer`` string recognized by
429 :func:`lsst.resources.ResourcePath.transfer_from`.
430 ``"auto"`` means to ``"copy"`` if the ``datastoreRoot`` is specified.
432 Returns
433 -------
434 executionButler : `lsst.daf.butler.Butler`
435 An instance of the newly created execution butler
437 Raises
438 ------
439 FileExistsError
440 Raised if something exists in the filesystem at the specified output
441 location and clobber is `False`
442 NotADirectoryError
443 Raised if specified output URI does not correspond to a directory
444 """
445 # Now require that if run is given it must match the graph run.
446 if run and graph.metadata and run != (graph_run := graph.metadata.get("output_run")):
447 raise ValueError(f"The given run, {run!r}, does not match that specified in the graph, {graph_run!r}")
449 # We know this must refer to a directory.
450 outputLocation = ResourcePath(outputLocation, forceDirectory=True)
451 if datastoreRoot is not None:
452 datastoreRoot = ResourcePath(datastoreRoot, forceDirectory=True)
454 # Do this first to Fail Fast if the output exists
455 if (dirExists := outputLocation.exists()) and not clobber:
456 raise FileExistsError("Cannot create a butler at specified location, location exists")
457 if not outputLocation.isdir():
458 raise NotADirectoryError("The specified output URI does not appear to correspond to a directory")
460 exports, inserts = _accumulate(butler, graph)
461 yamlBuffer = _export(butler, collections, inserts)
463 newButler = _setupNewButler(butler, outputLocation, dirExists, datastoreRoot)
465 newButler = _import(yamlBuffer, newButler, inserts, run, butlerModifier)
467 if transfer == "auto" and datastoreRoot is not None:
468 transfer = "copy"
470 # Transfer the existing datasets directly from the source butler.
471 newButler.transfer_from(
472 butler,
473 exports,
474 transfer=transfer,
475 skip_missing=False, # Everything should exist.
476 register_dataset_types=True,
477 transfer_dimensions=True,
478 )
480 return newButler