Coverage for python/lsst/pipe/base/executionButlerBuilder.py: 9%
148 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-16 09:02 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-16 09:02 +0000
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("buildExecutionButler",)
25import io
26from collections import defaultdict
27from collections.abc import Callable, Iterable, Mapping
29from lsst.daf.butler import Butler, Config, DatasetRef, DatasetType, Registry
30from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
31from lsst.daf.butler.registry import ConflictingDefinitionError, MissingDatasetTypeError
32from lsst.daf.butler.transfers import RepoExportContext
33from lsst.resources import ResourcePath, ResourcePathExpression
34from lsst.utils.introspection import get_class_of
36from .graph import QuantumGraph
38DataSetTypeRefMap = Mapping[DatasetType, set[DatasetRef]]
41def _validate_dataset_type(
42 candidate: DatasetType, previous: dict[str | DatasetType, DatasetType], registry: Registry
43) -> DatasetType:
44 """Check the dataset types and return a consistent variant if there are
45 different compatible options.
47 Parameters
48 ----------
49 candidate : `lsst.daf.butler.DatasetType`
50 The candidate dataset type.
51 previous : `dict` [ `str` | `~lsst.daf.butler.DatasetType`, \
52 `~lsst.daf.butler.DatasetType`]
53 Previous dataset types found, indexed by name and also by
54 dataset type. The latter provides a quick way of returning a
55 previously checked dataset type.
56 registry : `lsst.daf.butler.Registry`
57 Main registry whose dataset type registration should override the
58 given one if it exists.
60 Returns
61 -------
62 datasetType : `lsst.daf.butler.DatasetType`
63 The dataset type to be used. This can be different from the
64 given ``candidate`` if a previous dataset type was encountered
65 with the same name and this one is compatible with it.
67 Raises
68 ------
69 ConflictingDefinitionError
70 Raised if a candidate dataset type has the same name as one
71 previously encountered but is not compatible with it.
73 Notes
74 -----
75 This function ensures that if a dataset type is given that has the
76 same name as a previously encountered dataset type but differs solely
77 in a way that is interchangeable (through a supported storage class)
78 then we will always return the first dataset type encountered instead
79 of the new variant. We assume that the butler will handle the
80 type conversion itself later.
81 """
82 # First check that if we have previously vetted this dataset type.
83 # Return the vetted form immediately if we have.
84 checked = previous.get(candidate)
85 if checked:
86 return checked
88 # Have not previously encountered this dataset type.
89 name = candidate.name
90 if prevDsType := previous.get(name):
91 # Check compatibility. For now assume both directions have to
92 # be acceptable.
93 if prevDsType.is_compatible_with(candidate) and candidate.is_compatible_with(prevDsType):
94 # Ensure that if this dataset type is used again we will return
95 # the version that we were first given with this name. Store
96 # it for next time and return the previous one.
97 previous[candidate] = prevDsType
98 return prevDsType
99 else:
100 raise ConflictingDefinitionError(
101 f"Dataset type incompatibility in graph: {prevDsType} not compatible with {candidate}"
102 )
104 # We haven't seen this dataset type in this graph before, but it may
105 # already be in the registry.
106 try:
107 registryDsType = registry.getDatasetType(name)
108 previous[candidate] = registryDsType
109 return registryDsType
110 except MissingDatasetTypeError:
111 pass
112 # Dataset type is totally new. Store it by name and by dataset type so
113 # it will be validated immediately next time it comes up.
114 previous[name] = candidate
115 previous[candidate] = candidate
116 return candidate
119def _accumulate(
120 butler: Butler,
121 graph: QuantumGraph,
122) -> tuple[set[DatasetRef], DataSetTypeRefMap]:
123 # accumulate the DatasetRefs that will be transferred to the execution
124 # registry
126 # exports holds all the existing data that will be migrated to the
127 # execution butler
128 exports: set[DatasetRef] = set()
130 # inserts is the mapping of DatasetType to dataIds for what is to be
131 # inserted into the registry. These are the products that are expected
132 # to be produced during processing of the QuantumGraph
133 inserts: DataSetTypeRefMap = defaultdict(set)
135 # It is possible to end up with a graph that has different storage
136 # classes attached to the same dataset type name. This is okay but
137 # must we must ensure that only a single dataset type definition is
138 # accumulated in the loop below. This data structure caches every dataset
139 # type encountered and stores the compatible alternative.
140 datasetTypes: dict[str | DatasetType, DatasetType] = {}
142 # Find the initOutput refs.
143 initOutputRefs = list(graph.globalInitOutputRefs())
144 for task_def in graph.iterTaskGraph():
145 task_refs = graph.initOutputRefs(task_def)
146 if task_refs:
147 initOutputRefs.extend(task_refs)
149 for ref in initOutputRefs:
150 dataset_type = ref.datasetType
151 if dataset_type.component() is not None:
152 dataset_type = dataset_type.makeCompositeDatasetType()
153 dataset_type = _validate_dataset_type(dataset_type, datasetTypes, butler.registry)
154 inserts[dataset_type].add(ref)
156 # Output references may be resolved even if they do not exist. Find all
157 # actually existing refs.
158 check_refs: set[DatasetRef] = set()
159 for quantum in (n.quantum for n in graph):
160 for attrName in ("initInputs", "inputs", "outputs"):
161 attr: Mapping[DatasetType, DatasetRef | list[DatasetRef]] = getattr(quantum, attrName)
162 for type, refs in attr.items():
163 # This if block is because init inputs has a different
164 # signature for its items
165 if not isinstance(refs, list):
166 refs = [refs]
167 for ref in refs:
168 if ref.isComponent():
169 ref = ref.makeCompositeRef()
170 check_refs.add(ref)
171 exist_map = butler.datastore.knows_these(check_refs)
172 existing_ids = set(ref.id for ref, exists in exist_map.items() if exists)
173 del exist_map
175 for quantum in (n.quantum for n in graph):
176 for attrName in ("initInputs", "inputs", "outputs"):
177 attr = getattr(quantum, attrName)
179 for type, refs in attr.items():
180 if not isinstance(refs, list):
181 refs = [refs]
182 if type.component() is not None:
183 type = type.makeCompositeDatasetType()
184 type = _validate_dataset_type(type, datasetTypes, butler.registry)
185 # iterate over all the references, if it exists and should be
186 # exported, if not it should be inserted into the new registry
187 for ref in refs:
188 # Component dataset ID is the same as its parent ID, so
189 # checking component in existing_ids works OK.
190 if ref.id in existing_ids:
191 # If this is a component we want the composite to be
192 # exported.
193 if ref.isComponent():
194 ref = ref.makeCompositeRef()
195 # Make sure we export this with the registry's dataset
196 # type, since transfer_from doesn't handle storage
197 # class differences (maybe it should, but it's not
198 # bad to be defensive here even if that changes).
199 if type != ref.datasetType:
200 ref = ref.overrideStorageClass(type.storageClass)
201 assert ref.datasetType == type, "Dataset types should not differ in other ways."
202 exports.add(ref)
203 else:
204 if ref.isComponent():
205 # We can't insert a component, and a component will
206 # be part of some other upstream dataset, so it
207 # should be safe to skip them here
208 continue
209 inserts[type].add(ref)
211 return exports, inserts
214def _discoverCollections(butler: Butler, collections: Iterable[str]) -> set[str]:
215 # Recurse through any discovered collections to make sure all collections
216 # are exported. This exists because I ran into a situation where some
217 # collections were not properly being discovered and exported. This
218 # method may be able to be removed in the future if collection export
219 # logic changes
220 collections = set(collections)
221 while True:
222 discoveredCollections = set(
223 butler.registry.queryCollections(collections, flattenChains=True, includeChains=True)
224 )
225 if len(discoveredCollections) > len(collections):
226 collections = discoveredCollections
227 else:
228 break
229 return collections
232def _export(butler: Butler, collections: Iterable[str] | None, inserts: DataSetTypeRefMap) -> io.StringIO:
233 # This exports relevant dimension records and collections using daf butler
234 # objects, however it reaches in deep and does not use the public methods
235 # so that it can export it to a string buffer and skip disk access. This
236 # does not export the datasets themselves, since we use transfer_from for
237 # that.
238 yamlBuffer = io.StringIO()
239 # Yaml is hard coded, since the class controls both ends of the
240 # export/import
241 BackendClass = get_class_of(butler._config["repo_transfer_formats", "yaml", "export"])
242 backend = BackendClass(yamlBuffer, universe=butler.dimensions)
243 exporter = RepoExportContext(butler.registry, butler.datastore, backend, directory=None, transfer=None)
245 # Need to ensure that the dimension records for outputs are
246 # transferred.
247 for _, refs in inserts.items():
248 exporter.saveDataIds([ref.dataId for ref in refs])
250 # Look for any defined collection, if not get the defaults
251 if collections is None:
252 collections = butler.registry.defaults.collections
254 # look up all collections associated with those inputs, this follows
255 # all chains to make sure everything is properly exported
256 for c in _discoverCollections(butler, collections):
257 exporter.saveCollection(c)
258 exporter._finish()
260 # reset the string buffer to the beginning so the read operation will
261 # actually *see* the data that was exported
262 yamlBuffer.seek(0)
263 return yamlBuffer
266def _setupNewButler(
267 butler: Butler,
268 outputLocation: ResourcePath,
269 dirExists: bool,
270 datastoreRoot: ResourcePath | None = None,
271) -> Butler:
272 """Set up the execution butler
274 Parameters
275 ----------
276 butler : `Butler`
277 The original butler, upon which the execution butler is based.
278 outputLocation : `~lsst.resources.ResourcePath`
279 Location of the execution butler.
280 dirExists : `bool`
281 Does the ``outputLocation`` exist, and if so, should it be clobbered?
282 datastoreRoot : `~lsst.resources.ResourcePath`, optional
283 Path for the execution butler datastore. If not specified, then the
284 original butler's datastore will be used.
286 Returns
287 -------
288 execution_butler : `Butler`
289 Execution butler.
290 """
291 # Set up the new butler object at the specified location
292 if dirExists:
293 # Remove the existing table, if the code got this far and this exists
294 # clobber must be true
295 executionRegistry = outputLocation.join("gen3.sqlite3")
296 if executionRegistry.exists():
297 executionRegistry.remove()
298 else:
299 outputLocation.mkdir()
301 # Copy the existing butler config, modifying the location of the
302 # registry to the specified location.
303 # Preserve the root path from the existing butler so things like
304 # file data stores continue to look at the old location.
305 config = Config(butler._config)
306 config["root"] = outputLocation.geturl()
307 config["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3"
309 # Remove any namespace that may be set in main registry.
310 config.pop(("registry", "namespace"), None)
312 # Obscore manager cannot be used with execution butler.
313 config.pop(("registry", "managers", "obscore"), None)
315 # record the current root of the datastore if it is specified relative
316 # to the butler root
317 if datastoreRoot is not None:
318 config["datastore", "root"] = datastoreRoot.geturl()
319 elif config.get(("datastore", "root")) == BUTLER_ROOT_TAG and butler._config.configDir is not None:
320 config["datastore", "root"] = butler._config.configDir.geturl()
321 config["datastore", "trust_get_request"] = True
323 # Requires that we use the dimension configuration from the original
324 # butler and not use the defaults.
325 config = Butler.makeRepo(
326 root=outputLocation,
327 config=config,
328 dimensionConfig=butler.dimensions.dimensionConfig,
329 overwrite=True,
330 forceConfigRoot=False,
331 )
333 # Return a newly created butler
334 return Butler(config, writeable=True)
337def _import(
338 yamlBuffer: io.StringIO,
339 newButler: Butler,
340 inserts: DataSetTypeRefMap,
341 run: str | None,
342 butlerModifier: Callable[[Butler], Butler] | None,
343) -> Butler:
344 # This method takes the exports from the existing butler, imports
345 # them into the newly created butler, and then inserts the datasets
346 # that are expected to be produced.
348 # import the existing datasets using "split" mode. "split" is safe
349 # because execution butler is assumed to be able to see all the file
350 # locations that the main datastore can see. "split" supports some
351 # absolute URIs in the datastore.
352 newButler.import_(filename=yamlBuffer, format="yaml", transfer="split")
354 # If there is modifier callable, run it to make necessary updates
355 # to the new butler.
356 if butlerModifier is not None:
357 newButler = butlerModifier(newButler)
359 # Register datasets to be produced and insert them into the registry
360 for dsType, refs in inserts.items():
361 # Storage class differences should have already been resolved by calls
362 # _validate_dataset_type in _export, resulting in the Registry dataset
363 # type whenever that exists.
364 newButler.registry.registerDatasetType(dsType)
365 newButler.registry._importDatasets(refs)
367 return newButler
370def buildExecutionButler(
371 butler: Butler,
372 graph: QuantumGraph,
373 outputLocation: ResourcePathExpression,
374 run: str | None,
375 *,
376 clobber: bool = False,
377 butlerModifier: Callable[[Butler], Butler] | None = None,
378 collections: Iterable[str] | None = None,
379 datastoreRoot: ResourcePathExpression | None = None,
380 transfer: str = "auto",
381) -> Butler:
382 r"""Create an execution butler.
384 Responsible for exporting
385 input `QuantumGraph`\s into a new minimal `~lsst.daf.butler.Butler` which
386 only contains datasets specified by the `QuantumGraph`.
388 These datasets are both those that already exist in the input
389 `~lsst.daf.butler.Butler`, and those that are expected to be produced
390 during the execution of the `QuantumGraph`.
392 Parameters
393 ----------
394 butler : `lsst.daf.butler.Butler`
395 This is the existing `~lsst.daf.butler.Butler` instance from which
396 existing datasets will be exported. This should be the
397 `~lsst.daf.butler.Butler` which was used to create any `QuantumGraphs`
398 that will be converted with this object.
399 graph : `QuantumGraph`
400 Graph containing nodes that are to be exported into an execution
401 butler
402 outputLocation : convertible to `~lsst.resources.ResourcePath`
403 URI Location at which the execution butler is to be exported. May be
404 specified as a string or a `~lsst.resources.ResourcePath` instance.
405 run : `str`, optional
406 The run collection that the exported datasets are to be placed in. If
407 None, the default value in registry.defaults will be used.
408 clobber : `bool`, Optional
409 By default a butler will not be created if a file or directory
410 already exists at the output location. If this is set to `True`
411 what is at the location will be deleted prior to running the
412 export. Defaults to `False`
413 butlerModifier : `~typing.Callable`, Optional
414 If supplied this should be a callable that accepts a
415 `~lsst.daf.butler.Butler`, and returns an instantiated
416 `~lsst.daf.butler.Butler`. This callable may be used to make any
417 modifications to the `~lsst.daf.butler.Butler` desired. This
418 will be called after importing all datasets that exist in the input
419 `~lsst.daf.butler.Butler` but prior to inserting Datasets expected
420 to be produced. Examples of what this method could do include
421 things such as creating collections/runs/ etc.
422 collections : `~typing.Iterable` of `str`, Optional
423 An iterable of collection names that will be exported from the input
424 `~lsst.daf.butler.Butler` when creating the execution butler. If not
425 supplied the `~lsst.daf.butler.Butler`\ 's `~lsst.daf.butler.Registry`
426 default collections will be used.
427 datastoreRoot : convertible to `~lsst.resources.ResourcePath`, Optional
428 Root directory for datastore of execution butler. If `None`, then the
429 original butler's datastore will be used.
430 transfer : `str`
431 How (and whether) the input datasets should be added to the execution
432 butler datastore. This should be a ``transfer`` string recognized by
433 :func:`lsst.resources.ResourcePath.transfer_from`.
434 ``"auto"`` means to ``"copy"`` if the ``datastoreRoot`` is specified.
436 Returns
437 -------
438 executionButler : `lsst.daf.butler.Butler`
439 An instance of the newly created execution butler.
441 Raises
442 ------
443 FileExistsError
444 Raised if something exists in the filesystem at the specified output
445 location and clobber is `False`.
446 NotADirectoryError
447 Raised if specified output URI does not correspond to a directory.
448 """
449 # Now require that if run is given it must match the graph run.
450 if run and graph.metadata and run != (graph_run := graph.metadata.get("output_run")):
451 raise ValueError(f"The given run, {run!r}, does not match that specified in the graph, {graph_run!r}")
453 # We know this must refer to a directory.
454 outputLocation = ResourcePath(outputLocation, forceDirectory=True)
455 if datastoreRoot is not None:
456 datastoreRoot = ResourcePath(datastoreRoot, forceDirectory=True)
458 # Do this first to Fail Fast if the output exists
459 if (dirExists := outputLocation.exists()) and not clobber:
460 raise FileExistsError("Cannot create a butler at specified location, location exists")
461 if not outputLocation.isdir():
462 raise NotADirectoryError("The specified output URI does not appear to correspond to a directory")
464 exports, inserts = _accumulate(butler, graph)
465 yamlBuffer = _export(butler, collections, inserts)
467 newButler = _setupNewButler(butler, outputLocation, dirExists, datastoreRoot)
469 newButler = _import(yamlBuffer, newButler, inserts, run, butlerModifier)
471 if transfer == "auto" and datastoreRoot is not None:
472 transfer = "copy"
474 # Transfer the existing datasets directly from the source butler.
475 newButler.transfer_from(
476 butler,
477 exports,
478 transfer=transfer,
479 skip_missing=False, # Everything should exist.
480 register_dataset_types=True,
481 transfer_dimensions=True,
482 )
484 return newButler