Coverage for python/lsst/daf/butler/tests/_testRepo.py: 12%
141 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-27 08:58 +0000
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-27 08:58 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = [
25 "makeTestRepo",
26 "makeTestCollection",
27 "addDatasetType",
28 "expandUniqueId",
29 "DatastoreMock",
30 "addDataIdValue",
31]
33import random
34from typing import Any, Iterable, Mapping, Optional, Set, Tuple, Union
35from unittest.mock import MagicMock
37import sqlalchemy
38from lsst.daf.butler import (
39 Butler,
40 Config,
41 DataCoordinate,
42 DatasetRef,
43 DatasetType,
44 Dimension,
45 DimensionUniverse,
46 FileDataset,
47)
50def makeTestRepo(
51 root: str, dataIds: Optional[Mapping[str, Iterable]] = None, *, config: Config = None, **kwargs
52) -> Butler:
53 """Create an empty test repository.
55 Parameters
56 ----------
57 root : `str`
58 The location of the root directory for the repository.
59 dataIds : `~collections.abc.Mapping` [`str`, `iterable`], optional
60 A mapping keyed by the dimensions used in the test. Each value is an
61 iterable of names for that dimension (e.g., detector IDs for
62 `"detector"`). Related dimensions (e.g., instruments and detectors) are
63 linked arbitrarily, with values created for implied dimensions only
64 when needed. This parameter is provided for compatibility with old
65 code; newer code should make the repository, then call
66 `~lsst.daf.butler.tests.addDataIdValue`.
67 config : `lsst.daf.butler.Config`, optional
68 A configuration for the repository (for details, see
69 `lsst.daf.butler.Butler.makeRepo`). If omitted, creates a repository
70 with default dataset and storage types, but optimized for speed. The
71 defaults set ``.datastore.cls``, ``.datastore.checksum`` and
72 ``.registry.db``. If a supplied config does not specify these values
73 the internal defaults will be used to ensure that we have a usable
74 configuration.
75 **kwargs
76 Extra arguments to `lsst.daf.butler.Butler.makeRepo`.
78 Returns
79 -------
80 butler : `lsst.daf.butler.Butler`
81 A Butler referring to the new repository. This Butler is provided only
82 for additional setup; to keep test cases isolated, it is highly
83 recommended that each test create its own Butler with a unique
84 run/collection. See `makeTestCollection`.
86 Notes
87 -----
88 This function provides a "quick and dirty" repository for simple unit tests
89 that don't depend on complex data relationships. It is ill-suited for tests
90 where the structure of the data matters. If you need such a dataset, create
91 it directly or use a saved test dataset.
92 """
93 defaults = Config()
94 defaults["datastore", "cls"] = "lsst.daf.butler.datastores.inMemoryDatastore.InMemoryDatastore"
95 defaults["datastore", "checksum"] = False # In case of future changes
96 defaults["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3"
98 if config:
99 defaults.update(config)
101 if not dataIds:
102 dataIds = {}
104 # Disable config root by default so that our registry override will
105 # not be ignored.
106 # newConfig guards against location-related keywords like outfile
107 newConfig = Butler.makeRepo(root, config=defaults, forceConfigRoot=False, **kwargs)
108 butler = Butler(newConfig, writeable=True)
109 dimensionRecords = _makeRecords(dataIds, butler.registry.dimensions)
110 for dimension, records in dimensionRecords.items():
111 if butler.registry.dimensions[dimension].viewOf is None:
112 butler.registry.insertDimensionData(dimension, *records)
113 return butler
116def makeTestCollection(repo: Butler, uniqueId: Optional[str] = None) -> Butler:
117 """Create a read/write Butler to a fresh collection.
119 Parameters
120 ----------
121 repo : `lsst.daf.butler.Butler`
122 A previously existing Butler to a repository, such as that returned by
123 `~lsst.daf.butler.Butler.makeRepo` or `makeTestRepo`.
124 uniqueId : `str`, optional
125 A collection ID guaranteed by external code to be unique across all
126 calls to ``makeTestCollection`` for the same repository.
128 Returns
129 -------
130 butler : `lsst.daf.butler.Butler`
131 A Butler referring to a new collection in the repository at ``root``.
132 The collection is (almost) guaranteed to be new.
134 Notes
135 -----
136 This function creates a single run collection that does not necessarily
137 conform to any repository conventions. It is only suitable for creating an
138 isolated test area, and not for repositories intended for real data
139 processing or analysis.
140 """
141 if not uniqueId:
142 # Create a "random" collection name
143 # Speed matters more than cryptographic guarantees
144 uniqueId = str(random.randrange(1_000_000_000))
145 collection = "test_" + uniqueId
146 return Butler(butler=repo, run=collection)
149def _makeRecords(dataIds: Mapping[str, Iterable], universe: DimensionUniverse) -> Mapping[str, Iterable]:
150 """Create cross-linked dimension records from a collection of
151 data ID values.
153 Parameters
154 ----------
155 dataIds : `~collections.abc.Mapping` [`str`, `iterable`]
156 A mapping keyed by the dimensions of interest. Each value is an
157 iterable of names for that dimension (e.g., detector IDs for
158 `"detector"`).
159 universe : lsst.daf.butler.DimensionUniverse
160 Set of all known dimensions and their relationships.
162 Returns
163 -------
164 dataIds : `~collections.abc.Mapping` [`str`, `iterable`]
165 A mapping keyed by the dimensions of interest, giving one
166 `~lsst.daf.butler.DimensionRecord` for each input name. Related
167 dimensions (e.g., instruments and detectors) are linked arbitrarily.
168 """
170 # Create values for all dimensions that are (recursive) required or implied
171 # dependencies of the given ones.
172 complete_data_id_values = {}
173 for dimension in universe.extract(dataIds.keys()):
174 if dimension.name in dataIds:
175 complete_data_id_values[dimension.name] = list(dataIds[dimension.name])
176 if dimension.name not in complete_data_id_values:
177 complete_data_id_values[dimension.name] = [_makeRandomDataIdValue(dimension)]
179 # Start populating dicts that will become DimensionRecords by providing
180 # alternate keys like detector names
181 record_dicts_by_dimension_name = {}
182 for name, values in complete_data_id_values.items():
183 record_dicts_by_dimension_name[name] = []
184 dimension = universe[name]
185 for value in values:
186 record_dicts_by_dimension_name[name].append(_fillAllKeys(dimension, value))
188 # Pick cross-relationships arbitrarily
189 for name, record_dicts in record_dicts_by_dimension_name.items():
190 dimension = universe[name]
191 for record_dict in record_dicts:
192 for other in dimension.dimensions:
193 if other != dimension:
194 relation = record_dicts_by_dimension_name[other.name][0]
195 record_dict[other.name] = relation[other.primaryKey.name]
197 return {
198 dimension: [universe[dimension].RecordClass(**record_dict) for record_dict in record_dicts]
199 for dimension, record_dicts in record_dicts_by_dimension_name.items()
200 }
203def _fillAllKeys(dimension: Dimension, value: Union[str, int]) -> Mapping[str, Union[str, int]]:
204 """Create an arbitrary mapping of all required keys for a given dimension
205 that do not refer to other dimensions.
207 Parameters
208 ----------
209 dimension : `lsst.daf.butler.Dimension`
210 The dimension for which to generate a set of keys (e.g., detector).
211 value
212 The value assigned to ``dimension`` (e.g., detector ID).
214 Returns
215 -------
216 expandedValue : `dict` [`str`]
217 A mapping of dimension keys to values. ``dimension's`` primary key
218 maps to ``value``, but all other mappings (e.g., detector name)
219 are arbitrary.
220 """
221 expandedValue = {}
222 for key in dimension.uniqueKeys:
223 if key.nbytes:
224 # For `bytes` fields, we want something that casts at least `str`
225 # and `int` values to bytes and yields b'' when called with no
226 # arguments (as in the except block below). Unfortunately, the
227 # `bytes` type itself fails for both `str` and `int`, but this
228 # lambda does what we need. This particularly important for the
229 # skymap dimensions' bytes 'hash' field, which has a unique
230 # constraint; without this, all skymaps would get a hash of b''
231 # and end up conflicting.
232 castType = lambda *args: str(*args).encode() # noqa: E731
233 else:
234 castType = key.dtype().python_type
235 try:
236 castValue = castType(value)
237 except TypeError:
238 castValue = castType()
239 expandedValue[key.name] = castValue
240 for key in dimension.metadata:
241 if not key.nullable:
242 expandedValue[key.name] = key.dtype().python_type(value)
243 return expandedValue
246def _makeRandomDataIdValue(dimension: Dimension) -> Union[int, str]:
247 """Generate a random value of the appropriate type for a data ID key.
249 Parameters
250 ----------
251 dimension : `Dimension`
252 Dimension the value corresponds to.
254 Returns
255 -------
256 value : `int` or `str`
257 Random value.
258 """
259 if dimension.primaryKey.getPythonType() is str:
260 return str(random.randrange(1000))
261 else:
262 return random.randrange(1000)
265def expandUniqueId(butler: Butler, partialId: Mapping[str, Any]) -> DataCoordinate:
266 """Return a complete data ID matching some criterion.
268 Parameters
269 ----------
270 butler : `lsst.daf.butler.Butler`
271 The repository to query.
272 partialId : `~collections.abc.Mapping` [`str`]
273 A mapping of known dimensions and values.
275 Returns
276 -------
277 dataId : `lsst.daf.butler.DataCoordinate`
278 The unique data ID that matches ``partialId``.
280 Raises
281 ------
282 ValueError
283 Raised if ``partialId`` does not uniquely identify a data ID.
285 Notes
286 -----
287 This method will only work correctly if all dimensions attached to the
288 target dimension (eg., "physical_filter" for "visit") are known to the
289 repository, even if they're not needed to identify a dataset. This function
290 is only suitable for certain kinds of test repositories, and not for
291 repositories intended for real data processing or analysis.
293 Examples
294 --------
295 .. code-block:: py
297 >>> butler = makeTestRepo(
298 "testdir", {"instrument": ["notACam"], "detector": [1]})
299 >>> expandUniqueId(butler, {"detector": 1})
300 DataCoordinate({instrument, detector}, ('notACam', 1))
301 """
302 # The example is *not* a doctest because it requires dangerous I/O
303 registry = butler.registry
304 dimensions = registry.dimensions.extract(partialId.keys()).required
306 query = " AND ".join(f"{dimension} = {value!r}" for dimension, value in partialId.items())
308 # Much of the purpose of this function is to do something we explicitly
309 # reject most of the time: query for a governor dimension (e.g. instrument)
310 # given something that depends on it (e.g. visit), hence check=False.
311 dataId = list(registry.queryDataIds(dimensions, where=query, check=False))
312 if len(dataId) == 1:
313 return dataId[0]
314 else:
315 raise ValueError(f"Found {len(dataId)} matches for {partialId}, expected 1.")
318def _findOrInventDataIdValue(
319 butler: Butler, data_id: dict[str, Union[str, int]], dimension: Dimension
320) -> tuple[Union[str, int], bool]:
321 """Look up an arbitrary value for a dimension that is consistent with a
322 partial data ID that does not specify that dimension, or invent one if no
323 such value exists.
325 Parameters
326 ----------
327 butler : `Butler`
328 Butler to use to look up data ID values.
329 data_id : `dict` [ `str`, `str` or `int` ]
330 Dictionary of possibly-related data ID values.
331 dimension : `Dimension`
332 Dimension to obtain a value for.
334 Returns
335 -------
336 value : `int` or `str`
337 Value for this dimension.
338 invented : `bool`
339 `True` if the value had to be invented, `False` if a compatible value
340 already existed.
341 """
342 # No values given by caller for this dimension. See if any exist
343 # in the registry that are consistent with the values of dimensions
344 # we do have:
345 match_data_id = {key: data_id[key] for key in data_id.keys() & dimension.dimensions.names}
346 matches = list(butler.registry.queryDimensionRecords(dimension, dataId=match_data_id).limit(1))
347 if not matches:
348 # Nothing in the registry matches: invent a data ID value
349 # with the right type (actual value does not matter).
350 # We may or may not actually make a record with this; that's
351 # easier to check later.
352 dimension_value = _makeRandomDataIdValue(dimension)
353 return dimension_value, True
354 else:
355 # A record does exist in the registry. Use its data ID value.
356 return matches[0].dataId[dimension.name], False
359def _makeDimensionRecordDict(data_id: dict[str, Union[str, int]], dimension: Dimension) -> dict[str, Any]:
360 """Create a dictionary that can be used to build a `DimensionRecord` that
361 is consistent with the given data ID.
363 Parameters
364 ----------
365 data_id : `dict` [ `str`, `str` or `int` ]
366 Dictionary that contains values for at least all of
367 ``dimension.dimensions.names`` (the main dimension, its recursive
368 required dependencies, and its non-recursive implied dependencies).
369 dimension : `Dimension`
370 Dimension to build a record dictionary for.
372 Returns
373 -------
374 record_dict : `dict` [ `str`, `object` ]
375 Dictionary that can be passed as ``**kwargs`` to this dimensions
376 record class constructor.
377 """
378 # Add the primary key field for this dimension.
379 record_dict: dict[str, Any] = {dimension.primaryKey.name: data_id[dimension.name]}
380 # Define secondary keys (e.g., detector name given detector id)
381 record_dict.update(_fillAllKeys(dimension, data_id[dimension.name]))
382 # Set the foreign key values for any related dimensions that should
383 # appear in the record.
384 for related_dimension in dimension.dimensions:
385 if related_dimension.name != dimension.name:
386 record_dict[related_dimension.name] = data_id[related_dimension.name]
387 return record_dict
390def addDataIdValue(butler: Butler, dimension: str, value: Union[str, int], **related: Union[str, int]):
391 """Add the records that back a new data ID to a repository.
393 Parameters
394 ----------
395 butler : `lsst.daf.butler.Butler`
396 The repository to update.
397 dimension : `str`
398 The name of the dimension to gain a new value.
399 value
400 The value to register for the dimension.
401 **related
402 Any existing dimensions to be linked to ``value``.
404 Notes
405 -----
406 Related dimensions (e.g., the instrument associated with a detector) may be
407 specified using ``related``, which requires a value for those dimensions to
408 have been added to the repository already (generally with a previous call
409 to `addDataIdValue`. Any dependencies of the given dimension that are not
410 included in ``related`` will be linked to existing values arbitrarily, and
411 (for implied dependencies only) created and also inserted into the registry
412 if they do not exist. Values for required dimensions and those given in
413 ``related`` are never created.
415 Because this function creates filler data, it is only suitable for test
416 repositories. It should not be used for repositories intended for real data
417 processing or analysis, which have known dimension values.
419 Examples
420 --------
421 See the guide on :ref:`using-butler-in-tests-make-repo` for usage examples.
422 """
423 # Example is not doctest, because it's probably unsafe to create even an
424 # in-memory butler in that environment.
425 try:
426 fullDimension = butler.registry.dimensions[dimension]
427 except KeyError as e:
428 raise ValueError from e
429 # Bad keys ignored by registry code
430 extraKeys = related.keys() - fullDimension.graph.dimensions.names
431 if extraKeys:
432 raise ValueError(
433 f"Unexpected keywords {extraKeys} not found in {fullDimension.graph.dimensions.names}"
434 )
436 # Assemble a dictionary data ID holding the given primary dimension value
437 # and all of the related ones.
438 data_id: dict[str, Union[int, str]] = {dimension: value}
439 data_id.update(related)
441 # Compute the set of all dimensions that these recursively depend on.
442 all_dimensions = butler.registry.dimensions.extract(data_id.keys())
444 # Create dicts that will become DimensionRecords for all of these data IDs.
445 # This iteration is guaranteed to be in topological order, so we can count
446 # on new data ID values being invented before they are needed.
447 record_dicts_by_dimension: dict[Dimension, dict[str, Any]] = {}
448 for dimension_obj in all_dimensions:
449 dimension_value = data_id.get(dimension_obj.name)
450 if dimension_value is None:
451 data_id[dimension_obj.name], invented = _findOrInventDataIdValue(butler, data_id, dimension_obj)
452 if not invented:
453 # No need to make a new record; one already exists.
454 continue
455 if dimension_obj.name in related:
456 # Caller passed in a value of this dimension explicitly, but it
457 # isn't the primary dimension they asked to have a record created
458 # for. That means they expect this record to already exist.
459 continue
460 if dimension_obj != fullDimension and dimension_obj in all_dimensions.required:
461 # We also don't want to automatically create new dimension records
462 # for required dimensions (except for the main dimension the caller
463 # asked for); those are also asserted by the caller to already
464 # exist.
465 continue
466 if dimension_obj.viewOf is not None:
467 # Don't need to bother generating full records for dimensions whose
468 # records are just a view into some other's records anyway.
469 continue
470 record_dicts_by_dimension[dimension_obj] = _makeDimensionRecordDict(data_id, dimension_obj)
472 # Sync those dimension record dictionaries with the database.
473 for dimension_obj, record_dict in record_dicts_by_dimension.items():
474 record = dimension_obj.RecordClass(**record_dict)
475 try:
476 butler.registry.syncDimensionData(dimension_obj, record)
477 except sqlalchemy.exc.IntegrityError as e:
478 raise RuntimeError(
479 "Could not create data ID value. Automatic relationship generation "
480 "may have failed; try adding keywords to assign a specific instrument, "
481 "physical_filter, etc. based on the nested exception message."
482 ) from e
485def addDatasetType(butler: Butler, name: str, dimensions: Set[str], storageClass: str) -> DatasetType:
486 """Add a new dataset type to a repository.
488 Parameters
489 ----------
490 butler : `lsst.daf.butler.Butler`
491 The repository to update.
492 name : `str`
493 The name of the dataset type.
494 dimensions : `set` [`str`]
495 The dimensions of the new dataset type.
496 storageClass : `str`
497 The storage class the dataset will use.
499 Returns
500 -------
501 datasetType : `lsst.daf.butler.DatasetType`
502 The new type.
504 Raises
505 ------
506 ValueError
507 Raised if the dimensions or storage class is invalid.
509 Notes
510 -----
511 Dataset types are shared across all collections in a repository, so this
512 function does not need to be run for each collection.
513 """
514 try:
515 datasetType = DatasetType(name, dimensions, storageClass, universe=butler.registry.dimensions)
516 butler.registry.registerDatasetType(datasetType)
517 return datasetType
518 except KeyError as e:
519 raise ValueError from e
522class DatastoreMock:
523 """Mocks a butler datastore.
525 Has functions that mock the datastore in a butler. Provides an `apply`
526 function to replace the relevent butler datastore functions with the mock
527 functions.
528 """
530 @staticmethod
531 def apply(butler):
532 """Apply datastore mocks to a butler."""
533 butler.datastore.export = DatastoreMock._mock_export
534 butler.datastore.get = DatastoreMock._mock_get
535 butler.datastore.ingest = MagicMock()
537 @staticmethod
538 def _mock_export(
539 refs: Iterable[DatasetRef], *, directory: Optional[str] = None, transfer: Optional[str] = None
540 ) -> Iterable[FileDataset]:
541 """A mock of `Datastore.export` that satisfies the requirement that
542 the refs passed in are included in the `FileDataset` objects
543 returned.
545 This can be used to construct a `Datastore` mock that can be used
546 in repository export via::
548 datastore = unittest.mock.Mock(spec=Datastore)
549 datastore.export = DatastoreMock._mock_export
551 """
552 for ref in refs:
553 yield FileDataset(
554 refs=[ref], path="mock/path", formatter="lsst.daf.butler.formatters.json.JsonFormatter"
555 )
557 @staticmethod
558 def _mock_get(
559 ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None
560 ) -> Tuple[int, Optional[Mapping[str, Any]]]:
561 """A mock of `Datastore.get` that just returns the integer dataset ID
562 value and parameters it was given.
563 """
564 return (ref.id, parameters)