Coverage for python/lsst/daf/butler/tests/_testRepo.py: 12%
147 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-05 02:04 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-05 02:04 -0800
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = [
25 "makeTestRepo",
26 "makeTestCollection",
27 "addDatasetType",
28 "expandUniqueId",
29 "DatastoreMock",
30 "addDataIdValue",
31]
33import random
34from collections.abc import Iterable, Mapping
35from typing import TYPE_CHECKING, Any
36from unittest.mock import MagicMock
38import sqlalchemy
39from lsst.daf.butler import (
40 Butler,
41 Config,
42 DataCoordinate,
43 DatasetRef,
44 DatasetType,
45 Dimension,
46 DimensionUniverse,
47 FileDataset,
48 StorageClass,
49)
51if TYPE_CHECKING: 51 ↛ 52line 51 didn't jump to line 52, because the condition on line 51 was never true
52 from lsst.daf.butler import DatasetId
55def makeTestRepo(
56 root: str, dataIds: Mapping[str, Iterable] | None = None, *, config: Config | None = None, **kwargs: Any
57) -> Butler:
58 """Create an empty test repository.
60 Parameters
61 ----------
62 root : `str`
63 The location of the root directory for the repository.
64 dataIds : `~collections.abc.Mapping` [`str`, `iterable`], optional
65 A mapping keyed by the dimensions used in the test. Each value is an
66 iterable of names for that dimension (e.g., detector IDs for
67 `"detector"`). Related dimensions (e.g., instruments and detectors) are
68 linked arbitrarily, with values created for implied dimensions only
69 when needed. This parameter is provided for compatibility with old
70 code; newer code should make the repository, then call
71 `~lsst.daf.butler.tests.addDataIdValue`.
72 config : `lsst.daf.butler.Config`, optional
73 A configuration for the repository (for details, see
74 `lsst.daf.butler.Butler.makeRepo`). If omitted, creates a repository
75 with default dataset and storage types, but optimized for speed. The
76 defaults set ``.datastore.cls``, ``.datastore.checksum`` and
77 ``.registry.db``. If a supplied config does not specify these values
78 the internal defaults will be used to ensure that we have a usable
79 configuration.
80 **kwargs
81 Extra arguments to `lsst.daf.butler.Butler.makeRepo`.
83 Returns
84 -------
85 butler : `lsst.daf.butler.Butler`
86 A Butler referring to the new repository. This Butler is provided only
87 for additional setup; to keep test cases isolated, it is highly
88 recommended that each test create its own Butler with a unique
89 run/collection. See `makeTestCollection`.
91 Notes
92 -----
93 This function provides a "quick and dirty" repository for simple unit tests
94 that don't depend on complex data relationships. It is ill-suited for tests
95 where the structure of the data matters. If you need such a dataset, create
96 it directly or use a saved test dataset.
97 """
98 defaults = Config()
99 defaults["datastore", "cls"] = "lsst.daf.butler.datastores.inMemoryDatastore.InMemoryDatastore"
100 defaults["datastore", "checksum"] = False # In case of future changes
101 defaults["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3"
103 if config:
104 defaults.update(config)
106 if not dataIds:
107 dataIds = {}
109 # Disable config root by default so that our registry override will
110 # not be ignored.
111 # newConfig guards against location-related keywords like outfile
112 newConfig = Butler.makeRepo(root, config=defaults, forceConfigRoot=False, **kwargs)
113 butler = Butler(newConfig, writeable=True)
114 dimensionRecords = _makeRecords(dataIds, butler.registry.dimensions)
115 for dimension, records in dimensionRecords.items():
116 if butler.registry.dimensions[dimension].viewOf is None:
117 butler.registry.insertDimensionData(dimension, *records)
118 return butler
121def makeTestCollection(repo: Butler, uniqueId: str | None = None) -> Butler:
122 """Create a read/write Butler to a fresh collection.
124 Parameters
125 ----------
126 repo : `lsst.daf.butler.Butler`
127 A previously existing Butler to a repository, such as that returned by
128 `~lsst.daf.butler.Butler.makeRepo` or `makeTestRepo`.
129 uniqueId : `str`, optional
130 A collection ID guaranteed by external code to be unique across all
131 calls to ``makeTestCollection`` for the same repository.
133 Returns
134 -------
135 butler : `lsst.daf.butler.Butler`
136 A Butler referring to a new collection in the repository at ``root``.
137 The collection is (almost) guaranteed to be new.
139 Notes
140 -----
141 This function creates a single run collection that does not necessarily
142 conform to any repository conventions. It is only suitable for creating an
143 isolated test area, and not for repositories intended for real data
144 processing or analysis.
145 """
146 if not uniqueId:
147 # Create a "random" collection name
148 # Speed matters more than cryptographic guarantees
149 uniqueId = str(random.randrange(1_000_000_000))
150 collection = "test_" + uniqueId
151 return Butler(butler=repo, run=collection)
154def _makeRecords(dataIds: Mapping[str, Iterable], universe: DimensionUniverse) -> Mapping[str, Iterable]:
155 """Create cross-linked dimension records from a collection of
156 data ID values.
158 Parameters
159 ----------
160 dataIds : `~collections.abc.Mapping` [`str`, `iterable`]
161 A mapping keyed by the dimensions of interest. Each value is an
162 iterable of names for that dimension (e.g., detector IDs for
163 `"detector"`).
164 universe : lsst.daf.butler.DimensionUniverse
165 Set of all known dimensions and their relationships.
167 Returns
168 -------
169 dataIds : `~collections.abc.Mapping` [`str`, `iterable`]
170 A mapping keyed by the dimensions of interest, giving one
171 `~lsst.daf.butler.DimensionRecord` for each input name. Related
172 dimensions (e.g., instruments and detectors) are linked arbitrarily.
173 """
175 # Create values for all dimensions that are (recursive) required or implied
176 # dependencies of the given ones.
177 complete_data_id_values = {}
178 for dimension in universe.extract(dataIds.keys()):
179 if dimension.name in dataIds:
180 complete_data_id_values[dimension.name] = list(dataIds[dimension.name])
181 if dimension.name not in complete_data_id_values:
182 complete_data_id_values[dimension.name] = [_makeRandomDataIdValue(dimension)]
184 # Start populating dicts that will become DimensionRecords by providing
185 # alternate keys like detector names
186 record_dicts_by_dimension_name: dict[str, list[dict[str, str | int | bytes]]] = {}
187 for name, values in complete_data_id_values.items():
188 record_dicts_by_dimension_name[name] = []
189 dimension_el = universe[name]
190 for value in values:
191 # _fillAllKeys wants Dimension and not DimensionElement.
192 # universe.__getitem__ says it returns DimensionElement but this
193 # really does also seem to be a Dimension here.
194 record_dicts_by_dimension_name[name].append(
195 _fillAllKeys(dimension_el, value) # type: ignore[arg-type]
196 )
198 # Pick cross-relationships arbitrarily
199 for name, record_dicts in record_dicts_by_dimension_name.items():
200 dimension_el = universe[name]
201 for record_dict in record_dicts:
202 for other in dimension_el.dimensions:
203 if other != dimension_el:
204 relation = record_dicts_by_dimension_name[other.name][0]
205 record_dict[other.name] = relation[other.primaryKey.name]
207 return {
208 dimension: [universe[dimension].RecordClass(**record_dict) for record_dict in record_dicts]
209 for dimension, record_dicts in record_dicts_by_dimension_name.items()
210 }
213def _fillAllKeys(dimension: Dimension, value: str | int) -> dict[str, str | int | bytes]:
214 """Create an arbitrary mapping of all required keys for a given dimension
215 that do not refer to other dimensions.
217 Parameters
218 ----------
219 dimension : `lsst.daf.butler.Dimension`
220 The dimension for which to generate a set of keys (e.g., detector).
221 value
222 The value assigned to ``dimension`` (e.g., detector ID).
224 Returns
225 -------
226 expandedValue : `dict` [`str`]
227 A mapping of dimension keys to values. ``dimension's`` primary key
228 maps to ``value``, but all other mappings (e.g., detector name)
229 are arbitrary.
230 """
231 expandedValue: dict[str, str | int | bytes] = {}
232 for key in dimension.uniqueKeys:
233 if key.nbytes:
234 # For `bytes` fields, we want something that casts at least `str`
235 # and `int` values to bytes and yields b'' when called with no
236 # arguments (as in the except block below). Unfortunately, the
237 # `bytes` type itself fails for both `str` and `int`, but this
238 # lambda does what we need. This particularly important for the
239 # skymap dimensions' bytes 'hash' field, which has a unique
240 # constraint; without this, all skymaps would get a hash of b''
241 # and end up conflicting.
242 castType = lambda *args: str(*args).encode() # noqa: E731
243 else:
244 castType = key.dtype().python_type
245 try:
246 castValue = castType(value)
247 except TypeError:
248 castValue = castType()
249 expandedValue[key.name] = castValue
250 for key in dimension.metadata:
251 if not key.nullable:
252 expandedValue[key.name] = key.dtype().python_type(value)
253 return expandedValue
256def _makeRandomDataIdValue(dimension: Dimension) -> int | str:
257 """Generate a random value of the appropriate type for a data ID key.
259 Parameters
260 ----------
261 dimension : `Dimension`
262 Dimension the value corresponds to.
264 Returns
265 -------
266 value : `int` or `str`
267 Random value.
268 """
269 if dimension.primaryKey.getPythonType() is str:
270 return str(random.randrange(1000))
271 else:
272 return random.randrange(1000)
275def expandUniqueId(butler: Butler, partialId: Mapping[str, Any]) -> DataCoordinate:
276 """Return a complete data ID matching some criterion.
278 Parameters
279 ----------
280 butler : `lsst.daf.butler.Butler`
281 The repository to query.
282 partialId : `~collections.abc.Mapping` [`str`]
283 A mapping of known dimensions and values.
285 Returns
286 -------
287 dataId : `lsst.daf.butler.DataCoordinate`
288 The unique data ID that matches ``partialId``.
290 Raises
291 ------
292 ValueError
293 Raised if ``partialId`` does not uniquely identify a data ID.
295 Notes
296 -----
297 This method will only work correctly if all dimensions attached to the
298 target dimension (eg., "physical_filter" for "visit") are known to the
299 repository, even if they're not needed to identify a dataset. This function
300 is only suitable for certain kinds of test repositories, and not for
301 repositories intended for real data processing or analysis.
303 Examples
304 --------
305 .. code-block:: py
307 >>> butler = makeTestRepo(
308 "testdir", {"instrument": ["notACam"], "detector": [1]})
309 >>> expandUniqueId(butler, {"detector": 1})
310 DataCoordinate({instrument, detector}, ('notACam', 1))
311 """
312 # The example is *not* a doctest because it requires dangerous I/O
313 registry = butler.registry
314 dimensions = registry.dimensions.extract(partialId.keys()).required
316 query = " AND ".join(f"{dimension} = {value!r}" for dimension, value in partialId.items())
318 # Much of the purpose of this function is to do something we explicitly
319 # reject most of the time: query for a governor dimension (e.g. instrument)
320 # given something that depends on it (e.g. visit), hence check=False.
321 dataId = list(registry.queryDataIds(dimensions, where=query, check=False))
322 if len(dataId) == 1:
323 return dataId[0]
324 else:
325 raise ValueError(f"Found {len(dataId)} matches for {partialId}, expected 1.")
328def _findOrInventDataIdValue(
329 butler: Butler, data_id: dict[str, str | int], dimension: Dimension
330) -> tuple[str | int, bool]:
331 """Look up an arbitrary value for a dimension that is consistent with a
332 partial data ID that does not specify that dimension, or invent one if no
333 such value exists.
335 Parameters
336 ----------
337 butler : `Butler`
338 Butler to use to look up data ID values.
339 data_id : `dict` [ `str`, `str` or `int` ]
340 Dictionary of possibly-related data ID values.
341 dimension : `Dimension`
342 Dimension to obtain a value for.
344 Returns
345 -------
346 value : `int` or `str`
347 Value for this dimension.
348 invented : `bool`
349 `True` if the value had to be invented, `False` if a compatible value
350 already existed.
351 """
352 # No values given by caller for this dimension. See if any exist
353 # in the registry that are consistent with the values of dimensions
354 # we do have:
355 match_data_id = {key: data_id[key] for key in data_id.keys() & dimension.dimensions.names}
356 matches = list(butler.registry.queryDimensionRecords(dimension, dataId=match_data_id).limit(1))
357 if not matches:
358 # Nothing in the registry matches: invent a data ID value
359 # with the right type (actual value does not matter).
360 # We may or may not actually make a record with this; that's
361 # easier to check later.
362 dimension_value = _makeRandomDataIdValue(dimension)
363 return dimension_value, True
364 else:
365 # A record does exist in the registry. Use its data ID value.
366 dim_value = matches[0].dataId[dimension.name]
367 assert dim_value is not None
368 return dim_value, False
371def _makeDimensionRecordDict(data_id: dict[str, str | int], dimension: Dimension) -> dict[str, Any]:
372 """Create a dictionary that can be used to build a `DimensionRecord` that
373 is consistent with the given data ID.
375 Parameters
376 ----------
377 data_id : `dict` [ `str`, `str` or `int` ]
378 Dictionary that contains values for at least all of
379 ``dimension.dimensions.names`` (the main dimension, its recursive
380 required dependencies, and its non-recursive implied dependencies).
381 dimension : `Dimension`
382 Dimension to build a record dictionary for.
384 Returns
385 -------
386 record_dict : `dict` [ `str`, `object` ]
387 Dictionary that can be passed as ``**kwargs`` to this dimensions
388 record class constructor.
389 """
390 # Add the primary key field for this dimension.
391 record_dict: dict[str, Any] = {dimension.primaryKey.name: data_id[dimension.name]}
392 # Define secondary keys (e.g., detector name given detector id)
393 record_dict.update(_fillAllKeys(dimension, data_id[dimension.name]))
394 # Set the foreign key values for any related dimensions that should
395 # appear in the record.
396 for related_dimension in dimension.dimensions:
397 if related_dimension.name != dimension.name:
398 record_dict[related_dimension.name] = data_id[related_dimension.name]
399 return record_dict
402def addDataIdValue(butler: Butler, dimension: str, value: str | int, **related: str | int) -> None:
403 """Add the records that back a new data ID to a repository.
405 Parameters
406 ----------
407 butler : `lsst.daf.butler.Butler`
408 The repository to update.
409 dimension : `str`
410 The name of the dimension to gain a new value.
411 value
412 The value to register for the dimension.
413 **related
414 Any existing dimensions to be linked to ``value``.
416 Notes
417 -----
418 Related dimensions (e.g., the instrument associated with a detector) may be
419 specified using ``related``, which requires a value for those dimensions to
420 have been added to the repository already (generally with a previous call
421 to `addDataIdValue`. Any dependencies of the given dimension that are not
422 included in ``related`` will be linked to existing values arbitrarily, and
423 (for implied dependencies only) created and also inserted into the registry
424 if they do not exist. Values for required dimensions and those given in
425 ``related`` are never created.
427 Because this function creates filler data, it is only suitable for test
428 repositories. It should not be used for repositories intended for real data
429 processing or analysis, which have known dimension values.
431 Examples
432 --------
433 See the guide on :ref:`using-butler-in-tests-make-repo` for usage examples.
434 """
435 # Example is not doctest, because it's probably unsafe to create even an
436 # in-memory butler in that environment.
437 try:
438 fullDimension = butler.registry.dimensions[dimension]
439 except KeyError as e:
440 raise ValueError from e
441 # Bad keys ignored by registry code
442 extraKeys = related.keys() - fullDimension.graph.dimensions.names
443 if extraKeys:
444 raise ValueError(
445 f"Unexpected keywords {extraKeys} not found in {fullDimension.graph.dimensions.names}"
446 )
448 # Assemble a dictionary data ID holding the given primary dimension value
449 # and all of the related ones.
450 data_id: dict[str, int | str] = {dimension: value}
451 data_id.update(related)
453 # Compute the set of all dimensions that these recursively depend on.
454 all_dimensions = butler.registry.dimensions.extract(data_id.keys())
456 # Create dicts that will become DimensionRecords for all of these data IDs.
457 # This iteration is guaranteed to be in topological order, so we can count
458 # on new data ID values being invented before they are needed.
459 record_dicts_by_dimension: dict[Dimension, dict[str, Any]] = {}
460 for dimension_obj in all_dimensions:
461 dimension_value = data_id.get(dimension_obj.name)
462 if dimension_value is None:
463 data_id[dimension_obj.name], invented = _findOrInventDataIdValue(butler, data_id, dimension_obj)
464 if not invented:
465 # No need to make a new record; one already exists.
466 continue
467 if dimension_obj.name in related:
468 # Caller passed in a value of this dimension explicitly, but it
469 # isn't the primary dimension they asked to have a record created
470 # for. That means they expect this record to already exist.
471 continue
472 if dimension_obj != fullDimension and dimension_obj in all_dimensions.required:
473 # We also don't want to automatically create new dimension records
474 # for required dimensions (except for the main dimension the caller
475 # asked for); those are also asserted by the caller to already
476 # exist.
477 continue
478 if dimension_obj.viewOf is not None:
479 # Don't need to bother generating full records for dimensions whose
480 # records are just a view into some other's records anyway.
481 continue
482 record_dicts_by_dimension[dimension_obj] = _makeDimensionRecordDict(data_id, dimension_obj)
484 # Sync those dimension record dictionaries with the database.
485 for dimension_obj, record_dict in record_dicts_by_dimension.items():
486 record = dimension_obj.RecordClass(**record_dict)
487 try:
488 butler.registry.syncDimensionData(dimension_obj, record)
489 except sqlalchemy.exc.IntegrityError as e:
490 raise RuntimeError(
491 "Could not create data ID value. Automatic relationship generation "
492 "may have failed; try adding keywords to assign a specific instrument, "
493 "physical_filter, etc. based on the nested exception message."
494 ) from e
497def addDatasetType(butler: Butler, name: str, dimensions: set[str], storageClass: str) -> DatasetType:
498 """Add a new dataset type to a repository.
500 Parameters
501 ----------
502 butler : `lsst.daf.butler.Butler`
503 The repository to update.
504 name : `str`
505 The name of the dataset type.
506 dimensions : `set` [`str`]
507 The dimensions of the new dataset type.
508 storageClass : `str`
509 The storage class the dataset will use.
511 Returns
512 -------
513 datasetType : `lsst.daf.butler.DatasetType`
514 The new type.
516 Raises
517 ------
518 ValueError
519 Raised if the dimensions or storage class is invalid.
521 Notes
522 -----
523 Dataset types are shared across all collections in a repository, so this
524 function does not need to be run for each collection.
525 """
526 try:
527 datasetType = DatasetType(name, dimensions, storageClass, universe=butler.registry.dimensions)
528 butler.registry.registerDatasetType(datasetType)
529 return datasetType
530 except KeyError as e:
531 raise ValueError from e
534class DatastoreMock:
535 """Mocks a butler datastore.
537 Has functions that mock the datastore in a butler. Provides an `apply`
538 function to replace the relevent butler datastore functions with the mock
539 functions.
540 """
542 @staticmethod
543 def apply(butler: Butler) -> None:
544 """Apply datastore mocks to a butler."""
545 butler.datastore.export = DatastoreMock._mock_export # type: ignore
546 butler.datastore.get = DatastoreMock._mock_get # type: ignore
547 butler.datastore.ingest = MagicMock() # type: ignore
549 @staticmethod
550 def _mock_export(
551 refs: Iterable[DatasetRef], *, directory: str | None = None, transfer: str | None = None
552 ) -> Iterable[FileDataset]:
553 """A mock of `Datastore.export` that satisfies the requirement that
554 the refs passed in are included in the `FileDataset` objects
555 returned.
557 This can be used to construct a `Datastore` mock that can be used
558 in repository export via::
560 datastore = unittest.mock.Mock(spec=Datastore)
561 datastore.export = DatastoreMock._mock_export
563 """
564 for ref in refs:
565 yield FileDataset(
566 refs=[ref], path="mock/path", formatter="lsst.daf.butler.formatters.json.JsonFormatter"
567 )
569 @staticmethod
570 def _mock_get(
571 ref: DatasetRef,
572 parameters: Mapping[str, Any] | None = None,
573 storageClass: StorageClass | str | None = None,
574 ) -> tuple[DatasetId, Mapping[str, Any] | None]:
575 """A mock of `Datastore.get` that just returns the integer dataset ID
576 value and parameters it was given.
577 """
578 assert ref.id is not None
579 return (ref.id, parameters)