Coverage for python/lsst/daf/butler/tests/_testRepo.py: 14%
144 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = [
31 "makeTestRepo",
32 "makeTestCollection",
33 "addDatasetType",
34 "expandUniqueId",
35 "DatastoreMock",
36 "addDataIdValue",
37]
39import random
40from collections.abc import Iterable, Mapping
41from typing import TYPE_CHECKING, Any
42from unittest.mock import MagicMock
44import sqlalchemy
45from lsst.daf.butler import (
46 Butler,
47 Config,
48 DataCoordinate,
49 DatasetRef,
50 DatasetType,
51 Dimension,
52 DimensionUniverse,
53 FileDataset,
54 StorageClass,
55)
57if TYPE_CHECKING:
58 from lsst.daf.butler import DatasetId
61def makeTestRepo(
62 root: str, dataIds: Mapping[str, Iterable] | None = None, *, config: Config | None = None, **kwargs: Any
63) -> Butler:
64 """Create an empty test repository.
66 Parameters
67 ----------
68 root : `str`
69 The location of the root directory for the repository.
70 dataIds : `~collections.abc.Mapping` [`str`, `iterable`], optional
71 A mapping keyed by the dimensions used in the test. Each value is an
72 iterable of names for that dimension (e.g., detector IDs for
73 `"detector"`). Related dimensions (e.g., instruments and detectors) are
74 linked arbitrarily, with values created for implied dimensions only
75 when needed. This parameter is provided for compatibility with old
76 code; newer code should make the repository, then call
77 `~lsst.daf.butler.tests.addDataIdValue`.
78 config : `lsst.daf.butler.Config`, optional
79 A configuration for the repository (for details, see
80 `lsst.daf.butler.Butler.makeRepo`). If omitted, creates a repository
81 with default dataset and storage types, but optimized for speed. The
82 defaults set ``.datastore.cls``, ``.datastore.checksum`` and
83 ``.registry.db``. If a supplied config does not specify these values
84 the internal defaults will be used to ensure that we have a usable
85 configuration.
86 **kwargs
87 Extra arguments to `lsst.daf.butler.Butler.makeRepo`.
89 Returns
90 -------
91 butler : `lsst.daf.butler.Butler`
92 A Butler referring to the new repository. This Butler is provided only
93 for additional setup; to keep test cases isolated, it is highly
94 recommended that each test create its own Butler with a unique
95 run/collection. See `makeTestCollection`.
97 Notes
98 -----
99 This function provides a "quick and dirty" repository for simple unit tests
100 that don't depend on complex data relationships. It is ill-suited for tests
101 where the structure of the data matters. If you need such a dataset, create
102 it directly or use a saved test dataset.
103 """
104 defaults = Config()
105 defaults["datastore", "cls"] = "lsst.daf.butler.datastores.inMemoryDatastore.InMemoryDatastore"
106 defaults["datastore", "checksum"] = False # In case of future changes
107 defaults["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3"
109 if config:
110 defaults.update(config)
112 if not dataIds:
113 dataIds = {}
115 # Disable config root by default so that our registry override will
116 # not be ignored.
117 # newConfig guards against location-related keywords like outfile
118 newConfig = Butler.makeRepo(root, config=defaults, forceConfigRoot=False, **kwargs)
119 butler = Butler(newConfig, writeable=True)
120 dimensionRecords = _makeRecords(dataIds, butler.dimensions)
121 for dimension, records in dimensionRecords.items():
122 if butler.dimensions[dimension].viewOf is None:
123 butler.registry.insertDimensionData(dimension, *records)
124 return butler
127def makeTestCollection(repo: Butler, uniqueId: str | None = None) -> Butler:
128 """Create a read/write Butler to a fresh collection.
130 Parameters
131 ----------
132 repo : `lsst.daf.butler.Butler`
133 A previously existing Butler to a repository, such as that returned by
134 `~lsst.daf.butler.Butler.makeRepo` or `makeTestRepo`.
135 uniqueId : `str`, optional
136 A collection ID guaranteed by external code to be unique across all
137 calls to ``makeTestCollection`` for the same repository.
139 Returns
140 -------
141 butler : `lsst.daf.butler.Butler`
142 A Butler referring to a new collection in the repository at ``root``.
143 The collection is (almost) guaranteed to be new.
145 Notes
146 -----
147 This function creates a single run collection that does not necessarily
148 conform to any repository conventions. It is only suitable for creating an
149 isolated test area, and not for repositories intended for real data
150 processing or analysis.
151 """
152 if not uniqueId:
153 # Create a "random" collection name
154 # Speed matters more than cryptographic guarantees
155 uniqueId = str(random.randrange(1_000_000_000))
156 collection = "test_" + uniqueId
157 return Butler(butler=repo, run=collection)
160def _makeRecords(dataIds: Mapping[str, Iterable], universe: DimensionUniverse) -> Mapping[str, Iterable]:
161 """Create cross-linked dimension records from a collection of
162 data ID values.
164 Parameters
165 ----------
166 dataIds : `~collections.abc.Mapping` [`str`, `iterable`]
167 A mapping keyed by the dimensions of interest. Each value is an
168 iterable of names for that dimension (e.g., detector IDs for
169 `"detector"`).
170 universe : lsst.daf.butler.DimensionUniverse
171 Set of all known dimensions and their relationships.
173 Returns
174 -------
175 dataIds : `~collections.abc.Mapping` [`str`, `iterable`]
176 A mapping keyed by the dimensions of interest, giving one
177 `~lsst.daf.butler.DimensionRecord` for each input name. Related
178 dimensions (e.g., instruments and detectors) are linked arbitrarily.
179 """
180 # Create values for all dimensions that are (recursive) required or implied
181 # dependencies of the given ones.
182 complete_data_id_values = {}
183 for dimension in universe.extract(dataIds.keys()):
184 if dimension.name in dataIds:
185 complete_data_id_values[dimension.name] = list(dataIds[dimension.name])
186 if dimension.name not in complete_data_id_values:
187 complete_data_id_values[dimension.name] = [_makeRandomDataIdValue(dimension)]
189 # Start populating dicts that will become DimensionRecords by providing
190 # alternate keys like detector names
191 record_dicts_by_dimension_name: dict[str, list[dict[str, str | int | bytes]]] = {}
192 for name, values in complete_data_id_values.items():
193 record_dicts_by_dimension_name[name] = []
194 dimension_el = universe[name]
195 for value in values:
196 # _fillAllKeys wants Dimension and not DimensionElement.
197 # universe.__getitem__ says it returns DimensionElement but this
198 # really does also seem to be a Dimension here.
199 record_dicts_by_dimension_name[name].append(
200 _fillAllKeys(dimension_el, value) # type: ignore[arg-type]
201 )
203 # Pick cross-relationships arbitrarily
204 for name, record_dicts in record_dicts_by_dimension_name.items():
205 dimension_el = universe[name]
206 for record_dict in record_dicts:
207 for other in dimension_el.dimensions:
208 if other != dimension_el:
209 relation = record_dicts_by_dimension_name[other.name][0]
210 record_dict[other.name] = relation[other.primaryKey.name]
212 return {
213 dimension: [universe[dimension].RecordClass(**record_dict) for record_dict in record_dicts]
214 for dimension, record_dicts in record_dicts_by_dimension_name.items()
215 }
218def _fillAllKeys(dimension: Dimension, value: str | int) -> dict[str, str | int | bytes]:
219 """Create an arbitrary mapping of all required keys for a given dimension
220 that do not refer to other dimensions.
222 Parameters
223 ----------
224 dimension : `lsst.daf.butler.Dimension`
225 The dimension for which to generate a set of keys (e.g., detector).
226 value
227 The value assigned to ``dimension`` (e.g., detector ID).
229 Returns
230 -------
231 expandedValue : `dict` [`str`]
232 A mapping of dimension keys to values. ``dimension's`` primary key
233 maps to ``value``, but all other mappings (e.g., detector name)
234 are arbitrary.
235 """
236 expandedValue: dict[str, str | int | bytes] = {}
237 for key in dimension.uniqueKeys:
238 if key.nbytes:
239 # For `bytes` fields, we want something that casts at least `str`
240 # and `int` values to bytes and yields b'' when called with no
241 # arguments (as in the except block below). Unfortunately, the
242 # `bytes` type itself fails for both `str` and `int`, but this
243 # lambda does what we need. This particularly important for the
244 # skymap dimensions' bytes 'hash' field, which has a unique
245 # constraint; without this, all skymaps would get a hash of b''
246 # and end up conflicting.
247 castType = lambda *args: str(*args).encode() # noqa: E731
248 else:
249 castType = key.dtype().python_type
250 try:
251 castValue = castType(value)
252 except TypeError:
253 castValue = castType()
254 expandedValue[key.name] = castValue
255 for key in dimension.metadata:
256 if not key.nullable:
257 expandedValue[key.name] = key.dtype().python_type(value)
258 return expandedValue
261def _makeRandomDataIdValue(dimension: Dimension) -> int | str:
262 """Generate a random value of the appropriate type for a data ID key.
264 Parameters
265 ----------
266 dimension : `Dimension`
267 Dimension the value corresponds to.
269 Returns
270 -------
271 value : `int` or `str`
272 Random value.
273 """
274 if dimension.primaryKey.getPythonType() is str:
275 return str(random.randrange(1000))
276 else:
277 return random.randrange(1000)
280def expandUniqueId(butler: Butler, partialId: Mapping[str, Any]) -> DataCoordinate:
281 """Return a complete data ID matching some criterion.
283 Parameters
284 ----------
285 butler : `lsst.daf.butler.Butler`
286 The repository to query.
287 partialId : `~collections.abc.Mapping` [`str`]
288 A mapping of known dimensions and values.
290 Returns
291 -------
292 dataId : `lsst.daf.butler.DataCoordinate`
293 The unique data ID that matches ``partialId``.
295 Raises
296 ------
297 ValueError
298 Raised if ``partialId`` does not uniquely identify a data ID.
300 Notes
301 -----
302 This method will only work correctly if all dimensions attached to the
303 target dimension (eg., "physical_filter" for "visit") are known to the
304 repository, even if they're not needed to identify a dataset. This function
305 is only suitable for certain kinds of test repositories, and not for
306 repositories intended for real data processing or analysis.
308 Examples
309 --------
310 .. code-block:: py
312 >>> butler = makeTestRepo(
313 "testdir", {"instrument": ["notACam"], "detector": [1]})
314 >>> expandUniqueId(butler, {"detector": 1})
315 DataCoordinate({instrument, detector}, ('notACam', 1))
316 """
317 # The example is *not* a doctest because it requires dangerous I/O
318 registry = butler.registry
319 dimensions = registry.dimensions.extract(partialId.keys()).required
321 query = " AND ".join(f"{dimension} = {value!r}" for dimension, value in partialId.items())
323 # Much of the purpose of this function is to do something we explicitly
324 # reject most of the time: query for a governor dimension (e.g. instrument)
325 # given something that depends on it (e.g. visit), hence check=False.
326 dataId = list(registry.queryDataIds(dimensions, where=query, check=False))
327 if len(dataId) == 1:
328 return dataId[0]
329 else:
330 raise ValueError(f"Found {len(dataId)} matches for {partialId}, expected 1.")
333def _findOrInventDataIdValue(
334 butler: Butler, data_id: dict[str, str | int], dimension: Dimension
335) -> tuple[str | int, bool]:
336 """Look up an arbitrary value for a dimension that is consistent with a
337 partial data ID that does not specify that dimension, or invent one if no
338 such value exists.
340 Parameters
341 ----------
342 butler : `Butler`
343 Butler to use to look up data ID values.
344 data_id : `dict` [ `str`, `str` or `int` ]
345 Dictionary of possibly-related data ID values.
346 dimension : `Dimension`
347 Dimension to obtain a value for.
349 Returns
350 -------
351 value : `int` or `str`
352 Value for this dimension.
353 invented : `bool`
354 `True` if the value had to be invented, `False` if a compatible value
355 already existed.
356 """
357 # No values given by caller for this dimension. See if any exist
358 # in the registry that are consistent with the values of dimensions
359 # we do have:
360 match_data_id = {key: data_id[key] for key in data_id.keys() & dimension.dimensions.names}
361 matches = list(butler.registry.queryDimensionRecords(dimension, dataId=match_data_id).limit(1))
362 if not matches:
363 # Nothing in the registry matches: invent a data ID value
364 # with the right type (actual value does not matter).
365 # We may or may not actually make a record with this; that's
366 # easier to check later.
367 dimension_value = _makeRandomDataIdValue(dimension)
368 return dimension_value, True
369 else:
370 # A record does exist in the registry. Use its data ID value.
371 dim_value = matches[0].dataId[dimension.name]
372 assert dim_value is not None
373 return dim_value, False
376def _makeDimensionRecordDict(data_id: dict[str, str | int], dimension: Dimension) -> dict[str, Any]:
377 """Create a dictionary that can be used to build a `DimensionRecord` that
378 is consistent with the given data ID.
380 Parameters
381 ----------
382 data_id : `dict` [ `str`, `str` or `int` ]
383 Dictionary that contains values for at least all of
384 ``dimension.dimensions.names`` (the main dimension, its recursive
385 required dependencies, and its non-recursive implied dependencies).
386 dimension : `Dimension`
387 Dimension to build a record dictionary for.
389 Returns
390 -------
391 record_dict : `dict` [ `str`, `object` ]
392 Dictionary that can be passed as ``**kwargs`` to this dimensions
393 record class constructor.
394 """
395 # Add the primary key field for this dimension.
396 record_dict: dict[str, Any] = {dimension.primaryKey.name: data_id[dimension.name]}
397 # Define secondary keys (e.g., detector name given detector id)
398 record_dict.update(_fillAllKeys(dimension, data_id[dimension.name]))
399 # Set the foreign key values for any related dimensions that should
400 # appear in the record.
401 for related_dimension in dimension.dimensions:
402 if related_dimension.name != dimension.name:
403 record_dict[related_dimension.name] = data_id[related_dimension.name]
404 return record_dict
407def addDataIdValue(butler: Butler, dimension: str, value: str | int, **related: str | int) -> None:
408 """Add the records that back a new data ID to a repository.
410 Parameters
411 ----------
412 butler : `lsst.daf.butler.Butler`
413 The repository to update.
414 dimension : `str`
415 The name of the dimension to gain a new value.
416 value
417 The value to register for the dimension.
418 **related
419 Any existing dimensions to be linked to ``value``.
421 Notes
422 -----
423 Related dimensions (e.g., the instrument associated with a detector) may be
424 specified using ``related``, which requires a value for those dimensions to
425 have been added to the repository already (generally with a previous call
426 to `addDataIdValue`. Any dependencies of the given dimension that are not
427 included in ``related`` will be linked to existing values arbitrarily, and
428 (for implied dependencies only) created and also inserted into the registry
429 if they do not exist. Values for required dimensions and those given in
430 ``related`` are never created.
432 Because this function creates filler data, it is only suitable for test
433 repositories. It should not be used for repositories intended for real data
434 processing or analysis, which have known dimension values.
436 Examples
437 --------
438 See the guide on :ref:`using-butler-in-tests-make-repo` for usage examples.
439 """
440 # Example is not doctest, because it's probably unsafe to create even an
441 # in-memory butler in that environment.
442 try:
443 fullDimension = butler.dimensions[dimension]
444 except KeyError as e:
445 raise ValueError from e
446 # Bad keys ignored by registry code
447 extraKeys = related.keys() - fullDimension.graph.dimensions.names
448 if extraKeys:
449 raise ValueError(
450 f"Unexpected keywords {extraKeys} not found in {fullDimension.graph.dimensions.names}"
451 )
453 # Assemble a dictionary data ID holding the given primary dimension value
454 # and all of the related ones.
455 data_id: dict[str, int | str] = {dimension: value}
456 data_id.update(related)
458 # Compute the set of all dimensions that these recursively depend on.
459 all_dimensions = butler.dimensions.extract(data_id.keys())
461 # Create dicts that will become DimensionRecords for all of these data IDs.
462 # This iteration is guaranteed to be in topological order, so we can count
463 # on new data ID values being invented before they are needed.
464 record_dicts_by_dimension: dict[Dimension, dict[str, Any]] = {}
465 for dimension_obj in all_dimensions:
466 dimension_value = data_id.get(dimension_obj.name)
467 if dimension_value is None:
468 data_id[dimension_obj.name], invented = _findOrInventDataIdValue(butler, data_id, dimension_obj)
469 if not invented:
470 # No need to make a new record; one already exists.
471 continue
472 if dimension_obj.name in related:
473 # Caller passed in a value of this dimension explicitly, but it
474 # isn't the primary dimension they asked to have a record created
475 # for. That means they expect this record to already exist.
476 continue
477 if dimension_obj != fullDimension and dimension_obj in all_dimensions.required:
478 # We also don't want to automatically create new dimension records
479 # for required dimensions (except for the main dimension the caller
480 # asked for); those are also asserted by the caller to already
481 # exist.
482 continue
483 if dimension_obj.viewOf is not None:
484 # Don't need to bother generating full records for dimensions whose
485 # records are just a view into some other's records anyway.
486 continue
487 record_dicts_by_dimension[dimension_obj] = _makeDimensionRecordDict(data_id, dimension_obj)
489 # Sync those dimension record dictionaries with the database.
490 for dimension_obj, record_dict in record_dicts_by_dimension.items():
491 record = dimension_obj.RecordClass(**record_dict)
492 try:
493 butler.registry.syncDimensionData(dimension_obj, record)
494 except sqlalchemy.exc.IntegrityError as e:
495 raise RuntimeError(
496 "Could not create data ID value. Automatic relationship generation "
497 "may have failed; try adding keywords to assign a specific instrument, "
498 "physical_filter, etc. based on the nested exception message."
499 ) from e
502def addDatasetType(butler: Butler, name: str, dimensions: set[str], storageClass: str) -> DatasetType:
503 """Add a new dataset type to a repository.
505 Parameters
506 ----------
507 butler : `lsst.daf.butler.Butler`
508 The repository to update.
509 name : `str`
510 The name of the dataset type.
511 dimensions : `set` [`str`]
512 The dimensions of the new dataset type.
513 storageClass : `str`
514 The storage class the dataset will use.
516 Returns
517 -------
518 datasetType : `lsst.daf.butler.DatasetType`
519 The new type.
521 Raises
522 ------
523 ValueError
524 Raised if the dimensions or storage class is invalid.
526 Notes
527 -----
528 Dataset types are shared across all collections in a repository, so this
529 function does not need to be run for each collection.
530 """
531 try:
532 datasetType = DatasetType(name, dimensions, storageClass, universe=butler.dimensions)
533 butler.registry.registerDatasetType(datasetType)
534 return datasetType
535 except KeyError as e:
536 raise ValueError from e
539class DatastoreMock:
540 """Mocks a butler datastore.
542 Has functions that mock the datastore in a butler. Provides an `apply`
543 function to replace the relevent butler datastore functions with the mock
544 functions.
545 """
547 @staticmethod
548 def apply(butler: Butler) -> None:
549 """Apply datastore mocks to a butler."""
550 butler._datastore.export = DatastoreMock._mock_export # type: ignore
551 butler._datastore.get = DatastoreMock._mock_get # type: ignore
552 butler._datastore.ingest = MagicMock() # type: ignore
554 @staticmethod
555 def _mock_export(
556 refs: Iterable[DatasetRef], *, directory: str | None = None, transfer: str | None = None
557 ) -> Iterable[FileDataset]:
558 """Mock of `Datastore.export` that satisfies the requirement that
559 the refs passed in are included in the `FileDataset` objects
560 returned.
562 This can be used to construct a `Datastore` mock that can be used
563 in repository export via::
565 datastore = unittest.mock.Mock(spec=Datastore)
566 datastore.export = DatastoreMock._mock_export
568 """
569 for ref in refs:
570 yield FileDataset(
571 refs=[ref], path="mock/path", formatter="lsst.daf.butler.formatters.json.JsonFormatter"
572 )
574 @staticmethod
575 def _mock_get(
576 ref: DatasetRef,
577 parameters: Mapping[str, Any] | None = None,
578 storageClass: StorageClass | str | None = None,
579 ) -> tuple[DatasetId, Mapping[str, Any] | None]:
580 """Mock of `Datastore.get` that just returns the integer dataset ID
581 value and parameters it was given.
582 """
583 return (ref.id, parameters)