Coverage for python/lsst/daf/butler/registries/sqlRegistry.py : 94%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
# This file is part of daf_butler. # # Developed for the LSST Data Management System. # This product includes software developed by the LSST Project # (http://www.lsst.org). # See the COPYRIGHT file at the top-level directory of this distribution # for details of code ownership. # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>.
"""Registry backed by a SQL database.
Parameters ---------- registryConfig : `SqlRegistryConfig` or `str` Load configuration schemaConfig : `SchemaConfig` or `str` Definition of the schema to use. create : `bool` Assume registry is empty and create a new one. """
"""Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or absolute path. Can be None if no defaults specified. """
def transaction(self): """Context manager that implements SQL transactions.
Will roll back any changes to the `SqlRegistry` database in case an exception is raised in the enclosed block.
This context manager may be nested. """
"""Check if given `DatasetType` instance is valid for this `Registry`.
.. todo::
Insert checks for `storageClass`, `dataUnits` and `template`. """
"""Check if a dataId is valid for a particular `DatasetType`.
.. todo::
Move this function to some other place once DataUnit relations are implemented.
datasetType : `DatasetType` The `DatasetType`. dataId : `dict` A `dict` of `DataUnit` link name, value pairs that label the `DatasetRef` within a collection.
Raises ------ ValueError If the dataId is invalid for the given datasetType. """
"""Construct a DatabaseDict backed by a table in the same database as this Registry.
Parameters ---------- table : `table` Name of the table that backs the returned DatabaseDict. If this table already exists, its schema must include at least everything in `types`. types : `dict` A dictionary mapping `str` field names to type objects, containing all fields to be held in the database. key : `str` The name of the field to be used as the dictionary key. Must not be present in ``value._fields``. value : `type` The type used for the dictionary's values, typically a `~collections.namedtuple`. Must have a ``_fields`` class attribute that is a tuple of field names (i.e. as defined by `~collections.namedtuple`); these field names must also appear in the ``types`` arg, and a `_make` attribute to construct it from a sequence of values (again, as defined by `~collections.namedtuple`). """ # We need to construct a temporary config for the table value because # SqlRegistryDatabaseDict.__init__ is required to take a config so it # can be called by DatabaseDict.fromConfig as well. # I suppose we could have Registry.makeDatabaseDict take a config as # well, since it"ll also usually be called by DatabaseDict.fromConfig, # but I strongly believe in having signatures that only take what they # really need.
"""Lookup a dataset.
This can be used to obtain a `DatasetRef` that permits the dataset to be read from a `Datastore`.
Parameters ---------- collection : `str` Identifies the collection to search. datasetType : `DatasetType` The `DatasetType`. dataId : `dict` A `dict` of `DataUnit` link name, value pairs that label the `DatasetRef` within a collection.
Returns ------- ref : `DatasetRef` A ref to the Dataset, or `None` if no matching Dataset was found.
Raises ------ ValueError If dataId is invalid. """ for name in self._schema.dataUnits.getPrimaryKeyNames( datasetType.dataUnits))) datasetTable.join(datasetCollectionTable)).where(and_( datasetTable.c.dataset_type_name == datasetType.name, datasetCollectionTable.c.collection == collection, dataIdExpression))).fetchone() # TODO update unit values and add Run, Quantum and assembler? else:
"""Execute a SQL SELECT statement directly.
Named parameters are specified in the SQL query string by preceeding them with a colon. Parameter values are provided as additional keyword arguments. For example:
registry.query("SELECT * FROM Camera WHERE camera=:name", name="HSC")
Parameters ---------- sql : `str` SQL query string. Must be a SELECT statement. **params Parameter name-value pairs to insert into the query.
Yields ------- row : `dict` The next row result from executing the query.
""" # TODO: make this guard against non-SELECT queries.
def registerDatasetType(self, datasetType): """ Add a new `DatasetType` to the SqlRegistry.
It is not an error to register the same `DatasetType` twice.
Parameters ---------- datasetType : `DatasetType` The `DatasetType` to be added.
Raises ------ ValueError DatasetType is not valid for this registry or is already registered but not identical.
Returns ------- inserted : `bool` ``True`` if ``datasetType`` was inserted, ``False`` if an identical existing `DatsetType` was found. """ raise ValueError("DatasetType is not valid for this registry") # If a DatasetType is already registered it must be identical # A DatasetType entry with this name may exist, get it first. # Note that we can't just look in the cache, because it may not be there yet. # No registered DatasetType with this name exists, move on to inserting it else: # A DatasetType with this name exists, check if is equal else: # Insert it "storage_class": datasetType.storageClass.name} [{"dataset_type_name": datasetType.name, "unit_name": dataUnitName} for dataUnitName in datasetType.dataUnits]) # Also register component DatasetTypes (if any) datasetType.dataUnits, compStorageClass)
"""Get the `DatasetType`.
Parameters ---------- name : `str` Name of the type.
Returns ------- type : `DatasetType` The `DatasetType` associated with the given name.
Raises ------ KeyError Requested named DatasetType could not be found in registry. """ # Get StorageClass from DatasetType table datasetTypeTable.c.dataset_type_name == name)).fetchone()
# Get DataUnits (if any) from DatasetTypeUnits table datasetTypeUnitsTable.c.dataset_type_name == name)).fetchall() storageClass=storageClass, dataUnits=dataUnits)
"""Adds a Dataset entry to the `Registry`
This always adds a new Dataset; to associate an existing Dataset with a new collection, use ``associate``.
Parameters ---------- datasetType : `DatasetType` Type of the Dataset. dataId : `dict` A `dict` of `DataUnit` link name, value pairs that label the `DatasetRef` within a collection. run : `Run` The `Run` instance that produced the Dataset. Ignored if ``producer`` is passed (`producer.run` is then used instead). A Run must be provided by one of the two arguments. producer : `Quantum` Unit of work that produced the Dataset. May be ``None`` to store no provenance information, but if present the `Quantum` must already have been added to the SqlRegistry.
Returns ------- ref : `DatasetRef` A newly-created `DatasetRef` instance.
Raises ------ ValueError If a Dataset with the given `DatasetRef` already exists in the given collection.
Exception If ``dataId`` contains unknown or invalid `DataUnit` entries. """ # TODO this is obviously not the most efficient way to check # for existence. # TODO also note that this check is not safe # in the presence of concurrent calls to addDataset. # Then again, it is undoubtedly not the only place where # this problem occurs. Needs some serious thought. datasetType, dataId, run.collection)) # TODO add producer run_id=run.id, quantum_id=None, **dataId)) # A dataset is always associated with its Run collection
"""Retrieve a Dataset entry.
Parameters ---------- id : `int` The unique identifier for the Dataset.
Returns ------- ref : `DatasetRef` A ref to the Dataset, or `None` if no matching Dataset was found. """ select([datasetTable]).where(datasetTable.c.dataset_id == id)).fetchone() # dataUnitName gives a `str` key which which is used to lookup # the corresponding sqlalchemy.core.Column entry to index the result # because the name of the key may not be the name of the name of the # DataUnit link. for dataUnitName in self._schema.dataUnits.getPrimaryKeyNames(datasetType.dataUnits)} # Get components (if present) # TODO check against expected components select([datasetCompositionTable.c.component_name, datasetCompositionTable.c.component_dataset_id]).where( datasetCompositionTable.c.parent_dataset_id == id)).fetchall() else:
def attachComponent(self, name, parent, component): """Attach a component to a dataset.
Parameters ---------- name : `str` Name of the component. parent : `DatasetRef` A reference to the parent dataset. Will be updated to reference the component. component : `DatasetRef` A reference to the component dataset. """ # TODO Insert check for component name and type against parent.storageClass specified components parent_dataset_id=parent.id, component_dataset_id=component.id)
def associate(self, collection, refs): """Add existing Datasets to a collection, possibly creating the collection in the process.
Parameters ---------- collection : `str` Indicates the collection the Datasets should be associated with. refs : `list` of `DatasetRef` A `list` of `DatasetRef` instances that already exist in this `SqlRegistry`. """ [{"dataset_id": ref.id, "collection": collection} for ref in refs])
r"""Remove existing Datasets from a collection.
``collection`` and ``ref`` combinations that are not currently associated are silently ignored.
Parameters ---------- collection : `str` The collection the Datasets should no longer be associated with. refs : `list` of `DatasetRef` A `list` of `DatasetRef` instances that already exist in this `SqlRegistry`. remove : `bool` If `True`, remove Datasets from the `SqlRegistry` if they are not associated with any collection (including via any composites).
Returns ------- removed : `list` of `DatasetRef` If `remove` is `True`, the `list` of `DatasetRef`\ s that were removed. """ raise NotImplementedError("Cleanup of datasets not yet implemented") and_(datasetCollectionTable.c.dataset_id == ref.id, datasetCollectionTable.c.collection == collection)))
def addStorageInfo(self, ref, storageInfo): """Add storage information for a given dataset.
Typically used by `Datastore`.
Parameters ---------- ref : `DatasetRef` A reference to the dataset for which to add storage information. storageInfo : `StorageInfo` Storage information about the dataset. """ datastore_name=storageInfo.datastoreName, checksum=storageInfo.checksum, size=storageInfo.size)
def updateStorageInfo(self, ref, datastoreName, storageInfo): """Update storage information for a given dataset.
Typically used by `Datastore`.
Parameters ---------- ref : `DatasetRef` A reference to the dataset for which to add storage information. datastoreName : `str` What datastore association to update. storageInfo : `StorageInfo` Storage information about the dataset. """ datasetStorageTable.c.dataset_id == ref.id, datasetStorageTable.c.datastore_name == datastoreName)).values( datastore_name=storageInfo.datastoreName, checksum=storageInfo.checksum, size=storageInfo.size))
"""Retrieve storage information for a given dataset.
Typically used by `Datastore`.
Parameters ---------- ref : `DatasetRef` A reference to the dataset for which to add storage information. datastoreName : `str` What datastore association to update.
Returns ------- info : `StorageInfo` Storage information about the dataset.
Raises ------ KeyError The requested Dataset does not exist. """ select([datasetStorageTable.c.datastore_name, datasetStorageTable.c.checksum, datasetStorageTable.c.size]).where( and_(datasetStorageTable.c.dataset_id == ref.id, datasetStorageTable.c.datastore_name == datastoreName))).fetchone()
"Dataset {} in datastore {}".format(ref.id, datastoreName))
checksum=result["checksum"], size=result["size"])
def removeStorageInfo(self, datastoreName, ref): """Remove storage information associated with this dataset.
Typically used by `Datastore` when a dataset is removed.
Parameters ---------- datastoreName : `str` Name of this `Datastore`. ref : `DatasetRef` A reference to the dataset for which information is to be removed. """ and_(datasetStorageTable.c.dataset_id == ref.id, datasetStorageTable.c.datastore_name == datastoreName)))
def addExecution(self, execution): """Add a new `Execution` to the `SqlRegistry`.
If ``execution.id`` is `None` the `SqlRegistry` will update it to that of the newly inserted entry.
Parameters ---------- execution : `Execution` Instance to add to the `SqlRegistry`. The given `Execution` must not already be present in the `SqlRegistry`.
Raises ------ Exception If `Execution` is already present in the `SqlRegistry`. """ start_time=execution.startTime, end_time=execution.endTime, host=execution.host)) # Reassign id, may have been `None`
"""Retrieve an Execution.
Parameters ---------- id : `int` The unique identifier for the Execution. """ select([executionTable.c.start_time, executionTable.c.end_time, executionTable.c.host]).where(executionTable.c.execution_id == id)).fetchone() endTime=result["end_time"], host=result["host"], id=id) else: return None
def makeRun(self, collection): """Create a new `Run` in the `SqlRegistry` and return it.
If a run with this collection already exists, return that instead.
Parameters ---------- collection : `str` The collection used to identify all inputs and outputs of the `Run`.
Returns ------- run : `Run` A new `Run` instance. """
def ensureRun(self, run): """Conditionally add a new `Run` to the `SqlRegistry`.
If the ``run.id`` is ``None`` or a `Run` with this `id` doesn't exist in the `Registry` yet, add it. Otherwise, ensure the provided run is identical to the one already in the registry.
Parameters ---------- run : `Run` Instance to add to the `SqlRegistry`.
Raises ------ ValueError If ``run`` already exists, but is not identical. """
def addRun(self, run): """Add a new `Run` to the `SqlRegistry`.
Parameters ---------- run : `Run` Instance to add to the `SqlRegistry`. The given `Run` must not already be present in the `SqlRegistry` (or any other). Therefore its `id` must be `None` and its `collection` must not be associated with any existing `Run`.
Raises ------ ValueError If a run already exists with this collection. """ # TODO: this check is probably undesirable, as we may want to have multiple Runs output # to the same collection. Fixing this requires (at least) modifying getRun() accordingly. # First add the Execution part # Then the Run specific part collection=run.collection, environment_id=None, # TODO add environment pipeline_id=None)) # TODO add pipeline # TODO: set given Run's "id" attribute.
""" Get a `Run` corresponding to its collection or id
Parameters ---------- id : `int`, optional Lookup by run `id`, or: collection : `str` If given, lookup by `collection` name instead.
Returns ------- run : `Run` The `Run` instance.
Raises ------ ValueError Must supply one of ``collection`` or ``id``. """ # Retrieve by id executionTable.c.start_time, executionTable.c.end_time, executionTable.c.host, runTable.c.collection, runTable.c.environment_id, runTable.c.pipeline_id]).select_from( runTable.join(executionTable)).where( runTable.c.execution_id == id)).fetchone() # Retrieve by collection executionTable.c.start_time, executionTable.c.end_time, executionTable.c.host, runTable.c.collection, runTable.c.environment_id, runTable.c.pipeline_id]).select_from( runTable.join(executionTable)).where( runTable.c.collection == collection)).fetchone() else: raise ValueError("Either collection or id must be given") startTime=result["start_time"], endTime=result["end_time"], host=result["host"], collection=result["collection"], environment=None, # TODO add environment pipeline=None) # TODO add pipeline
def addQuantum(self, quantum): r"""Add a new `Quantum` to the `SqlRegistry`.
Parameters ---------- quantum : `Quantum` Instance to add to the `SqlRegistry`. The given `Quantum` must not already be present in the `SqlRegistry` (or any other), therefore its:
- `run` attribute must be set to an existing `Run`. - `predictedInputs` attribute must be fully populated with `DatasetRef`\ s, and its. - `actualInputs` and `outputs` will be ignored. """ # First add the Execution part # Then the Quantum specific part task=quantum.task, run_id=quantum.run.id)) # Attach dataset consumers # We use itertools.chain here because quantum.predictedInputs is a # dict of ``name : [DatasetRef, ...]`` and we need to flatten it # for inserting. [{"quantum_id": quantum.id, "dataset_id": ref.id, "actual": False} for ref in flatInputs])
"""Retrieve an Quantum.
Parameters ---------- id : `int` The unique identifier for the Quantum. """ select([quantumTable.c.task, quantumTable.c.run_id, executionTable.c.start_time, executionTable.c.end_time, executionTable.c.host]).select_from(quantumTable.join(executionTable)).where( quantumTable.c.execution_id == id)).fetchone() run=run, startTime=result["start_time"], endTime=result["end_time"], host=result["host"], id=id) # Add predicted and actual inputs to quantum datasetConsumersTable.c.actual]).where( datasetConsumersTable.c.quantum_id == id)): else: return None
def markInputUsed(self, quantum, ref): """Record the given `DatasetRef` as an actual (not just predicted) input of the given `Quantum`.
This updates both the `SqlRegistry`"s `Quantum` table and the Python `Quantum.actualInputs` attribute.
Parameters ---------- quantum : `Quantum` Producer to update. Will be updated in this call. ref : `DatasetRef` To set as actually used input.
Raises ------ KeyError If ``quantum`` is not a predicted consumer for ``ref``. """ datasetConsumersTable.c.quantum_id == quantum.id, datasetConsumersTable.c.dataset_id == ref.id)).values(actual=True))
def addDataUnitEntry(self, dataUnitName, values): """Add a new `DataUnit` entry.
dataUnitName : `str` Name of the `DataUnit` (e.g. ``"Camera"``). values : `dict` Dictionary of ``columnName, columnValue`` pairs.
If ``values`` includes a "region" key, `setDataUnitRegion` will automatically be called to set it any associated spatial join tables. Region fields associated with a combination of DataUnits must be explicitly set separately.
Raises ------ TypeError If the given `DataUnit` does not have explicit entries in the registry. ValueError If an entry with the primary-key defined in `values` is already present. """ raise TypeError("DataUnit '{}' has no table.".format(dataUnitName))
"""Return a `DataUnit` entry corresponding to a `value`.
Parameters ---------- dataUnitName : `str` Name of a `DataUnit` value : `dict` A dictionary of values that uniquely identify the `DataUnit`.
Returns ------- dataUnitEntry : `dict` Dictionary with all `DataUnit` values, or `None` if no matching entry is found. """ and_((primaryKeyColumns[name] == value[name] for name in primaryKeyColumns)))).fetchone() else:
"""Set the region field for a DataUnit instance or a combination thereof and update associated spatial join tables.
Parameters ---------- dataUnitNames : sequence A sequence of DataUnit names whose instances are jointly associated with a region on the sky. This must not include dependencies that are implied, e.g. "Patch" must not include "Tract", but "Sensor" needs to add "Visit". value : `dict` A dictionary of values that uniquely identify the DataUnits. region : `sphgeom.ConvexPolygon` Region on the sky. update : `bool` If True, existing region information for these DataUnits is being replaced. This is usually required because DataUnit entries are assumed to be pre-inserted prior to calling this function. """ raise TypeError("No region table found for '{}'.".format(dataUnitNames)) # Update the region for an existing entry table.update().where( and_((table.columns[name] == value[name] for name in primaryKey)) ).values( region=region.encode() ) ) raise ValueError("No records were updated when setting region, did you forget update=False?") else: # Insert rather than update. table.insert().values( region=region.encode(), **value ) ) # Delete any old SkyPix join entries for this DataUnit join.table.delete().where( and_((join.table.columns[name] == value[name] for name in primaryKey)) ) )
"""Get region associated with a dataId.
Parameters ---------- dataId : `dict` A `dict` of `DataUnit` link name, value pairs that label the `DatasetRef` within a collection.
Returns ------- region : `lsst.sphgeom.ConvexPolygon` The region associated with a ``dataId`` or ``None`` if not present.
Raises ------ KeyError If the set of dataunits for the ``dataId`` does not correspond to a unique spatial lookup. """ # Skypix does not have a table to lookup the region in, instead generate it # Lookup region and_((primaryKeyColumns[name] == dataId[name] for name in primaryKeyColumns)))).fetchone() else:
r"""Evaluate a filter expression and lists of `DatasetType`\ s and return a set of data unit values.
Returned set consists of combinations of units participating in data transformation from ``neededDatasetTypes`` to ``futureDatasetTypes``, restricted by existing data and filter expression.
Parameters ---------- collections : `list` of `str` An ordered `list` of collections indicating the collections to search for Datasets. expr : `str` An expression that limits the `DataUnit`\ s and (indirectly) the Datasets returned. neededDatasetTypes : `list` of `DatasetType` The `list` of `DatasetType`\ s whose instances should be included in the graph and limit its extent. futureDatasetTypes : `list` of `DatasetType` The `list` of `DatasetType`\ s whose instances may be added to the graph later, which requires that their `DataUnit` types must be present in the graph.
Returns ------- header : `tuple` of `tuple` Length of tuple equals the number of columns in the returned result set. Each item is a tuple with two elements - DataUnit name (e.g. "Visit") and unit value name (e.g. "visit"). rows : sequence of `tuple` Result set, this can be a single-pass iterator. Each tuple contains unit values corresponding to units in a header. """ return self._preFlight.selectDataUnits(collections, expr, neededDatasetTypes, futureDatasetTypes)
"""Make a `QuantumGraph` that contains the full provenance of all Datasets matching an expression.
Parameters ---------- expr : `str` An expression (SQL query that evaluates to a list of Dataset primary keys) that selects the Datasets.
Returns ------- graph : `QuantumGraph` Instance (with `units` set to `None`). """ raise NotImplementedError("Must be implemented by subclass")
"""Export contents of the `SqlRegistry`, limited to those reachable from the Datasets identified by the expression `expr`, into a `TableSet` format such that it can be imported into a different database.
Parameters ---------- expr : `str` An expression (SQL query that evaluates to a list of Dataset primary keys) that selects the `Datasets, or a `QuantumGraph` that can be similarly interpreted.
Returns ------- ts : `TableSet` Containing all rows, from all tables in the `SqlRegistry` that are reachable from the selected Datasets. """ raise NotImplementedError("Must be implemented by subclass")
def import_(self, tables, collection): """Import (previously exported) contents into the (possibly empty) `SqlRegistry`.
Parameters ---------- ts : `TableSet` Contains the previously exported content. collection : `str` An additional collection assigned to the newly imported Datasets. """ raise NotImplementedError("Must be implemented by subclass")
def subset(self, collection, expr, datasetTypes): r"""Create a new collection by subsetting an existing one.
Parameters ---------- collection : `str` Indicates the input collection to subset. expr : `str` An expression that limits the `DataUnit`\ s and (indirectly) Datasets in the subset. datasetTypes : `list` of `DatasetType` The `list` of `DatasetType`\ s whose instances should be included in the subset.
Returns ------- collection : `str` The newly created collection. """ raise NotImplementedError("Must be implemented by subclass")
def merge(self, outputCollection, inputCollections): """Create a new collection from a series of existing ones.
Entries earlier in the list will be used in preference to later entries when both contain Datasets with the same `DatasetRef`.
Parameters ---------- outputCollection : `str` collection to use for the new collection. inputCollections : `list` of `str` A `list` of collections to combine. """ raise NotImplementedError("Must be implemented by subclass") |