Coverage for python/lsst/daf/butler/registry/collections/_base.py : 88%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ()
25from abc import abstractmethod
26from collections import namedtuple
27import itertools
28from typing import (
29 Any,
30 Dict,
31 Generic,
32 Iterable,
33 Iterator,
34 Optional,
35 Type,
36 TYPE_CHECKING,
37 TypeVar,
38)
40import sqlalchemy
42from ...core import DimensionUniverse, TimespanDatabaseRepresentation, ddl, Timespan
43from .._collectionType import CollectionType
44from .._exceptions import MissingCollectionError
45from ..interfaces import (
46 ChainedCollectionRecord,
47 CollectionManager,
48 CollectionRecord,
49 RunRecord,
50)
51from ..wildcards import CollectionSearch
53if TYPE_CHECKING: 53 ↛ 54line 53 didn't jump to line 54, because the condition on line 53 was never true
54 from ..interfaces import Database, DimensionRecordStorageManager
57def _makeCollectionForeignKey(sourceColumnName: str, collectionIdName: str,
58 **kwargs: Any) -> ddl.ForeignKeySpec:
59 """Define foreign key specification that refers to collections table.
61 Parameters
62 ----------
63 sourceColumnName : `str`
64 Name of the column in the referring table.
65 collectionIdName : `str`
66 Name of the column in collections table that identifies it (PK).
67 **kwargs
68 Additional keyword arguments passed directly to `ddl.ForeignKeySpec`.
70 Returns
71 -------
72 spec : `ddl.ForeignKeySpec`
73 Foreign key specification.
75 Notes
76 -----
77 This method assumes fixed name ("collection") of a collections table.
78 There is also a general assumption that collection primary key consists
79 of a single column.
80 """
81 return ddl.ForeignKeySpec("collection", source=(sourceColumnName,), target=(collectionIdName,),
82 **kwargs)
85CollectionTablesTuple = namedtuple("CollectionTablesTuple", ["collection", "run", "collection_chain"])
88def makeRunTableSpec(collectionIdName: str, collectionIdType: type,
89 TimespanReprClass: Type[TimespanDatabaseRepresentation]) -> ddl.TableSpec:
90 """Define specification for "run" table.
92 Parameters
93 ----------
94 collectionIdName : `str`
95 Name of the column in collections table that identifies it (PK).
96 collectionIdType
97 Type of the PK column in the collections table, one of the
98 `sqlalchemy` types.
99 TimespanReprClass : `type` [ `TimespanDatabaseRepresentation` ]
100 Subclass of `TimespanDatabaseRepresentation` that encapsulates how
101 timespans are stored in this database.
104 Returns
105 -------
106 spec : `ddl.TableSpec`
107 Specification for run table.
109 Notes
110 -----
111 Assumption here and in the code below is that the name of the identifying
112 column is the same in both collections and run tables. The names of
113 non-identifying columns containing run metadata are fixed.
114 """
115 result = ddl.TableSpec(
116 fields=[
117 ddl.FieldSpec(collectionIdName, dtype=collectionIdType, primaryKey=True),
118 ddl.FieldSpec("host", dtype=sqlalchemy.String, length=128),
119 ],
120 foreignKeys=[
121 _makeCollectionForeignKey(collectionIdName, collectionIdName, onDelete="CASCADE"),
122 ],
123 )
124 for fieldSpec in TimespanReprClass.makeFieldSpecs(nullable=True):
125 result.fields.add(fieldSpec)
126 return result
129def makeCollectionChainTableSpec(collectionIdName: str, collectionIdType: type) -> ddl.TableSpec:
130 """Define specification for "collection_chain" table.
132 Parameters
133 ----------
134 collectionIdName : `str`
135 Name of the column in collections table that identifies it (PK).
136 collectionIdType
137 Type of the PK column in the collections table, one of the
138 `sqlalchemy` types.
140 Returns
141 -------
142 spec : `ddl.TableSpec`
143 Specification for collection chain table.
145 Notes
146 -----
147 Collection chain is simply an ordered one-to-many relation between
148 collections. The names of the columns in the table are fixed and
149 also hardcoded in the code below.
150 """
151 return ddl.TableSpec(
152 fields=[
153 ddl.FieldSpec("parent", dtype=collectionIdType, primaryKey=True),
154 ddl.FieldSpec("position", dtype=sqlalchemy.SmallInteger, primaryKey=True),
155 ddl.FieldSpec("child", dtype=collectionIdType, nullable=False),
156 ],
157 foreignKeys=[
158 _makeCollectionForeignKey("parent", collectionIdName, onDelete="CASCADE"),
159 _makeCollectionForeignKey("child", collectionIdName),
160 ],
161 )
164class DefaultRunRecord(RunRecord):
165 """Default `RunRecord` implementation.
167 This method assumes the same run table definition as produced by
168 `makeRunTableSpec` method. The only non-fixed name in the schema
169 is the PK column name, this needs to be passed in a constructor.
171 Parameters
172 ----------
173 db : `Database`
174 Registry database.
175 key
176 Unique collection ID, can be the same as ``name`` if ``name`` is used
177 for identification. Usually this is an integer or string, but can be
178 other database-specific type.
179 name : `str`
180 Run collection name.
181 table : `sqlalchemy.schema.Table`
182 Table for run records.
183 idColumnName : `str`
184 Name of the identifying column in run table.
185 host : `str`, optional
186 Name of the host where run was produced.
187 timespan : `Timespan`, optional
188 Timespan for this run.
189 """
190 def __init__(self, db: Database, key: Any, name: str, *, table: sqlalchemy.schema.Table,
191 idColumnName: str, host: Optional[str] = None,
192 timespan: Optional[Timespan] = None):
193 super().__init__(key=key, name=name, type=CollectionType.RUN)
194 self._db = db
195 self._table = table
196 self._host = host
197 if timespan is None: 197 ↛ 199line 197 didn't jump to line 199, because the condition on line 197 was never false
198 timespan = Timespan(begin=None, end=None)
199 self._timespan = timespan
200 self._idName = idColumnName
202 def update(self, host: Optional[str] = None,
203 timespan: Optional[Timespan] = None) -> None:
204 # Docstring inherited from RunRecord.
205 if timespan is None:
206 timespan = Timespan(begin=None, end=None)
207 row = {
208 self._idName: self.key,
209 "host": host,
210 }
211 self._db.getTimespanRepresentation().update(timespan, result=row)
212 count = self._db.update(self._table, {self._idName: self.key}, row)
213 if count != 1:
214 raise RuntimeError(f"Run update affected {count} records; expected exactly one.")
215 self._host = host
216 self._timespan = timespan
218 @property
219 def host(self) -> Optional[str]:
220 # Docstring inherited from RunRecord.
221 return self._host
223 @property
224 def timespan(self) -> Timespan:
225 # Docstring inherited from RunRecord.
226 return self._timespan
229class DefaultChainedCollectionRecord(ChainedCollectionRecord):
230 """Default `ChainedCollectionRecord` implementation.
232 This method assumes the same chain table definition as produced by
233 `makeCollectionChainTableSpec` method. All column names in the table are
234 fixed and hard-coded in the methods.
236 Parameters
237 ----------
238 db : `Database`
239 Registry database.
240 key
241 Unique collection ID, can be the same as ``name`` if ``name`` is used
242 for identification. Usually this is an integer or string, but can be
243 other database-specific type.
244 name : `str`
245 Collection name.
246 table : `sqlalchemy.schema.Table`
247 Table for chain relationship records.
248 universe : `DimensionUniverse`
249 Object managing all known dimensions.
250 """
251 def __init__(self, db: Database, key: Any, name: str, *, table: sqlalchemy.schema.Table,
252 universe: DimensionUniverse):
253 super().__init__(key=key, name=name, universe=universe)
254 self._db = db
255 self._table = table
256 self._universe = universe
258 def _update(self, manager: CollectionManager, children: CollectionSearch) -> None:
259 # Docstring inherited from ChainedCollectionRecord.
260 rows = []
261 position = itertools.count()
262 for child in children.iter(manager, flattenChains=False):
263 rows.append({
264 "parent": self.key,
265 "child": child.key,
266 "position": next(position),
267 })
268 with self._db.transaction():
269 self._db.delete(self._table, ["parent"], {"parent": self.key})
270 self._db.insert(self._table, *rows)
272 def _load(self, manager: CollectionManager) -> CollectionSearch:
273 # Docstring inherited from ChainedCollectionRecord.
274 sql = sqlalchemy.sql.select([
275 self._table.columns.child,
276 ]).select_from(
277 self._table
278 ).where(
279 self._table.columns.parent == self.key
280 ).order_by(
281 self._table.columns.position
282 )
283 return CollectionSearch.fromExpression(
284 [manager[row[self._table.columns.child]].name for row in self._db.query(sql)]
285 )
288K = TypeVar("K")
291class DefaultCollectionManager(Generic[K], CollectionManager):
292 """Default `CollectionManager` implementation.
294 This implementation uses record classes defined in this module and is
295 based on the same assumptions about schema outlined in the record classes.
297 Parameters
298 ----------
299 db : `Database`
300 Interface to the underlying database engine and namespace.
301 tables : `CollectionTablesTuple`
302 Named tuple of SQLAlchemy table objects.
303 collectionIdName : `str`
304 Name of the column in collections table that identifies it (PK).
305 dimensions : `DimensionRecordStorageManager`
306 Manager object for the dimensions in this `Registry`.
308 Notes
309 -----
310 Implementation uses "aggressive" pre-fetching and caching of the records
311 in memory. Memory cache is synchronized from database when `refresh`
312 method is called.
313 """
314 def __init__(self, db: Database, tables: CollectionTablesTuple, collectionIdName: str, *,
315 dimensions: DimensionRecordStorageManager):
316 self._db = db
317 self._tables = tables
318 self._collectionIdName = collectionIdName
319 self._records: Dict[K, CollectionRecord] = {} # indexed by record ID
320 self._dimensions = dimensions
322 def refresh(self) -> None:
323 # Docstring inherited from CollectionManager.
324 sql = sqlalchemy.sql.select(
325 list(self._tables.collection.columns) + list(self._tables.run.columns)
326 ).select_from(
327 self._tables.collection.join(self._tables.run, isouter=True)
328 )
329 # Put found records into a temporary instead of updating self._records
330 # in place, for exception safety.
331 records = []
332 chains = []
333 TimespanReprClass = self._db.getTimespanRepresentation()
334 for row in self._db.query(sql).fetchall():
335 collection_id = row[self._tables.collection.columns[self._collectionIdName]]
336 name = row[self._tables.collection.columns.name]
337 type = CollectionType(row["type"])
338 record: CollectionRecord
339 if type is CollectionType.RUN:
340 record = DefaultRunRecord(
341 key=collection_id,
342 name=name,
343 db=self._db,
344 table=self._tables.run,
345 idColumnName=self._collectionIdName,
346 host=row[self._tables.run.columns.host],
347 timespan=TimespanReprClass.extract(row),
348 )
349 elif type is CollectionType.CHAINED:
350 record = DefaultChainedCollectionRecord(db=self._db,
351 key=collection_id,
352 table=self._tables.collection_chain,
353 name=name,
354 universe=self._dimensions.universe)
355 chains.append(record)
356 else:
357 record = CollectionRecord(key=collection_id, name=name, type=type)
358 records.append(record)
359 self._setRecordCache(records)
360 for chain in chains:
361 chain.refresh(self)
363 def register(self, name: str, type: CollectionType, doc: Optional[str] = None) -> CollectionRecord:
364 # Docstring inherited from CollectionManager.
365 record = self._getByName(name)
366 if record is None:
367 row, _ = self._db.sync(
368 self._tables.collection,
369 keys={"name": name},
370 compared={"type": int(type)},
371 extra={"doc": doc},
372 returning=[self._collectionIdName],
373 )
374 assert row is not None
375 collection_id = row[self._collectionIdName]
376 if type is CollectionType.RUN:
377 TimespanReprClass = self._db.getTimespanRepresentation()
378 row, _ = self._db.sync(
379 self._tables.run,
380 keys={self._collectionIdName: collection_id},
381 returning=("host",) + TimespanReprClass.getFieldNames(),
382 )
383 assert row is not None
384 record = DefaultRunRecord(
385 db=self._db,
386 key=collection_id,
387 name=name,
388 table=self._tables.run,
389 idColumnName=self._collectionIdName,
390 host=row["host"],
391 timespan=TimespanReprClass.extract(row),
392 )
393 elif type is CollectionType.CHAINED:
394 record = DefaultChainedCollectionRecord(db=self._db, key=collection_id, name=name,
395 table=self._tables.collection_chain,
396 universe=self._dimensions.universe)
397 else:
398 record = CollectionRecord(key=collection_id, name=name, type=type)
399 self._addCachedRecord(record)
400 return record
402 def remove(self, name: str) -> None:
403 # Docstring inherited from CollectionManager.
404 record = self._getByName(name)
405 if record is None: 405 ↛ 406line 405 didn't jump to line 406, because the condition on line 405 was never true
406 raise MissingCollectionError(f"No collection with name '{name}' found.")
407 # This may raise
408 self._db.delete(self._tables.collection, [self._collectionIdName],
409 {self._collectionIdName: record.key})
410 self._removeCachedRecord(record)
412 def find(self, name: str) -> CollectionRecord:
413 # Docstring inherited from CollectionManager.
414 result = self._getByName(name)
415 if result is None:
416 raise MissingCollectionError(f"No collection with name '{name}' found.")
417 return result
419 def __getitem__(self, key: Any) -> CollectionRecord:
420 # Docstring inherited from CollectionManager.
421 try:
422 return self._records[key]
423 except KeyError as err:
424 raise MissingCollectionError(f"Collection with key '{key}' not found.") from err
426 def __iter__(self) -> Iterator[CollectionRecord]:
427 yield from self._records.values()
429 def getDocumentation(self, key: Any) -> Optional[str]:
430 # Docstring inherited from CollectionManager.
431 sql = sqlalchemy.sql.select(
432 [self._tables.collection.columns.doc]
433 ).select_from(
434 self._tables.collection
435 ).where(
436 self._tables.collection.columns[self._collectionIdName] == key
437 )
438 return self._db.query(sql).scalar()
440 def setDocumentation(self, key: Any, doc: Optional[str]) -> None:
441 # Docstring inherited from CollectionManager.
442 self._db.update(self._tables.collection, {self._collectionIdName: "key"}, {"key": key, "doc": doc})
444 def _setRecordCache(self, records: Iterable[CollectionRecord]) -> None:
445 """Set internal record cache to contain given records,
446 old cached records will be removed.
447 """
448 self._records = {}
449 for record in records:
450 self._records[record.key] = record
452 def _addCachedRecord(self, record: CollectionRecord) -> None:
453 """Add single record to cache.
454 """
455 self._records[record.key] = record
457 def _removeCachedRecord(self, record: CollectionRecord) -> None:
458 """Remove single record from cache.
459 """
460 del self._records[record.key]
462 @abstractmethod
463 def _getByName(self, name: str) -> Optional[CollectionRecord]:
464 """Find collection record given collection name.
465 """
466 raise NotImplementedError()