Coverage for python/lsst/daf/butler/registry/collections/_base.py: 88%
150 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-01 19:54 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-01 19:54 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ()
25from abc import abstractmethod
26from collections import namedtuple
27import itertools
28from typing import (
29 Any,
30 Dict,
31 Generic,
32 Iterable,
33 Iterator,
34 Optional,
35 Tuple,
36 Type,
37 TYPE_CHECKING,
38 TypeVar,
39)
41import sqlalchemy
43from ...core import DimensionUniverse, TimespanDatabaseRepresentation, ddl, Timespan
44from .._collectionType import CollectionType
45from .._exceptions import MissingCollectionError
46from ..interfaces import (
47 ChainedCollectionRecord,
48 CollectionManager,
49 CollectionRecord,
50 RunRecord,
51)
52from ..wildcards import CollectionSearch
54if TYPE_CHECKING: 54 ↛ 55line 54 didn't jump to line 55, because the condition on line 54 was never true
55 from ..interfaces import Database, DimensionRecordStorageManager
58def _makeCollectionForeignKey(sourceColumnName: str, collectionIdName: str,
59 **kwargs: Any) -> ddl.ForeignKeySpec:
60 """Define foreign key specification that refers to collections table.
62 Parameters
63 ----------
64 sourceColumnName : `str`
65 Name of the column in the referring table.
66 collectionIdName : `str`
67 Name of the column in collections table that identifies it (PK).
68 **kwargs
69 Additional keyword arguments passed directly to `ddl.ForeignKeySpec`.
71 Returns
72 -------
73 spec : `ddl.ForeignKeySpec`
74 Foreign key specification.
76 Notes
77 -----
78 This method assumes fixed name ("collection") of a collections table.
79 There is also a general assumption that collection primary key consists
80 of a single column.
81 """
82 return ddl.ForeignKeySpec("collection", source=(sourceColumnName,), target=(collectionIdName,),
83 **kwargs)
86CollectionTablesTuple = namedtuple("CollectionTablesTuple", ["collection", "run", "collection_chain"])
89def makeRunTableSpec(collectionIdName: str, collectionIdType: type,
90 TimespanReprClass: Type[TimespanDatabaseRepresentation]) -> ddl.TableSpec:
91 """Define specification for "run" table.
93 Parameters
94 ----------
95 collectionIdName : `str`
96 Name of the column in collections table that identifies it (PK).
97 collectionIdType
98 Type of the PK column in the collections table, one of the
99 `sqlalchemy` types.
100 TimespanReprClass : `type` [ `TimespanDatabaseRepresentation` ]
101 Subclass of `TimespanDatabaseRepresentation` that encapsulates how
102 timespans are stored in this database.
105 Returns
106 -------
107 spec : `ddl.TableSpec`
108 Specification for run table.
110 Notes
111 -----
112 Assumption here and in the code below is that the name of the identifying
113 column is the same in both collections and run tables. The names of
114 non-identifying columns containing run metadata are fixed.
115 """
116 result = ddl.TableSpec(
117 fields=[
118 ddl.FieldSpec(collectionIdName, dtype=collectionIdType, primaryKey=True),
119 ddl.FieldSpec("host", dtype=sqlalchemy.String, length=128),
120 ],
121 foreignKeys=[
122 _makeCollectionForeignKey(collectionIdName, collectionIdName, onDelete="CASCADE"),
123 ],
124 )
125 for fieldSpec in TimespanReprClass.makeFieldSpecs(nullable=True):
126 result.fields.add(fieldSpec)
127 return result
130def makeCollectionChainTableSpec(collectionIdName: str, collectionIdType: type) -> ddl.TableSpec:
131 """Define specification for "collection_chain" table.
133 Parameters
134 ----------
135 collectionIdName : `str`
136 Name of the column in collections table that identifies it (PK).
137 collectionIdType
138 Type of the PK column in the collections table, one of the
139 `sqlalchemy` types.
141 Returns
142 -------
143 spec : `ddl.TableSpec`
144 Specification for collection chain table.
146 Notes
147 -----
148 Collection chain is simply an ordered one-to-many relation between
149 collections. The names of the columns in the table are fixed and
150 also hardcoded in the code below.
151 """
152 return ddl.TableSpec(
153 fields=[
154 ddl.FieldSpec("parent", dtype=collectionIdType, primaryKey=True),
155 ddl.FieldSpec("position", dtype=sqlalchemy.SmallInteger, primaryKey=True),
156 ddl.FieldSpec("child", dtype=collectionIdType, nullable=False),
157 ],
158 foreignKeys=[
159 _makeCollectionForeignKey("parent", collectionIdName, onDelete="CASCADE"),
160 _makeCollectionForeignKey("child", collectionIdName),
161 ],
162 )
165class DefaultRunRecord(RunRecord):
166 """Default `RunRecord` implementation.
168 This method assumes the same run table definition as produced by
169 `makeRunTableSpec` method. The only non-fixed name in the schema
170 is the PK column name, this needs to be passed in a constructor.
172 Parameters
173 ----------
174 db : `Database`
175 Registry database.
176 key
177 Unique collection ID, can be the same as ``name`` if ``name`` is used
178 for identification. Usually this is an integer or string, but can be
179 other database-specific type.
180 name : `str`
181 Run collection name.
182 table : `sqlalchemy.schema.Table`
183 Table for run records.
184 idColumnName : `str`
185 Name of the identifying column in run table.
186 host : `str`, optional
187 Name of the host where run was produced.
188 timespan : `Timespan`, optional
189 Timespan for this run.
190 """
191 def __init__(self, db: Database, key: Any, name: str, *, table: sqlalchemy.schema.Table,
192 idColumnName: str, host: Optional[str] = None,
193 timespan: Optional[Timespan] = None):
194 super().__init__(key=key, name=name, type=CollectionType.RUN)
195 self._db = db
196 self._table = table
197 self._host = host
198 if timespan is None: 198 ↛ 200line 198 didn't jump to line 200, because the condition on line 198 was never false
199 timespan = Timespan(begin=None, end=None)
200 self._timespan = timespan
201 self._idName = idColumnName
203 def update(self, host: Optional[str] = None,
204 timespan: Optional[Timespan] = None) -> None:
205 # Docstring inherited from RunRecord.
206 if timespan is None:
207 timespan = Timespan(begin=None, end=None)
208 row = {
209 self._idName: self.key,
210 "host": host,
211 }
212 self._db.getTimespanRepresentation().update(timespan, result=row)
213 count = self._db.update(self._table, {self._idName: self.key}, row)
214 if count != 1:
215 raise RuntimeError(f"Run update affected {count} records; expected exactly one.")
216 self._host = host
217 self._timespan = timespan
219 @property
220 def host(self) -> Optional[str]:
221 # Docstring inherited from RunRecord.
222 return self._host
224 @property
225 def timespan(self) -> Timespan:
226 # Docstring inherited from RunRecord.
227 return self._timespan
230class DefaultChainedCollectionRecord(ChainedCollectionRecord):
231 """Default `ChainedCollectionRecord` implementation.
233 This method assumes the same chain table definition as produced by
234 `makeCollectionChainTableSpec` method. All column names in the table are
235 fixed and hard-coded in the methods.
237 Parameters
238 ----------
239 db : `Database`
240 Registry database.
241 key
242 Unique collection ID, can be the same as ``name`` if ``name`` is used
243 for identification. Usually this is an integer or string, but can be
244 other database-specific type.
245 name : `str`
246 Collection name.
247 table : `sqlalchemy.schema.Table`
248 Table for chain relationship records.
249 universe : `DimensionUniverse`
250 Object managing all known dimensions.
251 """
252 def __init__(self, db: Database, key: Any, name: str, *, table: sqlalchemy.schema.Table,
253 universe: DimensionUniverse):
254 super().__init__(key=key, name=name, universe=universe)
255 self._db = db
256 self._table = table
257 self._universe = universe
259 def _update(self, manager: CollectionManager, children: CollectionSearch) -> None:
260 # Docstring inherited from ChainedCollectionRecord.
261 rows = []
262 position = itertools.count()
263 for child in children.iter(manager, flattenChains=False):
264 rows.append({
265 "parent": self.key,
266 "child": child.key,
267 "position": next(position),
268 })
269 with self._db.transaction():
270 self._db.delete(self._table, ["parent"], {"parent": self.key})
271 self._db.insert(self._table, *rows)
273 def _load(self, manager: CollectionManager) -> CollectionSearch:
274 # Docstring inherited from ChainedCollectionRecord.
275 sql = sqlalchemy.sql.select(
276 self._table.columns.child,
277 ).select_from(
278 self._table
279 ).where(
280 self._table.columns.parent == self.key
281 ).order_by(
282 self._table.columns.position
283 )
284 return CollectionSearch.fromExpression(
285 [manager[row._mapping[self._table.columns.child]].name for row in self._db.query(sql)]
286 )
289K = TypeVar("K")
292class DefaultCollectionManager(Generic[K], CollectionManager):
293 """Default `CollectionManager` implementation.
295 This implementation uses record classes defined in this module and is
296 based on the same assumptions about schema outlined in the record classes.
298 Parameters
299 ----------
300 db : `Database`
301 Interface to the underlying database engine and namespace.
302 tables : `CollectionTablesTuple`
303 Named tuple of SQLAlchemy table objects.
304 collectionIdName : `str`
305 Name of the column in collections table that identifies it (PK).
306 dimensions : `DimensionRecordStorageManager`
307 Manager object for the dimensions in this `Registry`.
309 Notes
310 -----
311 Implementation uses "aggressive" pre-fetching and caching of the records
312 in memory. Memory cache is synchronized from database when `refresh`
313 method is called.
314 """
315 def __init__(self, db: Database, tables: CollectionTablesTuple, collectionIdName: str, *,
316 dimensions: DimensionRecordStorageManager):
317 self._db = db
318 self._tables = tables
319 self._collectionIdName = collectionIdName
320 self._records: Dict[K, CollectionRecord] = {} # indexed by record ID
321 self._dimensions = dimensions
323 def refresh(self) -> None:
324 # Docstring inherited from CollectionManager.
325 sql = sqlalchemy.sql.select(
326 *(list(self._tables.collection.columns) + list(self._tables.run.columns))
327 ).select_from(
328 self._tables.collection.join(self._tables.run, isouter=True)
329 )
330 # Put found records into a temporary instead of updating self._records
331 # in place, for exception safety.
332 records = []
333 chains = []
334 TimespanReprClass = self._db.getTimespanRepresentation()
335 for row in self._db.query(sql).mappings():
336 collection_id = row[self._tables.collection.columns[self._collectionIdName]]
337 name = row[self._tables.collection.columns.name]
338 type = CollectionType(row["type"])
339 record: CollectionRecord
340 if type is CollectionType.RUN:
341 record = DefaultRunRecord(
342 key=collection_id,
343 name=name,
344 db=self._db,
345 table=self._tables.run,
346 idColumnName=self._collectionIdName,
347 host=row[self._tables.run.columns.host],
348 timespan=TimespanReprClass.extract(row),
349 )
350 elif type is CollectionType.CHAINED:
351 record = DefaultChainedCollectionRecord(db=self._db,
352 key=collection_id,
353 table=self._tables.collection_chain,
354 name=name,
355 universe=self._dimensions.universe)
356 chains.append(record)
357 else:
358 record = CollectionRecord(key=collection_id, name=name, type=type)
359 records.append(record)
360 self._setRecordCache(records)
361 for chain in chains:
362 try:
363 chain.refresh(self)
364 except MissingCollectionError:
365 # This indicates a race condition in which some other client
366 # created a new collection and added it as a child of this
367 # (pre-existing) chain between the time we fetched all
368 # collections and the time we queried for parent-child
369 # relationships.
370 # Because that's some other unrelated client, we shouldn't care
371 # about that parent collection anyway, so we just drop it on
372 # the floor (a manual refresh can be used to get it back).
373 self._removeCachedRecord(chain)
375 def register(self, name: str, type: CollectionType,
376 doc: Optional[str] = None) -> Tuple[CollectionRecord, bool]:
377 # Docstring inherited from CollectionManager.
378 registered = False
379 record = self._getByName(name)
380 if record is None:
381 row, inserted_or_updated = self._db.sync(
382 self._tables.collection,
383 keys={"name": name},
384 compared={"type": int(type)},
385 extra={"doc": doc},
386 returning=[self._collectionIdName],
387 )
388 assert isinstance(inserted_or_updated, bool)
389 registered = inserted_or_updated
390 assert row is not None
391 collection_id = row[self._collectionIdName]
392 if type is CollectionType.RUN:
393 TimespanReprClass = self._db.getTimespanRepresentation()
394 row, _ = self._db.sync(
395 self._tables.run,
396 keys={self._collectionIdName: collection_id},
397 returning=("host",) + TimespanReprClass.getFieldNames(),
398 )
399 assert row is not None
400 record = DefaultRunRecord(
401 db=self._db,
402 key=collection_id,
403 name=name,
404 table=self._tables.run,
405 idColumnName=self._collectionIdName,
406 host=row["host"],
407 timespan=TimespanReprClass.extract(row),
408 )
409 elif type is CollectionType.CHAINED:
410 record = DefaultChainedCollectionRecord(db=self._db, key=collection_id, name=name,
411 table=self._tables.collection_chain,
412 universe=self._dimensions.universe)
413 else:
414 record = CollectionRecord(key=collection_id, name=name, type=type)
415 self._addCachedRecord(record)
416 return record, registered
418 def remove(self, name: str) -> None:
419 # Docstring inherited from CollectionManager.
420 record = self._getByName(name)
421 if record is None: 421 ↛ 422line 421 didn't jump to line 422, because the condition on line 421 was never true
422 raise MissingCollectionError(f"No collection with name '{name}' found.")
423 # This may raise
424 self._db.delete(self._tables.collection, [self._collectionIdName],
425 {self._collectionIdName: record.key})
426 self._removeCachedRecord(record)
428 def find(self, name: str) -> CollectionRecord:
429 # Docstring inherited from CollectionManager.
430 result = self._getByName(name)
431 if result is None:
432 raise MissingCollectionError(f"No collection with name '{name}' found.")
433 return result
435 def __getitem__(self, key: Any) -> CollectionRecord:
436 # Docstring inherited from CollectionManager.
437 try:
438 return self._records[key]
439 except KeyError as err:
440 raise MissingCollectionError(f"Collection with key '{key}' not found.") from err
442 def __iter__(self) -> Iterator[CollectionRecord]:
443 yield from self._records.values()
445 def getDocumentation(self, key: Any) -> Optional[str]:
446 # Docstring inherited from CollectionManager.
447 sql = sqlalchemy.sql.select(
448 self._tables.collection.columns.doc
449 ).select_from(
450 self._tables.collection
451 ).where(
452 self._tables.collection.columns[self._collectionIdName] == key
453 )
454 return self._db.query(sql).scalar()
456 def setDocumentation(self, key: Any, doc: Optional[str]) -> None:
457 # Docstring inherited from CollectionManager.
458 self._db.update(self._tables.collection, {self._collectionIdName: "key"}, {"key": key, "doc": doc})
460 def _setRecordCache(self, records: Iterable[CollectionRecord]) -> None:
461 """Set internal record cache to contain given records,
462 old cached records will be removed.
463 """
464 self._records = {}
465 for record in records:
466 self._records[record.key] = record
468 def _addCachedRecord(self, record: CollectionRecord) -> None:
469 """Add single record to cache.
470 """
471 self._records[record.key] = record
473 def _removeCachedRecord(self, record: CollectionRecord) -> None:
474 """Remove single record from cache.
475 """
476 del self._records[record.key]
478 @abstractmethod
479 def _getByName(self, name: str) -> Optional[CollectionRecord]:
480 """Find collection record given collection name.
481 """
482 raise NotImplementedError()