Coverage for python/lsst/daf/butler/registry/collections/_base.py : 74%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ()
25from abc import abstractmethod
26from collections import namedtuple
27import itertools
28from typing import (
29 Any,
30 Dict,
31 Generic,
32 Iterable,
33 Iterator,
34 Optional,
35 Type,
36 TYPE_CHECKING,
37 TypeVar,
38)
40import sqlalchemy
42from ...core import DatabaseTimespanRepresentation, ddl, Timespan
43from .._collectionType import CollectionType
44from ..interfaces import (
45 ChainedCollectionRecord,
46 CollectionManager,
47 CollectionRecord,
48 MissingCollectionError,
49 RunRecord,
50)
51from ..wildcards import CollectionSearch, Ellipsis
53if TYPE_CHECKING: 53 ↛ 54line 53 didn't jump to line 54, because the condition on line 53 was never true
54 from ..interfaces import Database
57def _makeCollectionForeignKey(sourceColumnName: str, collectionIdName: str,
58 **kwargs: Any) -> ddl.ForeignKeySpec:
59 """Define foreign key specification that refers to collections table.
61 Parameters
62 ----------
63 sourceColumnName : `str`
64 Name of the column in the referring table.
65 collectionIdName : `str`
66 Name of the column in collections table that identifies it (PK).
67 **kwargs
68 Additional keyword arguments passed directly to `ddl.ForeignKeySpec`.
70 Returns
71 -------
72 spec : `ddl.ForeignKeySpec`
73 Foreign key specification.
75 Notes
76 -----
77 This method assumes fixed name ("collection") of a collections table.
78 There is also a general assumption that collection primary key consists
79 of a single column.
80 """
81 return ddl.ForeignKeySpec("collection", source=(sourceColumnName,), target=(collectionIdName,),
82 **kwargs)
85CollectionTablesTuple = namedtuple("CollectionTablesTuple", ["collection", "run", "collection_chain"])
88def makeRunTableSpec(collectionIdName: str, collectionIdType: type,
89 tsRepr: Type[DatabaseTimespanRepresentation]) -> ddl.TableSpec:
90 """Define specification for "run" table.
92 Parameters
93 ----------
94 collectionIdName : `str`
95 Name of the column in collections table that identifies it (PK).
96 collectionIdType
97 Type of the PK column in the collections table, one of the
98 `sqlalchemy` types.
99 tsRepr : `type` [ `DatabaseTimespanRepresentation` ]
100 Subclass of `DatabaseTimespanRepresentation` that encapsulates how
101 timespans are stored in this database.
104 Returns
105 -------
106 spec : `ddl.TableSpec`
107 Specification for run table.
109 Notes
110 -----
111 Assumption here and in the code below is that the name of the identifying
112 column is the same in both collections and run tables. The names of
113 non-identifying columns containing run metadata are fixed.
114 """
115 result = ddl.TableSpec(
116 fields=[
117 ddl.FieldSpec(collectionIdName, dtype=collectionIdType, primaryKey=True),
118 ddl.FieldSpec("host", dtype=sqlalchemy.String, length=128),
119 ],
120 foreignKeys=[
121 _makeCollectionForeignKey(collectionIdName, collectionIdName, onDelete="CASCADE"),
122 ],
123 )
124 for fieldSpec in tsRepr.makeFieldSpecs(nullable=True):
125 result.fields.add(fieldSpec)
126 return result
129def makeCollectionChainTableSpec(collectionIdName: str, collectionIdType: type) -> ddl.TableSpec:
130 """Define specification for "collection_chain" table.
132 Parameters
133 ----------
134 collectionIdName : `str`
135 Name of the column in collections table that identifies it (PK).
136 collectionIdType
137 Type of the PK column in the collections table, one of the
138 `sqlalchemy` types.
140 Returns
141 -------
142 spec : `ddl.TableSpec`
143 Specification for collection chain table.
145 Notes
146 -----
147 Collection chain is simply an ordered one-to-many relation between
148 collections. The names of the columns in the table are fixed and
149 also hardcoded in the code below.
150 """
151 return ddl.TableSpec(
152 fields=[
153 ddl.FieldSpec("parent", dtype=collectionIdType, primaryKey=True),
154 ddl.FieldSpec("position", dtype=sqlalchemy.SmallInteger, primaryKey=True),
155 ddl.FieldSpec("child", dtype=collectionIdType, nullable=False),
156 ddl.FieldSpec("dataset_type_name", dtype=sqlalchemy.String, length=128, nullable=True),
157 ],
158 foreignKeys=[
159 _makeCollectionForeignKey("parent", collectionIdName, onDelete="CASCADE"),
160 _makeCollectionForeignKey("child", collectionIdName),
161 ],
162 )
165class DefaultRunRecord(RunRecord):
166 """Default `RunRecord` implementation.
168 This method assumes the same run table definition as produced by
169 `makeRunTableSpec` method. The only non-fixed name in the schema
170 is the PK column name, this needs to be passed in a constructor.
172 Parameters
173 ----------
174 db : `Database`
175 Registry database.
176 key
177 Unique collection ID, can be the same as ``name`` if ``name`` is used
178 for identification. Usually this is an integer or string, but can be
179 other database-specific type.
180 name : `str`
181 Run collection name.
182 table : `sqlalchemy.schema.Table`
183 Table for run records.
184 idColumnName : `str`
185 Name of the identifying column in run table.
186 host : `str`, optional
187 Name of the host where run was produced.
188 timespan : `Timespan`, optional
189 Timespan for this run.
190 """
191 def __init__(self, db: Database, key: Any, name: str, *, table: sqlalchemy.schema.Table,
192 idColumnName: str, host: Optional[str] = None,
193 timespan: Optional[Timespan] = None):
194 super().__init__(key=key, name=name, type=CollectionType.RUN)
195 self._db = db
196 self._table = table
197 self._host = host
198 if timespan is None: 198 ↛ 200line 198 didn't jump to line 200, because the condition on line 198 was never false
199 timespan = Timespan(begin=None, end=None)
200 self._timespan = timespan
201 self._idName = idColumnName
203 def update(self, host: Optional[str] = None,
204 timespan: Optional[Timespan] = None) -> None:
205 # Docstring inherited from RunRecord.
206 if timespan is None:
207 timespan = Timespan(begin=None, end=None)
208 row = {
209 self._idName: self.key,
210 "host": host,
211 }
212 self._db.getTimespanRepresentation().update(timespan, result=row)
213 count = self._db.update(self._table, {self._idName: self.key}, row)
214 if count != 1:
215 raise RuntimeError(f"Run update affected {count} records; expected exactly one.")
216 self._host = host
217 self._timespan = timespan
219 @property
220 def host(self) -> Optional[str]:
221 # Docstring inherited from RunRecord.
222 return self._host
224 @property
225 def timespan(self) -> Timespan:
226 # Docstring inherited from RunRecord.
227 return self._timespan
230class DefaultChainedCollectionRecord(ChainedCollectionRecord):
231 """Default `ChainedCollectionRecord` implementation.
233 This method assumes the same chain table definition as produced by
234 `makeCollectionChainTableSpec` method. All column names in the table are
235 fixed and hard-coded in the methods.
237 Parameters
238 ----------
239 db : `Database`
240 Registry database.
241 key
242 Unique collection ID, can be the same as ``name`` if ``name`` is used
243 for identification. Usually this is an integer or string, but can be
244 other database-specific type.
245 name : `str`
246 Collection name.
247 table : `sqlalchemy.schema.Table`
248 Table for chain relationship records.
249 """
250 def __init__(self, db: Database, key: Any, name: str, *, table: sqlalchemy.schema.Table):
251 super().__init__(key=key, name=name)
252 self._db = db
253 self._table = table
255 def _update(self, manager: CollectionManager, children: CollectionSearch) -> None:
256 # Docstring inherited from ChainedCollectionRecord.
257 rows = []
258 position = itertools.count()
259 for child, restriction in children.iterPairs(manager, flattenChains=False):
260 if restriction.names is Ellipsis:
261 rows.append({"parent": self.key, "child": child.key,
262 "position": next(position), "dataset_type_name": None})
263 else:
264 for name in restriction.names:
265 rows.append({"parent": self.key, "child": child.key,
266 "position": next(position), "dataset_type_name": name})
267 with self._db.transaction():
268 self._db.delete(self._table, ["parent"], {"parent": self.key})
269 self._db.insert(self._table, *rows)
271 def _load(self, manager: CollectionManager) -> CollectionSearch:
272 # Docstring inherited from ChainedCollectionRecord.
273 sql = sqlalchemy.sql.select(
274 [self._table.columns.child, self._table.columns.dataset_type_name]
275 ).select_from(
276 self._table
277 ).where(
278 self._table.columns.parent == self.key
279 ).order_by(
280 self._table.columns.position
281 )
282 # It's fine to have consecutive rows with the same collection name
283 # and different dataset type names - CollectionSearch will group those
284 # up for us.
285 children = []
286 for row in self._db.query(sql):
287 key = row[self._table.columns.child]
288 restriction = row[self._table.columns.dataset_type_name]
289 if not restriction:
290 restriction = ... # we store ... as "" in the database
291 record = manager[key]
292 children.append((record.name, restriction))
293 return CollectionSearch.fromExpression(children)
296K = TypeVar("K")
299class DefaultCollectionManager(Generic[K], CollectionManager):
300 """Default `CollectionManager` implementation.
302 This implementation uses record classes defined in this module and is
303 based on the same assumptions about schema outlined in the record classes.
305 Parameters
306 ----------
307 db : `Database`
308 Interface to the underlying database engine and namespace.
309 tables : `CollectionTablesTuple`
310 Named tuple of SQLAlchemy table objects.
311 collectionIdName : `str`
312 Name of the column in collections table that identifies it (PK).
314 Notes
315 -----
316 Implementation uses "aggressive" pre-fetching and caching of the records
317 in memory. Memory cache is synchronized from database when `refresh`
318 method is called.
319 """
320 def __init__(self, db: Database, tables: CollectionTablesTuple, collectionIdName: str):
321 self._db = db
322 self._tables = tables
323 self._collectionIdName = collectionIdName
324 self._records: Dict[K, CollectionRecord] = {} # indexed by record ID
326 def refresh(self) -> None:
327 # Docstring inherited from CollectionManager.
328 sql = sqlalchemy.sql.select(
329 self._tables.collection.columns + self._tables.run.columns
330 ).select_from(
331 self._tables.collection.join(self._tables.run, isouter=True)
332 )
333 # Put found records into a temporary instead of updating self._records
334 # in place, for exception safety.
335 records = []
336 chains = []
337 tsRepr = self._db.getTimespanRepresentation()
338 for row in self._db.query(sql).fetchall():
339 collection_id = row[self._tables.collection.columns[self._collectionIdName]]
340 name = row[self._tables.collection.columns.name]
341 type = CollectionType(row["type"])
342 record: CollectionRecord
343 if type is CollectionType.RUN:
344 record = DefaultRunRecord(
345 key=collection_id,
346 name=name,
347 db=self._db,
348 table=self._tables.run,
349 idColumnName=self._collectionIdName,
350 host=row[self._tables.run.columns.host],
351 timespan=tsRepr.extract(row),
352 )
353 elif type is CollectionType.CHAINED: 353 ↛ 354line 353 didn't jump to line 354, because the condition on line 353 was never true
354 record = DefaultChainedCollectionRecord(db=self._db,
355 key=collection_id,
356 table=self._tables.collection_chain,
357 name=name)
358 chains.append(record)
359 else:
360 record = CollectionRecord(key=collection_id, name=name, type=type)
361 records.append(record)
362 self._setRecordCache(records)
363 for chain in chains: 363 ↛ 364line 363 didn't jump to line 364, because the loop on line 363 never started
364 chain.refresh(self)
366 def register(self, name: str, type: CollectionType) -> CollectionRecord:
367 # Docstring inherited from CollectionManager.
368 record = self._getByName(name)
369 if record is None:
370 row, _ = self._db.sync(
371 self._tables.collection,
372 keys={"name": name},
373 compared={"type": int(type)},
374 returning=[self._collectionIdName],
375 )
376 assert row is not None
377 collection_id = row[self._collectionIdName]
378 if type is CollectionType.RUN:
379 tsRepr = self._db.getTimespanRepresentation()
380 row, _ = self._db.sync(
381 self._tables.run,
382 keys={self._collectionIdName: collection_id},
383 returning=("host",) + tsRepr.getFieldNames(),
384 )
385 assert row is not None
386 record = DefaultRunRecord(
387 db=self._db,
388 key=collection_id,
389 name=name,
390 table=self._tables.run,
391 idColumnName=self._collectionIdName,
392 host=row["host"],
393 timespan=tsRepr.extract(row),
394 )
395 elif type is CollectionType.CHAINED:
396 record = DefaultChainedCollectionRecord(db=self._db, key=collection_id, name=name,
397 table=self._tables.collection_chain)
398 else:
399 record = CollectionRecord(key=collection_id, name=name, type=type)
400 self._addCachedRecord(record)
401 return record
403 def remove(self, name: str) -> None:
404 # Docstring inherited from CollectionManager.
405 record = self._getByName(name)
406 if record is None: 406 ↛ 407line 406 didn't jump to line 407, because the condition on line 406 was never true
407 raise MissingCollectionError(f"No collection with name '{name}' found.")
408 # This may raise
409 self._db.delete(self._tables.collection, [self._collectionIdName],
410 {self._collectionIdName: record.key})
411 self._removeCachedRecord(record)
413 def find(self, name: str) -> CollectionRecord:
414 # Docstring inherited from CollectionManager.
415 result = self._getByName(name)
416 if result is None:
417 raise MissingCollectionError(f"No collection with name '{name}' found.")
418 return result
420 def __getitem__(self, key: Any) -> CollectionRecord:
421 # Docstring inherited from CollectionManager.
422 try:
423 return self._records[key]
424 except KeyError as err:
425 raise MissingCollectionError(f"Collection with key '{key}' not found.") from err
427 def __iter__(self) -> Iterator[CollectionRecord]:
428 yield from self._records.values()
430 def _setRecordCache(self, records: Iterable[CollectionRecord]) -> None:
431 """Set internal record cache to contain given records,
432 old cached records will be removed.
433 """
434 self._records = {}
435 for record in records:
436 self._records[record.key] = record
438 def _addCachedRecord(self, record: CollectionRecord) -> None:
439 """Add single record to cache.
440 """
441 self._records[record.key] = record
443 def _removeCachedRecord(self, record: CollectionRecord) -> None:
444 """Remove single record from cache.
445 """
446 del self._records[record.key]
448 @abstractmethod
449 def _getByName(self, name: str) -> Optional[CollectionRecord]:
450 """Find collection record given collection name.
451 """
452 raise NotImplementedError()