Coverage for python/lsst/daf/butler/registry/collections/_base.py : 69%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = []
25from abc import abstractmethod
26import astropy.time
27import itertools
28from typing import (
29 Any,
30 Iterator,
31 NamedTuple,
32 Optional,
33 TYPE_CHECKING,
34)
36import sqlalchemy
38from ...core import ddl
39from ...core.timespan import Timespan, TIMESPAN_FIELD_SPECS
40from .._collectionType import CollectionType
41from ..interfaces import (
42 ChainedCollectionRecord,
43 CollectionManager,
44 CollectionRecord,
45 MissingCollectionError,
46 RunRecord,
47)
48from ..wildcards import CollectionSearch
50if TYPE_CHECKING: 50 ↛ 51line 50 didn't jump to line 51, because the condition on line 50 was never true
51 from .database import Database
54def _makeCollectionForeignKey(sourceColumnName: str, collectionIdName: str, **kwargs) -> ddl.ForeignKeySpec:
55 """Define foreign key specification that refers to collections table.
57 Parameters
58 ----------
59 sourceColumnName : `str`
60 Name of the column in the referring table.
61 collectionIdName : `str`
62 Name of the column in collections table that identifies it (PK).
63 **kwargs
64 Additional keyword arguments passed directly to `ddl.ForeignKeySpec`.
66 Returns
67 -------
68 spec : `ddl.ForeignKeySpec`
69 Foreign key specification.
71 Notes
72 -----
73 This method assumes fixed name ("collection") of a collections table.
74 There is also a general assumption that collection primary key consists
75 of a single column.
76 """
77 return ddl.ForeignKeySpec("collection", source=(sourceColumnName,), target=(collectionIdName,),
78 **kwargs)
81def makeRunTableSpec(collectionIdName: str, collectionIdType: type):
82 """Define specification for "run" table.
84 Parameters
85 ----------
86 collectionIdName : `str`
87 Name of the column in collections table that identifies it (PK).
88 collectionIdType
89 Type of the PK column in the collections table, one of the
90 `sqlalchemy` types.
92 Returns
93 -------
94 spec : `ddl.TableSpec`
95 Specification for run table.
97 Notes
98 -----
99 Assumption here and in the code below is that the name of the identifying
100 column is the same in both collections and run tables. The names of
101 non-identifying columns containing run metadata are fixed.
102 """
103 return ddl.TableSpec(
104 fields=[
105 ddl.FieldSpec(collectionIdName, dtype=collectionIdType, primaryKey=True),
106 TIMESPAN_FIELD_SPECS.begin,
107 TIMESPAN_FIELD_SPECS.end,
108 ddl.FieldSpec("host", dtype=sqlalchemy.String, length=128),
109 ],
110 foreignKeys=[
111 _makeCollectionForeignKey(collectionIdName, collectionIdName, onDelete="CASCADE"),
112 ],
113 )
116def makeCollectionChainTableSpec(collectionIdName: str, collectionIdType: type):
117 """Define specification for "collection_chain" table.
119 Parameters
120 ----------
121 collectionIdName : `str`
122 Name of the column in collections table that identifies it (PK).
123 collectionIdType
124 Type of the PK column in the collections table, one of the
125 `sqlalchemy` types.
127 Returns
128 -------
129 spec : `ddl.TableSpec`
130 Specification for collection chain table.
132 Notes
133 -----
134 Collection chain is simply an ordered one-to-many relation between
135 collections. The names of the columns in the table are fixed and
136 also hardcoded in the code below.
137 """
138 return ddl.TableSpec(
139 fields=[
140 ddl.FieldSpec("parent", dtype=collectionIdType, primaryKey=True),
141 ddl.FieldSpec("position", dtype=sqlalchemy.SmallInteger, primaryKey=True),
142 ddl.FieldSpec("child", dtype=collectionIdType, nullable=False),
143 ddl.FieldSpec("dataset_type_name", dtype=sqlalchemy.String, length=128, nullable=True),
144 ],
145 foreignKeys=[
146 _makeCollectionForeignKey("parent", collectionIdName, onDelete="CASCADE"),
147 _makeCollectionForeignKey("child", collectionIdName),
148 ],
149 )
152class DefaultRunRecord(RunRecord):
153 """Default `RunRecord` implementation.
155 This method assumes the same run table definition as produced by
156 `makeRunTableSpec` method. The only non-fixed name in the schema
157 is the PK column name, this needs to be passed in a constructor.
159 Parameters
160 ----------
161 db : `Database`
162 Registry database.
163 key
164 Unique collection ID, can be the same as ``name`` if ``name`` is used
165 for identification. Usually this is an integer or string, but can be
166 other database-specific type.
167 name : `str`
168 Run collection name.
169 table : `sqlalchemy.schema.Table`
170 Table for run records.
171 idColumnName : `str`
172 Name of the identifying column in run table.
173 host : `str`, optional
174 Name of the host where run was produced.
175 timespan : `Timespan`, optional
176 Timespan for this run.
177 """
178 def __init__(self, db: Database, key: Any, name: str, *, table: sqlalchemy.schema.Table,
179 idColumnName: str, host: Optional[str] = None,
180 timespan: Optional[Timespan[astropy.time.Time]] = None):
181 super().__init__(key=key, name=name, type=CollectionType.RUN)
182 self._db = db
183 self._table = table
184 self._host = host
185 if timespan is None: 185 ↛ 186line 185 didn't jump to line 186, because the condition on line 185 was never true
186 timespan = Timespan(begin=None, end=None)
187 self._timespan = timespan
188 self._idName = idColumnName
190 def update(self, host: Optional[str] = None, timespan: Optional[Timespan[astropy.time.Time]] = None):
191 # Docstring inherited from RunRecord.
192 if timespan is None:
193 timespan = Timespan(begin=None, end=None)
194 row = {
195 self._idName: self.key,
196 TIMESPAN_FIELD_SPECS.begin.name: timespan.begin,
197 TIMESPAN_FIELD_SPECS.end.name: timespan.end,
198 "host": host
199 }
200 count = self._db.update(self._table, {self._idName: self.key}, row)
201 if count != 1:
202 raise RuntimeError(f"Run update affected {count} records; expected exactly one.")
203 self._host = host
204 self._timespan = timespan
206 @property
207 def host(self) -> Optional[str]:
208 # Docstring inherited from RunRecord.
209 return self._host
211 @property
212 def timespan(self) -> Timespan[astropy.time.Time]:
213 # Docstring inherited from RunRecord.
214 return self._timespan
217class DefaultChainedCollectionRecord(ChainedCollectionRecord):
218 """Default `ChainedCollectionRecord` implementation.
220 This method assumes the same chain table definition as produced by
221 `makeCollectionChainTableSpec` method. All column names in the table are
222 fixed and hard-coded in the methods.
224 Parameters
225 ----------
226 db : `Database`
227 Registry database.
228 key
229 Unique collection ID, can be the same as ``name`` if ``name`` is used
230 for identification. Usually this is an integer or string, but can be
231 other database-specific type.
232 name : `str`
233 Collection name.
234 table : `sqlalchemy.schema.Table`
235 Table for chain relationship records.
236 """
237 def __init__(self, db: Database, key: Any, name: str, *, table: sqlalchemy.schema.Table):
238 super().__init__(key=key, name=name)
239 self._db = db
240 self._table = table
242 def _update(self, manager: CollectionManager, children: CollectionSearch):
243 # Docstring inherited from ChainedCollectionRecord.
244 rows = []
245 position = itertools.count()
246 for child, restriction in children.iter(manager, withRestrictions=True, flattenChains=False):
247 if restriction.names is ...:
248 rows.append({"parent": self.key, "child": child.key,
249 "position": next(position), "dataset_type_name": None})
250 else:
251 for name in restriction.names:
252 rows.append({"parent": self.key, "child": child.key,
253 "position": next(position), "dataset_type_name": name})
254 with self._db.transaction():
255 self._db.delete(self._table, ["parent"], {"parent": self.key})
256 self._db.insert(self._table, *rows)
258 def _load(self, manager: CollectionManager) -> CollectionSearch:
259 # Docstring inherited from ChainedCollectionRecord.
260 sql = sqlalchemy.sql.select(
261 [self._table.columns.child, self._table.columns.dataset_type_name]
262 ).select_from(
263 self._table
264 ).where(
265 self._table.columns.parent == self.key
266 ).order_by(
267 self._table.columns.position
268 )
269 # It's fine to have consecutive rows with the same collection name
270 # and different dataset type names - CollectionSearch will group those
271 # up for us.
272 children = []
273 for row in self._db.query(sql):
274 key = row[self._table.columns.child]
275 restriction = row[self._table.columns.dataset_type_name]
276 if not restriction:
277 restriction = ... # we store ... as "" in the database
278 record = manager[key]
279 children.append((record.name, restriction))
280 return CollectionSearch.fromExpression(children)
283class DefaultCollectionManager(CollectionManager):
284 """Default `CollectionManager` implementation.
286 This implementation uses record classes defined in this module and is
287 based on the same assumptions about schema outlined in the record classes.
289 Parameters
290 ----------
291 db : `Database`
292 Interface to the underlying database engine and namespace.
293 tables : `NamedTuple`
294 Named tuple of SQLAlchemy table objects.
295 collectionIdName : `str`
296 Name of the column in collections table that identifies it (PK).
298 Notes
299 -----
300 Implementation uses "aggressive" pre-fetching and caching of the records
301 in memory. Memory cache is synchronized from database when `refresh`
302 method is called.
303 """
304 def __init__(self, db: Database, tables: NamedTuple[sqlalchemy.schema.Table, ...],
305 collectionIdName: str):
306 self._db = db
307 self._tables = tables
308 self._collectionIdName = collectionIdName
309 self._records = {} # indexed by record ID
311 def refresh(self):
312 # Docstring inherited from CollectionManager.
313 sql = sqlalchemy.sql.select(
314 self._tables.collection.columns + self._tables.run.columns
315 ).select_from(
316 self._tables.collection.join(self._tables.run, isouter=True)
317 )
318 # Put found records into a temporary instead of updating self._records
319 # in place, for exception safety.
320 records = []
321 chains = []
322 for row in self._db.query(sql).fetchall():
323 collection_id = row[self._tables.collection.columns[self._collectionIdName]]
324 name = row[self._tables.collection.columns.name]
325 type = CollectionType(row["type"])
326 if type is CollectionType.RUN: 326 ↛ 339line 326 didn't jump to line 339, because the condition on line 326 was never false
327 record = DefaultRunRecord(
328 key=collection_id,
329 name=name,
330 db=self._db,
331 table=self._tables.run,
332 idColumnName=self._collectionIdName,
333 host=row[self._tables.run.columns.host],
334 timespan=Timespan(
335 begin=row[self._tables.run.columns[TIMESPAN_FIELD_SPECS.begin.name]],
336 end=row[self._tables.run.columns[TIMESPAN_FIELD_SPECS.end.name]],
337 )
338 )
339 elif type is CollectionType.CHAINED:
340 record = DefaultChainedCollectionRecord(db=self._db,
341 key=collection_id,
342 table=self._tables.collection_chain,
343 name=name)
344 chains.append(record)
345 else:
346 record = CollectionRecord(key=collection_id, name=name, type=type)
347 records.append(record)
348 self._setRecordCache(records)
349 for chain in chains: 349 ↛ 350line 349 didn't jump to line 350, because the loop on line 349 never started
350 chain.refresh(self)
352 def register(self, name: str, type: CollectionType) -> CollectionRecord:
353 # Docstring inherited from CollectionManager.
354 record = self._getByName(name)
355 if record is None:
356 row, _ = self._db.sync(
357 self._tables.collection,
358 keys={"name": name},
359 compared={"type": int(type)},
360 returning=[self._collectionIdName],
361 )
362 collection_id = row[self._collectionIdName]
363 if type is CollectionType.RUN:
364 row, _ = self._db.sync(
365 self._tables.run,
366 keys={self._collectionIdName: collection_id},
367 returning={"host", TIMESPAN_FIELD_SPECS.begin.name, TIMESPAN_FIELD_SPECS.end.name},
368 )
369 record = DefaultRunRecord(
370 db=self._db,
371 key=collection_id,
372 name=name,
373 table=self._tables.run,
374 idColumnName=self._collectionIdName,
375 host=row["host"],
376 timespan=Timespan(
377 row[TIMESPAN_FIELD_SPECS.begin.name],
378 row[TIMESPAN_FIELD_SPECS.end.name]
379 ),
380 )
381 elif type is CollectionType.CHAINED:
382 record = DefaultChainedCollectionRecord(db=self._db, key=collection_id, name=name,
383 table=self._tables.collection_chain)
384 else:
385 record = CollectionRecord(key=collection_id, name=name, type=type)
386 self._addCachedRecord(record)
387 return record
389 def remove(self, name: str):
390 # Docstring inherited from CollectionManager.
391 record = self._getByName(name)
392 if record is None: 392 ↛ 393line 392 didn't jump to line 393, because the condition on line 392 was never true
393 raise MissingCollectionError(f"No collection with name '{name}' found.")
394 # This may raise
395 self._db.delete(self._tables.collection, [self._collectionIdName],
396 {self._collectionIdName: record.key})
397 self._removeCachedRecord(record)
399 def find(self, name: str) -> CollectionRecord:
400 # Docstring inherited from CollectionManager.
401 result = self._getByName(name)
402 if result is None:
403 raise MissingCollectionError(f"No collection with name '{name}' found.")
404 return result
406 def __getitem__(self, key: Any) -> Optional[CollectionRecord]:
407 # Docstring inherited from CollectionManager.
408 try:
409 return self._records[key]
410 except KeyError as err:
411 raise MissingCollectionError(f"Collection with key '{key}' not found.") from err
413 def __iter__(self) -> Iterator[CollectionRecord]:
414 yield from self._records.values()
416 def _setRecordCache(self, records: Iterator[CollectionRecord]):
417 """Set internal record cache to contain given records,
418 old cached records will be removed.
419 """
420 self._records = {}
421 for record in records:
422 self._records[record.key] = record
424 def _addCachedRecord(self, record: CollectionRecord):
425 """Add single record to cache.
426 """
427 self._records[record.key] = record
429 def _removeCachedRecord(self, record: CollectionRecord):
430 """Remove single record from cache.
431 """
432 del self._records[record.key]
434 @abstractmethod
435 def _getByName(self, name: str):
436 """Find collection record given collection name.
437 """
438 raise NotImplementedError()