/[thuban]/branches/WIP-pyshapelib-bramz/Thuban/Model/transientdb.py
ViewVC logotype

Annotation of /branches/WIP-pyshapelib-bramz/Thuban/Model/transientdb.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1923 - (hide annotations)
Fri Nov 7 12:07:01 2003 UTC (21 years, 4 months ago) by bh
Original Path: trunk/thuban/Thuban/Model/transientdb.py
File MIME type: text/x-python
File size: 25170 byte(s)
Tweak the usage of the sqlite database to make common use cases
more responsive. In most cases copying the data to the sqlite
database takes so long that using sqlite doesn't have enough
advantages.

(TransientTableBase.ValueRange): Add
comments about performance and query the min and max in separate
statements because only that way will indexes be used.
(TransientTableBase.UniqueValues): Add some comments about
performance.
(AutoTransientTable.ValueRange, AutoTransientTable.UniqueValues):
Do not copy the data to the transient DB but use the transient
copy if it exists. See the new comments for the performance trade
offs

1 bh 765 # Copyright (C) 2003 by Intevation GmbH
2     # Authors:
3     # Bernhard Herzog <[email protected]>
4     #
5     # This program is free software under the GPL (>=v2)
6     # Read the file COPYING coming with the software for details.
7    
8     """Database for transient data
9    
10     This database is intended for data representations needed during the
11     lifetime of a Thuban session but which is not permanent. Examples of
12     this are for instance a join of two DBF files where the DBF files are
13     the permanent representation of the data and the join only exists in the
14     Thuban session and is reconstructed when the session is opened.
15     """
16    
17     __version__ = "$Revision$"
18     # $Source$
19     # $Id$
20    
21     from sqlite import connect
22    
23 jan 1020 from base import TitledObject
24    
25 bh 765 import table
26    
27     sql_type_map = {
28     table.FIELDTYPE_INT: "INTEGER",
29     table.FIELDTYPE_STRING: "VARCHAR",
30     table.FIELDTYPE_DOUBLE: "FLOAT",
31     }
32    
33     type_converter_map = {
34     table.FIELDTYPE_INT: int,
35     table.FIELDTYPE_STRING: str,
36     table.FIELDTYPE_DOUBLE: float,
37     }
38    
39     class TransientDatabase:
40    
41     def __init__(self, filename):
42     self.filename = filename
43     self.conn = connect(filename)
44     # Counters to produce unique table and column names
45     self.num_tables = 0
46     self.num_cols = 0
47     # Since there's only once process using the SQLite database, we
48     # might be able to get a tad more speed with default_synchronous
49     # OFF. So far I haven't seen any measurable speedup, though.
50     #self.execute("PRAGMA default_synchronous = OFF")
51    
52     def __del__(self):
53     self.close()
54    
55     def close(self):
56 bh 777 if self.conn is not None:
57 bh 765 self.conn.close()
58 bh 777 self.conn = None
59 bh 765
60     def new_table_name(self):
61     self.num_tables += 1
62     return "Table%03d" % self.num_tables
63    
64     def new_column_name(self):
65     self.num_cols += 1
66     return "Col%03d" % self.num_cols
67    
68     def execute(self, *args):
69     """execute the SQL statement in the database and return the result"""
70     cursor = self.conn.cursor()
71     cursor.execute(*args)
72     result = cursor.fetchone()
73     self.conn.commit()
74     return result
75    
76     def cursor(self):
77     return self.conn.cursor()
78    
79    
80     class ColumnReference:
81    
82     def __init__(self, name, type, internal_name):
83     self.name = name
84     self.type = type
85     self.internal_name = internal_name
86    
87    
88 bh 818 class TransientTableBase(table.OldTableInterfaceMixin):
89 bh 765
90     """Base class for tables in the transient database"""
91    
92     def __init__(self, transient_db):
93     """Initialize the table for use with the given transient db"""
94     self.db = transient_db
95     self.tablename = self.db.new_table_name()
96     self.indexed_columns = {}
97     self.read_record_cursor = None
98     self.read_record_last_row = None
99 bh 785 self.read_record_last_result = None
100 bh 765
101     def create(self, columns):
102     self.columns = columns
103     self.name_to_column = {}
104     self.orig_names = []
105     self.internal_to_orig = {}
106     self.orig_to_internal = {}
107 bh 818 self.column_map = {}
108 bh 765
109     # Create the column objects and fill various maps and lists
110 bh 818 for index in range(len(self.columns)):
111     col = self.columns[index]
112 bh 765 self.name_to_column[col.name] = col
113     self.orig_names.append(col.name)
114     self.internal_to_orig[col.internal_name] = col.name
115     self.orig_to_internal[col.name] = col.internal_name
116 bh 818 self.column_map[col.name] = col
117     self.column_map[index] = col
118 bh 765
119     # Build the CREATE TABLE statement and create the table in the
120     # database
121 bh 841 table_types = ["id INTEGER PRIMARY KEY"]
122 bh 765 for col in self.columns:
123     table_types.append("%s %s" % (col.internal_name,
124     sql_type_map[col.type]))
125     table_stmt = "CREATE TABLE %s (\n %s\n);" % (self.tablename,
126     ",\n ".join(table_types))
127     self.db.execute(table_stmt)
128    
129     def transient_table(self):
130     """
131     Return a table whose underlying implementation is in the transient db
132     """
133     return self
134    
135     def ensure_index(self, column):
136     """Ensure that there's an index on the given column"""
137     if not column in self.indexed_columns:
138     internal_name = self.orig_to_internal[column]
139     indexname = "Index_%s_%s" % (self.tablename, internal_name)
140     stmt = "CREATE INDEX %s ON %s (%s);" % (indexname, self.tablename,
141     internal_name)
142     self.db.execute(stmt)
143     self.indexed_columns[column] = 1
144    
145 bh 818 def NumColumns(self):
146 bh 765 return len(self.columns)
147    
148 bh 818 def NumRows(self):
149 bh 765 result = self.db.execute("SELECT count(*) FROM %s;" % self.tablename)
150     return int(result[0])
151    
152 bh 818 def Columns(self):
153     return self.columns
154    
155     def Column(self, col):
156     return self.column_map[col]
157    
158 bh 839 def HasColumn(self, col):
159     """Return whether the table has a column with the given name or index
160     """
161     return self.column_map.has_key(col)
162    
163 bh 1662 def RowIdToOrdinal(self, gid):
164     """Return the row ordinal given its id
165    
166     At the moment the transient tables are only used for tables that
167     don't distinguish between row number and row id, so the value is
168     returned unchanged.
169     """
170     return gid
171    
172     def RowOrdinalToId(self, num):
173     """Return the rowid for given its ordinal
174    
175     At the moment the transient tables are only used for tables that
176     don't distinguish between row number and row id, so the value is
177     returned unchanged.
178     """
179     return num
180    
181     def ReadRowAsDict(self, index, row_is_ordinal = 0):
182     """Return the entire row as a dictionary with column names as keys
183    
184     The row_is_ordinal is ignored because at the moment the
185     transient tables are only used for DBF files where it doesn't
186     matter.
187     """
188 bh 849 # Implementation Strategy: Executing a completely new select
189     # statement every time this method is called is too slow. The
190     # most important usage is to read the records more or less
191     # sequentially. This happens e.g. when drawing a layer with a
192     # classification where the shapes are drawn in order of the
193     # shape ids. Another pattern is that the same row is requested
194     # several times in a row. This happens in the table view, for
195     # instance.
196    
197     # We can exploit this to make access faster by having one cursor
198     # open all the time and keeping the last row read around in case
199     # the same row is accessed again the next time and if the row
200     # index is larger than the row we have read last we simply fetch
201     # rows from the cursor until we've reached the requested row. If
202     # the requested row index is smaller then we start a new cursor.
203    
204     # FIXME: So far this scheme seems to work well enough. Obvious
205     # improvements would be to start the cursor at exactly the
206     # requested row (should be efficient and easy to do now that the
207     # id is the primary key) and to perhaps to also start a new
208     # cursor if the requested index is much larger than the last row
209     # so that we don't read and discard lots of the rows.
210    
211     # Check whether we have to start a new cursor
212 bh 765 if self.read_record_cursor is None or index <self.read_record_last_row:
213     stmt = ("SELECT %s FROM %s;"
214     % (", ".join([c.internal_name for c in self.columns]),
215     self.tablename))
216     self.read_record_cursor = self.db.cursor()
217     self.read_record_cursor.execute(stmt)
218     self.read_record_last_row = -1
219 bh 785 self.read_record_last_result = None
220    
221     # Now we should have a cursor at a position less than or equal
222     # to the index so the following if statement will always set
223     # result to a suitable value
224     assert index >= self.read_record_last_row
225    
226     if index == self.read_record_last_row:
227 bh 818 result = self.read_record_last_result
228 bh 785 else:
229     for i in range(index - self.read_record_last_row):
230     result = self.read_record_cursor.fetchone()
231     self.read_record_last_result = result
232 bh 765 self.read_record_last_row = index
233 bh 785 return dict(zip(self.orig_names, result))
234 bh 765
235 bh 1662 def ReadValue(self, row, col, row_is_ordinal = 0):
236 bh 849 """Return the value of the specified row and column
237    
238     The col parameter may be the index of the column or its name.
239 bh 1662
240     The row_is_ordinal is ignored because at the moment the
241     transient tables are only used for DBF files where it doesn't
242     matter.
243 bh 849 """
244     # Depending on the actual access patterns of the table data, it
245     # might be a bit faster in some circumstances to not implement
246     # this via ReadRowAsDict, but this simple implementation should
247     # be fast enough for most purposes.
248     return self.ReadRowAsDict(row)[self.column_map[col].name]
249    
250 bh 818 def ValueRange(self, col):
251 bh 1923 # Performance notes:
252     #
253     # In sqlite 2.8.6 the min and max aggregate functions can use an
254     # index but only when used as the only expression in the select
255     # statement (i.e. 'select min(col), max(col) from tbl;' will not
256     # use the index but 'select min(col) from tbl;' will) so we
257     # query the minimum and maximum separately.
258     #
259     # With the separate statements we can take advantage of an index
260     # if it exists. If the index doesn't exist, creating it first
261     # and then using it in the query is slower than the queries
262     # without an index. Creating the index is only an advantage if
263     # the queries are performed multiple times. With the current use
264     # patterns where ValueRange is only used occasionally by the
265     # classification generation dialog creating the index only for
266     # this usage is not really worth it, so we don't.
267 bh 818 col = self.column_map[col]
268 bh 765 iname = col.internal_name
269 bh 1923 min = self.db.execute("SELECT min(%s) FROM %s;"
270     % (iname, self.tablename))[0]
271     max = self.db.execute("SELECT max(%s) FROM %s;"
272     % (iname, self.tablename))[0]
273 bh 765 converter = type_converter_map[col.type]
274 bh 818 return (converter(min), converter(max))
275 bh 765
276 bh 818 def UniqueValues(self, col):
277 bh 1923 # Performance notes:
278     #
279     # In sqlite 2.8.6 there doesn't seem to be a way to query the
280     # unique items that uses an index. I've tried
281     #
282     # SELECT col FROM tbl GROUP BY col;
283     #
284     # and
285     #
286     # SELECT DISTINCT col FROM tbl;
287     #
288     # and in both cases the index is not used. If the index isn't
289     # used it doesn't make sense to call self.ensure_index.
290 bh 818 iname = self.column_map[col].internal_name
291 bh 765 cursor = self.db.cursor()
292     cursor.execute("SELECT %s FROM %s GROUP BY %s;"
293     % (iname, self.tablename, iname))
294     result = []
295     while 1:
296     row = cursor.fetchone()
297     if row is None:
298     break
299     result.append(row[0])
300     return result
301    
302 frank 1026 def Width(self, col):
303     """Return the maximum width of values in the column
304    
305 bh 1381 The return value is the the maximum length of string
306     representation of the values in the column (represented by index
307     or name).
308     """
309 frank 1026 max = 0
310 bh 1381
311 frank 1026 type = self.column_map[col].type
312     iname = self.column_map[col].internal_name
313     cursor = self.db.cursor()
314     cursor.execute("SELECT %s FROM %s;" % (iname, self.tablename))
315     values = [ i[0] for i in cursor.fetchall()]
316     if not values:
317     return None
318    
319 bh 1381 if type == table.FIELDTYPE_DOUBLE:
320 bh 1043 format = "%.12f"
321 bh 1381 elif type == table.FIELDTYPE_INT:
322 frank 1026 format = "%d"
323     else:
324     format = "%s"
325     for value in values:
326     if value is None: continue
327     l = len(format % value)
328     if l > max:
329     max = l
330    
331     return max
332    
333 bh 841 def SimpleQuery(self, left, comparison, right):
334     """Return the indices of all rows that matching a condition.
335 bh 765
336 bh 841 Parameters:
337     left -- The column object for the left side of the comparison
338    
339     comparison -- The comparison operator as a string. It must be
340     one of '==', '!=', '<', '<=', '>=', '>'
341    
342     right -- The right hand side of the comparison. It must be
343     either a column object or a value, i.e. a string,
344     int or float.
345    
346     The return value is a sorted list of the indices of the rows
347     where the condition is true.
348     """
349     if comparison not in ("==", "!=", "<", "<=", ">=", ">"):
350     raise ValueError("Comparison operator %r not allowed" % comparison)
351    
352     if hasattr(right, "internal_name"):
353     right_template = right.internal_name
354     params = ()
355     else:
356     right_template = "%s"
357     params = (right,)
358    
359     query = "SELECT id FROM %s WHERE %s %s %s ORDER BY id;" \
360     % (self.tablename, left.internal_name, comparison,
361     right_template)
362    
363     cursor = self.db.cursor()
364     cursor.execute(query, params)
365     result = []
366     while 1:
367     row = cursor.fetchone()
368     if row is None:
369     break
370     result.append(row[0])
371     return result
372    
373 bh 984 def Dependencies(self):
374     """Placeholder for a method in a derived class.
375 bh 841
376 bh 984 Return a sequence with the tables and other data objects that
377     self depends on.
378     """
379     raise NotImplementedError
380    
381    
382 jan 1020 class TransientTable(TitledObject, TransientTableBase):
383 bh 765
384     """A Table in a transient DB that starts as the copy of a Thuban Table."""
385    
386     def __init__(self, transient_db, table):
387     """Create a new table in the given transient DB as a copy of table
388    
389     The table argument can be any object implementing the Table
390     interface.
391     """
392     TransientTableBase.__init__(self, transient_db)
393 jan 1020 TitledObject.__init__(self, table.Title())
394 bh 765 self.create(table)
395    
396     def create(self, table):
397     columns = []
398 bh 818 for col in table.Columns():
399     columns.append(ColumnReference(col.name, col.type,
400 bh 765 self.db.new_column_name()))
401     TransientTableBase.create(self, columns)
402    
403     # copy the input table to the transient db
404 bh 841
405     # A key to insert to use for the formatting of the insert
406     # statement. The key must not be equal to any of the column
407     # names so we construct one by building a string of x's that is
408     # longer than any of the column names
409     id_key = max([len(col.name) for col in self.columns]) * "x"
410    
411     insert_template = "INSERT INTO %s (id, %s) VALUES (%%(%s)s, %s);" \
412 bh 765 % (self.tablename,
413     ", ".join([col.internal_name
414     for col in self.columns]),
415 bh 841 id_key,
416 bh 765 ", ".join(["%%(%s)s" % col.name
417     for col in self.columns]))
418     cursor = self.db.cursor()
419 bh 818 for i in range(table.NumRows()):
420 bh 841 row = table.ReadRowAsDict(i)
421     row[id_key] = i
422     cursor.execute(insert_template, row)
423 bh 765 self.db.conn.commit()
424    
425    
426    
427 jan 1020 class TransientJoinedTable(TitledObject, TransientTableBase):
428 bh 765
429     """A Table in the transient DB that contains a join of two tables"""
430    
431     def __init__(self, transient_db, left_table, left_field,
432 frank 1009 right_table, right_field = None, outer_join = False):
433 bh 765 """Create a new table in the transient DB as a join of two tables.
434    
435     Both input tables, left_table and right_table must have a
436     transient_table method that returns a table object for a table
437 frank 1002 in the transient database. The join is performed on the condition
438 bh 765 that the value of the left_field column the the left table is
439     equal to the value of the right_field in the right_table.
440    
441 bh 1364 The joined table contains all columns of the input tables,
442     however, the column names of the right table may be changed
443     slightly to make them unique in the joined table. This is
444     currently done by appending a sufficient number of underscores
445     ('_').
446 bh 765 """
447     TransientTableBase.__init__(self, transient_db)
448 bh 984 self.dependencies = (left_table, right_table)
449 bh 765 self.left_table = left_table.transient_table()
450     self.left_field = left_field
451     self.right_table = right_table.transient_table()
452     if right_field:
453     self.right_field = right_field
454     else:
455     self.right_field = self.left_field
456 frank 1009 self.outer_join = outer_join
457 jan 1020
458     title = "Join of %(left)s and %(right)s" \
459     % {"left": self.left_table.Title(),
460     "right": self.right_table.Title()}
461     TitledObject.__init__(self, title)
462    
463 bh 765 self.create()
464    
465     def create(self):
466     """Internal: Create the table with the joined data"""
467     self.tablename = self.db.new_table_name()
468    
469     self.right_table.ensure_index(self.right_field)
470    
471 bh 1328 # determine the internal column names to join on before
472     # coalescing the column information because if the external
473     # column names are the same they will be mapped to the same
474     # internal name afterwards.
475     internal_left_col = self.left_table.orig_to_internal[self.left_field]
476 bh 1364 internal_right_col =self.right_table.orig_to_internal[self.right_field]
477 bh 1328
478 bh 765 # Coalesce the column information
479     visited = {}
480     columns = []
481 frank 1333 newcolumns = []
482 bh 1364 for table in (self.left_table, self.right_table):
483     for col in table.Columns():
484     colname = col.name
485     # We can't allow multiple columns with the same
486     # original name, so append '_' to this one until
487     # it is unique.
488     # FIXME: There should be a better solution.
489     while colname in visited:
490     colname = colname + '_'
491     columns.append((table.tablename, col))
492     newcol = ColumnReference(colname, col.type,
493     "Col%03d" % (len(newcolumns)+1))
494     newcolumns.append(newcol)
495     visited[colname] = 1
496 frank 1333 TransientTableBase.create(self, newcolumns)
497 bh 765
498     # Copy the joined data to the table.
499 frank 1333 newinternal_names = [col.internal_name for col in self.columns]
500     internal_references = ["%s.%s" % (table, col.internal_name)
501     for table, col in columns]
502 frank 1009 if self.outer_join:
503     join_operator = 'LEFT OUTER JOIN'
504     else:
505     join_operator = 'JOIN'
506 bh 841 stmt = ("INSERT INTO %s (id, %s) SELECT %s.id, %s FROM %s"
507 frank 1333 " %s %s ON %s.%s = %s.%s;"
508 bh 841 % (self.tablename,
509 frank 1333 ", ".join(newinternal_names),
510 bh 841 self.left_table.tablename,
511 frank 1333 ", ".join(internal_references),
512 bh 841 self.left_table.tablename,
513 frank 1009 join_operator,
514 bh 841 self.right_table.tablename,
515 frank 1333 self.left_table.tablename,
516 bh 1328 internal_left_col,
517 frank 1333 self.right_table.tablename,
518 bh 1328 internal_right_col))
519 bh 765 self.db.execute(stmt)
520    
521 bh 984 def Dependencies(self):
522     """Return a tuple with the two tables the join depends on."""
523     return self.dependencies
524 bh 765
525 bh 1375 def JoinType(self):
526     """Return the type of the join (either 'INNER' or 'LEFT OUTER')"""
527     if self.outer_join:
528     return "LEFT OUTER"
529     else:
530     return "INNER"
531 bh 984
532 bh 1375
533 jan 1020 class AutoTransientTable(TitledObject, table.OldTableInterfaceMixin):
534 bh 765
535     """Table that copies data to a transient table on demand.
536    
537     The AutoTransientTable takes another table as input and copies data
538     to a table in a TransientDatabase instance on demand.
539     """
540    
541     def __init__(self, transient_db, table):
542 jan 1020 TitledObject.__init__(self, table.Title())
543 bh 765 self.transient_db = transient_db
544     self.table = table
545     self.t_table = None
546    
547 bh 818 def Columns(self):
548     return self.table.Columns()
549 bh 765
550 bh 818 def Column(self, col):
551     return self.table.Column(col)
552 bh 765
553 bh 839 def HasColumn(self, col):
554     """Return whether the table has a column with the given name or index
555     """
556     return self.table.HasColumn(col)
557    
558 bh 818 def NumRows(self):
559     return self.table.NumRows()
560 bh 765
561 bh 818 def NumColumns(self):
562     return self.table.NumColumns()
563 bh 765
564 bh 1662 def RowIdToOrdinal(self, gid):
565     """Return the row ordinal given its id"""
566     if self.t_table is not None:
567     return self.t_table.RowIdToOrdinal(gid)
568     else:
569     return self.table.RowIdToOrdinal(gid)
570    
571     def RowOrdinalToId(self, num):
572     """Return the rowid for given its ordinal"""
573     if self.t_table is not None:
574     return self.t_table.RowOrdinalToId(num)
575     else:
576     return self.table.RowOrdinalToId(num)
577    
578     def ReadRowAsDict(self, record, row_is_ordinal = 0):
579 bh 765 """Return the record no. record as a dict mapping field names to values
580     """
581     if self.t_table is not None:
582 bh 1662 return self.t_table.ReadRowAsDict(record,
583     row_is_ordinal = row_is_ordinal)
584 bh 765 else:
585 bh 1662 return self.table.ReadRowAsDict(record,
586     row_is_ordinal = row_is_ordinal)
587 bh 765
588 bh 1662 def ReadValue(self, row, col, row_is_ordinal = 0):
589 bh 849 """Return the value of the specified row and column
590    
591     The col parameter may be the index of the column or its name.
592     """
593     if self.t_table is not None:
594 bh 1662 return self.t_table.ReadValue(row, col,
595     row_is_ordinal = row_is_ordinal)
596 bh 849 else:
597 bh 1662 return self.table.ReadValue(row, col,
598     row_is_ordinal = row_is_ordinal)
599 bh 849
600 bh 765 def copy_to_transient(self):
601     """Internal: Create a transient table and copy the data into it"""
602     self.t_table = TransientTable(self.transient_db, self)
603    
604     def transient_table(self):
605     """
606     Return a table whose underlying implementation is in the transient db
607     """
608     if self.t_table is None:
609     self.copy_to_transient()
610     return self.t_table
611    
612 bh 818 def ValueRange(self, col):
613 bh 1923 # Performance of sqlite vs. DBF for this method:
614     #
615     # If the table has been copied to the sqlite database it's
616     # faster to use it even if there is no index on that column.
617     # Otherwise it's faster to simply loop through all rows in the
618     # DBF file. Copying the data to the sqlite database can take
619     # very long for large amounts of data
620     #
621     # Of course if the table is not a DBF file the issue could be
622     # different, although copying the data into sqlite first will
623     # likely always be slower than simply querying the non-sqlite
624     # table directly. Currently only DBFfiles and memory tables are
625     # used as the underlying non-sqlite table, though.
626     if self.t_table is not None:
627     return self.t_table.ValueRange(col)
628     else:
629     return self.table.ValueRange(col)
630 bh 765
631 bh 839 def UniqueValues(self, col):
632 bh 1923 # The performance trade-offs for this method are basically the
633     # same as for ValueRange except that currently there doesn't
634     # seem to be a way to take advantage of indexes in this case in
635     # sqlite. However, but it's still faster to query the transient
636     # table if it already exists.
637     if self.t_table is not None:
638     return self.t_table.UniqueValues(col)
639     else:
640     return self.table.UniqueValues(col)
641 bh 844
642     def SimpleQuery(self, left, comparison, right):
643     if self.t_table is None:
644     self.copy_to_transient()
645     # Make sure to use the column object of the transient table. The
646     # left argument is always a column object so we can just ask the
647     # t_table for the right object.
648 jonathan 933 if hasattr(right, "name"):
649     return self.t_table.SimpleQuery(self.t_table.Column(left.name),
650     comparison,
651     self.t_table.Column(right.name))
652     else:
653     return self.t_table.SimpleQuery(self.t_table.Column(left.name),
654     comparison, right)
655 bh 984
656     def Dependencies(self):
657     """Return a tuple containing the original table"""
658     return (self.table,)
659 frank 1026
660 bh 1043 def Width(self, col):
661     return self.table.Width(col)

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

[email protected]
ViewVC Help
Powered by ViewVC 1.1.26