/[thuban]/branches/WIP-pyshapelib-bramz/Thuban/Model/transientdb.py
ViewVC logotype

Contents of /branches/WIP-pyshapelib-bramz/Thuban/Model/transientdb.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2734 - (show annotations)
Thu Mar 1 12:42:59 2007 UTC (18 years ago) by bramz
File MIME type: text/x-python
File size: 26542 byte(s)
made a copy
1 # Copyright (C) 2003 by Intevation GmbH
2 # Authors:
3 # Bernhard Herzog <[email protected]>
4 #
5 # This program is free software under the GPL (>=v2)
6 # Read the file COPYING coming with the software for details.
7
8 """Database for transient data
9
10 This database is intended for data representations needed during the
11 lifetime of a Thuban session but which is not permanent. Examples of
12 this are for instance a join of two DBF files where the DBF files are
13 the permanent representation of the data and the join only exists in the
14 Thuban session and is reconstructed when the session is opened.
15 """
16
17 __version__ = "$Revision$"
18 # $Source$
19 # $Id$
20
21 # Pysqlite version 1. and 2. behaves quiet differently
22 # Pysqlite uses a different paramstyle. The older version
23 # support format and pyformat while pysqlite2 supports only qmark
24 # and named.
25 # The sqlite2 boolean variable is used to manage specific part of the code
26 try:
27 # Using SQLITE 2.x
28 sqlite2 = True
29 from pysqlite2 import dbapi2 as sqlite
30 except ImportError:
31 # Using SQLITE 1.x
32 sqlite2 = False
33 import sqlite
34
35
36 from base import TitledObject
37
38 import table
39
40 sql_type_map = {
41 table.FIELDTYPE_INT: "INTEGER",
42 table.FIELDTYPE_STRING: "VARCHAR",
43 table.FIELDTYPE_DOUBLE: "FLOAT",
44 }
45
46 type_converter_map = {
47 table.FIELDTYPE_INT: int,
48 table.FIELDTYPE_STRING: str,
49 table.FIELDTYPE_DOUBLE: float,
50 }
51
52 class TransientDatabase:
53
54 def __init__(self, filename):
55 self.filename = filename
56 self.conn = sqlite.connect(filename)
57 # Counters to produce unique table and column names
58 self.num_tables = 0
59 self.num_cols = 0
60 # Since there's only once process using the SQLite database, we
61 # might be able to get a tad more speed with default_synchronous
62 # OFF. So far I haven't seen any measurable speedup, though.
63 #self.execute("PRAGMA default_synchronous = OFF")
64
65 def __del__(self):
66 self.close()
67
68 def close(self):
69 if self.conn is not None:
70 self.conn.close()
71 self.conn = None
72
73 def new_table_name(self):
74 self.num_tables += 1
75 return "Table%03d" % self.num_tables
76
77 def new_column_name(self):
78 self.num_cols += 1
79 return "Col%03d" % self.num_cols
80
81 def execute(self, *args):
82 """execute the SQL statement in the database and return the result"""
83 cursor = self.conn.cursor()
84 cursor.execute(*args)
85 result = cursor.fetchone()
86 self.conn.commit()
87 return result
88
89 def cursor(self):
90 return self.conn.cursor()
91
92
93 class ColumnReference:
94
95 def __init__(self, name, type, internal_name):
96 self.name = name
97 self.type = type
98 self.internal_name = internal_name
99
100
101 class TransientTableBase:
102
103 """Base class for tables in the transient database"""
104
105 def __init__(self, transient_db):
106 """Initialize the table for use with the given transient db"""
107 self.db = transient_db
108 self.tablename = self.db.new_table_name()
109 self.indexed_columns = {}
110 self.read_record_cursor = None
111 self.read_record_last_row = None
112 self.read_record_last_result = None
113
114 def create(self, columns):
115 self.columns = columns
116 self.name_to_column = {}
117 self.orig_names = []
118 self.internal_to_orig = {}
119 self.orig_to_internal = {}
120 self.column_map = {}
121
122 # Create the column objects and fill various maps and lists
123 for index in range(len(self.columns)):
124 col = self.columns[index]
125 self.name_to_column[col.name] = col
126 self.orig_names.append(col.name)
127 self.internal_to_orig[col.internal_name] = col.name
128 self.orig_to_internal[col.name] = col.internal_name
129 self.column_map[col.name] = col
130 self.column_map[index] = col
131
132 # Build the CREATE TABLE statement and create the table in the
133 # database
134 table_types = ["id INTEGER PRIMARY KEY"]
135 for col in self.columns:
136 table_types.append("%s %s" % (col.internal_name,
137 sql_type_map[col.type]))
138 table_stmt = "CREATE TABLE %s (\n %s\n);" % (self.tablename,
139 ",\n ".join(table_types))
140 self.db.execute(table_stmt)
141
142 def transient_table(self):
143 """
144 Return a table whose underlying implementation is in the transient db
145 """
146 return self
147
148 def ensure_index(self, column):
149 """Ensure that there's an index on the given column"""
150 if not column in self.indexed_columns:
151 internal_name = self.orig_to_internal[column]
152 indexname = "Index_%s_%s" % (self.tablename, internal_name)
153 stmt = "CREATE INDEX %s ON %s (%s);" % (indexname, self.tablename,
154 internal_name)
155 self.db.execute(stmt)
156 self.indexed_columns[column] = 1
157
158 def NumColumns(self):
159 return len(self.columns)
160
161 def NumRows(self):
162 result = self.db.execute("SELECT count(*) FROM %s;" % self.tablename)
163 return int(result[0])
164
165 def Columns(self):
166 return self.columns
167
168 def Column(self, col):
169 return self.column_map[col]
170
171 def HasColumn(self, col):
172 """Return whether the table has a column with the given name or index
173 """
174 return self.column_map.has_key(col)
175
176 def RowIdToOrdinal(self, gid):
177 """Return the row ordinal given its id
178
179 At the moment the transient tables are only used for tables that
180 don't distinguish between row number and row id, so the value is
181 returned unchanged.
182 """
183 return gid
184
185 def RowOrdinalToId(self, num):
186 """Return the rowid for given its ordinal
187
188 At the moment the transient tables are only used for tables that
189 don't distinguish between row number and row id, so the value is
190 returned unchanged.
191 """
192 return num
193
194 def ReadRowAsDict(self, index, row_is_ordinal = 0):
195 """Return the entire row as a dictionary with column names as keys
196
197 The row_is_ordinal is ignored because at the moment the
198 transient tables are only used for DBF files where it doesn't
199 matter.
200 """
201 # Implementation Strategy: Executing a completely new select
202 # statement every time this method is called is too slow. The
203 # most important usage is to read the records more or less
204 # sequentially. This happens e.g. when drawing a layer with a
205 # classification where the shapes are drawn in order of the
206 # shape ids. Another pattern is that the same row is requested
207 # several times in a row. This happens in the table view, for
208 # instance.
209
210 # We can exploit this to make access faster by having one cursor
211 # open all the time and keeping the last row read around in case
212 # the same row is accessed again the next time and if the row
213 # index is larger than the row we have read last we simply fetch
214 # rows from the cursor until we've reached the requested row. If
215 # the requested row index is smaller then we start a new cursor.
216
217 # FIXME: So far this scheme seems to work well enough. Obvious
218 # improvements would be to start the cursor at exactly the
219 # requested row (should be efficient and easy to do now that the
220 # id is the primary key) and to perhaps to also start a new
221 # cursor if the requested index is much larger than the last row
222 # so that we don't read and discard lots of the rows.
223
224 # Check whether we have to start a new cursor
225 if self.read_record_cursor is None or index <self.read_record_last_row:
226 stmt = ("SELECT %s FROM %s;"
227 % (", ".join([c.internal_name for c in self.columns]),
228 self.tablename))
229 self.read_record_cursor = self.db.cursor()
230 self.read_record_cursor.execute(stmt)
231 self.read_record_last_row = -1
232 self.read_record_last_result = None
233
234 # Now we should have a cursor at a position less than or equal
235 # to the index so the following if statement will always set
236 # result to a suitable value
237 assert index >= self.read_record_last_row
238
239 if index == self.read_record_last_row:
240 result = self.read_record_last_result
241 else:
242 for i in range(index - self.read_record_last_row):
243 result = self.read_record_cursor.fetchone()
244 self.read_record_last_result = result
245 self.read_record_last_row = index
246 return dict(zip(self.orig_names, result))
247
248 def ReadValue(self, row, col, row_is_ordinal = 0):
249 """Return the value of the specified row and column
250
251 The col parameter may be the index of the column or its name.
252
253 The row_is_ordinal is ignored because at the moment the
254 transient tables are only used for DBF files where it doesn't
255 matter.
256 """
257 # Depending on the actual access patterns of the table data, it
258 # might be a bit faster in some circumstances to not implement
259 # this via ReadRowAsDict, but this simple implementation should
260 # be fast enough for most purposes.
261 return self.ReadRowAsDict(row)[self.column_map[col].name]
262
263 def ValueRange(self, col):
264 # Performance notes:
265 #
266 # In sqlite 2.8.6 the min and max aggregate functions can use an
267 # index but only when used as the only expression in the select
268 # statement (i.e. 'select min(col), max(col) from tbl;' will not
269 # use the index but 'select min(col) from tbl;' will) so we
270 # query the minimum and maximum separately.
271 #
272 # With the separate statements we can take advantage of an index
273 # if it exists. If the index doesn't exist, creating it first
274 # and then using it in the query is slower than the queries
275 # without an index. Creating the index is only an advantage if
276 # the queries are performed multiple times. With the current use
277 # patterns where ValueRange is only used occasionally by the
278 # classification generation dialog creating the index only for
279 # this usage is not really worth it, so we don't.
280 col = self.column_map[col]
281 iname = col.internal_name
282 min = self.db.execute("SELECT min(%s) FROM %s;"
283 % (iname, self.tablename))[0]
284 max = self.db.execute("SELECT max(%s) FROM %s;"
285 % (iname, self.tablename))[0]
286 converter = type_converter_map[col.type]
287 return (converter(min), converter(max))
288
289 def UniqueValues(self, col):
290 # Performance notes:
291 #
292 # In sqlite 2.8.6 there doesn't seem to be a way to query the
293 # unique items that uses an index. I've tried
294 #
295 # SELECT col FROM tbl GROUP BY col;
296 #
297 # and
298 #
299 # SELECT DISTINCT col FROM tbl;
300 #
301 # and in both cases the index is not used. If the index isn't
302 # used it doesn't make sense to call self.ensure_index.
303 iname = self.column_map[col].internal_name
304 cursor = self.db.cursor()
305 cursor.execute("SELECT %s FROM %s GROUP BY %s;"
306 % (iname, self.tablename, iname))
307 result = []
308 while 1:
309 row = cursor.fetchone()
310 if row is None:
311 break
312 result.append(row[0])
313 return result
314
315 def Width(self, col):
316 """Return the maximum width of values in the column
317
318 The return value is the the maximum length of string
319 representation of the values in the column (represented by index
320 or name).
321 """
322 max = 0
323
324 type = self.column_map[col].type
325 iname = self.column_map[col].internal_name
326 cursor = self.db.cursor()
327 cursor.execute("SELECT %s FROM %s;" % (iname, self.tablename))
328 values = [ i[0] for i in cursor.fetchall()]
329 if not values:
330 return None
331
332 if type == table.FIELDTYPE_DOUBLE:
333 format = "%.12f"
334 elif type == table.FIELDTYPE_INT:
335 format = "%d"
336 else:
337 format = "%s"
338 for value in values:
339 if value is None: continue
340 l = len(format % value)
341 if l > max:
342 max = l
343
344 return max
345
346 def SimpleQuery(self, left, comparison, right):
347 """Return the indices of all rows that matching a condition.
348
349 Parameters:
350 left -- The column object for the left side of the comparison
351
352 comparison -- The comparison operator as a string. It must be
353 one of '==', '!=', '<', '<=', '>=', '>'
354
355 right -- The right hand side of the comparison. It must be
356 either a column object or a value, i.e. a string,
357 int or float.
358
359 The return value is a sorted list of the indices of the rows
360 where the condition is true.
361 """
362 if comparison not in ("==", "!=", "<", "<=", ">=", ">"):
363 raise ValueError("Comparison operator %r not allowed" % comparison)
364
365 if hasattr(right, "internal_name"):
366 right_template = right.internal_name
367 params = ()
368 else:
369 if sqlite2:
370 right_template = "?"
371 else: right_template = "%s"
372 params = (right,)
373
374 query = "SELECT id FROM %s WHERE %s %s %s ORDER BY id;" \
375 % (self.tablename, left.internal_name, comparison,
376 right_template)
377
378 cursor = self.db.cursor()
379 cursor.execute(query, params)
380 result = []
381 while 1:
382 row = cursor.fetchone()
383 if row is None:
384 break
385 result.append(row[0])
386 return result
387
388 def Dependencies(self):
389 """Placeholder for a method in a derived class.
390
391 Return a sequence with the tables and other data objects that
392 self depends on.
393 """
394 raise NotImplementedError
395
396
397 class TransientTable(TitledObject, TransientTableBase):
398
399 """A Table in a transient DB that starts as the copy of a Thuban Table."""
400
401 def __init__(self, transient_db, table):
402 """Create a new table in the given transient DB as a copy of table
403
404 The table argument can be any object implementing the Table
405 interface.
406 """
407 TransientTableBase.__init__(self, transient_db)
408 TitledObject.__init__(self, table.Title())
409 self.create(table)
410
411 def create(self, table):
412 columns = []
413 for col in table.Columns():
414 columns.append(ColumnReference(col.name, col.type,
415 self.db.new_column_name()))
416 TransientTableBase.create(self, columns)
417
418 # copy the input table to the transient db
419
420 # A key to insert to use for the formatting of the insert
421 # statement. The key must not be equal to any of the column
422 # names so we construct one by building a string of x's that is
423 # longer than any of the column names
424 id_key = max([len(col.name) for col in self.columns]) * "x"
425
426 if sqlite2:
427 insert_template = "INSERT INTO %s (id, %s) VALUES (%s, %s);" \
428 % (self.tablename,
429 ", ".join([col.internal_name
430 for col in self.columns]),
431 '?',
432 ", ".join(["?" for col in self.columns]))
433
434 else:
435 insert_template = "INSERT INTO %s (id, %s) VALUES (%%(%s)s, %s);" \
436 % (self.tablename,
437 ", ".join([col.internal_name
438 for col in self.columns]),
439 id_key,
440 ", ".join(["%%(%s)s" % col.name
441 for col in self.columns]))
442 cursor = self.db.cursor()
443 for i in range(table.NumRows()):
444 row = table.ReadRowAsDict(i)
445 row[id_key] = i
446 if sqlite2:
447 params = [i]
448 for col in self.columns:
449 params.append(row[col.name])
450 cursor.execute(insert_template, params)
451 else:
452 cursor.execute(insert_template, row)
453 self.db.conn.commit()
454
455
456
457 class TransientJoinedTable(TitledObject, TransientTableBase):
458
459 """A Table in the transient DB that contains a join of two tables"""
460
461 def __init__(self, transient_db, left_table, left_field,
462 right_table, right_field = None, outer_join = False):
463 """Create a new table in the transient DB as a join of two tables.
464
465 Both input tables, left_table and right_table must have a
466 transient_table method that returns a table object for a table
467 in the transient database. The join is performed on the condition
468 that the value of the left_field column the the left table is
469 equal to the value of the right_field in the right_table.
470
471 The joined table contains all columns of the input tables,
472 however, the column names of the right table may be changed
473 slightly to make them unique in the joined table. This is
474 currently done by appending a sufficient number of underscores
475 ('_').
476 """
477 TransientTableBase.__init__(self, transient_db)
478 self.dependencies = (left_table, right_table)
479 self.left_table = left_table.transient_table()
480 self.left_field = left_field
481 self.right_table = right_table.transient_table()
482 if right_field:
483 self.right_field = right_field
484 else:
485 self.right_field = self.left_field
486 self.outer_join = outer_join
487
488 title = "Join of %(left)s and %(right)s" \
489 % {"left": self.left_table.Title(),
490 "right": self.right_table.Title()}
491 TitledObject.__init__(self, title)
492
493 self.create()
494
495 def create(self):
496 """Internal: Create the table with the joined data"""
497 self.tablename = self.db.new_table_name()
498
499 self.right_table.ensure_index(self.right_field)
500
501 # determine the internal column names to join on before
502 # coalescing the column information because if the external
503 # column names are the same they will be mapped to the same
504 # internal name afterwards.
505 internal_left_col = self.left_table.orig_to_internal[self.left_field]
506 internal_right_col =self.right_table.orig_to_internal[self.right_field]
507
508 # Coalesce the column information
509 visited = {}
510 columns = []
511 newcolumns = []
512 for table in (self.left_table, self.right_table):
513 for col in table.Columns():
514 colname = col.name
515 # We can't allow multiple columns with the same
516 # original name, so append '_' to this one until
517 # it is unique.
518 # FIXME: There should be a better solution.
519 while colname in visited:
520 colname = colname + '_'
521 columns.append((table.tablename, col))
522 newcol = ColumnReference(colname, col.type,
523 "Col%03d" % (len(newcolumns)+1))
524 newcolumns.append(newcol)
525 visited[colname] = 1
526 TransientTableBase.create(self, newcolumns)
527
528 # Copy the joined data to the table.
529 newinternal_names = [col.internal_name for col in self.columns]
530 internal_references = ["%s.%s" % (table, col.internal_name)
531 for table, col in columns]
532 if self.outer_join:
533 join_operator = 'LEFT OUTER JOIN'
534 else:
535 join_operator = 'JOIN'
536 stmt = ("INSERT INTO %s (id, %s) SELECT %s.id, %s FROM %s"
537 " %s %s ON %s.%s = %s.%s;"
538 % (self.tablename,
539 ", ".join(newinternal_names),
540 self.left_table.tablename,
541 ", ".join(internal_references),
542 self.left_table.tablename,
543 join_operator,
544 self.right_table.tablename,
545 self.left_table.tablename,
546 internal_left_col,
547 self.right_table.tablename,
548 internal_right_col))
549 self.db.execute(stmt)
550
551 def Dependencies(self):
552 """Return a tuple with the two tables the join depends on."""
553 return self.dependencies
554
555 def JoinType(self):
556 """Return the type of the join (either 'INNER' or 'LEFT OUTER')"""
557 if self.outer_join:
558 return "LEFT OUTER"
559 else:
560 return "INNER"
561
562
563 class AutoTransientTable(TitledObject):
564
565 """Table that copies data to a transient table on demand.
566
567 The AutoTransientTable takes another table as input and copies data
568 to a table in a TransientDatabase instance on demand.
569 """
570
571 def __init__(self, transient_db, table):
572 TitledObject.__init__(self, table.Title())
573 self.transient_db = transient_db
574 self.table = table
575 self.t_table = None
576
577 def Columns(self):
578 return self.table.Columns()
579
580 def Column(self, col):
581 return self.table.Column(col)
582
583 def HasColumn(self, col):
584 """Return whether the table has a column with the given name or index
585 """
586 return self.table.HasColumn(col)
587
588 def NumRows(self):
589 return self.table.NumRows()
590
591 def NumColumns(self):
592 return self.table.NumColumns()
593
594 def RowIdToOrdinal(self, gid):
595 """Return the row ordinal given its id"""
596 if self.t_table is not None:
597 return self.t_table.RowIdToOrdinal(gid)
598 else:
599 return self.table.RowIdToOrdinal(gid)
600
601 def RowOrdinalToId(self, num):
602 """Return the rowid for given its ordinal"""
603 if self.t_table is not None:
604 return self.t_table.RowOrdinalToId(num)
605 else:
606 return self.table.RowOrdinalToId(num)
607
608 def ReadRowAsDict(self, record, row_is_ordinal = 0):
609 """Return the record no. record as a dict mapping field names to values
610 """
611 if self.t_table is not None:
612 return self.t_table.ReadRowAsDict(record,
613 row_is_ordinal = row_is_ordinal)
614 else:
615 return self.table.ReadRowAsDict(record,
616 row_is_ordinal = row_is_ordinal)
617
618 def ReadValue(self, row, col, row_is_ordinal = 0):
619 """Return the value of the specified row and column
620
621 The col parameter may be the index of the column or its name.
622 """
623 if self.t_table is not None:
624 return self.t_table.ReadValue(row, col,
625 row_is_ordinal = row_is_ordinal)
626 else:
627 return self.table.ReadValue(row, col,
628 row_is_ordinal = row_is_ordinal)
629
630 def copy_to_transient(self):
631 """Internal: Create a transient table and copy the data into it"""
632 self.t_table = TransientTable(self.transient_db, self)
633
634 def transient_table(self):
635 """
636 Return a table whose underlying implementation is in the transient db
637 """
638 if self.t_table is None:
639 self.copy_to_transient()
640 return self.t_table
641
642 def ValueRange(self, col):
643 # Performance of sqlite vs. DBF for this method:
644 #
645 # If the table has been copied to the sqlite database it's
646 # faster to use it even if there is no index on that column.
647 # Otherwise it's faster to simply loop through all rows in the
648 # DBF file. Copying the data to the sqlite database can take
649 # very long for large amounts of data
650 #
651 # Of course if the table is not a DBF file the issue could be
652 # different, although copying the data into sqlite first will
653 # likely always be slower than simply querying the non-sqlite
654 # table directly. Currently only DBFfiles and memory tables are
655 # used as the underlying non-sqlite table, though.
656 if self.t_table is not None:
657 return self.t_table.ValueRange(col)
658 else:
659 return self.table.ValueRange(col)
660
661 def UniqueValues(self, col):
662 # The performance trade-offs for this method are basically the
663 # same as for ValueRange except that currently there doesn't
664 # seem to be a way to take advantage of indexes in this case in
665 # sqlite. However, but it's still faster to query the transient
666 # table if it already exists.
667 if self.t_table is not None:
668 return self.t_table.UniqueValues(col)
669 else:
670 return self.table.UniqueValues(col)
671
672 def SimpleQuery(self, left, comparison, right):
673 if self.t_table is None:
674 self.copy_to_transient()
675 # Make sure to use the column object of the transient table. The
676 # left argument is always a column object so we can just ask the
677 # t_table for the right object.
678 if hasattr(right, "name"):
679 return self.t_table.SimpleQuery(self.t_table.Column(left.name),
680 comparison,
681 self.t_table.Column(right.name))
682 else:
683 return self.t_table.SimpleQuery(self.t_table.Column(left.name),
684 comparison, right)
685
686 def Dependencies(self):
687 """Return a tuple containing the original table"""
688 return (self.table,)
689
690 def Width(self, col):
691 return self.table.Width(col)
692
693 def write_record(self, row, values):
694 """Write the values to the given row.
695
696 This is a very experimental feature which doesn't work in all
697 cases, so you better know what you're doing when calling this
698 method.
699 """
700 self.table.write_record(row, values)

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

[email protected]
ViewVC Help
Powered by ViewVC 1.1.26