Thuban/Model/transientdb.py

# Copyright (C) 2003 by Intevation GmbH
# Authors:
# Bernhard Herzog <[email protected]>
#
# This program is free software under the GPL (>=v2)
# Read the file COPYING coming with the software for details.

"""Database for transient data

This database is intended for data representations needed during the
lifetime of a Thuban session but which is not permanent. Examples of
this are for instance a join of two DBF files where the DBF files are
the permanent representation of the data and the join only exists in the
Thuban session and is reconstructed when the session is opened.
"""

__version__ = "$Revision$"
# $Source$
# $Id$

from sqlite import connect

from base import TitledObject

import table

sql_type_map = {
    table.FIELDTYPE_INT: "INTEGER",
    table.FIELDTYPE_STRING: "VARCHAR",
    table.FIELDTYPE_DOUBLE: "FLOAT",
    }

type_converter_map = {
    table.FIELDTYPE_INT: int,
    table.FIELDTYPE_STRING: str,
    table.FIELDTYPE_DOUBLE: float,
    }

class TransientDatabase:

    def __init__(self, filename):
        self.filename = filename
        self.conn = connect(filename)
        # Counters to produce unique table and column names
        self.num_tables = 0
        self.num_cols = 0
        # Since there's only once process using the SQLite database, we
        # might be able to get a tad more speed with default_synchronous
        # OFF. So far I haven't seen any measurable speedup, though.
        #self.execute("PRAGMA default_synchronous = OFF")

    def __del__(self):
        self.close()

    def close(self):
        if self.conn is not None:
            self.conn.close()
            self.conn = None

    def new_table_name(self):
        self.num_tables += 1
        return "Table%03d" % self.num_tables

    def new_column_name(self):
        self.num_cols += 1
        return "Col%03d" % self.num_cols

    def execute(self, *args):
        """execute the SQL statement in the database and return the result"""
        cursor = self.conn.cursor()
        cursor.execute(*args)
        result = cursor.fetchone()
        self.conn.commit()
        return result

    def cursor(self):
        return self.conn.cursor()


class ColumnReference:

    def __init__(self, name, type, internal_name):
        self.name = name
        self.type = type
        self.internal_name = internal_name


class TransientTableBase(table.OldTableInterfaceMixin):

    """Base class for tables in the transient database"""

    def __init__(self, transient_db):
        """Initialize the table for use with the given transient db"""
        self.db = transient_db
        self.tablename = self.db.new_table_name()
        self.indexed_columns = {}
        self.read_record_cursor = None
        self.read_record_last_row = None
        self.read_record_last_result = None

    def create(self, columns):
        self.columns = columns
        self.name_to_column = {}
        self.orig_names = []
        self.internal_to_orig = {}
        self.orig_to_internal = {}
        self.column_map = {}

        # Create the column objects and fill various maps and lists
        for index in range(len(self.columns)):
            col = self.columns[index]
            self.name_to_column[col.name] = col
            self.orig_names.append(col.name)
            self.internal_to_orig[col.internal_name] = col.name
            self.orig_to_internal[col.name] = col.internal_name
            self.column_map[col.name] = col
            self.column_map[index] = col

        # Build the CREATE TABLE statement and create the table in the
        # database
        table_types = ["id INTEGER PRIMARY KEY"]
        for col in self.columns:
            table_types.append("%s %s" % (col.internal_name,
                                          sql_type_map[col.type]))
        table_stmt = "CREATE TABLE %s (\n    %s\n);" % (self.tablename,
                                                   ",\n    ".join(table_types))
        self.db.execute(table_stmt)

    def transient_table(self):
        """
        Return a table whose underlying implementation is in the transient db
        """
        return self

    def ensure_index(self, column):
        """Ensure that there's an index on the given column"""
        if not column in self.indexed_columns:
            internal_name = self.orig_to_internal[column]
            indexname = "Index_%s_%s" % (self.tablename, internal_name)
            stmt = "CREATE INDEX %s ON %s (%s);" % (indexname, self.tablename,
                                                    internal_name)
            self.db.execute(stmt)
            self.indexed_columns[column] = 1

    def NumColumns(self):
        return len(self.columns)

    def NumRows(self):
        result = self.db.execute("SELECT count(*) FROM %s;" % self.tablename)
        return int(result[0])

    def Columns(self):
        return self.columns

    def Column(self, col):
        return self.column_map[col]

    def HasColumn(self, col):
        """Return whether the table has a column with the given name or index
        """
        return self.column_map.has_key(col)

    def RowIdToOrdinal(self, gid):
        """Return the row ordinal given its id

        At the moment the transient tables are only used for tables that
        don't distinguish between row number and row id, so the value is
        returned unchanged.
        """
        return gid

    def RowOrdinalToId(self, num):
        """Return the rowid for given its ordinal

        At the moment the transient tables are only used for tables that
        don't distinguish between row number and row id, so the value is
        returned unchanged.
        """
        return num

    def ReadRowAsDict(self, index, row_is_ordinal = 0):
        """Return the entire row as a dictionary with column names as keys

        The row_is_ordinal is ignored because at the moment the
        transient tables are only used for DBF files where it doesn't
        matter.
        """
        # Implementation Strategy: Executing a completely new select
        # statement every time this method is called is too slow. The
        # most important usage is to read the records more or less
        # sequentially. This happens e.g. when drawing a layer with a
        # classification where the shapes are drawn in order of the
        # shape ids. Another pattern is that the same row is requested
        # several times in a row. This happens in the table view, for
        # instance.

        # We can exploit this to make access faster by having one cursor
        # open all the time and keeping the last row read around in case
        # the same row is accessed again the next time and if the row
        # index is larger than the row we have read last we simply fetch
        # rows from the cursor until we've reached the requested row. If
        # the requested row index is smaller then we start a new cursor.

        # FIXME: So far this scheme seems to work well enough. Obvious
        # improvements would be to start the cursor at exactly the
        # requested row (should be efficient and easy to do now that the
        # id is the primary key) and to perhaps to also start a new
        # cursor if the requested index is much larger than the last row
        # so that we don't read and discard lots of the rows.

        # Check whether we have to start a new cursor
        if self.read_record_cursor is None or index <self.read_record_last_row:
            stmt = ("SELECT %s FROM %s;"
                    % (", ".join([c.internal_name for c in self.columns]),
                       self.tablename))
            self.read_record_cursor = self.db.cursor()
            self.read_record_cursor.execute(stmt)
            self.read_record_last_row = -1
            self.read_record_last_result = None

        # Now we should have a cursor at a position less than or equal
        # to the index so the following if statement will always set
        # result to a suitable value
        assert index >= self.read_record_last_row

        if index == self.read_record_last_row:
            result = self.read_record_last_result
        else:
            for i in range(index - self.read_record_last_row):
                result = self.read_record_cursor.fetchone()
                self.read_record_last_result = result
        self.read_record_last_row = index
        return dict(zip(self.orig_names, result))

    def ReadValue(self, row, col, row_is_ordinal = 0):
        """Return the value of the specified row and column

        The col parameter may be the index of the column or its name.

        The row_is_ordinal is ignored because at the moment the
        transient tables are only used for DBF files where it doesn't
        matter.
        """
        # Depending on the actual access patterns of the table data, it
        # might be a bit faster in some circumstances to not implement
        # this via ReadRowAsDict, but this simple implementation should
        # be fast enough for most purposes.
        return self.ReadRowAsDict(row)[self.column_map[col].name]

    def ValueRange(self, col):
        # Performance notes:
        #
        # In sqlite 2.8.6 the min and max aggregate functions can use an
        # index but only when used as the only expression in the select
        # statement (i.e. 'select min(col), max(col) from tbl;' will not
        # use the index but 'select min(col) from tbl;' will) so we
        # query the minimum and maximum separately.
        #
        # With the separate statements we can take advantage of an index
        # if it exists. If the index doesn't exist, creating it first
        # and then using it in the query is slower than the queries
        # without an index. Creating the index is only an advantage if
        # the queries are performed multiple times. With the current use
        # patterns where ValueRange is only used occasionally by the
        # classification generation dialog creating the index only for
        # this usage is not really worth it, so we don't.
        col = self.column_map[col]
        iname = col.internal_name
        min = self.db.execute("SELECT min(%s) FROM %s;"
                              % (iname, self.tablename))[0]
        max = self.db.execute("SELECT max(%s) FROM %s;"
                              % (iname, self.tablename))[0]
        converter = type_converter_map[col.type]
        return (converter(min), converter(max))

    def UniqueValues(self, col):
        # Performance notes:
        #
        # In sqlite 2.8.6 there doesn't seem to be a way to query the
        # unique items that uses an index. I've tried
        #
        #   SELECT col FROM tbl GROUP BY col;
        #
        # and
        #
        #   SELECT DISTINCT col FROM tbl;
        #
        # and in both cases the index is not used. If the index isn't
        # used it doesn't make sense to call self.ensure_index.
        iname = self.column_map[col].internal_name
        cursor = self.db.cursor()
        cursor.execute("SELECT %s FROM %s GROUP BY %s;"
                       % (iname, self.tablename, iname))
        result = []
        while 1:
            row = cursor.fetchone()
            if row is None:
                break
            result.append(row[0])
        return result

    def Width(self, col):
        """Return the maximum width of values in the column

        The return value is the the maximum length of string
        representation of the values in the column (represented by index
        or name).
        """
        max = 0

        type  = self.column_map[col].type
        iname = self.column_map[col].internal_name
        cursor = self.db.cursor()
        cursor.execute("SELECT %s FROM %s;" % (iname, self.tablename))
        values = [ i[0] for i in cursor.fetchall()]
        if not values:
            return None

        if type == table.FIELDTYPE_DOUBLE:
            format = "%.12f"
        elif type == table.FIELDTYPE_INT:
            format = "%d"
        else:
            format = "%s"
        for value in values:
            if value is None: continue
            l = len(format % value)
            if l > max:
                max = l

        return max

    def SimpleQuery(self, left, comparison, right):
        """Return the indices of all rows that matching a condition.

        Parameters:
           left -- The column object for the left side of the comparison

           comparison -- The comparison operator as a string. It must be
                         one of '==', '!=', '<', '<=', '>=', '>'

           right -- The right hand side of the comparison. It must be
                    either a column object or a value, i.e. a string,
                    int or float.

        The return value is a sorted list of the indices of the rows
        where the condition is true.
        """
        if comparison not in ("==", "!=", "<", "<=", ">=", ">"):
            raise ValueError("Comparison operator %r not allowed" % comparison)

        if hasattr(right, "internal_name"):
            right_template = right.internal_name
            params = ()
        else:
            right_template = "%s"
            params = (right,)

        query = "SELECT id FROM %s WHERE %s %s %s ORDER BY id;" \
                % (self.tablename, left.internal_name, comparison,
                   right_template)

        cursor = self.db.cursor()
        cursor.execute(query, params)
        result = []
        while 1:
            row = cursor.fetchone()
            if row is None:
                break
            result.append(row[0])
        return result

    def Dependencies(self):
        """Placeholder for a method in a derived class.

        Return a sequence with the tables and other data objects that
        self depends on.
        """
        raise NotImplementedError


class TransientTable(TitledObject, TransientTableBase):

    """A Table in a transient DB that starts as the copy of a Thuban Table."""

    def __init__(self, transient_db, table):
        """Create a new table in the given transient DB as a copy of table

        The table argument can be any object implementing the Table
        interface.
        """
        TransientTableBase.__init__(self, transient_db)
        TitledObject.__init__(self, table.Title())
        self.create(table)

    def create(self, table):
        columns = []
        for col in table.Columns():
            columns.append(ColumnReference(col.name, col.type,
                                           self.db.new_column_name()))
        TransientTableBase.create(self, columns)

        # copy the input table to the transient db

        # A key to insert to use for the formatting of the insert
        # statement. The key must not be equal to any of the column
        # names so we construct one by building a string of x's that is
        # longer than any of the column names
        id_key = max([len(col.name) for col in self.columns]) * "x"

        insert_template = "INSERT INTO %s (id, %s) VALUES (%%(%s)s, %s);" \
                               % (self.tablename,
                                  ", ".join([col.internal_name
                                             for col in self.columns]),
                                  id_key,
                                  ", ".join(["%%(%s)s" % col.name
                                             for col in self.columns]))
        cursor = self.db.cursor()
        for i in range(table.NumRows()):
            row = table.ReadRowAsDict(i)
            row[id_key] = i
            cursor.execute(insert_template, row)
        self.db.conn.commit()


class TransientJoinedTable(TitledObject, TransientTableBase):

    """A Table in the transient DB that contains a join of two tables"""

    def __init__(self, transient_db, left_table, left_field,
                 right_table, right_field = None, outer_join = False):
        """Create a new table in the transient DB as a join of two tables.

        Both input tables, left_table and right_table must have a
        transient_table method that returns a table object for a table
        in the transient database. The join is performed on the condition
        that the value of the left_field column the the left table is
        equal to the value of the right_field in the right_table.

        The joined table contains all columns of the input tables,
        however, the column names of the right table may be changed
        slightly to make them unique in the joined table. This is
        currently done by appending a sufficient number of underscores
        ('_').
        """
        TransientTableBase.__init__(self, transient_db)
        self.dependencies = (left_table, right_table)
        self.left_table = left_table.transient_table()
        self.left_field = left_field
        self.right_table = right_table.transient_table()
        if right_field:
            self.right_field = right_field
        else:
            self.right_field = self.left_field
        self.outer_join = outer_join

        title = "Join of %(left)s and %(right)s" \
                % {"left": self.left_table.Title(),
                   "right": self.right_table.Title()}
        TitledObject.__init__(self, title)

        self.create()

    def create(self):
        """Internal: Create the table with the joined data"""
        self.tablename = self.db.new_table_name()

        self.right_table.ensure_index(self.right_field)

        # determine the internal column names to join on before
        # coalescing the column information because if the external
        # column names are the same they will be mapped to the same
        # internal name afterwards.
        internal_left_col = self.left_table.orig_to_internal[self.left_field]
        internal_right_col =self.right_table.orig_to_internal[self.right_field]

        # Coalesce the column information
        visited = {}
        columns = []
        newcolumns = []
        for table in (self.left_table, self.right_table):
            for col in table.Columns():
                colname = col.name
                # We can't allow multiple columns with the same
                # original name, so append '_' to this one until
                # it is unique.
                # FIXME: There should be a better solution.
                while colname in visited:
                    colname = colname + '_'
                columns.append((table.tablename, col))
                newcol = ColumnReference(colname, col.type,
                                            "Col%03d" % (len(newcolumns)+1))
                newcolumns.append(newcol)
                visited[colname] = 1
        TransientTableBase.create(self, newcolumns)

        # Copy the joined data to the table.
        newinternal_names = [col.internal_name for col in self.columns]
        internal_references = ["%s.%s" % (table, col.internal_name) 
                                                    for table, col in columns]
        if self.outer_join:
            join_operator = 'LEFT OUTER JOIN'
        else:
            join_operator = 'JOIN'
        stmt = ("INSERT INTO %s (id, %s) SELECT %s.id, %s FROM %s"
                " %s %s ON %s.%s = %s.%s;"
                % (self.tablename,
                   ", ".join(newinternal_names),
                   self.left_table.tablename,
                   ", ".join(internal_references),
                   self.left_table.tablename,
                   join_operator,
                   self.right_table.tablename,
                   self.left_table.tablename,
                   internal_left_col,
                   self.right_table.tablename,
                   internal_right_col))
        self.db.execute(stmt)

    def Dependencies(self):
        """Return a tuple with the two tables the join depends on."""
        return self.dependencies

    def JoinType(self):
        """Return the type of the join (either 'INNER' or 'LEFT OUTER')"""
        if self.outer_join:
            return "LEFT OUTER"
        else:
            return "INNER"


class AutoTransientTable(TitledObject, table.OldTableInterfaceMixin):

    """Table that copies data to a transient table on demand.

    The AutoTransientTable takes another table as input and copies data
    to a table in a TransientDatabase instance on demand.
    """

    def __init__(self, transient_db, table):
        TitledObject.__init__(self, table.Title())
        self.transient_db = transient_db
        self.table = table
        self.t_table = None

    def Columns(self):
        return self.table.Columns()

    def Column(self, col):
        return self.table.Column(col)

    def HasColumn(self, col):
        """Return whether the table has a column with the given name or index
        """
        return self.table.HasColumn(col)

    def NumRows(self):
        return self.table.NumRows()

    def NumColumns(self):
        return self.table.NumColumns()

    def RowIdToOrdinal(self, gid):
        """Return the row ordinal given its id"""
        if self.t_table is not None:
            return self.t_table.RowIdToOrdinal(gid)
        else:
            return self.table.RowIdToOrdinal(gid)

    def RowOrdinalToId(self, num):
        """Return the rowid for given its ordinal"""
        if self.t_table is not None:
            return self.t_table.RowOrdinalToId(num)
        else:
            return self.table.RowOrdinalToId(num)

    def ReadRowAsDict(self, record, row_is_ordinal = 0):
        """Return the record no. record as a dict mapping field names to values
        """
        if self.t_table is not None:
            return self.t_table.ReadRowAsDict(record,
                                              row_is_ordinal = row_is_ordinal)
        else:
            return self.table.ReadRowAsDict(record,
                                            row_is_ordinal = row_is_ordinal)

    def ReadValue(self, row, col, row_is_ordinal = 0):
        """Return the value of the specified row and column

        The col parameter may be the index of the column or its name.
        """
        if self.t_table is not None:
            return self.t_table.ReadValue(row, col,
                                          row_is_ordinal = row_is_ordinal)
        else:
            return self.table.ReadValue(row, col,
                                        row_is_ordinal = row_is_ordinal)

    def copy_to_transient(self):
        """Internal: Create a transient table and copy the data into it"""
        self.t_table = TransientTable(self.transient_db, self)

    def transient_table(self):
        """
        Return a table whose underlying implementation is in the transient db
        """
        if self.t_table is None:
            self.copy_to_transient()
        return self.t_table

    def ValueRange(self, col):
        # Performance of sqlite vs. DBF for this method:
        #
        # If the table has been copied to the sqlite database it's
        # faster to use it even if there is no index on that column.
        # Otherwise it's faster to simply loop through all rows in the
        # DBF file. Copying the data to the sqlite database can take
        # very long for large amounts of data
        #
        # Of course if the table is not a DBF file the issue could be
        # different, although copying the data into sqlite first will
        # likely always be slower than simply querying the non-sqlite
        # table directly. Currently only DBFfiles and memory tables are
        # used as the underlying non-sqlite table, though.
        if self.t_table is not None:
            return self.t_table.ValueRange(col)
        else:
            return self.table.ValueRange(col)

    def UniqueValues(self, col):
        # The performance trade-offs for this method are basically the
        # same as for ValueRange except that currently there doesn't
        # seem to be a way to take advantage of indexes in this case in
        # sqlite. However, but it's still faster to query the transient
        # table if it already exists.
        if self.t_table is not None:
            return self.t_table.UniqueValues(col)
        else:
            return self.table.UniqueValues(col)

    def SimpleQuery(self, left, comparison, right):
        if self.t_table is None:
            self.copy_to_transient()
        # Make sure to use the column object of the transient table. The
        # left argument is always a column object so we can just ask the
        # t_table for the right object.
        if hasattr(right, "name"):
            return self.t_table.SimpleQuery(self.t_table.Column(left.name),
                                            comparison, 
                                            self.t_table.Column(right.name))
        else:
            return self.t_table.SimpleQuery(self.t_table.Column(left.name),
                                            comparison, right)

    def Dependencies(self):
        """Return a tuple containing the original table"""
        return (self.table,)

    def Width(self, col):
        return self.table.Width(col)
Name	Value
svn:eol-style	native
svn:keywords	Author Date Id Revision