1 |
# Copyright (c) 2001, 2002, 2003 by Intevation GmbH |
2 |
# Authors: |
3 |
# Bernhard Herzog <[email protected]> |
4 |
# Jan-Oliver Wagner <[email protected]> |
5 |
# Frank Koormann <[email protected]> |
6 |
# |
7 |
# This program is free software under the GPL (>=v2) |
8 |
# Read the file COPYING coming with Thuban for details. |
9 |
|
10 |
""" |
11 |
Classes for handling tables of data. |
12 |
""" |
13 |
|
14 |
__version__ = "$Revision$" |
15 |
|
16 |
import os |
17 |
import inspect |
18 |
import warnings |
19 |
|
20 |
from base import TitledObject |
21 |
|
22 |
from Thuban import internal_from_unicode, unicode_from_internal |
23 |
|
24 |
import dbflib |
25 |
|
26 |
# the field types supported by a Table instance. |
27 |
FIELDTYPE_INT = "int" |
28 |
FIELDTYPE_STRING = "string" |
29 |
FIELDTYPE_DOUBLE = "double" |
30 |
|
31 |
|
32 |
# map the dbflib constants for the field types to our constants |
33 |
dbflib_fieldtypes = {dbflib.FTString: FIELDTYPE_STRING, |
34 |
dbflib.FTInteger: FIELDTYPE_INT, |
35 |
dbflib.FTDouble: FIELDTYPE_DOUBLE} |
36 |
|
37 |
|
38 |
class DBFColumn: |
39 |
|
40 |
"""Description of a column in a DBFTable |
41 |
|
42 |
Instances have the following public attributes: |
43 |
|
44 |
name -- Name of the column |
45 |
type -- Type of the column (one of FIELDTYPE_STRING, FIELDTYPE_INT or\ |
46 |
FIELDTYPE_DOUBLE) |
47 |
index -- The index of the column |
48 |
width -- the width of the data in the column |
49 |
prec -- The precision of the data (only valid for type == FIELDTYPE_DOUBLE) |
50 |
""" |
51 |
|
52 |
def __init__(self, name, type, width, prec, index): |
53 |
self.name = name |
54 |
self.type = type |
55 |
self.width = width |
56 |
self.prec = prec |
57 |
self.index = index |
58 |
|
59 |
|
60 |
class DBFTable(TitledObject): |
61 |
|
62 |
""" |
63 |
Table interface for the data in a DBF file |
64 |
""" |
65 |
|
66 |
# Implementation strategy regarding writing to a DBF file: |
67 |
# |
68 |
# Most of the time Thuban only needs to read from a table and it is |
69 |
# important that Thuban can work with read-only files. Therefore the |
70 |
# DBF file is opened only for reading initially. Only when |
71 |
# write_record is called we try to open the DBF file for writing as |
72 |
# well. If that succeeds the read/write DBF file will be used for |
73 |
# all IO afterwards. |
74 |
# |
75 |
# It's important to use the same DBF file object for both reading |
76 |
# and writing to make sure that reading a records after writing |
77 |
# returns the new values. With two separate objects this wouldn't |
78 |
# work because a DBF file object buffers some data |
79 |
|
80 |
def __init__(self, filename): |
81 |
self.filename = os.path.abspath(filename) |
82 |
|
83 |
# Omit the extension in the title as it's not really needed and |
84 |
# it can be confusing because dbflib removes extensions and |
85 |
# appends some variations of '.dbf' before it tries to open the |
86 |
# file. So the title could be e.g. myshapefile.shp when the real |
87 |
# filename is myshapefile.dbf |
88 |
title = os.path.splitext(os.path.basename(self.filename))[0] |
89 |
TitledObject.__init__(self, title) |
90 |
|
91 |
self.dbf = dbflib.open(filename, return_unicode = True) |
92 |
|
93 |
# If true, self.dbf is open for writing. |
94 |
self._writable = 0 |
95 |
|
96 |
# Create the column information objects |
97 |
self.columns = [] |
98 |
self.column_map = {} |
99 |
for i in range(self.NumColumns()): |
100 |
ftype, name, width, prec = self.dbf.field_info(i) |
101 |
name = internal_from_unicode(name) |
102 |
ftype = dbflib_fieldtypes[ftype] |
103 |
index = len(self.columns) |
104 |
|
105 |
col = DBFColumn(name, ftype, width, prec, index) |
106 |
self.columns.append(col) |
107 |
self.column_map[name] = col |
108 |
self.column_map[index] = col |
109 |
|
110 |
def NumRows(self): |
111 |
"""Return the number of rows in the table""" |
112 |
return self.dbf.record_count() |
113 |
|
114 |
def NumColumns(self): |
115 |
"""Return the number of columns in the table""" |
116 |
return self.dbf.field_count() |
117 |
|
118 |
def Columns(self): |
119 |
"""Return the table's colum definitions |
120 |
|
121 |
The return value is a sequence of DBFColumn instances, one for |
122 |
each column. |
123 |
""" |
124 |
return self.columns |
125 |
|
126 |
def Column(self, col): |
127 |
"""Return information about the column given by its name or index |
128 |
|
129 |
The returned object is an instance of DBFColumn |
130 |
""" |
131 |
return self.column_map[col] |
132 |
|
133 |
def HasColumn(self, col): |
134 |
"""Return whether the table has a column with the given name or index |
135 |
""" |
136 |
return self.column_map.has_key(col) |
137 |
|
138 |
def RowIdToOrdinal(self, gid): |
139 |
"""Return the row ordinal given its id |
140 |
|
141 |
Since for DBFTables the row id is the row number, return the |
142 |
value unchanged. |
143 |
""" |
144 |
return gid |
145 |
|
146 |
def RowOrdinalToId(self, num): |
147 |
"""Return the rowid for given its ordinal |
148 |
|
149 |
Since for DBFTables the row id is the row number, return the |
150 |
value unchanged. |
151 |
""" |
152 |
return num |
153 |
|
154 |
def ReadRowAsDict(self, row, row_is_ordinal = 0): |
155 |
"""Return the entire row as a dictionary with column names as keys |
156 |
|
157 |
The row_is_ordinal is ignored for DBF tables because the row id |
158 |
is always the row number. |
159 |
""" |
160 |
return self.dbf.read_record(row) |
161 |
|
162 |
def ReadValue(self, row, col, row_is_ordinal = 0): |
163 |
"""Return the value of the specified row and column |
164 |
|
165 |
The col parameter may be the index of the column or its name. |
166 |
|
167 |
The row_is_ordinal is ignored for DBF tables because the row id |
168 |
is always the row number. |
169 |
""" |
170 |
return self.dbf.read_attribute(row, self.column_map[col].index) |
171 |
|
172 |
def ValueRange(self, col): |
173 |
"""Return the minimum and maximum values of the values in the column |
174 |
|
175 |
The return value is a tuple (min, max) unless the table is empty |
176 |
in which case the return value is None. |
177 |
""" |
178 |
count = self.NumRows() |
179 |
|
180 |
if count == 0: |
181 |
return None |
182 |
|
183 |
min = max = self.ReadValue(0, col) |
184 |
for i in range(1, count): |
185 |
value = self.ReadValue(i, col) |
186 |
if value < min: |
187 |
min = value |
188 |
elif value > max: |
189 |
max = value |
190 |
|
191 |
return (min, max) |
192 |
|
193 |
def UniqueValues(self, col): |
194 |
"""Return a sorted list of all unique values in the column col""" |
195 |
dict = {} |
196 |
|
197 |
for i in range(self.NumRows()): |
198 |
value = self.ReadValue(i, col) |
199 |
dict[value] = 0 |
200 |
|
201 |
values = dict.keys() |
202 |
values.sort() |
203 |
return values |
204 |
|
205 |
def Dependencies(self): |
206 |
"""Return an empty sequence. The DBFTable doesn't depend on anything""" |
207 |
return () |
208 |
|
209 |
# DBF specific interface parts. |
210 |
|
211 |
def Width(self, col): |
212 |
"""Return column width""" |
213 |
return self.column_map[col].width |
214 |
|
215 |
def Destroy(self): |
216 |
self.dbf.close() |
217 |
self.dbf = None |
218 |
|
219 |
def write_record(self, record, values): |
220 |
"""Write the values into the record |
221 |
|
222 |
The values parameter may either be a dictionary or a sequence. |
223 |
|
224 |
If it's a dictionary the keys must be the names of the fields |
225 |
and their value must have a suitable type. Only the fields |
226 |
actually contained in the dictionary are written. Fields for |
227 |
which there's no item in the dict are not modified. |
228 |
|
229 |
If it's a sequence, all fields must be present in the right |
230 |
order. |
231 |
""" |
232 |
if not self._writable: |
233 |
new_dbf = dbflib.open(self.filename, "r+b", return_unicode = True) |
234 |
self.dbf.close() |
235 |
self.dbf = new_dbf |
236 |
self._writable = 1 |
237 |
self.dbf.write_record(record, values) |
238 |
self.dbf.commit() |
239 |
|
240 |
def FileName(self): |
241 |
"""Return the filename the DBFTable was instantiated with""" |
242 |
return self.filename |
243 |
|
244 |
|
245 |
class MemoryColumn: |
246 |
|
247 |
def __init__(self, name, type, index): |
248 |
self.name = name |
249 |
self.type = type |
250 |
self.index = index |
251 |
|
252 |
class MemoryTable(TitledObject): |
253 |
|
254 |
"""Very simple table implementation that operates on a list of tuples""" |
255 |
|
256 |
def __init__(self, fields, data): |
257 |
"""Initialize the MemoryTable |
258 |
|
259 |
Parameters: |
260 |
fields -- List of (name, field_type) pairs |
261 |
data -- List of tuples, one for each row of data |
262 |
""" |
263 |
self.data = data |
264 |
title = 'MemoryTable' |
265 |
TitledObject.__init__(self, title) |
266 |
|
267 |
# Create the column information objects |
268 |
self.columns = [] |
269 |
self.column_map = {} |
270 |
for name, ftype in fields: |
271 |
index = len(self.columns) |
272 |
col = MemoryColumn(name, ftype, index) |
273 |
self.columns.append(col) |
274 |
self.column_map[name] = col |
275 |
self.column_map[index] = col |
276 |
|
277 |
def NumColumns(self): |
278 |
"""Return the number of columns in the table""" |
279 |
return len(self.columns) |
280 |
|
281 |
def Column(self, col): |
282 |
"""Return information about the column given by its name or index |
283 |
|
284 |
The returned object is an instance of MemoryColumn. |
285 |
""" |
286 |
return self.column_map[col] |
287 |
|
288 |
def Columns(self): |
289 |
"""Return the table's colum definitions |
290 |
|
291 |
The return value is a sequence of MemoryColumn instances, one |
292 |
for each column. |
293 |
""" |
294 |
return self.columns |
295 |
|
296 |
def HasColumn(self, col): |
297 |
"""Return whether the table has a column with the given name or index |
298 |
""" |
299 |
return self.column_map.has_key(col) |
300 |
|
301 |
def NumRows(self): |
302 |
"""Return the number of rows in the table""" |
303 |
return len(self.data) |
304 |
|
305 |
def RowIdToOrdinal(self, gid): |
306 |
"""Return the row ordinal given its id |
307 |
|
308 |
Since for MemoryTables the row id is the row number, return the |
309 |
value unchanged. |
310 |
""" |
311 |
return gid |
312 |
|
313 |
def RowOrdinalToId(self, num): |
314 |
"""Return the rowid for given its ordinal |
315 |
|
316 |
Since for MemoryTables the row id is the row number, return the |
317 |
value unchanged. |
318 |
""" |
319 |
return num |
320 |
|
321 |
def ReadValue(self, row, col, row_is_ordinal = 0): |
322 |
"""Return the value of the specified row and column |
323 |
|
324 |
The col parameter may be the index of the column or its name. |
325 |
|
326 |
The row_is_ordinal is ignored for DBF tables because the row id |
327 |
is always the row number. |
328 |
""" |
329 |
return self.data[row][self.column_map[col].index] |
330 |
|
331 |
def ReadRowAsDict(self, index, row_is_ordinal = 0): |
332 |
"""Return the entire row as a dictionary with column names as keys |
333 |
|
334 |
The row_is_ordinal is ignored for DBF tables because the row id |
335 |
is always the row number. |
336 |
""" |
337 |
return dict([(col.name, self.data[index][col.index]) |
338 |
for col in self.columns]) |
339 |
|
340 |
def ValueRange(self, col): |
341 |
"""Return the minimum and maximum values of the values in the column |
342 |
|
343 |
The return value is a tuple (min, max) unless the table is empty |
344 |
in which case the return value is None. |
345 |
""" |
346 |
|
347 |
index = self.column_map[col].index |
348 |
values = [row[index] for row in self.data] |
349 |
if not values: |
350 |
return None |
351 |
|
352 |
return min(values), max(values) |
353 |
|
354 |
def UniqueValues(self, col): |
355 |
"""Return a sorted list of all unique values in the column col |
356 |
|
357 |
col can be either column index or name. |
358 |
""" |
359 |
dict = {} |
360 |
|
361 |
for i in range(self.NumRows()): |
362 |
value = self.ReadValue(i, col) |
363 |
dict[value] = 0 |
364 |
|
365 |
values = dict.keys() |
366 |
values.sort() |
367 |
return values |
368 |
|
369 |
def Width(self, col): |
370 |
"""Return the maximum width of values in the column |
371 |
|
372 |
The return value is the the maximum length of string |
373 |
representation of the values in the column (represented by index |
374 |
or name). |
375 |
""" |
376 |
max = 0 |
377 |
|
378 |
type = self.column_map[col].type |
379 |
index = self.column_map[col].index |
380 |
values = [row[index] for row in self.data] |
381 |
if not values: |
382 |
return None |
383 |
|
384 |
if type == FIELDTYPE_DOUBLE: |
385 |
format = "%.12f" |
386 |
elif type == FIELDTYPE_INT: |
387 |
format = "%d" |
388 |
else: |
389 |
format = "%s" |
390 |
for value in values: |
391 |
l = len(format % value) |
392 |
if l > max: |
393 |
max = l |
394 |
|
395 |
return max |
396 |
|
397 |
def Dependencies(self): |
398 |
"""Return an empty sequence. The MemoryTable doesn't depend on anything |
399 |
""" |
400 |
return () |
401 |
|
402 |
def write_record(self, record, values): |
403 |
# TODO: Check for correct lenght and perhaps also |
404 |
# for correct types in case values is a tuple. How to report problems? |
405 |
# TODO: Allow values to be a dictionary and write the single |
406 |
# fields that are specified. |
407 |
self.data[record] = values |
408 |
|
409 |
|
410 |
|
411 |
def _find_dbf_column_names(names): |
412 |
"""Determine the column names to use in a DBF file |
413 |
|
414 |
DBF files have a length limit of 10 characters on the column names |
415 |
so when writing an arbitrary Thuban table to a DBF file we may have |
416 |
we may have to rename some of the columns making sure that they're |
417 |
unique in the DBF file too. |
418 |
|
419 |
Names that are already short enough will stay the same. Longer names |
420 |
will be truncated to 10 characters or if that isn't unique it will |
421 |
be truncated more and filled up with digits. |
422 |
|
423 |
The parameter names should be a list of the column names. The return |
424 |
value will be a dictionary mapping the names in the input list to |
425 |
the names to use in the DBF file. |
426 |
""" |
427 |
# mapping from the original names in table to the names in the DBF |
428 |
# file |
429 |
name_map = {} |
430 |
|
431 |
# First, we keep all names that are already short enough |
432 |
for i in range(len(names) - 1, -1, -1): |
433 |
if len(names[i]) <= 10: |
434 |
name_map[names[i]] = names[i] |
435 |
del names[i] |
436 |
|
437 |
# dict used as a set of all names already used as DBF column names |
438 |
used = name_map.copy() |
439 |
|
440 |
# Go through all longer names. If the name truncated to 10 |
441 |
# characters is not used already, we use that. Otherwise we truncate |
442 |
# it more and append numbers until we get an unused name |
443 |
for name in names: |
444 |
truncated = name[:10] |
445 |
num = 0; numstr = "" |
446 |
#print "truncated", truncated, num |
447 |
while truncated in used and len(numstr) < 10: |
448 |
num += 1 |
449 |
numstr = str(num) |
450 |
truncated = name[:10 - len(numstr)] + numstr |
451 |
#print "truncated", truncated, num |
452 |
if len(numstr) >= 10: |
453 |
# This case should never happen in practice as tables with |
454 |
# 10^10 columns seem very unlikely :) |
455 |
raise ValueError("Can't find unique dbf column name") |
456 |
|
457 |
name_map[name] = truncated |
458 |
used[truncated] = 1 |
459 |
|
460 |
return name_map |
461 |
|
462 |
def table_to_dbf(table, filename, rows = None): |
463 |
"""Create the dbf file filename from the table. |
464 |
|
465 |
If rows is not None (the default) then it must be a list of row |
466 |
indices to be saved to the file, otherwise all rows are saved. |
467 |
""" |
468 |
|
469 |
dbf = dbflib.create(filename, code_page = dbflib.LDID_ESRI_ANSI, return_unicode = True) |
470 |
|
471 |
dbflib_fieldtypes = {FIELDTYPE_STRING: dbflib.FTString, |
472 |
FIELDTYPE_INT: dbflib.FTInteger, |
473 |
FIELDTYPE_DOUBLE: dbflib.FTDouble} |
474 |
|
475 |
|
476 |
name_map = _find_dbf_column_names([col.name for col in table.Columns()]) |
477 |
|
478 |
# Initialise the header. Distinguish between DBFTable and others. |
479 |
for col in table.Columns(): |
480 |
width = table.Width(col.name) |
481 |
if col.type == FIELDTYPE_DOUBLE: |
482 |
prec = getattr(col, "prec", 12) |
483 |
else: |
484 |
prec = 0 |
485 |
dbf.add_field(unicode_from_internal(name_map[col.name]), |
486 |
dbflib_fieldtypes[col.type], |
487 |
width, prec) |
488 |
|
489 |
if rows is None: |
490 |
rows = range(table.NumRows()) |
491 |
|
492 |
recNum = 0 |
493 |
for i in rows: |
494 |
record = {} |
495 |
for key, value in table.ReadRowAsDict(i).items(): |
496 |
record[name_map[key]] = value |
497 |
dbf.write_record(recNum, record) |
498 |
recNum += 1 |
499 |
dbf.close() |
500 |
|
501 |
def table_to_csv(table, filename, rows = None): |
502 |
"""Export table to csv file. |
503 |
|
504 |
If rows is not None (the default) then it must be a list of row |
505 |
indices to be saved to the file, otherwise all rows are saved. |
506 |
""" |
507 |
|
508 |
file = open(filename,"w") |
509 |
columns = table.Columns() |
510 |
if columns: |
511 |
header = "#%s" % columns[0].name |
512 |
for col in columns[1:]: |
513 |
header = header + ",%s" % col.name |
514 |
header = header + "\n" |
515 |
file.write(header) |
516 |
|
517 |
if rows is None: |
518 |
rows = range(table.NumRows()) |
519 |
|
520 |
for i in rows: |
521 |
record = table.ReadRowAsDict(i) |
522 |
if len(record): |
523 |
line = "%s" % record[columns[0].name] |
524 |
for col in columns[1:]: |
525 |
line = line + ",%s" % record[col.name] |
526 |
line = line + "\n" |
527 |
file.write(line) |
528 |
file.close() |
529 |
|