/[thuban]/branches/WIP-pyshapelib-Unicode/thuban/Thuban/Model/table.py
ViewVC logotype

Contents of /branches/WIP-pyshapelib-Unicode/thuban/Thuban/Model/table.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2839 - (show annotations)
Thu Feb 14 00:19:14 2008 UTC (17 years ago) by bernhard
File MIME type: text/x-python
File size: 16586 byte(s)
* Thuban/Model/table.py: Using internal_from_unicode() when reading
dbflib column names and unicode_from_internal() when writing them.

* test/test_load.py(TestNonAsciiColumnName), applied change rev2838 from
  test/test_load_1_0.py(TestNonAsciiColumnName),

1 # Copyright (c) 2001, 2002, 2003 by Intevation GmbH
2 # Authors:
3 # Bernhard Herzog <[email protected]>
4 # Jan-Oliver Wagner <[email protected]>
5 # Frank Koormann <[email protected]>
6 #
7 # This program is free software under the GPL (>=v2)
8 # Read the file COPYING coming with Thuban for details.
9
10 """
11 Classes for handling tables of data.
12 """
13
14 __version__ = "$Revision$"
15
16 import os
17 import inspect
18 import warnings
19
20 from base import TitledObject
21
22 from Thuban import internal_from_unicode, unicode_from_internal
23
24 import dbflib
25
26 # the field types supported by a Table instance.
27 FIELDTYPE_INT = "int"
28 FIELDTYPE_STRING = "string"
29 FIELDTYPE_DOUBLE = "double"
30
31
32 # map the dbflib constants for the field types to our constants
33 dbflib_fieldtypes = {dbflib.FTString: FIELDTYPE_STRING,
34 dbflib.FTInteger: FIELDTYPE_INT,
35 dbflib.FTDouble: FIELDTYPE_DOUBLE}
36
37
38 class DBFColumn:
39
40 """Description of a column in a DBFTable
41
42 Instances have the following public attributes:
43
44 name -- Name of the column
45 type -- Type of the column (one of FIELDTYPE_STRING, FIELDTYPE_INT or\
46 FIELDTYPE_DOUBLE)
47 index -- The index of the column
48 width -- the width of the data in the column
49 prec -- The precision of the data (only valid for type == FIELDTYPE_DOUBLE)
50 """
51
52 def __init__(self, name, type, width, prec, index):
53 self.name = name
54 self.type = type
55 self.width = width
56 self.prec = prec
57 self.index = index
58
59
60 class DBFTable(TitledObject):
61
62 """
63 Table interface for the data in a DBF file
64 """
65
66 # Implementation strategy regarding writing to a DBF file:
67 #
68 # Most of the time Thuban only needs to read from a table and it is
69 # important that Thuban can work with read-only files. Therefore the
70 # DBF file is opened only for reading initially. Only when
71 # write_record is called we try to open the DBF file for writing as
72 # well. If that succeeds the read/write DBF file will be used for
73 # all IO afterwards.
74 #
75 # It's important to use the same DBF file object for both reading
76 # and writing to make sure that reading a records after writing
77 # returns the new values. With two separate objects this wouldn't
78 # work because a DBF file object buffers some data
79
80 def __init__(self, filename):
81 self.filename = os.path.abspath(filename)
82
83 # Omit the extension in the title as it's not really needed and
84 # it can be confusing because dbflib removes extensions and
85 # appends some variations of '.dbf' before it tries to open the
86 # file. So the title could be e.g. myshapefile.shp when the real
87 # filename is myshapefile.dbf
88 title = os.path.splitext(os.path.basename(self.filename))[0]
89 TitledObject.__init__(self, title)
90
91 self.dbf = dbflib.open(filename, return_unicode = True)
92
93 # If true, self.dbf is open for writing.
94 self._writable = 0
95
96 # Create the column information objects
97 self.columns = []
98 self.column_map = {}
99 for i in range(self.NumColumns()):
100 ftype, name, width, prec = self.dbf.field_info(i)
101 name = internal_from_unicode(name)
102 ftype = dbflib_fieldtypes[ftype]
103 index = len(self.columns)
104
105 col = DBFColumn(name, ftype, width, prec, index)
106 self.columns.append(col)
107 self.column_map[name] = col
108 self.column_map[index] = col
109
110 def NumRows(self):
111 """Return the number of rows in the table"""
112 return self.dbf.record_count()
113
114 def NumColumns(self):
115 """Return the number of columns in the table"""
116 return self.dbf.field_count()
117
118 def Columns(self):
119 """Return the table's colum definitions
120
121 The return value is a sequence of DBFColumn instances, one for
122 each column.
123 """
124 return self.columns
125
126 def Column(self, col):
127 """Return information about the column given by its name or index
128
129 The returned object is an instance of DBFColumn
130 """
131 return self.column_map[col]
132
133 def HasColumn(self, col):
134 """Return whether the table has a column with the given name or index
135 """
136 return self.column_map.has_key(col)
137
138 def RowIdToOrdinal(self, gid):
139 """Return the row ordinal given its id
140
141 Since for DBFTables the row id is the row number, return the
142 value unchanged.
143 """
144 return gid
145
146 def RowOrdinalToId(self, num):
147 """Return the rowid for given its ordinal
148
149 Since for DBFTables the row id is the row number, return the
150 value unchanged.
151 """
152 return num
153
154 def ReadRowAsDict(self, row, row_is_ordinal = 0):
155 """Return the entire row as a dictionary with column names as keys
156
157 The row_is_ordinal is ignored for DBF tables because the row id
158 is always the row number.
159 """
160 return self.dbf.read_record(row)
161
162 def ReadValue(self, row, col, row_is_ordinal = 0):
163 """Return the value of the specified row and column
164
165 The col parameter may be the index of the column or its name.
166
167 The row_is_ordinal is ignored for DBF tables because the row id
168 is always the row number.
169 """
170 return self.dbf.read_attribute(row, self.column_map[col].index)
171
172 def ValueRange(self, col):
173 """Return the minimum and maximum values of the values in the column
174
175 The return value is a tuple (min, max) unless the table is empty
176 in which case the return value is None.
177 """
178 count = self.NumRows()
179
180 if count == 0:
181 return None
182
183 min = max = self.ReadValue(0, col)
184 for i in range(1, count):
185 value = self.ReadValue(i, col)
186 if value < min:
187 min = value
188 elif value > max:
189 max = value
190
191 return (min, max)
192
193 def UniqueValues(self, col):
194 """Return a sorted list of all unique values in the column col"""
195 dict = {}
196
197 for i in range(self.NumRows()):
198 value = self.ReadValue(i, col)
199 dict[value] = 0
200
201 values = dict.keys()
202 values.sort()
203 return values
204
205 def Dependencies(self):
206 """Return an empty sequence. The DBFTable doesn't depend on anything"""
207 return ()
208
209 # DBF specific interface parts.
210
211 def Width(self, col):
212 """Return column width"""
213 return self.column_map[col].width
214
215 def Destroy(self):
216 self.dbf.close()
217 self.dbf = None
218
219 def write_record(self, record, values):
220 """Write the values into the record
221
222 The values parameter may either be a dictionary or a sequence.
223
224 If it's a dictionary the keys must be the names of the fields
225 and their value must have a suitable type. Only the fields
226 actually contained in the dictionary are written. Fields for
227 which there's no item in the dict are not modified.
228
229 If it's a sequence, all fields must be present in the right
230 order.
231 """
232 if not self._writable:
233 new_dbf = dbflib.open(self.filename, "r+b", return_unicode = True)
234 self.dbf.close()
235 self.dbf = new_dbf
236 self._writable = 1
237 self.dbf.write_record(record, values)
238 self.dbf.commit()
239
240 def FileName(self):
241 """Return the filename the DBFTable was instantiated with"""
242 return self.filename
243
244
245 class MemoryColumn:
246
247 def __init__(self, name, type, index):
248 self.name = name
249 self.type = type
250 self.index = index
251
252 class MemoryTable(TitledObject):
253
254 """Very simple table implementation that operates on a list of tuples"""
255
256 def __init__(self, fields, data):
257 """Initialize the MemoryTable
258
259 Parameters:
260 fields -- List of (name, field_type) pairs
261 data -- List of tuples, one for each row of data
262 """
263 self.data = data
264 title = 'MemoryTable'
265 TitledObject.__init__(self, title)
266
267 # Create the column information objects
268 self.columns = []
269 self.column_map = {}
270 for name, ftype in fields:
271 index = len(self.columns)
272 col = MemoryColumn(name, ftype, index)
273 self.columns.append(col)
274 self.column_map[name] = col
275 self.column_map[index] = col
276
277 def NumColumns(self):
278 """Return the number of columns in the table"""
279 return len(self.columns)
280
281 def Column(self, col):
282 """Return information about the column given by its name or index
283
284 The returned object is an instance of MemoryColumn.
285 """
286 return self.column_map[col]
287
288 def Columns(self):
289 """Return the table's colum definitions
290
291 The return value is a sequence of MemoryColumn instances, one
292 for each column.
293 """
294 return self.columns
295
296 def HasColumn(self, col):
297 """Return whether the table has a column with the given name or index
298 """
299 return self.column_map.has_key(col)
300
301 def NumRows(self):
302 """Return the number of rows in the table"""
303 return len(self.data)
304
305 def RowIdToOrdinal(self, gid):
306 """Return the row ordinal given its id
307
308 Since for MemoryTables the row id is the row number, return the
309 value unchanged.
310 """
311 return gid
312
313 def RowOrdinalToId(self, num):
314 """Return the rowid for given its ordinal
315
316 Since for MemoryTables the row id is the row number, return the
317 value unchanged.
318 """
319 return num
320
321 def ReadValue(self, row, col, row_is_ordinal = 0):
322 """Return the value of the specified row and column
323
324 The col parameter may be the index of the column or its name.
325
326 The row_is_ordinal is ignored for DBF tables because the row id
327 is always the row number.
328 """
329 return self.data[row][self.column_map[col].index]
330
331 def ReadRowAsDict(self, index, row_is_ordinal = 0):
332 """Return the entire row as a dictionary with column names as keys
333
334 The row_is_ordinal is ignored for DBF tables because the row id
335 is always the row number.
336 """
337 return dict([(col.name, self.data[index][col.index])
338 for col in self.columns])
339
340 def ValueRange(self, col):
341 """Return the minimum and maximum values of the values in the column
342
343 The return value is a tuple (min, max) unless the table is empty
344 in which case the return value is None.
345 """
346
347 index = self.column_map[col].index
348 values = [row[index] for row in self.data]
349 if not values:
350 return None
351
352 return min(values), max(values)
353
354 def UniqueValues(self, col):
355 """Return a sorted list of all unique values in the column col
356
357 col can be either column index or name.
358 """
359 dict = {}
360
361 for i in range(self.NumRows()):
362 value = self.ReadValue(i, col)
363 dict[value] = 0
364
365 values = dict.keys()
366 values.sort()
367 return values
368
369 def Width(self, col):
370 """Return the maximum width of values in the column
371
372 The return value is the the maximum length of string
373 representation of the values in the column (represented by index
374 or name).
375 """
376 max = 0
377
378 type = self.column_map[col].type
379 index = self.column_map[col].index
380 values = [row[index] for row in self.data]
381 if not values:
382 return None
383
384 if type == FIELDTYPE_DOUBLE:
385 format = "%.12f"
386 elif type == FIELDTYPE_INT:
387 format = "%d"
388 else:
389 format = "%s"
390 for value in values:
391 l = len(format % value)
392 if l > max:
393 max = l
394
395 return max
396
397 def Dependencies(self):
398 """Return an empty sequence. The MemoryTable doesn't depend on anything
399 """
400 return ()
401
402 def write_record(self, record, values):
403 # TODO: Check for correct lenght and perhaps also
404 # for correct types in case values is a tuple. How to report problems?
405 # TODO: Allow values to be a dictionary and write the single
406 # fields that are specified.
407 self.data[record] = values
408
409
410
411 def _find_dbf_column_names(names):
412 """Determine the column names to use in a DBF file
413
414 DBF files have a length limit of 10 characters on the column names
415 so when writing an arbitrary Thuban table to a DBF file we may have
416 we may have to rename some of the columns making sure that they're
417 unique in the DBF file too.
418
419 Names that are already short enough will stay the same. Longer names
420 will be truncated to 10 characters or if that isn't unique it will
421 be truncated more and filled up with digits.
422
423 The parameter names should be a list of the column names. The return
424 value will be a dictionary mapping the names in the input list to
425 the names to use in the DBF file.
426 """
427 # mapping from the original names in table to the names in the DBF
428 # file
429 name_map = {}
430
431 # First, we keep all names that are already short enough
432 for i in range(len(names) - 1, -1, -1):
433 if len(names[i]) <= 10:
434 name_map[names[i]] = names[i]
435 del names[i]
436
437 # dict used as a set of all names already used as DBF column names
438 used = name_map.copy()
439
440 # Go through all longer names. If the name truncated to 10
441 # characters is not used already, we use that. Otherwise we truncate
442 # it more and append numbers until we get an unused name
443 for name in names:
444 truncated = name[:10]
445 num = 0; numstr = ""
446 #print "truncated", truncated, num
447 while truncated in used and len(numstr) < 10:
448 num += 1
449 numstr = str(num)
450 truncated = name[:10 - len(numstr)] + numstr
451 #print "truncated", truncated, num
452 if len(numstr) >= 10:
453 # This case should never happen in practice as tables with
454 # 10^10 columns seem very unlikely :)
455 raise ValueError("Can't find unique dbf column name")
456
457 name_map[name] = truncated
458 used[truncated] = 1
459
460 return name_map
461
462 def table_to_dbf(table, filename, rows = None):
463 """Create the dbf file filename from the table.
464
465 If rows is not None (the default) then it must be a list of row
466 indices to be saved to the file, otherwise all rows are saved.
467 """
468
469 dbf = dbflib.create(filename, code_page = dbflib.LDID_ESRI_ANSI, return_unicode = True)
470
471 dbflib_fieldtypes = {FIELDTYPE_STRING: dbflib.FTString,
472 FIELDTYPE_INT: dbflib.FTInteger,
473 FIELDTYPE_DOUBLE: dbflib.FTDouble}
474
475
476 name_map = _find_dbf_column_names([col.name for col in table.Columns()])
477
478 # Initialise the header. Distinguish between DBFTable and others.
479 for col in table.Columns():
480 width = table.Width(col.name)
481 if col.type == FIELDTYPE_DOUBLE:
482 prec = getattr(col, "prec", 12)
483 else:
484 prec = 0
485 dbf.add_field(unicode_from_internal(name_map[col.name]),
486 dbflib_fieldtypes[col.type],
487 width, prec)
488
489 if rows is None:
490 rows = range(table.NumRows())
491
492 recNum = 0
493 for i in rows:
494 record = {}
495 for key, value in table.ReadRowAsDict(i).items():
496 record[name_map[key]] = value
497 dbf.write_record(recNum, record)
498 recNum += 1
499 dbf.close()
500
501 def table_to_csv(table, filename, rows = None):
502 """Export table to csv file.
503
504 If rows is not None (the default) then it must be a list of row
505 indices to be saved to the file, otherwise all rows are saved.
506 """
507
508 file = open(filename,"w")
509 columns = table.Columns()
510 if columns:
511 header = "#%s" % columns[0].name
512 for col in columns[1:]:
513 header = header + ",%s" % col.name
514 header = header + "\n"
515 file.write(header)
516
517 if rows is None:
518 rows = range(table.NumRows())
519
520 for i in rows:
521 record = table.ReadRowAsDict(i)
522 if len(record):
523 line = "%s" % record[columns[0].name]
524 for col in columns[1:]:
525 line = line + ",%s" % record[col.name]
526 line = line + "\n"
527 file.write(line)
528 file.close()
529

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

[email protected]
ViewVC Help
Powered by ViewVC 1.1.26