4 |
# author: Sascha L. Teichmann ([email protected]) |
# author: Sascha L. Teichmann ([email protected]) |
5 |
# |
# |
6 |
import sys |
import sys |
7 |
|
import re |
8 |
import codecs |
import codecs |
9 |
|
|
10 |
SEP = '\t' |
SEP = '\t' |
11 |
|
|
12 |
|
FKZ = re.compile(r".*ka_([0-9a-z]+)_db.csv$") |
13 |
|
|
14 |
|
class CSV: |
15 |
|
def __init__(self, f, fkz=None): |
16 |
|
self.fkz = fkz |
17 |
|
line = f.readline() |
18 |
|
line = line.replace('\r', '').replace('\n', '') |
19 |
|
line = line.split(SEP) |
20 |
|
line[0] = line[0].replace('#', '', 1) |
21 |
|
self.headers = line |
22 |
|
self.indexed_headers = dict(zip(line, xrange(len(line)))) |
23 |
|
rows = [] |
24 |
|
for line in f: |
25 |
|
line = line.replace('\r', '').replace('\n', '') |
26 |
|
if not line: continue |
27 |
|
line = line.split(SEP) |
28 |
|
rows.append(line) |
29 |
|
self.rows = rows |
30 |
|
|
31 |
|
def find(self, header): |
32 |
|
return self.indexed_headers.get(header, -1) |
33 |
|
|
34 |
def main(): |
def main(): |
35 |
csvs = [] |
csvs = [] |
36 |
max_cols = -1; |
for arg in sys.argv[1:]: |
|
max_idx = -1; |
|
|
for idx, arg in enumerate(sys.argv[1:]): |
|
37 |
print >> sys.stderr, "file: %s" % arg |
print >> sys.stderr, "file: %s" % arg |
38 |
|
m = FKZ.match(arg) |
39 |
|
if m: fkz = m.group(1).upper() |
40 |
|
else: fkz = None |
41 |
f = None |
f = None |
42 |
try: |
try: |
43 |
f = codecs.open(arg, "r", "latin1") |
f = codecs.open(arg, "r", "latin1") |
44 |
csv = [] |
csvs.append(CSV(f, fkz)) |
|
for line in f: |
|
|
line = line.replace('\r', '').replace('\n', '') |
|
|
if not line: continue |
|
|
line = line.split(SEP) |
|
|
l = len(line) |
|
|
line[0] = line[0].replace('#', '', 1) |
|
|
if l > max_cols: |
|
|
max_cols = l |
|
|
max_idx = idx |
|
|
csv.append(line) |
|
|
csvs.append(csv) |
|
45 |
finally: |
finally: |
46 |
if f: |
if f: |
47 |
try: f.close() |
try: f.close() |
48 |
except: pass |
except: pass |
49 |
|
|
50 |
print >> sys.stderr, "max_cols: %d" % max_cols |
csvs.sort(lambda a, b: -cmp(len(a.headers), len(b.headers))) |
51 |
print >> sys.stderr, "max_idx: %d" % max_idx |
|
52 |
|
headers = set() |
53 |
|
|
54 |
maps = [] |
for csv in csvs: |
55 |
for i in csvs: |
for h in csv.headers: |
56 |
maps.append([-1] * max_cols) |
headers.add(h) |
57 |
|
|
58 |
master = csvs[max_idx] |
order = [] |
59 |
|
for header in headers: |
60 |
for j, m in enumerate(master[0]): |
order.append((max([csv.find(header) for csv in csvs]), header)) |
61 |
for i in range(len(csvs)): |
order.sort(lambda a, b: cmp(a[0], b[0])) |
62 |
try: |
|
63 |
idx = csvs[i][0].index(m) |
order = [x[1] for x in order] |
64 |
maps[i][j] = idx |
|
65 |
except ValueError: |
print >> sys.stderr, "headers: %d" % len(headers) |
66 |
pass |
|
67 |
|
Writer = codecs.getwriter("latin1") |
68 |
Writer = codecs.getwriter("latin-1") |
|
69 |
f = Writer(sys.stdout) |
try: |
70 |
f.write("#%s\r\n" % SEP.join(master[0])) |
f = Writer(sys.stdout) |
71 |
|
f.write("#%s\r\n" % SEP.join(order)) |
72 |
for row, csv in enumerate(csvs): |
for csv in csvs: |
73 |
map = maps[row] |
for row in csv.rows: |
74 |
for c in csv[1:]: |
line = [] |
75 |
line = [] |
for header in order: |
76 |
for j in xrange(max_cols): |
if header == 'fkz' and csv.fkz: |
77 |
idx = map[j] |
line.append(csv.fkz) |
78 |
if idx >= 0: |
else: |
79 |
line.append(c[idx]) |
idx = csv.find(header) |
80 |
else: |
if idx >= 0: line.append(row[idx]) |
81 |
line.append('') |
else: line.append('') |
82 |
all = SEP.join(line) |
line = SEP.join(line) |
83 |
f.write("%s\r\n" % all) |
f.write("%s\r\n" % line) |
84 |
|
finally: |
85 |
|
if f: |
86 |
|
try: f.close() |
87 |
|
except: pass |
88 |
|
|
89 |
if __name__ == "__main__": |
if __name__ == "__main__": |
90 |
main() |
main() |