Python CSV Normalizer (norm.py)
From SnOwy - Ed's Wiki Notebook
Contents |
Abstract
forthcoming... -- remember to remark that the range output is [0.2, 0.8].
Usage
Unix
cat input.txt | python norm.py > output.py
Windows
see normwin.py instead.
Example
Input File
1011.02, 21, 0.003121, 0.1 1320.02, 34, 0.013400, 0.8 1993.04, 45, 0.031001, 0.2 1321.04, 31, 0.003999, 0.6 2100.11, 39, 0.020310, 0.3
Output File
Spaces have been added for clarity-- they don't exist in the actual output.
0.2, 0.2, 0.2, 0.2 0.37023386497, 0.525, 0.421212338594, 0.8 0.74101313941, 0.8, 0.8, 0.285714285714 0.370795802, 0.45, 0.218895265423, 0.628571428571 0.8, 0.65, 0.569921090387, 0.371428571429
Code
import sys
import math
#if len(sys.argv) > 1 and sys.argv[1] == "-r": rank = True
#else: rank = False
cardinality = None
lines = []
min_vals = []
max_vals = []
for iline, line in enumerate(sys.stdin):
line = line.split(',')
if not cardinality:
cardinality = len(line)
elif cardinality != len(line):
exitStr = "Error: Line " + str(iline) + " has " + str(len(line)) +\
" elements, expected " + str(cardinality) + "."
exit(exitStr)
values = []
for val in line:
try:
val = float(val)
except ValueError:
exitStr = "Error: Line " + str(iline) + " contains non-number."
exit(exitStr)
values += [val]
lines += [values]
if iline == 0:
for val in values:
min_vals += [val]
max_vals += [val]
for i, (miv, mav, val) in enumerate(zip(min_vals, max_vals, values)):
if(math.isnan(miv)): min_vals[i] = val
elif val < miv: min_vals[i] = val
if(math.isnan(mav)): max_vals[i] = val
elif val > mav: max_vals[i] = val
for line in lines:
for i, (miv, mav, val) in enumerate(zip(min_vals, max_vals, line)):
line[i] = 0.2 + ((val - miv) / (mav - miv)) * 0.6
for line in lines:
outStr = ""
for ival, val in enumerate(line):
outStr += str(val)
if ival < len(line) -1:
outStr += ","
print outStr