#!python

"""
This runs a K-means algorithm on some text/plain data.
Usage: g_kmeans [flags] K <data >output
where flags are:

-- : end of flags

-header AAA BBB : add AAA=BBB to the header.

-clip .### : use a trimmed mean instead of a regular mean in the algorithm.
This makes the algorithm robust against wild data.
	
-uidname NNN : this just changes the name of the output column that
contains the name of each datum.   This is intended to make it easier
to feed the output into other programs.

Input format:
C{N} numeric columns separated by white space, a hash mark, and a name for each datum.
The hash mark and the name are optional.

Output format:

L{fiatio} format on the standard output.
"""

import sys
from gmisclib import die
from gmisclib import chunkio
from gmisclib import fiatio
from g_classifiers import g_kmeans

if __name__ == '__main__':
	arglist = sys.argv[1:]
	hdrs = {}
	Clip = None
	uidname = 'uid'
	while arglist and arglist[0].startswith('-'):
		arg = arglist.pop(0)
		if arg == '--':
			break
		elif arg == '-header':
			k = arglist.pop(0)
			v = arglist.pop(0)
			hdrs[k] = v
		elif arg == '-uidname':
			uidname = arglist.pop(0)
		elif arg == '-clip':
			Clip = float(arglist.pop(0))
		else:
			die.die('Unrecognized flag: %s' % arg)
	if not arglist:
		sys.stderr.write(__doc__)
		die.die("Usage: g_kmeans [flags] number_of_classes <data >output")
	ncl = int(arglist[0])
	d = g_kmeans.read_data(sys.stdin)
	o, map_to_cluster, err = g_kmeans.kmeans(d, ncl, distfcn=g_kmeans.euclid, clip=Clip)
	w = fiatio.writer(sys.stdout)
	w.headers(hdrs)
	w.header('Ncl', ncl)
	if Clip is not None:
		w.header('Clip', Clip)
	w.header('Error', err)
	for (clnum, descr) in enumerate(o):
		w.header('cluster%d' % clnum,
			chunkio.chunkstring_w().write_NumArray(descr.center, str).close()
			)
		w.header('size%d' % clnum, descr.size)
	for (clnum, descr) in enumerate(o):
		for datum_uid in descr.membership:
			tmp = {'clnum': clnum, uidname: datum_uid}
			w.datum(tmp)
	w.flush()
