#!/usr/bin/env python

"""A classifier that assumes that P is linear in position.
This is known as a (linear) logistic discriminant analysis::


This is a script that can be run from the Linux command line.
Usage: l_classifier [flags] < input_data >log_file
This script also produces two files: classes.chunk and classified.fiat.

Flags:
	- C{-test}	Run some tests.
	- C{-D}	Print extra debug information.  Repeated -D flags increase verbosity.
	- C{-quiet}	Print less.
	- C{-c 0.NN}	Ignore the specified fraction (C{0 <= 0.NN < 1}) of the worst classifications.
		See L{q_classifier_r.evaluate_Bayes} for details.  When building
		the classifiers, if C{0.NN > 0}, this essentially says that "nothing is extremely
		improbable, because there's a C{0.NN} chance that it is just a mistake."
		This makes the classifier boundaries less sensitive to points on the
		outskirts of regions.
	- C{-ftest 0.NN}		Use a fraction C{0 < 0.NN < 1} of the data for the test set; the remainder
		is used for training the classifiers.
	- C{-coverage N.N}	This script generates a group of classifiers for a particular test-set/training-set
		pair, but then it samples a new test set and repeats until an average datum appears in a
		test set C{N.N} times.
	- C{-nperdim N.N} This controls how many classifiers are generated per test-set/training-set pair.
		The number is C{N.N} times the number of dimensions in the feature vector.
	- C{-i filename} Take input from the specified file instead of the standard input.

The input data is a multicolumn ASCII file with one line per measurement to be classified.
Columns are separated by whitespace and are:
	- 1:	The correct class (i.e. the 'gold standard').  Obviously, the classification problem
		gets more difficult as the number of distinct classes gets larger.
	- 2 - N+1:	The various components of the feature vector to be used for classification.
		All lines must have the same number of components.
	- *:	An optional hash mark followed by an arbitrary identifier for that measurement.
		(If no identifier is supplied, it will be called "Line:NN" based on the line number in
		this input file.)  Identifiers don't affect the computation, but they do let you
		connect values in the output files to feature vectors in the input file.

The standard output contains miscellaneous progress information and lines
(that are prefixed with "WRONG") that list incorrect classifications.  However,
comprehensive classification information can be found in
C{classes.chunk}. This provides a list of all the classifiers that were generated,
and contains enough information to reconstruct the classifiers so that they could
be applied to another set of data.
(C{classes.chunk} is in a format readable by C{chunkio.py}.)    It is recommended that it
be read in by L{read_classified.read_classes_header} (if you just
want the top few lines of header information), or
L{read_classified.read_classes} for the full description.

The header contains information on classifier performance.  It contains
attribute/value pairs as follows:
	- B{Pcorrect} Average fraction of correct classification
	- B{PSigma} The standard deviation (among the classifiers) of fraction of correct classification.
	- B{total 3660}	Total number of classifications (i.e. number of data
		times number of classifiers).
	- B{nok} 8 K 0.994231955051 KSigma
 	- B{Chance} The overall probability of accidentally making a correct classification.
		(This is an average over all classifiers.)
 	- B{ChSigma} The standard deviation (among the classifiers) of C{Chance}.
	- B{Perfection} How well a perfect classifier could perform on that test set (normally 1.0).
	- B{PerfectionSigma} The standard deviation of C{Perfection} across all classifiers (normally 0).
 	- B{N_per_dim}, B{Ftest}, B{Coverage} Parameters set by command line flags
	- B{classifier_type} "linear_discriminant_classifier" for this program.

@note: A useful general reference is:
	@inbook{webb:spr:logistic,
   	author    = {Andrew Webb},
   	title     = {Statistical Pattern Recognition},
   	pages     = {124--132},
   	year      = {1999},
   	publisher = {Arnold},
   	address   = {London, New York},
   	note = {ISBN 0 340 74164  3}
	}
@note: This code was described in an appendix to
	"Dimensions of durational variation in speech",
	by Anastassia Loukina, Greg Kochanski, Burton Rosner, Chilin Shih, and Elinor Keane,
	submitted 2010 to J. Acoustical Society of America.
"""

from gmisclib import die
from g_classifiers import q_classifier_r as Q
from g_classifiers import l_classifier_guts as QC

if __name__ == '__main__':
	import sys

	Modify = None
	Stdin = sys.stdin
#	evnm = None
	ftrim = None
	Verbose = 1
	arglist = sys.argv[1:]
	while arglist and arglist[0].startswith('-'):
		arg = arglist.pop(0)
		if arg == '--':
			break
		elif arg == '-coverage':
			QC.COVERAGE = float(arglist.pop(0))
		elif arg == '-nperdim':
			QC.N_PER_DIM = float(arglist.pop(0))
# 		elif arg == '-evaluator':
# 			evnm = arglist.pop(0)
		elif arg == '-ftest':
			QC.FTEST = float(arglist.pop(0))
			assert 0.0 < QC.FTEST < 1.0
		elif arg == '-c':
			ftrim = float(arglist.pop(0))
			assert 0.0 <= ftrim < 0.5
		elif arg == '-quiet':
			verbose = 0
		elif arg == '-verbose':
			verbose = -1
		elif arg == '-flatten':
			Modify = Q.default_modify_class
		elif arg == '-i':
			Stdin = open(arglist.pop(0), 'r')
		elif arg == '-D':
			Q.D += 1
		elif arg == '-test':
			QC.test()
			sys.exit(0)
		elif arg == '-test_2_bias':
			QC.test_2_bias(QC.l_classifier_desc)
			sys.exit(0)
		else:
			die.warn('Unrecognized argument: %s' % arg)
			sys.stderr.write(__doc__)
			die.exit(1)
	if len(arglist) != 0:
		die.die("Extra arguments: %s" % str(arglist))

	QC.go_auto(Stdin, coverage=QC.COVERAGE, n_per_dim=QC.N_PER_DIM,
		ftrim=ftrim, ftest=QC.FTEST, verbose=Verbose, modify_class=Modify,
#		evnm=evnm
		)
