/* $Id: descr_stats.c,v 1.2 1999/03/02 16:38:31 roca Exp $ */
/*
 * desc_stats.c
 *
 *	Created by  : V.Roca
 *	Date	    : February 1999
 *
 *	This file calculates various statistics on a set values.
 *	(mean, median, variance, standard deviation, confidence interval)
 */
/*
 * (c) Copyright 1998/1999 - V. Roca (vincent.roca@lip6.fr)
 * This tool is provided as is, without any warranty.
 * Permission to use, copy and modify is provided for non commercial
 * purposes as long as this notice appears on all copies.
 */

#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
#include <math.h>

#ifdef DEBUG
#define TRACE(m)	printf m
#else
#define TRACE(m)
#endif

#define min(a,b)	((a) < (b) ? (a) : (b))
#define max(a,b)	((a) > (b) ? (a) : (b))


#define STEPVAL		0.01	/* size of an interval for histograms */
#define MAXNAMELEN	256	/* max name string length */
#define MAXCMDLEN	2048	/* max name string length */
#define MAXVAL		16384	/* max nb of values */


float	val[MAXVAL];		/* array of values from input file */
int	valnb;			/* nb of values of array */
int	middle;


void
usage (int	argc,
       char	*argv[])
{
	printf(
	"usage: %s row if\n\
	row	row to consider in the input file (first row is 1)\n\
	if	input file\n",
	argv[0]);
	exit(-1);
}


/*
 * find the confidence interval around the given value
 */
float
confidence_int (float	confidence,
		float	center)
{
	int	i_center;
	int	inf, sup;
	int	need;			/* we need so many values... */
	int	nb;			/* ...but we have this number */
	float	delta_inf;
	float	delta_sup;

	for (i_center = 0; i_center < valnb - 1; i_center++)
		if (val[i_center] < center && val[i_center+1] >= center)
			break;
	if ( i_center == 0 || i_center == valnb - 1)
		return -1.0;	/* not found */

	need = (int)floor((double)(valnb * confidence));
	nb = 0;
	inf = sup = i_center;
	while (nb < need) {
		if (inf == 0)
			sup++;
		else if (sup == valnb -1)
			inf--;
		else if ((center - val[inf - 1]) < (val[sup + 1] - center))
			inf--;
		else
			sup++;
		nb++;
	}
	delta_inf = center - val[inf];
	delta_sup = val[sup] - center;
	TRACE(("delta_inf=%f, delta_sup=%f, inf=%d, sup=%d, need=%d\n",
		delta_inf, delta_sup, inf, sup, need));
	return (max(delta_inf, delta_sup));
}


void
main (int	argc,
      char	*argv[])
{
	FILE	*ifp, *ofp;		/* input/output file ptrs */
	char	ifn[MAXNAMELEN];	/* input file name string */
	char	ofn[MAXNAMELEN];	/* output file name string */
	char	demfn[MAXNAMELEN];	/* .dem file name string */
	char	cmd[MAXCMDLEN];		/* command string */
	int	row;			/* colomn number to keep */
	int	i;
	float	sum, mean, median, variance, std_deviation, range;
	char	answer;
	float	upto, step, half_step;
	int	nb_per_int;		/* histograms: sample nb per interval */
	float	int_90_mean;
	float	int_90_median;
	float	int_95_mean;
	float	int_95_median;
	float	int_99_mean;
	float	int_99_median;

	if (argc != 3)
		usage(argc, argv);

	row = atoi(argv[1]);
	strncpy(ifn, argv[2], MAXNAMELEN);

	sprintf(ofn, "/tmp/of%d", getpid());
	TRACE(("\ttemp file is %s\n", ofn));

	/* keep only the appropriate row */
	sprintf(cmd, "cat %s | awk '{ print $%d }' > %s", ifn, row, ofn);
	TRACE(("--> keep row cmd: \"%s\"\n", cmd));
	system(cmd);

	strncpy(ifn, ofn, MAXNAMELEN);
	sprintf(ofn, "/tmp/of%d", getpid()+1);
	TRACE(("\tnew temp file is %s\n", ofn));

	/* sort the file (specify ifn both for input and output file) */
	sprintf(cmd, "sort -n %s > %s", ifn, ofn);
	TRACE(("--> sort cmd: \"%s\"\n", cmd));
	system(cmd);

	strncpy(ifn, ofn, MAXNAMELEN);
	if ((ifp = fopen(ifn, "r")) < 0) {
		perror("open");
		usage(argc, argv);
	}

	/* store values of the input file into our array */
	valnb = 0;
	while (fscanf(ifp, "%f", &val[valnb]) == 1)
		valnb++;
	if (valnb <= 1) {
		printf("ERROR: no value read in %s\n", ifn);
		exit(-1);
	}

	/* now calculate various stats */
	for (i = 0; i < valnb; i++) {
		sum += val[i];
	}

	mean = sum / valnb;

	if (valnb % 2) {
		/*
		 * odd nb of samples
		 */
		middle = floor(valnb / 2);
		median = val[middle];
	} else {
		/*
		 * even nb of samples
		 * val[middle] | median | val[middle+1]
		 */
		middle = (floor)((valnb - 1) / 2);
		median = (val[middle] + val[middle+1]) / 2;
	}

	variance = 0.0;
	for (i = 0; i < valnb; i++) {
		variance += (val[i] - mean) * (val[i] - mean);
	}
	variance = variance / (valnb - 1);
	std_deviation = sqrt(variance);

	range = val[valnb -1] - val[0];

	int_90_mean = confidence_int(0.90, mean);
	int_95_mean = confidence_int(0.95, mean);
	int_99_mean = confidence_int(0.99, mean);
	int_90_median = confidence_int(0.90, median);
	int_95_median = confidence_int(0.95, median);
	int_99_median = confidence_int(0.99, median);

	/*
	 * Print results
	 */
#ifdef NEVERDEF
	printf(
"	------------------------------------------------------\n\
	nb of samples = %d\n\
	mean = %f\n\
	median = %f\n\
	variance = %f\n\
	standard deviation = %f\n\
	range = %f\n\
	confidence interval around mean %f:\n\
		90: +/- %f\n\
		95: +/- %f\n\
		99: +/- %f\n\
	confidence interval around median %f:\n\
		90: +/- %f\n\
		95: +/- %f\n\
		99: +/- %f\n\
	------------------------------------------------------\n",
		valnb, mean, median, variance, std_deviation, range,
		mean, int_90_mean, int_95_mean, int_99_mean,
		median, int_90_median, int_95_median, int_99_median);
#else
	printf(
"	------------------------------------------------------\n\
	nb of samples = %d\n\
	mean = %f\n\
	median = %f\n\
	variance = %f\n\
	standard deviation = %f\n\
	range = %f\n\
	confidence interval around mean %f:\n\
		90: +/- %f\n\
		95: +/- %f\n\
		99: +/- %f\n\
	------------------------------------------------------\n",
		valnb, mean, median, variance, std_deviation, range,
		mean, int_90_mean, int_95_mean, int_99_mean);
#endif

	/*
	 * Histogram file preparation
	 */
	answer = 'n';
	printf("Continue with histogram (y/n)[n] ? ");
	scanf("%c", &answer);
	if (answer != 'y')
		goto cleanup;

	step = STEPVAL;
	half_step = step / 2;
	printf("use step %f\n", step);

	sprintf(ofn, "/tmp/histo%d.dat", getpid());
	sprintf(demfn, "/tmp/histo%d.dem", getpid());
	printf("histogram data file is:		%s\n", ofn);
	printf("histogram gnuplot file is:	%s\n", demfn);

	if ((ofp = fopen(ofn, "w")) < 0) {
		perror("open");
		exit(-1);
	}

	for (upto = val[0] + step, i = 0, nb_per_int = 0;
	     i < valnb;
	     i++) {
		if (val[i] < upto) {
			nb_per_int ++;
		} else {
			/* switch to next interval */
			fprintf(ofp, "%f %d\n", upto - half_step, nb_per_int);
			upto = val[i] + step;
			nb_per_int = 1;
		}
	}
	/* don't forget the last one... */
	fprintf(ofp, "%f %d\n", upto - half_step, nb_per_int);

	/* and now prepar the histo.dem file */
	sprintf(cmd, "echo \'
		set title  \"Histogram\"\n\
		set xlabel \"value\"\n\
		set ylabel \"number of samples per %.3f interval\"\n\
		set autoscale\n\
		set nolabel\n\
		set grid\n\
		plot [] [0:] \"%s\" with impulses\n\
		pause -1 \"Hit return to continue\" \' > %s",
		step, ofn, demfn);
	TRACE(("--> histo.dem creation cmd: \"%s\"\n", cmd));
	system(cmd);

#ifdef NEVERDEF
	/* and finally launch gnuplot... */
	/*sprintf(cmd, "xterm -e gnuplot -fn 5x8 %s", demfn);*/
	sprintf(cmd, "gnuplot %s", demfn);
	TRACE(("--> gnuplot cmd: \"%s\"\n", cmd));
	system(cmd);
#endif

cleanup:
#ifdef DEBUG
	/* keep temp output files */
#else
	/* remove everything */
	sprintf(cmd, "rm /tmp/of[0-9]*");
	system(cmd);
#endif
}

