a Java class for generating box plot statistics

While working on a Hadoop project I found myself needing a lightweight Java class that computes box plot statistics (e.g. quartiles, outliers, etc.). So I wrote the class appended below. The code does not display the plots, only computes necessary values, since I’m planning on displaying the plots with matplotlib:

import java.util.ArrayList;
import java.util.Collections;

public class BoxPlotStats {
    double first_quartile;
    double median;
    double third_quartile;
    Double upper_max_within;
    Double lower_max_within;
    ArrayList<Double> upper_outliers;
    ArrayList<Double> lower_outliers;

    // this method adapted from http://code.activestate.com/recipes/511478-finding-the-percentile-of-the-values/
    public static double percentile(ArrayList values, double percent) {
	double k = (values.size() - 1) * percent;
	double f = Math.floor(k);
	double c = Math.ceil(k);
	if (f == c) {
	    return (Double) values.get((int) k);
	}
	double d0 = ((Double) values.get((int) f)) * (c - k);
	double d1 = ((Double) values.get((int) c)) * (k - f);
	return d0 + d1;
    }

    public void display() {
	System.out.println();
	System.out.println("Lower outliers: " + lower_outliers);
	System.out.println("Lower whisker: " + lower_max_within);
	System.out.println("First quartile: " + first_quartile);
	System.out.println("Median: " + median);
	System.out.println("Third quartile: " + third_quartile);
	System.out.println("Upper whisker: " + upper_max_within);
	System.out.println("Upper outliers: " + upper_outliers);
	System.out.println();
    }

    public void displayOneLiner() {
	String lo = lower_outliers.toString().replace(" ", "|").replace(",", "").replace("[", "").replace("]", "");
	String uo = upper_outliers.toString().replace(" ", "|").replace(",", "").replace("[", "").replace("]", "");
	System.out.println(lo + "," + lower_max_within + "," + first_quartile + "," + median + "," + third_quartile + "," + upper_max_within + "," + uo);
    }


    BoxPlotStats(ArrayList values) {
	Collections.sort(values);
	
	first_quartile = percentile(values, 0.25);
	median = percentile(values, 0.5);
	third_quartile = percentile(values, 0.75);
	double range_limits = 1.5 * (third_quartile - first_quartile);

	double limit_upper_whisker = third_quartile + range_limits;
	upper_outliers = new ArrayList();
	upper_max_within = (Double) Collections.min(values);

	double limit_lower_whisker = first_quartile - range_limits;
	lower_outliers = new ArrayList();
	lower_max_within = (Double) Collections.max(values);


	//for (Double i : values) {   // not sure why this isn't working!
	for (int x=0; x<values.size(); x++) {
	    Double i = (Double) values.get(x);

	    if (i > limit_upper_whisker) {
		upper_outliers.add(i);
	    }
	    if (i < limit_lower_whisker) {
		lower_outliers.add(i);
	    }

	    if (i > upper_max_within & i <= limit_upper_whisker) {
		upper_max_within = i;
	    }
	    if (i < lower_max_within & i >= limit_lower_whisker) {
		lower_max_within = i;
	    }
	}

    }

}
This entry was posted in engineering and tagged , , . Bookmark the permalink.

Leave a Reply

Your email address will not be published.

You may use these HTML tags and attributes: <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <strike> <strong>