While working on a Hadoop project I found myself needing a lightweight Java class that computes box plot statistics (e.g. quartiles, outliers, etc.). So I wrote the class appended below. The code does not display the plots, only computes necessary values, since I’m planning on displaying the plots with matplotlib:
import java.util.ArrayList;
import java.util.Collections;
public class BoxPlotStats {
double first_quartile;
double median;
double third_quartile;
Double upper_max_within;
Double lower_max_within;
ArrayList<Double> upper_outliers;
ArrayList<Double> lower_outliers;
// this method adapted from http://code.activestate.com/recipes/511478-finding-the-percentile-of-the-values/
public static double percentile(ArrayList values, double percent) {
double k = (values.size() - 1) * percent;
double f = Math.floor(k);
double c = Math.ceil(k);
if (f == c) {
return (Double) values.get((int) k);
}
double d0 = ((Double) values.get((int) f)) * (c - k);
double d1 = ((Double) values.get((int) c)) * (k - f);
return d0 + d1;
}
public void display() {
System.out.println();
System.out.println("Lower outliers: " + lower_outliers);
System.out.println("Lower whisker: " + lower_max_within);
System.out.println("First quartile: " + first_quartile);
System.out.println("Median: " + median);
System.out.println("Third quartile: " + third_quartile);
System.out.println("Upper whisker: " + upper_max_within);
System.out.println("Upper outliers: " + upper_outliers);
System.out.println();
}
public void displayOneLiner() {
String lo = lower_outliers.toString().replace(" ", "|").replace(",", "").replace("[", "").replace("]", "");
String uo = upper_outliers.toString().replace(" ", "|").replace(",", "").replace("[", "").replace("]", "");
System.out.println(lo + "," + lower_max_within + "," + first_quartile + "," + median + "," + third_quartile + "," + upper_max_within + "," + uo);
}
BoxPlotStats(ArrayList values) {
Collections.sort(values);
first_quartile = percentile(values, 0.25);
median = percentile(values, 0.5);
third_quartile = percentile(values, 0.75);
double range_limits = 1.5 * (third_quartile - first_quartile);
double limit_upper_whisker = third_quartile + range_limits;
upper_outliers = new ArrayList();
upper_max_within = (Double) Collections.min(values);
double limit_lower_whisker = first_quartile - range_limits;
lower_outliers = new ArrayList();
lower_max_within = (Double) Collections.max(values);
//for (Double i : values) { // not sure why this isn't working!
for (int x=0; x<values.size(); x++) {
Double i = (Double) values.get(x);
if (i > limit_upper_whisker) {
upper_outliers.add(i);
}
if (i < limit_lower_whisker) {
lower_outliers.add(i);
}
if (i > upper_max_within & i <= limit_upper_whisker) {
upper_max_within = i;
}
if (i < lower_max_within & i >= limit_lower_whisker) {
lower_max_within = i;
}
}
}
}



