Identifying and Removing Outliers

From Q
Jump to: navigation, search

This script checks for outliers in numeric questions.

To run this script

  1. Ensure that all numeric questions have the correct Question Type set to Pick One - Multi.
  2. Save this file (File:Identifying Outliers for Numeric Variables.QScript) to a location somewhere on your computer/network (but not where it is currently located).
  3. Run the QScript.
//defining outliers
var number_sd_from_mean = 3;

// getting data
var data = project.dataFiles[0];
var questions = data.questions;
var n_questions = questions.length;
var log_report = "Looking for outliers in numeric variables\r\n";
log_report += "Test: value is more than " + number_sd_from_mean + " standard deviations from the mean\r\n";
log_report += "Where the data contains outliers, a copy of the data is made and it is labeled as 'OUTLIERS REMOVED'\r\n";
log_report += "\r\n";

// creating the tables int the report tree
var group = project.report.appendGroup();
group.name = "Numeric data";

// constructing the variables
for (var q_counter=0; q_counter < n_questions; q_counter++) {
  var question = questions[q_counter];
  if (!question.isHidden && (question.questionType == "Number" || question.questionType == "Number - Multi" || question.questionType == "Number - Grid")) {
     // constructing the outputs
     var t = group.appendTable();
     t.primary = question;
     t.secondary = "SUMMARY";
     t.cellStatistics = ['Average', 'Standard Deviation','Minimum', 'Maximum', 'Base n', 'Missing n'];
     var output = t.calculateOutput();
     var n_columns = question.questionType == "Number - Grid" ? output.numberColumns - 1: 1; //excludes sums
     var n_rows = Math.max(1, output.numberRows - 1);
     // determining if there are outliers
     var outliers = false;
     var min_permissable = Array(n_columns * n_rows);
     var max_permissable = Array(n_columns * n_rows);
     for (var column = 0; column < n_columns; column++) 
       for (var row = 0; row < n_rows; row++) {
          var v_counter = row + column * n_rows;
          var mean = output.get('Average')[row][column];
          var sd = output.get('Standard Deviation')[row][column];
          min_permissable[v_counter] = mean - number_sd_from_mean * sd;
          max_permissable[v_counter] = mean + number_sd_from_mean * sd;
          var too_low = output.get('Minimum')[row][column] < min_permissable[v_counter];
          var too_high = output.get('Maximum')[row][column] > max_permissable[v_counter];
          if (too_low || too_high)
		outliers = true;
      }
     if (outliers) { //creating new data
         log_report += question.name + " contains outliers\r\n";
         var new_variables = Array();
         var variables = question.variables;
         var new_variables = Array();
         for (var v = 0; v < variables.length; v++) {
             var v_name = variables[v].name;
             var expression = "if (" + v_name + " > " + min_permissable[v] + " && " + v_name + " < " + max_permissable[v] +") " + v_name + "; else NaN";
             new_variables.push(data.newJavaScriptVariable(expression, false, v_name + "_noOutliers", variables[v].label, null));
         }
         var new_question = data.setQuestion(question.name + " OUTLIERS REMOVED", question.questionType, new_variables);
         t = group.appendTable();
         t.primary = new_question;
         t.secondary = "SUMMARY";
         t.cellStatistics = ['Average', 'Standard Deviation','Minimum', 'Maximum', 'Base n', 'Missing n'];
    }
  }
}
 log(log_report + '\r\n\r\nFinished!');


See also