Identifying and Removing Outliers

From Q
Jump to navigation Jump to search

This script checks for outliers in numeric questions.

To run this script

  1. Ensure that all numeric questions have the correct Question Type set to Pick One - Multi.
  2. Save this file (File:Identifying Outliers for Numeric Variables.QScript) to a location somewhere on your computer/network (but not where it is currently located).
  3. Run the QScript.
//defining outliers
var number_sd_from_mean = 3;

// getting data
var data = project.dataFiles[0];
var questions = data.questions;
var n_questions = questions.length;
var log_report = "Looking for outliers in numeric variables\r\n";
log_report += "Test: value is more than " + number_sd_from_mean + " standard deviations from the mean\r\n";
log_report += "Where the data contains outliers, a copy of the data is made and it is labeled as 'OUTLIERS REMOVED'\r\n";
log_report += "\r\n";

// creating the tables int the report tree
var group = project.report.appendGroup();
group.name = "Numeric data";

// constructing the variables
for (var q_counter=0; q_counter < n_questions; q_counter++) {
  var question = questions[q_counter];
  if (!question.isHidden && (question.questionType == "Number" || question.questionType == "Number - Multi" || question.questionType == "Number - Grid")) {
     // constructing the outputs
     var t = group.appendTable();
     t.primary = question;
     t.secondary = "SUMMARY";
     t.cellStatistics = ['Average', 'Standard Deviation','Minimum', 'Maximum', 'Base n', 'Missing n'];
     var output = t.calculateOutput();
     var n_columns = question.questionType == "Number - Grid" ? output.numberColumns - 1: 1; //excludes sums
     var n_rows = Math.max(1, output.numberRows - 1);
     // determining if there are outliers
     var outliers = false;
     var min_permissable = Array(n_columns * n_rows);
     var max_permissable = Array(n_columns * n_rows);
     for (var column = 0; column < n_columns; column++) 
       for (var row = 0; row < n_rows; row++) {
          var v_counter = row + column * n_rows;
          var mean = output.get('Average')[row][column];
          var sd = output.get('Standard Deviation')[row][column];
          min_permissable[v_counter] = mean - number_sd_from_mean * sd;
          max_permissable[v_counter] = mean + number_sd_from_mean * sd;
          var too_low = output.get('Minimum')[row][column] < min_permissable[v_counter];
          var too_high = output.get('Maximum')[row][column] > max_permissable[v_counter];
          if (too_low || too_high)
		outliers = true;
      }
     if (outliers) { //creating new data
         log_report += question.name + " contains outliers\r\n";
         var new_variables = Array();
         var variables = question.variables;
         var new_variables = Array();
         for (var v = 0; v < variables.length; v++) {
             var v_name = variables[v].name;
             var expression = "if (" + v_name + " > " + min_permissable[v] + " && " + v_name + " < " + max_permissable[v] +") " + v_name + "; else NaN";
             new_variables.push(data.newJavaScriptVariable(expression, false, v_name + "_noOutliers", variables[v].label, null));
         }
         var new_question = data.setQuestion(question.name + " OUTLIERS REMOVED", question.questionType, new_variables);
         t = group.appendTable();
         t.primary = new_question;
         t.secondary = "SUMMARY";
         t.cellStatistics = ['Average', 'Standard Deviation','Minimum', 'Maximum', 'Base n', 'Missing n'];
    }
  }
}
 log(log_report + '\r\n\r\nFinished!');


See also