Identifying and Removing Outliers
Jump to navigation
Jump to search
This script checks for outliers in numeric questions.
To run this script
- Ensure that all numeric questions have the correct Question Type set to Pick One - Multi.
- Save this file (File:Identifying Outliers for Numeric Variables.QScript) to a location somewhere on your computer/network (but not where it is currently located).
- Run the QScript.
//defining outliers
var number_sd_from_mean = 3;
// getting data
var data = project.dataFiles[0];
var questions = data.questions;
var n_questions = questions.length;
var log_report = "Looking for outliers in numeric variables\r\n";
log_report += "Test: value is more than " + number_sd_from_mean + " standard deviations from the mean\r\n";
log_report += "Where the data contains outliers, a copy of the data is made and it is labeled as 'OUTLIERS REMOVED'\r\n";
log_report += "\r\n";
// creating the tables int the report tree
var group = project.report.appendGroup();
group.name = "Numeric data";
// constructing the variables
for (var q_counter=0; q_counter < n_questions; q_counter++) {
var question = questions[q_counter];
if (!question.isHidden && (question.questionType == "Number" || question.questionType == "Number - Multi" || question.questionType == "Number - Grid")) {
// constructing the outputs
var t = group.appendTable();
t.primary = question;
t.secondary = "SUMMARY";
t.cellStatistics = ['Average', 'Standard Deviation','Minimum', 'Maximum', 'Base n', 'Missing n'];
var output = t.calculateOutput();
var n_columns = question.questionType == "Number - Grid" ? output.numberColumns - 1: 1; //excludes sums
var n_rows = Math.max(1, output.numberRows - 1);
// determining if there are outliers
var outliers = false;
var min_permissable = Array(n_columns * n_rows);
var max_permissable = Array(n_columns * n_rows);
for (var column = 0; column < n_columns; column++)
for (var row = 0; row < n_rows; row++) {
var v_counter = row + column * n_rows;
var mean = output.get('Average')[row][column];
var sd = output.get('Standard Deviation')[row][column];
min_permissable[v_counter] = mean - number_sd_from_mean * sd;
max_permissable[v_counter] = mean + number_sd_from_mean * sd;
var too_low = output.get('Minimum')[row][column] < min_permissable[v_counter];
var too_high = output.get('Maximum')[row][column] > max_permissable[v_counter];
if (too_low || too_high)
outliers = true;
}
if (outliers) { //creating new data
log_report += question.name + " contains outliers\r\n";
var new_variables = Array();
var variables = question.variables;
var new_variables = Array();
for (var v = 0; v < variables.length; v++) {
var v_name = variables[v].name;
var expression = "if (" + v_name + " > " + min_permissable[v] + " && " + v_name + " < " + max_permissable[v] +") " + v_name + "; else NaN";
new_variables.push(data.newJavaScriptVariable(expression, false, v_name + "_noOutliers", variables[v].label, null));
}
var new_question = data.setQuestion(question.name + " OUTLIERS REMOVED", question.questionType, new_variables);
t = group.appendTable();
t.primary = new_question;
t.secondary = "SUMMARY";
t.cellStatistics = ['Average', 'Standard Deviation','Minimum', 'Maximum', 'Base n', 'Missing n'];
}
}
}
log(log_report + '\r\n\r\nFinished!');
See also
- QScript for an explanation of how to run this code.
- QScript Reference for technical information.
- JavaScript for information about the JavaScript programming language.