Create New Variables - Identifying and Removing Outliers

From Q
Jump to: navigation, search

This QScript checks the selected data for outliers and creates new copies of the data with the outliers removed. Outliers are defined as values that are not within a certain number of standard deviations from the variable mean. The new copies of data will have the outlying values replaced with NaN. Data that does not contain outliers will not be copied.

Technical details

You will be asked to specify:

  1. The data to check.
  2. The cut-off value to use to define outliers. Respondents whose value is not within this many standard deviations of the mean of a variable will be considered outliers. The default value is 3 standard deviations.

A new folder will be created in the report tree that contains tables for the selected data and any new copies of data with the outliers removed.

The new copies of variables use a JavaScript formula to assign respondents with outlying values with a value of NaN. The means and standard deviations are determined when this script is run. As a result, the definition of an outlier in variables where the outliers have been removed will not be updated if the underlying data changes.

How to apply this QScript

  • Start typing the name of the QScript into the Search features and data box in the top right of the Q window.
  • Click on the QScript when it appears in the QScripts and Rules section of the search results.

OR

  • Select Automate > Browse Online Library.
  • Select this QScript from the list.

Customizing the QScript

This QScript is written in JavaScript and can be customized by copying and modifying the JavaScript.

Customizing QScripts in Q4.11 and more recent versions

  • Start typing the name of the QScript into the Search features and data box in the top right of the Q window.
  • Hover your mouse over the QScript when it appears in the QScripts and Rules section of the search results.
  • Press Edit a Copy (bottom-left corner of the preview).
  • Modify the JavaScript (see QScripts for more detail on this).
  • Either:
    • Run the QScript, by pressing the blue triangle button.
    • Save the QScript and run it at a later time, using Automate > Run QScript (Macro) from File.

Customizing QScripts in older versions

  • Copy the JavaScript shown on this page.
  • Create a new text file, giving it a file extension of .QScript. See here for more information about how to do this.
  • Modify the JavaScript (see QScripts for more detail on this).
  • Run the file using Automate > Run QScript (Macro) from File.

JavaScript

// The script checks for outliers in selected variables, and creates new variables with the outliers
// removed if any are found.
 
includeWeb("QScript Selection Functions");
includeWeb("QScript Functions to Generate Outputs"); 
includeWeb("QScript Table Functions"); 

 
if (!main())
    log("QScript cancelled.");
else
    conditionallyEmptyLog("QScript finished.");
 
function main() {
    // On the web just take from what is selected.
    var web_mode = (!!Q.isOnTheWeb && Q.isOnTheWeb());
    
    if (!web_mode) {
        // Selection of questions to process
        var data = dataFileSelection();
        var candidate_questions = getAllQuestionsByTypes(data, ["Number", "Number - Multi", "Number - Grid"]);
        if (candidate_questions.length < 1) {
            log("No numeric variables found.");
            return false;
        }
        
        var questions = selectManyQuestions("Select the data to check for outliers:", candidate_questions).questions;
    }else {
    	var allowed_types = ["Numeric", "Numeric - Multi", "Numeric - Grid"];
	    var selected_questions = project.report.selectedQuestions();
        if (selected_questions.length == 0) {
            log("To use this QScript, you must select at least one question from Data Sets with one of the following types: " + allowed_types.join(", "));
            return false;
        }
	var sorted_selection = splitArrayIntoApplicableAndNotApplicable(selected_questions, function (q) { return allowed_types.indexOf(q.variableSetStructure) != -1 && !q.isBanner; });
	var questions = sorted_selection.applicable;
        if (sorted_selection.notApplicable.length != 0){
            log("The type for each of the selected questions must be one of: " + allowed_types.join(", "));
            return false;
        }
    }
    if (questions.length < 1) {
        log("No questions selected.");
        return false;
    }
 
    // Specify the number of standard deviations that defines an 'outlier'
    var number_sd_from_mean;
    if (!web_mode) {
        while (isNaN(number_sd_from_mean)) {
            number_sd_from_mean = prompt("Enter the cut-off value to use to identify outliers. Respondents whose value is not within this many standard deviations from the mean will be considered outliers.", 3);
            if (isNaN(number_sd_from_mean))
                alert('The cut-off value must be a number.');
        }
    }else {
        number_sd_from_mean = 3;
    }
    
    // Begin the report
    var paragraphs = ["The tables below show data that has been checked for outliers.",
                      "Test: value is more than " + number_sd_from_mean + " standard deviations from the mean.",
                      "Where the data contains outliers, a copy of the data is made and it is labeled as 'OUTLIERS REMOVED'",
                      " "];
 
    // creating the tables in the report tree
    var group = project.report.appendGroup();
    group.name = "Checked for outliers";
 
    // Check each of the input variables and construct new variables where needed
    var outliers_found = false;
    var outlier_list = [];
    questions.forEach(function (question) {
        if (!question.isHidden) {
            var data_file = question.dataFile;
            // Make a table for the original question
            if (!web_mode) {
                new_table = group.appendTable();
                new_table.primary = question;
                new_table.secondary = "SUMMARY";
                new_table.cellStatistics = ['Average', 'Standard Deviation','Minimum', 'Maximum', 'Base n', 'Missing n'];                
            }else {
                // Position both tables side by side, not possible with "TwoItems" page, so use left/height properties
                var TABLE_PAD = 20;
                var page = group.appendPage('Blank');
                page.name = question.name;

                new_table = page.appendTable();
                new_table.primary = question;
                new_table.secondary = "SUMMARY";
                new_table.cellStatistics = ['Average', 'Standard Deviation','Minimum', 'Maximum', 'Base n', 'Missing n'];  
               
                var side_by_side;
                var max_width = page.width - 2*TABLE_PAD;
                var max_height = page.height - 3*TABLE_PAD;  // need space for description text
                if (new_table.height > page.height - TABLE_PAD) {  // place tall tables side-by-side
                    side_by_side = true
                    new_table.left = TABLE_PAD;
                    max_width = max_width - TABLE_PAD;  // account for middle space 
                    if (new_table.width > max_width/2)
                        new_table.width = max_width/2;
                    if (new_table.height > max_height) {
                        new_table.top = TABLE_PAD;
                        new_table.height = max_height;
                    }
                }else {  // place 2nd table underneath first
                    side_by_side = false;
                    new_table.top = TABLE_PAD;
                    max_height = max_height - TABLE_PAD;  // account for middle space  
                    if (new_table.height > max_height/2)
                        new_table.height = max_height/2;
                    if (new_table.width > max_width) {
                        new_table.left = TABLE_PAD;
                        new_table.width = max_width;
                    }
                }
            }
 
            // Check the statistics for each variable in the question to determine
            // if there are any outliers
            var outlier_data = question.variables.map(function (v) {
                return checkVariableForOutliers(v, number_sd_from_mean);
            });
            var outliers = outlier_data.filter(function (obj) {
                return obj.hasOutliers;
            }).length > 0;
 
            // Generate new variables if any of the variables in the question contains outliers
            if (outliers) { 
                outliers_found = true;
                var max_permissable = outlier_data.map(function (obj) {
                    return obj.maxPermissable;
                });
                var min_permissable = outlier_data.map(function (obj) {
                    return obj.minPermissable;
                });
                outlier_list.push(question.name);
                var variables = question.variables;
                var new_variables = [];
                for (var v = 0; v < variables.length; v++) {
                    var v_name = variables[v].name;
                    var expression = "if (" + v_name + " > " + min_permissable[v] + " && " + v_name + " < " + max_permissable[v] +") " + v_name + "; else NaN";
                    new_variables.push(question.dataFile.newJavaScriptVariable(expression, false, preventDuplicateVariableName(data_file, v_name + "_noOutliers"), variables[v].label, null));
                }
                var new_question = data_file.setQuestion(preventDuplicateQuestionName(data_file, question.name + " OUTLIERS REMOVED"), question.questionType, new_variables);
                if (!web_mode){
                    new_table = group.appendTable();
                    new_table.primary = new_question;
                    new_table.secondary = "SUMMARY";
                    new_table.cellStatistics = ['Average', 'Standard Deviation','Minimum', 'Maximum', 'Base n', 'Missing n'];                         
                }else {
                    new_table2 = page.appendTable();
                    new_table2.primary = new_question;
                    new_table2.secondary = "SUMMARY";
                    new_table2.cellStatistics = ['Average', 'Standard Deviation','Minimum', 'Maximum', 'Base n', 'Missing n'];                         
 
                    if (side_by_side) {  // place to right of tall table
                        new_table2.left = new_table.width + 2*TABLE_PAD;
                        if (new_table2.width > max_width/2)
                            new_table2.width = max_width/2;
                        if (new_table2.height > max_height) {
                            new_table2.top = TABLE_PAD;
                            new_table2.height = max_height;
                        }                       
                    }else {  // place 2nd table underneath first
                        new_table2.top = new_table.height + 2*TABLE_PAD;
                        if (new_table2.height > max_height/2)
                            new_table2.height = max_height/2;
                        if (new_table2.width > max_width) {
                            new_table2.left = TABLE_PAD;
                            new_table2.width = max_width;
                        }
                    }                    
                
                    var descriptive_text = page.appendText();
                    descriptive_text.text = "Respondents with values larger than three standard deviations from mean have been removed.";
                    if (side_by_side) {
                        descriptive_text.top = 2*TABLE_PAD + new_table.height;
                    }else
                        descriptive_text.top = 3*TABLE_PAD + new_table.height + new_table2.height;   
                }
           
            }
        }
    });
    
    if (!web_mode) {        
        if (outliers_found) {
            paragraphs.push("Outliers found in:");
            paragraphs.push("");
            paragraphs = paragraphs.concat(outlier_list);
        } else
            paragraphs.push('No outliers found');

        simpleHTMLReport(paragraphs, "Checked for outliers", group, true);
    }else{
        if (outliers_found) {
            function makeWordList(words) {
                if (words.length == 1){
                    return "New variable '" + words[0] + "' created. ";
                }else
                    return "New variables '" + words.slice(0,words.length-1).join("', ") 
                             + "', and '" + words[words.length-1] + "' created. ";
            }
            log(makeWordList(outlier_list)
                + "Tables showing the variables with and without outliers removed have been added to the bottom of the document."
                + " Respondents whose value are not within " + number_sd_from_mean 
                + " standard deviations from the mean were considered outliers.");
        }else
            log("No outliers detected. Tables showing a summary of the original variables have been added to the bottom of the document.")
    }
    return true;
}
 
// Checks a variable for outliers.
function checkVariableForOutliers(variable, number_sd_from_mean) {
    // Generate table data for single variable
    var temp_var = variable.duplicate();
    temp_var.variableType = "Numeric";
    var temp_table = project.report.appendTable();
    temp_table.primary = temp_var.question;
    temp_table.secondary = "SUMMARY";
    removeRules(temp_table);
    var output = temp_table.calculateOutput();
 
    var outliers = false;
 
    // Use table stats to check for outliers
    var mean = output.get('Average')[0][0];
    var sd = output.get('Standard Deviation')[0][0];
    var min_permissable = mean - number_sd_from_mean * sd;
    var max_permissable = mean + number_sd_from_mean * sd;
    var too_low = output.get('Minimum')[0][0] < min_permissable;
    var too_high = output.get('Maximum')[0][0] > max_permissable;
    if (too_low || too_high)
        outliers = true;
 
    // Tidy up
    temp_table.deleteItem();
    temp_var.deleteVariable();
    return { hasOutliers: outliers, maxPermissable: max_permissable, minPermissable: min_permissable };
}


See also